poly_int: get_mask_mode
[official-gcc.git] / gcc / config / i386 / i386.c
bloba0875a718e33ea1c7094757fa48d025c69b1b036
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
94 /* This file should be included last. */
95 #include "target-def.h"
97 #include "x86-tune-costs.h"
99 static rtx legitimize_dllimport_symbol (rtx, bool);
100 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
101 static rtx legitimize_pe_coff_symbol (rtx, bool);
102 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
103 static bool ix86_save_reg (unsigned int, bool, bool);
104 static bool ix86_function_naked (const_tree);
105 static bool ix86_notrack_prefixed_insn_p (rtx);
106 static void ix86_emit_restore_reg_using_pop (rtx);
109 #ifndef CHECK_STACK_LIMIT
110 #define CHECK_STACK_LIMIT (-1)
111 #endif
113 /* Return index of given mode in mult and division cost tables. */
114 #define MODE_INDEX(mode) \
115 ((mode) == QImode ? 0 \
116 : (mode) == HImode ? 1 \
117 : (mode) == SImode ? 2 \
118 : (mode) == DImode ? 3 \
119 : 4)
122 /* Set by -mtune. */
123 const struct processor_costs *ix86_tune_cost = NULL;
125 /* Set by -mtune or -Os. */
126 const struct processor_costs *ix86_cost = NULL;
128 /* Processor feature/optimization bitmasks. */
129 #define m_386 (1U<<PROCESSOR_I386)
130 #define m_486 (1U<<PROCESSOR_I486)
131 #define m_PENT (1U<<PROCESSOR_PENTIUM)
132 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
133 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
134 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
135 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
136 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
137 #define m_CORE2 (1U<<PROCESSOR_CORE2)
138 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
139 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
140 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
141 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
142 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
143 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
144 #define m_KNL (1U<<PROCESSOR_KNL)
145 #define m_KNM (1U<<PROCESSOR_KNM)
146 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
147 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
148 #define m_INTEL (1U<<PROCESSOR_INTEL)
150 #define m_GEODE (1U<<PROCESSOR_GEODE)
151 #define m_K6 (1U<<PROCESSOR_K6)
152 #define m_K6_GEODE (m_K6 | m_GEODE)
153 #define m_K8 (1U<<PROCESSOR_K8)
154 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
155 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
156 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
157 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
158 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
159 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
160 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
161 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
162 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
163 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
164 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
165 #define m_BTVER (m_BTVER1 | m_BTVER2)
166 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
167 | m_ZNVER1)
169 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
171 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
172 #undef DEF_TUNE
173 #define DEF_TUNE(tune, name, selector) name,
174 #include "x86-tune.def"
175 #undef DEF_TUNE
178 /* Feature tests against the various tunings. */
179 unsigned char ix86_tune_features[X86_TUNE_LAST];
181 /* Feature tests against the various tunings used to create ix86_tune_features
182 based on the processor mask. */
183 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
184 #undef DEF_TUNE
185 #define DEF_TUNE(tune, name, selector) selector,
186 #include "x86-tune.def"
187 #undef DEF_TUNE
190 /* Feature tests against the various architecture variations. */
191 unsigned char ix86_arch_features[X86_ARCH_LAST];
193 /* Feature tests against the various architecture variations, used to create
194 ix86_arch_features based on the processor mask. */
195 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
196 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
197 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
199 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
200 ~m_386,
202 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
203 ~(m_386 | m_486),
205 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
206 ~m_386,
208 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
209 ~m_386,
212 /* In case the average insn count for single function invocation is
213 lower than this constant, emit fast (but longer) prologue and
214 epilogue code. */
215 #define FAST_PROLOGUE_INSN_COUNT 20
217 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
218 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
219 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
220 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
222 /* Array of the smallest class containing reg number REGNO, indexed by
223 REGNO. Used by REGNO_REG_CLASS in i386.h. */
225 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
227 /* ax, dx, cx, bx */
228 AREG, DREG, CREG, BREG,
229 /* si, di, bp, sp */
230 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
231 /* FP registers */
232 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
233 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
234 /* arg pointer */
235 NON_Q_REGS,
236 /* flags, fpsr, fpcr, frame */
237 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
238 /* SSE registers */
239 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
240 SSE_REGS, SSE_REGS,
241 /* MMX registers */
242 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
243 MMX_REGS, MMX_REGS,
244 /* REX registers */
245 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
246 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
247 /* SSE REX registers */
248 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
249 SSE_REGS, SSE_REGS,
250 /* AVX-512 SSE registers */
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 /* Mask registers. */
256 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
257 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
258 /* MPX bound registers */
259 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
262 /* The "default" register map used in 32bit mode. */
264 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
266 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
267 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
268 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
269 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
270 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
272 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
273 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
274 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
275 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
276 101, 102, 103, 104, /* bound registers */
279 /* The "default" register map used in 64bit mode. */
281 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
283 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
284 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
285 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
286 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
287 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
288 8,9,10,11,12,13,14,15, /* extended integer registers */
289 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
290 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
291 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
292 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
293 126, 127, 128, 129, /* bound registers */
296 /* Define the register numbers to be used in Dwarf debugging information.
297 The SVR4 reference port C compiler uses the following register numbers
298 in its Dwarf output code:
299 0 for %eax (gcc regno = 0)
300 1 for %ecx (gcc regno = 2)
301 2 for %edx (gcc regno = 1)
302 3 for %ebx (gcc regno = 3)
303 4 for %esp (gcc regno = 7)
304 5 for %ebp (gcc regno = 6)
305 6 for %esi (gcc regno = 4)
306 7 for %edi (gcc regno = 5)
307 The following three DWARF register numbers are never generated by
308 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
309 believed these numbers have these meanings.
310 8 for %eip (no gcc equivalent)
311 9 for %eflags (gcc regno = 17)
312 10 for %trapno (no gcc equivalent)
313 It is not at all clear how we should number the FP stack registers
314 for the x86 architecture. If the version of SDB on x86/svr4 were
315 a bit less brain dead with respect to floating-point then we would
316 have a precedent to follow with respect to DWARF register numbers
317 for x86 FP registers, but the SDB on x86/svr4 was so completely
318 broken with respect to FP registers that it is hardly worth thinking
319 of it as something to strive for compatibility with.
320 The version of x86/svr4 SDB I had does (partially)
321 seem to believe that DWARF register number 11 is associated with
322 the x86 register %st(0), but that's about all. Higher DWARF
323 register numbers don't seem to be associated with anything in
324 particular, and even for DWARF regno 11, SDB only seemed to under-
325 stand that it should say that a variable lives in %st(0) (when
326 asked via an `=' command) if we said it was in DWARF regno 11,
327 but SDB still printed garbage when asked for the value of the
328 variable in question (via a `/' command).
329 (Also note that the labels SDB printed for various FP stack regs
330 when doing an `x' command were all wrong.)
331 Note that these problems generally don't affect the native SVR4
332 C compiler because it doesn't allow the use of -O with -g and
333 because when it is *not* optimizing, it allocates a memory
334 location for each floating-point variable, and the memory
335 location is what gets described in the DWARF AT_location
336 attribute for the variable in question.
337 Regardless of the severe mental illness of the x86/svr4 SDB, we
338 do something sensible here and we use the following DWARF
339 register numbers. Note that these are all stack-top-relative
340 numbers.
341 11 for %st(0) (gcc regno = 8)
342 12 for %st(1) (gcc regno = 9)
343 13 for %st(2) (gcc regno = 10)
344 14 for %st(3) (gcc regno = 11)
345 15 for %st(4) (gcc regno = 12)
346 16 for %st(5) (gcc regno = 13)
347 17 for %st(6) (gcc regno = 14)
348 18 for %st(7) (gcc regno = 15)
350 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
352 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
353 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
354 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
355 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
356 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
358 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
359 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
360 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
361 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
362 101, 102, 103, 104, /* bound registers */
365 /* Define parameter passing and return registers. */
367 static int const x86_64_int_parameter_registers[6] =
369 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
372 static int const x86_64_ms_abi_int_parameter_registers[4] =
374 CX_REG, DX_REG, R8_REG, R9_REG
377 static int const x86_64_int_return_registers[4] =
379 AX_REG, DX_REG, DI_REG, SI_REG
382 /* Additional registers that are clobbered by SYSV calls. */
384 #define NUM_X86_64_MS_CLOBBERED_REGS 12
385 static int const x86_64_ms_sysv_extra_clobbered_registers
386 [NUM_X86_64_MS_CLOBBERED_REGS] =
388 SI_REG, DI_REG,
389 XMM6_REG, XMM7_REG,
390 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
391 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
394 enum xlogue_stub {
395 XLOGUE_STUB_SAVE,
396 XLOGUE_STUB_RESTORE,
397 XLOGUE_STUB_RESTORE_TAIL,
398 XLOGUE_STUB_SAVE_HFP,
399 XLOGUE_STUB_RESTORE_HFP,
400 XLOGUE_STUB_RESTORE_HFP_TAIL,
402 XLOGUE_STUB_COUNT
405 enum xlogue_stub_sets {
406 XLOGUE_SET_ALIGNED,
407 XLOGUE_SET_ALIGNED_PLUS_8,
408 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
409 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
411 XLOGUE_SET_COUNT
414 /* Register save/restore layout used by out-of-line stubs. */
415 class xlogue_layout {
416 public:
417 struct reginfo
419 unsigned regno;
420 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
421 rsi) to where each register is stored. */
424 unsigned get_nregs () const {return m_nregs;}
425 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
427 const reginfo &get_reginfo (unsigned reg) const
429 gcc_assert (reg < m_nregs);
430 return m_regs[reg];
433 static const char *get_stub_name (enum xlogue_stub stub,
434 unsigned n_extra_args);
436 /* Returns an rtx for the stub's symbol based upon
437 1.) the specified stub (save, restore or restore_ret) and
438 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
439 3.) rather or not stack alignment is being performed. */
440 static rtx get_stub_rtx (enum xlogue_stub stub);
442 /* Returns the amount of stack space (including padding) that the stub
443 needs to store registers based upon data in the machine_function. */
444 HOST_WIDE_INT get_stack_space_used () const
446 const struct machine_function *m = cfun->machine;
447 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
449 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
450 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
453 /* Returns the offset for the base pointer used by the stub. */
454 HOST_WIDE_INT get_stub_ptr_offset () const
456 return STUB_INDEX_OFFSET + m_stack_align_off_in;
459 static const struct xlogue_layout &get_instance ();
460 static unsigned count_stub_managed_regs ();
461 static bool is_stub_managed_reg (unsigned regno, unsigned count);
463 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
464 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
465 static const unsigned MAX_REGS = 18;
466 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
467 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
468 static const unsigned STUB_NAME_MAX_LEN = 20;
469 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
470 static const unsigned REG_ORDER[MAX_REGS];
471 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
473 private:
474 xlogue_layout ();
475 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
476 xlogue_layout (const xlogue_layout &);
478 /* True if hard frame pointer is used. */
479 bool m_hfp;
481 /* Max number of register this layout manages. */
482 unsigned m_nregs;
484 /* Incoming offset from 16-byte alignment. */
485 HOST_WIDE_INT m_stack_align_off_in;
487 /* Register order and offsets. */
488 struct reginfo m_regs[MAX_REGS];
490 /* Lazy-inited cache of symbol names for stubs. */
491 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
492 [STUB_NAME_MAX_LEN];
494 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
497 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
498 "savms64",
499 "resms64",
500 "resms64x",
501 "savms64f",
502 "resms64f",
503 "resms64fx"
506 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
507 /* The below offset values are where each register is stored for the layout
508 relative to incoming stack pointer. The value of each m_regs[].offset will
509 be relative to the incoming base pointer (rax or rsi) used by the stub.
511 s_instances: 0 1 2 3
512 Offset: realigned or aligned + 8
513 Register aligned aligned + 8 aligned w/HFP w/HFP */
514 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
515 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
516 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
517 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
518 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
519 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
520 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
521 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
522 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
523 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
524 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
525 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
526 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
527 BP_REG, /* 0xc0 0xc8 N/A N/A */
528 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
529 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
530 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
531 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
534 /* Instantiate static const values. */
535 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
536 const unsigned xlogue_layout::MIN_REGS;
537 const unsigned xlogue_layout::MAX_REGS;
538 const unsigned xlogue_layout::MAX_EXTRA_REGS;
539 const unsigned xlogue_layout::VARIANT_COUNT;
540 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
542 /* Initialize xlogue_layout::s_stub_names to zero. */
543 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
544 [STUB_NAME_MAX_LEN];
546 /* Instantiates all xlogue_layout instances. */
547 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
548 xlogue_layout (0, false),
549 xlogue_layout (8, false),
550 xlogue_layout (0, true),
551 xlogue_layout (8, true)
554 /* Return an appropriate const instance of xlogue_layout based upon values
555 in cfun->machine and crtl. */
556 const struct xlogue_layout &
557 xlogue_layout::get_instance ()
559 enum xlogue_stub_sets stub_set;
560 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
562 if (stack_realign_fp)
563 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
564 else if (frame_pointer_needed)
565 stub_set = aligned_plus_8
566 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
567 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
568 else
569 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
571 return s_instances[stub_set];
574 /* Determine how many clobbered registers can be saved by the stub.
575 Returns the count of registers the stub will save and restore. */
576 unsigned
577 xlogue_layout::count_stub_managed_regs ()
579 bool hfp = frame_pointer_needed || stack_realign_fp;
580 unsigned i, count;
581 unsigned regno;
583 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
585 regno = REG_ORDER[i];
586 if (regno == BP_REG && hfp)
587 continue;
588 if (!ix86_save_reg (regno, false, false))
589 break;
590 ++count;
592 return count;
595 /* Determine if register REGNO is a stub managed register given the
596 total COUNT of stub managed registers. */
597 bool
598 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
600 bool hfp = frame_pointer_needed || stack_realign_fp;
601 unsigned i;
603 for (i = 0; i < count; ++i)
605 gcc_assert (i < MAX_REGS);
606 if (REG_ORDER[i] == BP_REG && hfp)
607 ++count;
608 else if (REG_ORDER[i] == regno)
609 return true;
611 return false;
614 /* Constructor for xlogue_layout. */
615 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
616 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
617 m_stack_align_off_in (stack_align_off_in)
619 HOST_WIDE_INT offset = stack_align_off_in;
620 unsigned i, j;
622 for (i = j = 0; i < MAX_REGS; ++i)
624 unsigned regno = REG_ORDER[i];
626 if (regno == BP_REG && hfp)
627 continue;
628 if (SSE_REGNO_P (regno))
630 offset += 16;
631 /* Verify that SSE regs are always aligned. */
632 gcc_assert (!((stack_align_off_in + offset) & 15));
634 else
635 offset += 8;
637 m_regs[j].regno = regno;
638 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
640 gcc_assert (j == m_nregs);
643 const char *
644 xlogue_layout::get_stub_name (enum xlogue_stub stub,
645 unsigned n_extra_regs)
647 const int have_avx = TARGET_AVX;
648 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
650 /* Lazy init */
651 if (!*name)
653 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
654 (have_avx ? "avx" : "sse"),
655 STUB_BASE_NAMES[stub],
656 MIN_REGS + n_extra_regs);
657 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
660 return name;
663 /* Return rtx of a symbol ref for the entry point (based upon
664 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
666 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
668 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
669 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
670 gcc_assert (stub < XLOGUE_STUB_COUNT);
671 gcc_assert (crtl->stack_realign_finalized);
673 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
676 /* Define the structure for the machine field in struct function. */
678 struct GTY(()) stack_local_entry {
679 unsigned short mode;
680 unsigned short n;
681 rtx rtl;
682 struct stack_local_entry *next;
685 /* Which cpu are we scheduling for. */
686 enum attr_cpu ix86_schedule;
688 /* Which cpu are we optimizing for. */
689 enum processor_type ix86_tune;
691 /* Which instruction set architecture to use. */
692 enum processor_type ix86_arch;
694 /* True if processor has SSE prefetch instruction. */
695 unsigned char x86_prefetch_sse;
697 /* -mstackrealign option */
698 static const char ix86_force_align_arg_pointer_string[]
699 = "force_align_arg_pointer";
701 static rtx (*ix86_gen_leave) (void);
702 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
703 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
704 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
705 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
706 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_clzero) (rtx);
709 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
711 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
716 /* Preferred alignment for stack boundary in bits. */
717 unsigned int ix86_preferred_stack_boundary;
719 /* Alignment for incoming stack boundary in bits specified at
720 command line. */
721 static unsigned int ix86_user_incoming_stack_boundary;
723 /* Default alignment for incoming stack boundary in bits. */
724 static unsigned int ix86_default_incoming_stack_boundary;
726 /* Alignment for incoming stack boundary in bits. */
727 unsigned int ix86_incoming_stack_boundary;
729 /* Calling abi specific va_list type nodes. */
730 static GTY(()) tree sysv_va_list_type_node;
731 static GTY(()) tree ms_va_list_type_node;
733 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
734 char internal_label_prefix[16];
735 int internal_label_prefix_len;
737 /* Fence to use after loop using movnt. */
738 tree x86_mfence;
740 /* Register class used for passing given 64bit part of the argument.
741 These represent classes as documented by the PS ABI, with the exception
742 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
743 use SF or DFmode move instead of DImode to avoid reformatting penalties.
745 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
746 whenever possible (upper half does contain padding). */
747 enum x86_64_reg_class
749 X86_64_NO_CLASS,
750 X86_64_INTEGER_CLASS,
751 X86_64_INTEGERSI_CLASS,
752 X86_64_SSE_CLASS,
753 X86_64_SSESF_CLASS,
754 X86_64_SSEDF_CLASS,
755 X86_64_SSEUP_CLASS,
756 X86_64_X87_CLASS,
757 X86_64_X87UP_CLASS,
758 X86_64_COMPLEX_X87_CLASS,
759 X86_64_MEMORY_CLASS
762 #define MAX_CLASSES 8
764 /* Table of constants used by fldpi, fldln2, etc.... */
765 static REAL_VALUE_TYPE ext_80387_constants_table [5];
766 static bool ext_80387_constants_init;
769 static struct machine_function * ix86_init_machine_status (void);
770 static rtx ix86_function_value (const_tree, const_tree, bool);
771 static bool ix86_function_value_regno_p (const unsigned int);
772 static unsigned int ix86_function_arg_boundary (machine_mode,
773 const_tree);
774 static rtx ix86_static_chain (const_tree, bool);
775 static int ix86_function_regparm (const_tree, const_tree);
776 static void ix86_compute_frame_layout (void);
777 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
778 rtx, rtx, int);
779 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
780 static tree ix86_canonical_va_list_type (tree);
781 static void predict_jump (int);
782 static unsigned int split_stack_prologue_scratch_regno (void);
783 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
785 enum ix86_function_specific_strings
787 IX86_FUNCTION_SPECIFIC_ARCH,
788 IX86_FUNCTION_SPECIFIC_TUNE,
789 IX86_FUNCTION_SPECIFIC_MAX
792 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
793 const char *, const char *, enum fpmath_unit,
794 bool);
795 static void ix86_function_specific_save (struct cl_target_option *,
796 struct gcc_options *opts);
797 static void ix86_function_specific_restore (struct gcc_options *opts,
798 struct cl_target_option *);
799 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
800 static void ix86_function_specific_print (FILE *, int,
801 struct cl_target_option *);
802 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
803 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
804 struct gcc_options *,
805 struct gcc_options *,
806 struct gcc_options *);
807 static bool ix86_can_inline_p (tree, tree);
808 static void ix86_set_current_function (tree);
809 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
811 static enum calling_abi ix86_function_abi (const_tree);
814 #ifndef SUBTARGET32_DEFAULT_CPU
815 #define SUBTARGET32_DEFAULT_CPU "i386"
816 #endif
818 /* Whether -mtune= or -march= were specified */
819 static int ix86_tune_defaulted;
820 static int ix86_arch_specified;
822 /* Vectorization library interface and handlers. */
823 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
825 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
826 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
828 /* Processor target table, indexed by processor number */
829 struct ptt
831 const char *const name; /* processor name */
832 const struct processor_costs *cost; /* Processor costs */
833 const int align_loop; /* Default alignments. */
834 const int align_loop_max_skip;
835 const int align_jump;
836 const int align_jump_max_skip;
837 const int align_func;
840 /* This table must be in sync with enum processor_type in i386.h. */
841 static const struct ptt processor_target_table[PROCESSOR_max] =
843 {"generic", &generic_cost, 16, 10, 16, 10, 16},
844 {"i386", &i386_cost, 4, 3, 4, 3, 4},
845 {"i486", &i486_cost, 16, 15, 16, 15, 16},
846 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
847 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
848 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
849 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
850 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
851 {"core2", &core_cost, 16, 10, 16, 10, 16},
852 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
853 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
854 {"haswell", &core_cost, 16, 10, 16, 10, 16},
855 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
856 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
857 {"knl", &slm_cost, 16, 15, 16, 7, 16},
858 {"knm", &slm_cost, 16, 15, 16, 7, 16},
859 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
860 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
861 {"intel", &intel_cost, 16, 15, 16, 7, 16},
862 {"geode", &geode_cost, 0, 0, 0, 0, 0},
863 {"k6", &k6_cost, 32, 7, 32, 7, 32},
864 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
865 {"k8", &k8_cost, 16, 7, 16, 7, 16},
866 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
867 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
868 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
869 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
870 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
871 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
872 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
873 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
876 static unsigned int
877 rest_of_handle_insert_vzeroupper (void)
879 int i;
881 /* vzeroupper instructions are inserted immediately after reload to
882 account for possible spills from 256bit or 512bit registers. The pass
883 reuses mode switching infrastructure by re-running mode insertion
884 pass, so disable entities that have already been processed. */
885 for (i = 0; i < MAX_386_ENTITIES; i++)
886 ix86_optimize_mode_switching[i] = 0;
888 ix86_optimize_mode_switching[AVX_U128] = 1;
890 /* Call optimize_mode_switching. */
891 g->get_passes ()->execute_pass_mode_switching ();
892 return 0;
895 /* Return 1 if INSN uses or defines a hard register.
896 Hard register uses in a memory address are ignored.
897 Clobbers and flags definitions are ignored. */
899 static bool
900 has_non_address_hard_reg (rtx_insn *insn)
902 df_ref ref;
903 FOR_EACH_INSN_DEF (ref, insn)
904 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
905 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
906 && DF_REF_REGNO (ref) != FLAGS_REG)
907 return true;
909 FOR_EACH_INSN_USE (ref, insn)
910 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
911 return true;
913 return false;
916 /* Check if comparison INSN may be transformed
917 into vector comparison. Currently we transform
918 zero checks only which look like:
920 (set (reg:CCZ 17 flags)
921 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
922 (subreg:SI (reg:DI x) 0))
923 (const_int 0 [0]))) */
925 static bool
926 convertible_comparison_p (rtx_insn *insn)
928 if (!TARGET_SSE4_1)
929 return false;
931 rtx def_set = single_set (insn);
933 gcc_assert (def_set);
935 rtx src = SET_SRC (def_set);
936 rtx dst = SET_DEST (def_set);
938 gcc_assert (GET_CODE (src) == COMPARE);
940 if (GET_CODE (dst) != REG
941 || REGNO (dst) != FLAGS_REG
942 || GET_MODE (dst) != CCZmode)
943 return false;
945 rtx op1 = XEXP (src, 0);
946 rtx op2 = XEXP (src, 1);
948 if (op2 != CONST0_RTX (GET_MODE (op2)))
949 return false;
951 if (GET_CODE (op1) != IOR)
952 return false;
954 op2 = XEXP (op1, 1);
955 op1 = XEXP (op1, 0);
957 if (!SUBREG_P (op1)
958 || !SUBREG_P (op2)
959 || GET_MODE (op1) != SImode
960 || GET_MODE (op2) != SImode
961 || ((SUBREG_BYTE (op1) != 0
962 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
963 && (SUBREG_BYTE (op2) != 0
964 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
965 return false;
967 op1 = SUBREG_REG (op1);
968 op2 = SUBREG_REG (op2);
970 if (op1 != op2
971 || !REG_P (op1)
972 || GET_MODE (op1) != DImode)
973 return false;
975 return true;
978 /* The DImode version of scalar_to_vector_candidate_p. */
980 static bool
981 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
983 rtx def_set = single_set (insn);
985 if (!def_set)
986 return false;
988 if (has_non_address_hard_reg (insn))
989 return false;
991 rtx src = SET_SRC (def_set);
992 rtx dst = SET_DEST (def_set);
994 if (GET_CODE (src) == COMPARE)
995 return convertible_comparison_p (insn);
997 /* We are interested in DImode promotion only. */
998 if ((GET_MODE (src) != DImode
999 && !CONST_INT_P (src))
1000 || GET_MODE (dst) != DImode)
1001 return false;
1003 if (!REG_P (dst) && !MEM_P (dst))
1004 return false;
1006 switch (GET_CODE (src))
1008 case ASHIFTRT:
1009 if (!TARGET_AVX512VL)
1010 return false;
1011 /* FALLTHRU */
1013 case ASHIFT:
1014 case LSHIFTRT:
1015 if (!REG_P (XEXP (src, 1))
1016 && (!SUBREG_P (XEXP (src, 1))
1017 || SUBREG_BYTE (XEXP (src, 1)) != 0
1018 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1019 && (!CONST_INT_P (XEXP (src, 1))
1020 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1021 return false;
1023 if (GET_MODE (XEXP (src, 1)) != QImode
1024 && !CONST_INT_P (XEXP (src, 1)))
1025 return false;
1026 break;
1028 case PLUS:
1029 case MINUS:
1030 case IOR:
1031 case XOR:
1032 case AND:
1033 if (!REG_P (XEXP (src, 1))
1034 && !MEM_P (XEXP (src, 1))
1035 && !CONST_INT_P (XEXP (src, 1)))
1036 return false;
1038 if (GET_MODE (XEXP (src, 1)) != DImode
1039 && !CONST_INT_P (XEXP (src, 1)))
1040 return false;
1041 break;
1043 case NEG:
1044 case NOT:
1045 break;
1047 case REG:
1048 return true;
1050 case MEM:
1051 case CONST_INT:
1052 return REG_P (dst);
1054 default:
1055 return false;
1058 if (!REG_P (XEXP (src, 0))
1059 && !MEM_P (XEXP (src, 0))
1060 && !CONST_INT_P (XEXP (src, 0))
1061 /* Check for andnot case. */
1062 && (GET_CODE (src) != AND
1063 || GET_CODE (XEXP (src, 0)) != NOT
1064 || !REG_P (XEXP (XEXP (src, 0), 0))))
1065 return false;
1067 if (GET_MODE (XEXP (src, 0)) != DImode
1068 && !CONST_INT_P (XEXP (src, 0)))
1069 return false;
1071 return true;
1074 /* The TImode version of scalar_to_vector_candidate_p. */
1076 static bool
1077 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1079 rtx def_set = single_set (insn);
1081 if (!def_set)
1082 return false;
1084 if (has_non_address_hard_reg (insn))
1085 return false;
1087 rtx src = SET_SRC (def_set);
1088 rtx dst = SET_DEST (def_set);
1090 /* Only TImode load and store are allowed. */
1091 if (GET_MODE (dst) != TImode)
1092 return false;
1094 if (MEM_P (dst))
1096 /* Check for store. Memory must be aligned or unaligned store
1097 is optimal. Only support store from register, standard SSE
1098 constant or CONST_WIDE_INT generated from piecewise store.
1100 ??? Verify performance impact before enabling CONST_INT for
1101 __int128 store. */
1102 if (misaligned_operand (dst, TImode)
1103 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1104 return false;
1106 switch (GET_CODE (src))
1108 default:
1109 return false;
1111 case REG:
1112 case CONST_WIDE_INT:
1113 return true;
1115 case CONST_INT:
1116 return standard_sse_constant_p (src, TImode);
1119 else if (MEM_P (src))
1121 /* Check for load. Memory must be aligned or unaligned load is
1122 optimal. */
1123 return (REG_P (dst)
1124 && (!misaligned_operand (src, TImode)
1125 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1128 return false;
1131 /* Return 1 if INSN may be converted into vector
1132 instruction. */
1134 static bool
1135 scalar_to_vector_candidate_p (rtx_insn *insn)
1137 if (TARGET_64BIT)
1138 return timode_scalar_to_vector_candidate_p (insn);
1139 else
1140 return dimode_scalar_to_vector_candidate_p (insn);
1143 /* The DImode version of remove_non_convertible_regs. */
1145 static void
1146 dimode_remove_non_convertible_regs (bitmap candidates)
1148 bitmap_iterator bi;
1149 unsigned id;
1150 bitmap regs = BITMAP_ALLOC (NULL);
1152 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1154 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1155 rtx reg = SET_DEST (def_set);
1157 if (!REG_P (reg)
1158 || bitmap_bit_p (regs, REGNO (reg))
1159 || HARD_REGISTER_P (reg))
1160 continue;
1162 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1163 def;
1164 def = DF_REF_NEXT_REG (def))
1166 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1168 if (dump_file)
1169 fprintf (dump_file,
1170 "r%d has non convertible definition in insn %d\n",
1171 REGNO (reg), DF_REF_INSN_UID (def));
1173 bitmap_set_bit (regs, REGNO (reg));
1174 break;
1179 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1181 for (df_ref def = DF_REG_DEF_CHAIN (id);
1182 def;
1183 def = DF_REF_NEXT_REG (def))
1184 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1186 if (dump_file)
1187 fprintf (dump_file, "Removing insn %d from candidates list\n",
1188 DF_REF_INSN_UID (def));
1190 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1194 BITMAP_FREE (regs);
1197 /* For a register REGNO, scan instructions for its defs and uses.
1198 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1200 static void
1201 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1202 unsigned int regno)
1204 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1205 def;
1206 def = DF_REF_NEXT_REG (def))
1208 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1210 if (dump_file)
1211 fprintf (dump_file,
1212 "r%d has non convertible def in insn %d\n",
1213 regno, DF_REF_INSN_UID (def));
1215 bitmap_set_bit (regs, regno);
1216 break;
1220 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1221 ref;
1222 ref = DF_REF_NEXT_REG (ref))
1224 /* Debug instructions are skipped. */
1225 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1226 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1228 if (dump_file)
1229 fprintf (dump_file,
1230 "r%d has non convertible use in insn %d\n",
1231 regno, DF_REF_INSN_UID (ref));
1233 bitmap_set_bit (regs, regno);
1234 break;
1239 /* The TImode version of remove_non_convertible_regs. */
1241 static void
1242 timode_remove_non_convertible_regs (bitmap candidates)
1244 bitmap_iterator bi;
1245 unsigned id;
1246 bitmap regs = BITMAP_ALLOC (NULL);
1248 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1250 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1251 rtx dest = SET_DEST (def_set);
1252 rtx src = SET_SRC (def_set);
1254 if ((!REG_P (dest)
1255 || bitmap_bit_p (regs, REGNO (dest))
1256 || HARD_REGISTER_P (dest))
1257 && (!REG_P (src)
1258 || bitmap_bit_p (regs, REGNO (src))
1259 || HARD_REGISTER_P (src)))
1260 continue;
1262 if (REG_P (dest))
1263 timode_check_non_convertible_regs (candidates, regs,
1264 REGNO (dest));
1266 if (REG_P (src))
1267 timode_check_non_convertible_regs (candidates, regs,
1268 REGNO (src));
1271 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1273 for (df_ref def = DF_REG_DEF_CHAIN (id);
1274 def;
1275 def = DF_REF_NEXT_REG (def))
1276 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1278 if (dump_file)
1279 fprintf (dump_file, "Removing insn %d from candidates list\n",
1280 DF_REF_INSN_UID (def));
1282 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1285 for (df_ref ref = DF_REG_USE_CHAIN (id);
1286 ref;
1287 ref = DF_REF_NEXT_REG (ref))
1288 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1290 if (dump_file)
1291 fprintf (dump_file, "Removing insn %d from candidates list\n",
1292 DF_REF_INSN_UID (ref));
1294 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1298 BITMAP_FREE (regs);
1301 /* For a given bitmap of insn UIDs scans all instruction and
1302 remove insn from CANDIDATES in case it has both convertible
1303 and not convertible definitions.
1305 All insns in a bitmap are conversion candidates according to
1306 scalar_to_vector_candidate_p. Currently it implies all insns
1307 are single_set. */
1309 static void
1310 remove_non_convertible_regs (bitmap candidates)
1312 if (TARGET_64BIT)
1313 timode_remove_non_convertible_regs (candidates);
1314 else
1315 dimode_remove_non_convertible_regs (candidates);
1318 class scalar_chain
1320 public:
1321 scalar_chain ();
1322 virtual ~scalar_chain ();
1324 static unsigned max_id;
1326 /* ID of a chain. */
1327 unsigned int chain_id;
1328 /* A queue of instructions to be included into a chain. */
1329 bitmap queue;
1330 /* Instructions included into a chain. */
1331 bitmap insns;
1332 /* All registers defined by a chain. */
1333 bitmap defs;
1334 /* Registers used in both vector and sclar modes. */
1335 bitmap defs_conv;
1337 void build (bitmap candidates, unsigned insn_uid);
1338 virtual int compute_convert_gain () = 0;
1339 int convert ();
1341 protected:
1342 void add_to_queue (unsigned insn_uid);
1343 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1345 private:
1346 void add_insn (bitmap candidates, unsigned insn_uid);
1347 void analyze_register_chain (bitmap candidates, df_ref ref);
1348 virtual void mark_dual_mode_def (df_ref def) = 0;
1349 virtual void convert_insn (rtx_insn *insn) = 0;
1350 virtual void convert_registers () = 0;
1353 class dimode_scalar_chain : public scalar_chain
1355 public:
1356 int compute_convert_gain ();
1357 private:
1358 void mark_dual_mode_def (df_ref def);
1359 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1360 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1361 void convert_insn (rtx_insn *insn);
1362 void convert_op (rtx *op, rtx_insn *insn);
1363 void convert_reg (unsigned regno);
1364 void make_vector_copies (unsigned regno);
1365 void convert_registers ();
1366 int vector_const_cost (rtx exp);
1369 class timode_scalar_chain : public scalar_chain
1371 public:
1372 /* Convert from TImode to V1TImode is always faster. */
1373 int compute_convert_gain () { return 1; }
1375 private:
1376 void mark_dual_mode_def (df_ref def);
1377 void fix_debug_reg_uses (rtx reg);
1378 void convert_insn (rtx_insn *insn);
1379 /* We don't convert registers to difference size. */
1380 void convert_registers () {}
1383 unsigned scalar_chain::max_id = 0;
1385 /* Initialize new chain. */
1387 scalar_chain::scalar_chain ()
1389 chain_id = ++max_id;
1391 if (dump_file)
1392 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1394 bitmap_obstack_initialize (NULL);
1395 insns = BITMAP_ALLOC (NULL);
1396 defs = BITMAP_ALLOC (NULL);
1397 defs_conv = BITMAP_ALLOC (NULL);
1398 queue = NULL;
1401 /* Free chain's data. */
1403 scalar_chain::~scalar_chain ()
1405 BITMAP_FREE (insns);
1406 BITMAP_FREE (defs);
1407 BITMAP_FREE (defs_conv);
1408 bitmap_obstack_release (NULL);
1411 /* Add instruction into chains' queue. */
1413 void
1414 scalar_chain::add_to_queue (unsigned insn_uid)
1416 if (bitmap_bit_p (insns, insn_uid)
1417 || bitmap_bit_p (queue, insn_uid))
1418 return;
1420 if (dump_file)
1421 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1422 insn_uid, chain_id);
1423 bitmap_set_bit (queue, insn_uid);
1426 /* For DImode conversion, mark register defined by DEF as requiring
1427 conversion. */
1429 void
1430 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1432 gcc_assert (DF_REF_REG_DEF_P (def));
1434 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1435 return;
1437 if (dump_file)
1438 fprintf (dump_file,
1439 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1440 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1442 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1445 /* For TImode conversion, it is unused. */
1447 void
1448 timode_scalar_chain::mark_dual_mode_def (df_ref)
1450 gcc_unreachable ();
1453 /* Check REF's chain to add new insns into a queue
1454 and find registers requiring conversion. */
1456 void
1457 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1459 df_link *chain;
1461 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1462 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1463 add_to_queue (DF_REF_INSN_UID (ref));
1465 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1467 unsigned uid = DF_REF_INSN_UID (chain->ref);
1469 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1470 continue;
1472 if (!DF_REF_REG_MEM_P (chain->ref))
1474 if (bitmap_bit_p (insns, uid))
1475 continue;
1477 if (bitmap_bit_p (candidates, uid))
1479 add_to_queue (uid);
1480 continue;
1484 if (DF_REF_REG_DEF_P (chain->ref))
1486 if (dump_file)
1487 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1488 DF_REF_REGNO (chain->ref), uid);
1489 mark_dual_mode_def (chain->ref);
1491 else
1493 if (dump_file)
1494 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1495 DF_REF_REGNO (chain->ref), uid);
1496 mark_dual_mode_def (ref);
1501 /* Add instruction into a chain. */
1503 void
1504 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1506 if (bitmap_bit_p (insns, insn_uid))
1507 return;
1509 if (dump_file)
1510 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1512 bitmap_set_bit (insns, insn_uid);
1514 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1515 rtx def_set = single_set (insn);
1516 if (def_set && REG_P (SET_DEST (def_set))
1517 && !HARD_REGISTER_P (SET_DEST (def_set)))
1518 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1520 df_ref ref;
1521 df_ref def;
1522 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1523 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1524 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1525 def;
1526 def = DF_REF_NEXT_REG (def))
1527 analyze_register_chain (candidates, def);
1528 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1529 if (!DF_REF_REG_MEM_P (ref))
1530 analyze_register_chain (candidates, ref);
1533 /* Build new chain starting from insn INSN_UID recursively
1534 adding all dependent uses and definitions. */
1536 void
1537 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1539 queue = BITMAP_ALLOC (NULL);
1540 bitmap_set_bit (queue, insn_uid);
1542 if (dump_file)
1543 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1545 while (!bitmap_empty_p (queue))
1547 insn_uid = bitmap_first_set_bit (queue);
1548 bitmap_clear_bit (queue, insn_uid);
1549 bitmap_clear_bit (candidates, insn_uid);
1550 add_insn (candidates, insn_uid);
1553 if (dump_file)
1555 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1556 fprintf (dump_file, " insns: ");
1557 dump_bitmap (dump_file, insns);
1558 if (!bitmap_empty_p (defs_conv))
1560 bitmap_iterator bi;
1561 unsigned id;
1562 const char *comma = "";
1563 fprintf (dump_file, " defs to convert: ");
1564 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1566 fprintf (dump_file, "%sr%d", comma, id);
1567 comma = ", ";
1569 fprintf (dump_file, "\n");
1573 BITMAP_FREE (queue);
1576 /* Return a cost of building a vector costant
1577 instead of using a scalar one. */
1580 dimode_scalar_chain::vector_const_cost (rtx exp)
1582 gcc_assert (CONST_INT_P (exp));
1584 if (standard_sse_constant_p (exp, V2DImode))
1585 return COSTS_N_INSNS (1);
1586 return ix86_cost->sse_load[1];
1589 /* Compute a gain for chain conversion. */
1592 dimode_scalar_chain::compute_convert_gain ()
1594 bitmap_iterator bi;
1595 unsigned insn_uid;
1596 int gain = 0;
1597 int cost = 0;
1599 if (dump_file)
1600 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1602 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1604 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1605 rtx def_set = single_set (insn);
1606 rtx src = SET_SRC (def_set);
1607 rtx dst = SET_DEST (def_set);
1609 if (REG_P (src) && REG_P (dst))
1610 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1611 else if (REG_P (src) && MEM_P (dst))
1612 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1613 else if (MEM_P (src) && REG_P (dst))
1614 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1615 else if (GET_CODE (src) == ASHIFT
1616 || GET_CODE (src) == ASHIFTRT
1617 || GET_CODE (src) == LSHIFTRT)
1619 if (CONST_INT_P (XEXP (src, 0)))
1620 gain -= vector_const_cost (XEXP (src, 0));
1621 if (CONST_INT_P (XEXP (src, 1)))
1623 gain += ix86_cost->shift_const;
1624 if (INTVAL (XEXP (src, 1)) >= 32)
1625 gain -= COSTS_N_INSNS (1);
1627 else
1628 /* Additional gain for omitting two CMOVs. */
1629 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1631 else if (GET_CODE (src) == PLUS
1632 || GET_CODE (src) == MINUS
1633 || GET_CODE (src) == IOR
1634 || GET_CODE (src) == XOR
1635 || GET_CODE (src) == AND)
1637 gain += ix86_cost->add;
1638 /* Additional gain for andnot for targets without BMI. */
1639 if (GET_CODE (XEXP (src, 0)) == NOT
1640 && !TARGET_BMI)
1641 gain += 2 * ix86_cost->add;
1643 if (CONST_INT_P (XEXP (src, 0)))
1644 gain -= vector_const_cost (XEXP (src, 0));
1645 if (CONST_INT_P (XEXP (src, 1)))
1646 gain -= vector_const_cost (XEXP (src, 1));
1648 else if (GET_CODE (src) == NEG
1649 || GET_CODE (src) == NOT)
1650 gain += ix86_cost->add - COSTS_N_INSNS (1);
1651 else if (GET_CODE (src) == COMPARE)
1653 /* Assume comparison cost is the same. */
1655 else if (CONST_INT_P (src))
1657 if (REG_P (dst))
1658 gain += COSTS_N_INSNS (2);
1659 else if (MEM_P (dst))
1660 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1661 gain -= vector_const_cost (src);
1663 else
1664 gcc_unreachable ();
1667 if (dump_file)
1668 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1670 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1671 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1673 if (dump_file)
1674 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1676 gain -= cost;
1678 if (dump_file)
1679 fprintf (dump_file, " Total gain: %d\n", gain);
1681 return gain;
1684 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1687 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1689 if (x == reg)
1690 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1692 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1693 int i, j;
1694 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1696 if (fmt[i] == 'e')
1697 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1698 else if (fmt[i] == 'E')
1699 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1700 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1701 reg, new_reg);
1704 return x;
1707 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1709 void
1710 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1711 rtx reg, rtx new_reg)
1713 replace_with_subreg (single_set (insn), reg, new_reg);
1716 /* Insert generated conversion instruction sequence INSNS
1717 after instruction AFTER. New BB may be required in case
1718 instruction has EH region attached. */
1720 void
1721 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1723 if (!control_flow_insn_p (after))
1725 emit_insn_after (insns, after);
1726 return;
1729 basic_block bb = BLOCK_FOR_INSN (after);
1730 edge e = find_fallthru_edge (bb->succs);
1731 gcc_assert (e);
1733 basic_block new_bb = split_edge (e);
1734 emit_insn_after (insns, BB_HEAD (new_bb));
1737 /* Make vector copies for all register REGNO definitions
1738 and replace its uses in a chain. */
1740 void
1741 dimode_scalar_chain::make_vector_copies (unsigned regno)
1743 rtx reg = regno_reg_rtx[regno];
1744 rtx vreg = gen_reg_rtx (DImode);
1745 bool count_reg = false;
1746 df_ref ref;
1748 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1749 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1751 df_ref use;
1753 /* Detect the count register of a shift instruction. */
1754 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1755 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1757 rtx_insn *insn = DF_REF_INSN (use);
1758 rtx def_set = single_set (insn);
1760 gcc_assert (def_set);
1762 rtx src = SET_SRC (def_set);
1764 if ((GET_CODE (src) == ASHIFT
1765 || GET_CODE (src) == ASHIFTRT
1766 || GET_CODE (src) == LSHIFTRT)
1767 && !CONST_INT_P (XEXP (src, 1))
1768 && reg_or_subregno (XEXP (src, 1)) == regno)
1769 count_reg = true;
1772 start_sequence ();
1773 if (count_reg)
1775 rtx qreg = gen_lowpart (QImode, reg);
1776 rtx tmp = gen_reg_rtx (SImode);
1778 if (TARGET_ZERO_EXTEND_WITH_AND
1779 && optimize_function_for_speed_p (cfun))
1781 emit_move_insn (tmp, const0_rtx);
1782 emit_insn (gen_movstrictqi
1783 (gen_lowpart (QImode, tmp), qreg));
1785 else
1786 emit_insn (gen_rtx_SET
1787 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1789 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1791 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1792 emit_move_insn (slot, tmp);
1793 tmp = copy_rtx (slot);
1796 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1798 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1800 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1801 emit_move_insn (adjust_address (tmp, SImode, 0),
1802 gen_rtx_SUBREG (SImode, reg, 0));
1803 emit_move_insn (adjust_address (tmp, SImode, 4),
1804 gen_rtx_SUBREG (SImode, reg, 4));
1805 emit_move_insn (vreg, tmp);
1807 else if (TARGET_SSE4_1)
1809 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1810 CONST0_RTX (V4SImode),
1811 gen_rtx_SUBREG (SImode, reg, 0)));
1812 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1813 gen_rtx_SUBREG (V4SImode, vreg, 0),
1814 gen_rtx_SUBREG (SImode, reg, 4),
1815 GEN_INT (2)));
1817 else
1819 rtx tmp = gen_reg_rtx (DImode);
1820 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1821 CONST0_RTX (V4SImode),
1822 gen_rtx_SUBREG (SImode, reg, 0)));
1823 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1824 CONST0_RTX (V4SImode),
1825 gen_rtx_SUBREG (SImode, reg, 4)));
1826 emit_insn (gen_vec_interleave_lowv4si
1827 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1828 gen_rtx_SUBREG (V4SImode, vreg, 0),
1829 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1831 rtx_insn *seq = get_insns ();
1832 end_sequence ();
1833 rtx_insn *insn = DF_REF_INSN (ref);
1834 emit_conversion_insns (seq, insn);
1836 if (dump_file)
1837 fprintf (dump_file,
1838 " Copied r%d to a vector register r%d for insn %d\n",
1839 regno, REGNO (vreg), INSN_UID (insn));
1842 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1843 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1845 rtx_insn *insn = DF_REF_INSN (ref);
1846 if (count_reg)
1848 rtx def_set = single_set (insn);
1849 gcc_assert (def_set);
1851 rtx src = SET_SRC (def_set);
1853 if ((GET_CODE (src) == ASHIFT
1854 || GET_CODE (src) == ASHIFTRT
1855 || GET_CODE (src) == LSHIFTRT)
1856 && !CONST_INT_P (XEXP (src, 1))
1857 && reg_or_subregno (XEXP (src, 1)) == regno)
1858 XEXP (src, 1) = vreg;
1860 else
1861 replace_with_subreg_in_insn (insn, reg, vreg);
1863 if (dump_file)
1864 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1865 regno, REGNO (vreg), INSN_UID (insn));
1869 /* Convert all definitions of register REGNO
1870 and fix its uses. Scalar copies may be created
1871 in case register is used in not convertible insn. */
1873 void
1874 dimode_scalar_chain::convert_reg (unsigned regno)
1876 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1877 rtx reg = regno_reg_rtx[regno];
1878 rtx scopy = NULL_RTX;
1879 df_ref ref;
1880 bitmap conv;
1882 conv = BITMAP_ALLOC (NULL);
1883 bitmap_copy (conv, insns);
1885 if (scalar_copy)
1886 scopy = gen_reg_rtx (DImode);
1888 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1890 rtx_insn *insn = DF_REF_INSN (ref);
1891 rtx def_set = single_set (insn);
1892 rtx src = SET_SRC (def_set);
1893 rtx reg = DF_REF_REG (ref);
1895 if (!MEM_P (src))
1897 replace_with_subreg_in_insn (insn, reg, reg);
1898 bitmap_clear_bit (conv, INSN_UID (insn));
1901 if (scalar_copy)
1903 start_sequence ();
1904 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1906 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1907 emit_move_insn (tmp, reg);
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1909 adjust_address (tmp, SImode, 0));
1910 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1911 adjust_address (tmp, SImode, 4));
1913 else if (TARGET_SSE4_1)
1915 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1916 emit_insn
1917 (gen_rtx_SET
1918 (gen_rtx_SUBREG (SImode, scopy, 0),
1919 gen_rtx_VEC_SELECT (SImode,
1920 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1922 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1923 emit_insn
1924 (gen_rtx_SET
1925 (gen_rtx_SUBREG (SImode, scopy, 4),
1926 gen_rtx_VEC_SELECT (SImode,
1927 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1929 else
1931 rtx vcopy = gen_reg_rtx (V2DImode);
1932 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1933 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1934 gen_rtx_SUBREG (SImode, vcopy, 0));
1935 emit_move_insn (vcopy,
1936 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1937 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1938 gen_rtx_SUBREG (SImode, vcopy, 0));
1940 rtx_insn *seq = get_insns ();
1941 end_sequence ();
1942 emit_conversion_insns (seq, insn);
1944 if (dump_file)
1945 fprintf (dump_file,
1946 " Copied r%d to a scalar register r%d for insn %d\n",
1947 regno, REGNO (scopy), INSN_UID (insn));
1951 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1952 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1954 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1956 rtx_insn *insn = DF_REF_INSN (ref);
1958 rtx def_set = single_set (insn);
1959 gcc_assert (def_set);
1961 rtx src = SET_SRC (def_set);
1962 rtx dst = SET_DEST (def_set);
1964 if ((GET_CODE (src) == ASHIFT
1965 || GET_CODE (src) == ASHIFTRT
1966 || GET_CODE (src) == LSHIFTRT)
1967 && !CONST_INT_P (XEXP (src, 1))
1968 && reg_or_subregno (XEXP (src, 1)) == regno)
1970 rtx tmp2 = gen_reg_rtx (V2DImode);
1972 start_sequence ();
1974 if (TARGET_SSE4_1)
1975 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1976 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1977 else
1979 rtx vec_cst
1980 = gen_rtx_CONST_VECTOR (V2DImode,
1981 gen_rtvec (2, GEN_INT (0xff),
1982 const0_rtx));
1983 vec_cst
1984 = validize_mem (force_const_mem (V2DImode, vec_cst));
1986 emit_insn (gen_rtx_SET
1987 (tmp2,
1988 gen_rtx_AND (V2DImode,
1989 gen_rtx_SUBREG (V2DImode, reg, 0),
1990 vec_cst)));
1992 rtx_insn *seq = get_insns ();
1993 end_sequence ();
1995 emit_insn_before (seq, insn);
1997 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1999 else if (!MEM_P (dst) || !REG_P (src))
2000 replace_with_subreg_in_insn (insn, reg, reg);
2002 bitmap_clear_bit (conv, INSN_UID (insn));
2005 /* Skip debug insns and uninitialized uses. */
2006 else if (DF_REF_CHAIN (ref)
2007 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2009 gcc_assert (scopy);
2010 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2011 df_insn_rescan (DF_REF_INSN (ref));
2014 BITMAP_FREE (conv);
2017 /* Convert operand OP in INSN. We should handle
2018 memory operands and uninitialized registers.
2019 All other register uses are converted during
2020 registers conversion. */
2022 void
2023 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2025 *op = copy_rtx_if_shared (*op);
2027 if (GET_CODE (*op) == NOT)
2029 convert_op (&XEXP (*op, 0), insn);
2030 PUT_MODE (*op, V2DImode);
2032 else if (MEM_P (*op))
2034 rtx tmp = gen_reg_rtx (DImode);
2036 emit_insn_before (gen_move_insn (tmp, *op), insn);
2037 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2039 if (dump_file)
2040 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2041 INSN_UID (insn), REGNO (tmp));
2043 else if (REG_P (*op))
2045 /* We may have not converted register usage in case
2046 this register has no definition. Otherwise it
2047 should be converted in convert_reg. */
2048 df_ref ref;
2049 FOR_EACH_INSN_USE (ref, insn)
2050 if (DF_REF_REGNO (ref) == REGNO (*op))
2052 gcc_assert (!DF_REF_CHAIN (ref));
2053 break;
2055 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2057 else if (CONST_INT_P (*op))
2059 rtx vec_cst;
2060 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2062 /* Prefer all ones vector in case of -1. */
2063 if (constm1_operand (*op, GET_MODE (*op)))
2064 vec_cst = CONSTM1_RTX (V2DImode);
2065 else
2066 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2067 gen_rtvec (2, *op, const0_rtx));
2069 if (!standard_sse_constant_p (vec_cst, V2DImode))
2071 start_sequence ();
2072 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2073 rtx_insn *seq = get_insns ();
2074 end_sequence ();
2075 emit_insn_before (seq, insn);
2078 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2079 *op = tmp;
2081 else
2083 gcc_assert (SUBREG_P (*op));
2084 gcc_assert (GET_MODE (*op) == V2DImode);
2088 /* Convert INSN to vector mode. */
2090 void
2091 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2093 rtx def_set = single_set (insn);
2094 rtx src = SET_SRC (def_set);
2095 rtx dst = SET_DEST (def_set);
2096 rtx subreg;
2098 if (MEM_P (dst) && !REG_P (src))
2100 /* There are no scalar integer instructions and therefore
2101 temporary register usage is required. */
2102 rtx tmp = gen_reg_rtx (DImode);
2103 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2104 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2107 switch (GET_CODE (src))
2109 case ASHIFT:
2110 case ASHIFTRT:
2111 case LSHIFTRT:
2112 convert_op (&XEXP (src, 0), insn);
2113 PUT_MODE (src, V2DImode);
2114 break;
2116 case PLUS:
2117 case MINUS:
2118 case IOR:
2119 case XOR:
2120 case AND:
2121 convert_op (&XEXP (src, 0), insn);
2122 convert_op (&XEXP (src, 1), insn);
2123 PUT_MODE (src, V2DImode);
2124 break;
2126 case NEG:
2127 src = XEXP (src, 0);
2128 convert_op (&src, insn);
2129 subreg = gen_reg_rtx (V2DImode);
2130 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2131 src = gen_rtx_MINUS (V2DImode, subreg, src);
2132 break;
2134 case NOT:
2135 src = XEXP (src, 0);
2136 convert_op (&src, insn);
2137 subreg = gen_reg_rtx (V2DImode);
2138 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2139 src = gen_rtx_XOR (V2DImode, src, subreg);
2140 break;
2142 case MEM:
2143 if (!REG_P (dst))
2144 convert_op (&src, insn);
2145 break;
2147 case REG:
2148 if (!MEM_P (dst))
2149 convert_op (&src, insn);
2150 break;
2152 case SUBREG:
2153 gcc_assert (GET_MODE (src) == V2DImode);
2154 break;
2156 case COMPARE:
2157 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2159 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2160 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2162 if (REG_P (src))
2163 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2164 else
2165 subreg = copy_rtx_if_shared (src);
2166 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2167 copy_rtx_if_shared (subreg),
2168 copy_rtx_if_shared (subreg)),
2169 insn);
2170 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2171 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2172 copy_rtx_if_shared (src)),
2173 UNSPEC_PTEST);
2174 break;
2176 case CONST_INT:
2177 convert_op (&src, insn);
2178 break;
2180 default:
2181 gcc_unreachable ();
2184 SET_SRC (def_set) = src;
2185 SET_DEST (def_set) = dst;
2187 /* Drop possible dead definitions. */
2188 PATTERN (insn) = def_set;
2190 INSN_CODE (insn) = -1;
2191 recog_memoized (insn);
2192 df_insn_rescan (insn);
2195 /* Fix uses of converted REG in debug insns. */
2197 void
2198 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2200 if (!flag_var_tracking)
2201 return;
2203 df_ref ref, next;
2204 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2206 rtx_insn *insn = DF_REF_INSN (ref);
2207 /* Make sure the next ref is for a different instruction,
2208 so that we're not affected by the rescan. */
2209 next = DF_REF_NEXT_REG (ref);
2210 while (next && DF_REF_INSN (next) == insn)
2211 next = DF_REF_NEXT_REG (next);
2213 if (DEBUG_INSN_P (insn))
2215 /* It may be a debug insn with a TImode variable in
2216 register. */
2217 bool changed = false;
2218 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2220 rtx *loc = DF_REF_LOC (ref);
2221 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2223 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2224 changed = true;
2227 if (changed)
2228 df_insn_rescan (insn);
2233 /* Convert INSN from TImode to V1T1mode. */
2235 void
2236 timode_scalar_chain::convert_insn (rtx_insn *insn)
2238 rtx def_set = single_set (insn);
2239 rtx src = SET_SRC (def_set);
2240 rtx dst = SET_DEST (def_set);
2242 switch (GET_CODE (dst))
2244 case REG:
2246 rtx tmp = find_reg_equal_equiv_note (insn);
2247 if (tmp)
2248 PUT_MODE (XEXP (tmp, 0), V1TImode);
2249 PUT_MODE (dst, V1TImode);
2250 fix_debug_reg_uses (dst);
2252 break;
2253 case MEM:
2254 PUT_MODE (dst, V1TImode);
2255 break;
2257 default:
2258 gcc_unreachable ();
2261 switch (GET_CODE (src))
2263 case REG:
2264 PUT_MODE (src, V1TImode);
2265 /* Call fix_debug_reg_uses only if SRC is never defined. */
2266 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2267 fix_debug_reg_uses (src);
2268 break;
2270 case MEM:
2271 PUT_MODE (src, V1TImode);
2272 break;
2274 case CONST_WIDE_INT:
2275 if (NONDEBUG_INSN_P (insn))
2277 /* Since there are no instructions to store 128-bit constant,
2278 temporary register usage is required. */
2279 rtx tmp = gen_reg_rtx (V1TImode);
2280 start_sequence ();
2281 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2282 src = validize_mem (force_const_mem (V1TImode, src));
2283 rtx_insn *seq = get_insns ();
2284 end_sequence ();
2285 if (seq)
2286 emit_insn_before (seq, insn);
2287 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2288 dst = tmp;
2290 break;
2292 case CONST_INT:
2293 switch (standard_sse_constant_p (src, TImode))
2295 case 1:
2296 src = CONST0_RTX (GET_MODE (dst));
2297 break;
2298 case 2:
2299 src = CONSTM1_RTX (GET_MODE (dst));
2300 break;
2301 default:
2302 gcc_unreachable ();
2304 if (NONDEBUG_INSN_P (insn))
2306 rtx tmp = gen_reg_rtx (V1TImode);
2307 /* Since there are no instructions to store standard SSE
2308 constant, temporary register usage is required. */
2309 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2310 dst = tmp;
2312 break;
2314 default:
2315 gcc_unreachable ();
2318 SET_SRC (def_set) = src;
2319 SET_DEST (def_set) = dst;
2321 /* Drop possible dead definitions. */
2322 PATTERN (insn) = def_set;
2324 INSN_CODE (insn) = -1;
2325 recog_memoized (insn);
2326 df_insn_rescan (insn);
2329 void
2330 dimode_scalar_chain::convert_registers ()
2332 bitmap_iterator bi;
2333 unsigned id;
2335 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2336 convert_reg (id);
2338 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2339 make_vector_copies (id);
2342 /* Convert whole chain creating required register
2343 conversions and copies. */
2346 scalar_chain::convert ()
2348 bitmap_iterator bi;
2349 unsigned id;
2350 int converted_insns = 0;
2352 if (!dbg_cnt (stv_conversion))
2353 return 0;
2355 if (dump_file)
2356 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2358 convert_registers ();
2360 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2362 convert_insn (DF_INSN_UID_GET (id)->insn);
2363 converted_insns++;
2366 return converted_insns;
2369 /* Main STV pass function. Find and convert scalar
2370 instructions into vector mode when profitable. */
2372 static unsigned int
2373 convert_scalars_to_vector ()
2375 basic_block bb;
2376 bitmap candidates;
2377 int converted_insns = 0;
2379 bitmap_obstack_initialize (NULL);
2380 candidates = BITMAP_ALLOC (NULL);
2382 calculate_dominance_info (CDI_DOMINATORS);
2383 df_set_flags (DF_DEFER_INSN_RESCAN);
2384 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2385 df_md_add_problem ();
2386 df_analyze ();
2388 /* Find all instructions we want to convert into vector mode. */
2389 if (dump_file)
2390 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2392 FOR_EACH_BB_FN (bb, cfun)
2394 rtx_insn *insn;
2395 FOR_BB_INSNS (bb, insn)
2396 if (scalar_to_vector_candidate_p (insn))
2398 if (dump_file)
2399 fprintf (dump_file, " insn %d is marked as a candidate\n",
2400 INSN_UID (insn));
2402 bitmap_set_bit (candidates, INSN_UID (insn));
2406 remove_non_convertible_regs (candidates);
2408 if (bitmap_empty_p (candidates))
2409 if (dump_file)
2410 fprintf (dump_file, "There are no candidates for optimization.\n");
2412 while (!bitmap_empty_p (candidates))
2414 unsigned uid = bitmap_first_set_bit (candidates);
2415 scalar_chain *chain;
2417 if (TARGET_64BIT)
2418 chain = new timode_scalar_chain;
2419 else
2420 chain = new dimode_scalar_chain;
2422 /* Find instructions chain we want to convert to vector mode.
2423 Check all uses and definitions to estimate all required
2424 conversions. */
2425 chain->build (candidates, uid);
2427 if (chain->compute_convert_gain () > 0)
2428 converted_insns += chain->convert ();
2429 else
2430 if (dump_file)
2431 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2432 chain->chain_id);
2434 delete chain;
2437 if (dump_file)
2438 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2440 BITMAP_FREE (candidates);
2441 bitmap_obstack_release (NULL);
2442 df_process_deferred_rescans ();
2444 /* Conversion means we may have 128bit register spills/fills
2445 which require aligned stack. */
2446 if (converted_insns)
2448 if (crtl->stack_alignment_needed < 128)
2449 crtl->stack_alignment_needed = 128;
2450 if (crtl->stack_alignment_estimated < 128)
2451 crtl->stack_alignment_estimated = 128;
2452 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2453 if (TARGET_64BIT)
2454 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2455 parm; parm = DECL_CHAIN (parm))
2457 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2458 continue;
2459 if (DECL_RTL_SET_P (parm)
2460 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2462 rtx r = DECL_RTL (parm);
2463 if (REG_P (r))
2464 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2466 if (DECL_INCOMING_RTL (parm)
2467 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2469 rtx r = DECL_INCOMING_RTL (parm);
2470 if (REG_P (r))
2471 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2476 return 0;
2479 namespace {
2481 const pass_data pass_data_insert_vzeroupper =
2483 RTL_PASS, /* type */
2484 "vzeroupper", /* name */
2485 OPTGROUP_NONE, /* optinfo_flags */
2486 TV_MACH_DEP, /* tv_id */
2487 0, /* properties_required */
2488 0, /* properties_provided */
2489 0, /* properties_destroyed */
2490 0, /* todo_flags_start */
2491 TODO_df_finish, /* todo_flags_finish */
2494 class pass_insert_vzeroupper : public rtl_opt_pass
2496 public:
2497 pass_insert_vzeroupper(gcc::context *ctxt)
2498 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2501 /* opt_pass methods: */
2502 virtual bool gate (function *)
2504 return TARGET_AVX
2505 && TARGET_VZEROUPPER && flag_expensive_optimizations
2506 && !optimize_size;
2509 virtual unsigned int execute (function *)
2511 return rest_of_handle_insert_vzeroupper ();
2514 }; // class pass_insert_vzeroupper
2516 const pass_data pass_data_stv =
2518 RTL_PASS, /* type */
2519 "stv", /* name */
2520 OPTGROUP_NONE, /* optinfo_flags */
2521 TV_MACH_DEP, /* tv_id */
2522 0, /* properties_required */
2523 0, /* properties_provided */
2524 0, /* properties_destroyed */
2525 0, /* todo_flags_start */
2526 TODO_df_finish, /* todo_flags_finish */
2529 class pass_stv : public rtl_opt_pass
2531 public:
2532 pass_stv (gcc::context *ctxt)
2533 : rtl_opt_pass (pass_data_stv, ctxt),
2534 timode_p (false)
2537 /* opt_pass methods: */
2538 virtual bool gate (function *)
2540 return (timode_p == !!TARGET_64BIT
2541 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2544 virtual unsigned int execute (function *)
2546 return convert_scalars_to_vector ();
2549 opt_pass *clone ()
2551 return new pass_stv (m_ctxt);
2554 void set_pass_param (unsigned int n, bool param)
2556 gcc_assert (n == 0);
2557 timode_p = param;
2560 private:
2561 bool timode_p;
2562 }; // class pass_stv
2564 } // anon namespace
2566 rtl_opt_pass *
2567 make_pass_insert_vzeroupper (gcc::context *ctxt)
2569 return new pass_insert_vzeroupper (ctxt);
2572 rtl_opt_pass *
2573 make_pass_stv (gcc::context *ctxt)
2575 return new pass_stv (ctxt);
2578 /* Inserting ENDBRANCH instructions. */
2580 static unsigned int
2581 rest_of_insert_endbranch (void)
2583 timevar_push (TV_MACH_DEP);
2585 rtx cet_eb;
2586 rtx_insn *insn;
2587 basic_block bb;
2589 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2590 absent among function attributes. Later an optimization will be
2591 introduced to make analysis if an address of a static function is
2592 taken. A static function whose address is not taken will get a
2593 nocf_check attribute. This will allow to reduce the number of EB. */
2595 if (!lookup_attribute ("nocf_check",
2596 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2597 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2599 cet_eb = gen_nop_endbr ();
2601 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2602 insn = BB_HEAD (bb);
2603 emit_insn_before (cet_eb, insn);
2606 bb = 0;
2607 FOR_EACH_BB_FN (bb, cfun)
2609 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2610 insn = NEXT_INSN (insn))
2612 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2614 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2615 continue;
2616 /* Generate ENDBRANCH after CALL, which can return more than
2617 twice, setjmp-like functions. */
2619 /* Skip notes and debug insns that must be next to the
2620 call insn. ??? This might skip a lot more than
2621 that... ??? Skipping barriers and emitting code
2622 after them surely looks like a mistake; we probably
2623 won't ever hit it, for we'll hit BB_END first. */
2624 rtx_insn *next_insn = insn;
2625 while ((next_insn != BB_END (bb))
2626 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2627 || NOTE_P (NEXT_INSN (next_insn))
2628 || BARRIER_P (NEXT_INSN (next_insn))))
2629 next_insn = NEXT_INSN (next_insn);
2631 cet_eb = gen_nop_endbr ();
2632 emit_insn_after_setloc (cet_eb, next_insn, INSN_LOCATION (insn));
2633 continue;
2636 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2638 rtx target = JUMP_LABEL (insn);
2639 if (target == NULL_RTX || ANY_RETURN_P (target))
2640 continue;
2642 /* Check the jump is a switch table. */
2643 rtx_insn *label = as_a<rtx_insn *> (target);
2644 rtx_insn *table = next_insn (label);
2645 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2646 continue;
2648 /* For the indirect jump find out all places it jumps and insert
2649 ENDBRANCH there. It should be done under a special flag to
2650 control ENDBRANCH generation for switch stmts. */
2651 edge_iterator ei;
2652 edge e;
2653 basic_block dest_blk;
2655 FOR_EACH_EDGE (e, ei, bb->succs)
2657 rtx_insn *insn;
2659 dest_blk = e->dest;
2660 insn = BB_HEAD (dest_blk);
2661 gcc_assert (LABEL_P (insn));
2662 cet_eb = gen_nop_endbr ();
2663 emit_insn_after (cet_eb, insn);
2665 continue;
2668 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2669 || (NOTE_P (insn)
2670 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2671 /* TODO. Check /s bit also. */
2673 cet_eb = gen_nop_endbr ();
2674 emit_insn_after (cet_eb, insn);
2675 continue;
2680 timevar_pop (TV_MACH_DEP);
2681 return 0;
2684 namespace {
2686 const pass_data pass_data_insert_endbranch =
2688 RTL_PASS, /* type. */
2689 "cet", /* name. */
2690 OPTGROUP_NONE, /* optinfo_flags. */
2691 TV_MACH_DEP, /* tv_id. */
2692 0, /* properties_required. */
2693 0, /* properties_provided. */
2694 0, /* properties_destroyed. */
2695 0, /* todo_flags_start. */
2696 0, /* todo_flags_finish. */
2699 class pass_insert_endbranch : public rtl_opt_pass
2701 public:
2702 pass_insert_endbranch (gcc::context *ctxt)
2703 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2706 /* opt_pass methods: */
2707 virtual bool gate (function *)
2709 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2712 virtual unsigned int execute (function *)
2714 return rest_of_insert_endbranch ();
2717 }; // class pass_insert_endbranch
2719 } // anon namespace
2721 rtl_opt_pass *
2722 make_pass_insert_endbranch (gcc::context *ctxt)
2724 return new pass_insert_endbranch (ctxt);
2727 /* Return true if a red-zone is in use. */
2729 bool
2730 ix86_using_red_zone (void)
2732 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2735 /* Return a string that documents the current -m options. The caller is
2736 responsible for freeing the string. */
2738 static char *
2739 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2740 int flags, int flags2,
2741 const char *arch, const char *tune,
2742 enum fpmath_unit fpmath, bool add_nl_p)
2744 struct ix86_target_opts
2746 const char *option; /* option string */
2747 HOST_WIDE_INT mask; /* isa mask options */
2750 /* This table is ordered so that options like -msse4.2 that imply other
2751 ISAs come first. Target string will be displayed in the same order. */
2752 static struct ix86_target_opts isa2_opts[] =
2754 { "-mcx16", OPTION_MASK_ISA_CX16 },
2755 { "-mmpx", OPTION_MASK_ISA_MPX },
2756 { "-mvaes", OPTION_MASK_ISA_VAES },
2757 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2758 { "-msgx", OPTION_MASK_ISA_SGX },
2759 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2760 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2761 { "-mibt", OPTION_MASK_ISA_IBT },
2762 { "-mhle", OPTION_MASK_ISA_HLE },
2763 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2764 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2765 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2767 static struct ix86_target_opts isa_opts[] =
2769 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2770 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2771 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2772 { "-mgfni", OPTION_MASK_ISA_GFNI },
2773 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2774 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2775 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2776 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2777 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2778 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2779 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2780 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2781 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2782 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2783 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2784 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2785 { "-mfma", OPTION_MASK_ISA_FMA },
2786 { "-mxop", OPTION_MASK_ISA_XOP },
2787 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2788 { "-mf16c", OPTION_MASK_ISA_F16C },
2789 { "-mavx", OPTION_MASK_ISA_AVX },
2790 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2791 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2792 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2793 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2794 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2795 { "-msse3", OPTION_MASK_ISA_SSE3 },
2796 { "-maes", OPTION_MASK_ISA_AES },
2797 { "-msha", OPTION_MASK_ISA_SHA },
2798 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2799 { "-msse2", OPTION_MASK_ISA_SSE2 },
2800 { "-msse", OPTION_MASK_ISA_SSE },
2801 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2802 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2803 { "-mmmx", OPTION_MASK_ISA_MMX },
2804 { "-mrtm", OPTION_MASK_ISA_RTM },
2805 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2806 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2807 { "-madx", OPTION_MASK_ISA_ADX },
2808 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2809 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2810 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2811 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2812 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2813 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2814 { "-mabm", OPTION_MASK_ISA_ABM },
2815 { "-mbmi", OPTION_MASK_ISA_BMI },
2816 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2817 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2818 { "-mtbm", OPTION_MASK_ISA_TBM },
2819 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2820 { "-msahf", OPTION_MASK_ISA_SAHF },
2821 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2822 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2823 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2824 { "-mpku", OPTION_MASK_ISA_PKU },
2825 { "-mlwp", OPTION_MASK_ISA_LWP },
2826 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2827 { "-mclwb", OPTION_MASK_ISA_CLWB },
2828 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2831 /* Flag options. */
2832 static struct ix86_target_opts flag_opts[] =
2834 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2835 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2836 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2837 { "-m80387", MASK_80387 },
2838 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2839 { "-malign-double", MASK_ALIGN_DOUBLE },
2840 { "-mcld", MASK_CLD },
2841 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2842 { "-mieee-fp", MASK_IEEE_FP },
2843 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2844 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2845 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2846 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2847 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2848 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2849 { "-mno-red-zone", MASK_NO_RED_ZONE },
2850 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2851 { "-mrecip", MASK_RECIP },
2852 { "-mrtd", MASK_RTD },
2853 { "-msseregparm", MASK_SSEREGPARM },
2854 { "-mstack-arg-probe", MASK_STACK_PROBE },
2855 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2856 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2857 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2858 { "-mvzeroupper", MASK_VZEROUPPER },
2859 { "-mstv", MASK_STV },
2860 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2861 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2862 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2865 /* Additional flag options. */
2866 static struct ix86_target_opts flag2_opts[] =
2868 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2871 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2872 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2874 char isa_other[40];
2875 char isa2_other[40];
2876 char flags_other[40];
2877 char flags2_other[40];
2878 unsigned num = 0;
2879 unsigned i, j;
2880 char *ret;
2881 char *ptr;
2882 size_t len;
2883 size_t line_len;
2884 size_t sep_len;
2885 const char *abi;
2887 memset (opts, '\0', sizeof (opts));
2889 /* Add -march= option. */
2890 if (arch)
2892 opts[num][0] = "-march=";
2893 opts[num++][1] = arch;
2896 /* Add -mtune= option. */
2897 if (tune)
2899 opts[num][0] = "-mtune=";
2900 opts[num++][1] = tune;
2903 /* Add -m32/-m64/-mx32. */
2904 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2906 if ((isa & OPTION_MASK_ABI_64) != 0)
2907 abi = "-m64";
2908 else
2909 abi = "-mx32";
2910 isa &= ~ (OPTION_MASK_ISA_64BIT
2911 | OPTION_MASK_ABI_64
2912 | OPTION_MASK_ABI_X32);
2914 else
2915 abi = "-m32";
2916 opts[num++][0] = abi;
2918 /* Pick out the options in isa2 options. */
2919 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2921 if ((isa2 & isa2_opts[i].mask) != 0)
2923 opts[num++][0] = isa2_opts[i].option;
2924 isa2 &= ~ isa2_opts[i].mask;
2928 if (isa2 && add_nl_p)
2930 opts[num++][0] = isa2_other;
2931 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2934 /* Pick out the options in isa options. */
2935 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2937 if ((isa & isa_opts[i].mask) != 0)
2939 opts[num++][0] = isa_opts[i].option;
2940 isa &= ~ isa_opts[i].mask;
2944 if (isa && add_nl_p)
2946 opts[num++][0] = isa_other;
2947 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2950 /* Add flag options. */
2951 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2953 if ((flags & flag_opts[i].mask) != 0)
2955 opts[num++][0] = flag_opts[i].option;
2956 flags &= ~ flag_opts[i].mask;
2960 if (flags && add_nl_p)
2962 opts[num++][0] = flags_other;
2963 sprintf (flags_other, "(other flags: %#x)", flags);
2966 /* Add additional flag options. */
2967 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2969 if ((flags2 & flag2_opts[i].mask) != 0)
2971 opts[num++][0] = flag2_opts[i].option;
2972 flags2 &= ~ flag2_opts[i].mask;
2976 if (flags2 && add_nl_p)
2978 opts[num++][0] = flags2_other;
2979 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2982 /* Add -fpmath= option. */
2983 if (fpmath)
2985 opts[num][0] = "-mfpmath=";
2986 switch ((int) fpmath)
2988 case FPMATH_387:
2989 opts[num++][1] = "387";
2990 break;
2992 case FPMATH_SSE:
2993 opts[num++][1] = "sse";
2994 break;
2996 case FPMATH_387 | FPMATH_SSE:
2997 opts[num++][1] = "sse+387";
2998 break;
3000 default:
3001 gcc_unreachable ();
3005 /* Any options? */
3006 if (num == 0)
3007 return NULL;
3009 gcc_assert (num < ARRAY_SIZE (opts));
3011 /* Size the string. */
3012 len = 0;
3013 sep_len = (add_nl_p) ? 3 : 1;
3014 for (i = 0; i < num; i++)
3016 len += sep_len;
3017 for (j = 0; j < 2; j++)
3018 if (opts[i][j])
3019 len += strlen (opts[i][j]);
3022 /* Build the string. */
3023 ret = ptr = (char *) xmalloc (len);
3024 line_len = 0;
3026 for (i = 0; i < num; i++)
3028 size_t len2[2];
3030 for (j = 0; j < 2; j++)
3031 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3033 if (i != 0)
3035 *ptr++ = ' ';
3036 line_len++;
3038 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3040 *ptr++ = '\\';
3041 *ptr++ = '\n';
3042 line_len = 0;
3046 for (j = 0; j < 2; j++)
3047 if (opts[i][j])
3049 memcpy (ptr, opts[i][j], len2[j]);
3050 ptr += len2[j];
3051 line_len += len2[j];
3055 *ptr = '\0';
3056 gcc_assert (ret + len >= ptr);
3058 return ret;
3061 /* Return true, if profiling code should be emitted before
3062 prologue. Otherwise it returns false.
3063 Note: For x86 with "hotfix" it is sorried. */
3064 static bool
3065 ix86_profile_before_prologue (void)
3067 return flag_fentry != 0;
3070 /* Function that is callable from the debugger to print the current
3071 options. */
3072 void ATTRIBUTE_UNUSED
3073 ix86_debug_options (void)
3075 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3076 target_flags, ix86_target_flags,
3077 ix86_arch_string,ix86_tune_string,
3078 ix86_fpmath, true);
3080 if (opts)
3082 fprintf (stderr, "%s\n\n", opts);
3083 free (opts);
3085 else
3086 fputs ("<no options>\n\n", stderr);
3088 return;
3091 /* Return true if T is one of the bytes we should avoid with
3092 -fmitigate-rop. */
3094 static bool
3095 ix86_rop_should_change_byte_p (int t)
3097 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3100 static const char *stringop_alg_names[] = {
3101 #define DEF_ENUM
3102 #define DEF_ALG(alg, name) #name,
3103 #include "stringop.def"
3104 #undef DEF_ENUM
3105 #undef DEF_ALG
3108 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3109 The string is of the following form (or comma separated list of it):
3111 strategy_alg:max_size:[align|noalign]
3113 where the full size range for the strategy is either [0, max_size] or
3114 [min_size, max_size], in which min_size is the max_size + 1 of the
3115 preceding range. The last size range must have max_size == -1.
3117 Examples:
3120 -mmemcpy-strategy=libcall:-1:noalign
3122 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3126 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3128 This is to tell the compiler to use the following strategy for memset
3129 1) when the expected size is between [1, 16], use rep_8byte strategy;
3130 2) when the size is between [17, 2048], use vector_loop;
3131 3) when the size is > 2048, use libcall. */
3133 struct stringop_size_range
3135 int max;
3136 stringop_alg alg;
3137 bool noalign;
3140 static void
3141 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3143 const struct stringop_algs *default_algs;
3144 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3145 char *curr_range_str, *next_range_str;
3146 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3147 int i = 0, n = 0;
3149 if (is_memset)
3150 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3151 else
3152 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3154 curr_range_str = strategy_str;
3158 int maxs;
3159 char alg_name[128];
3160 char align[16];
3161 next_range_str = strchr (curr_range_str, ',');
3162 if (next_range_str)
3163 *next_range_str++ = '\0';
3165 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3166 align) != 3)
3168 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3169 return;
3172 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3174 error ("size ranges of option %qs should be increasing", opt);
3175 return;
3178 for (i = 0; i < last_alg; i++)
3179 if (!strcmp (alg_name, stringop_alg_names[i]))
3180 break;
3182 if (i == last_alg)
3184 error ("wrong strategy name %qs specified for option %qs",
3185 alg_name, opt);
3187 auto_vec <const char *> candidates;
3188 for (i = 0; i < last_alg; i++)
3189 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3190 candidates.safe_push (stringop_alg_names[i]);
3192 char *s;
3193 const char *hint
3194 = candidates_list_and_hint (alg_name, s, candidates);
3195 if (hint)
3196 inform (input_location,
3197 "valid arguments to %qs are: %s; did you mean %qs?",
3198 opt, s, hint);
3199 else
3200 inform (input_location, "valid arguments to %qs are: %s",
3201 opt, s);
3202 XDELETEVEC (s);
3203 return;
3206 if ((stringop_alg) i == rep_prefix_8_byte
3207 && !TARGET_64BIT)
3209 /* rep; movq isn't available in 32-bit code. */
3210 error ("strategy name %qs specified for option %qs "
3211 "not supported for 32-bit code", alg_name, opt);
3212 return;
3215 input_ranges[n].max = maxs;
3216 input_ranges[n].alg = (stringop_alg) i;
3217 if (!strcmp (align, "align"))
3218 input_ranges[n].noalign = false;
3219 else if (!strcmp (align, "noalign"))
3220 input_ranges[n].noalign = true;
3221 else
3223 error ("unknown alignment %qs specified for option %qs", align, opt);
3224 return;
3226 n++;
3227 curr_range_str = next_range_str;
3229 while (curr_range_str);
3231 if (input_ranges[n - 1].max != -1)
3233 error ("the max value for the last size range should be -1"
3234 " for option %qs", opt);
3235 return;
3238 if (n > MAX_STRINGOP_ALGS)
3240 error ("too many size ranges specified in option %qs", opt);
3241 return;
3244 /* Now override the default algs array. */
3245 for (i = 0; i < n; i++)
3247 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3248 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3249 = input_ranges[i].alg;
3250 *const_cast<int *>(&default_algs->size[i].noalign)
3251 = input_ranges[i].noalign;
3256 /* parse -mtune-ctrl= option. When DUMP is true,
3257 print the features that are explicitly set. */
3259 static void
3260 parse_mtune_ctrl_str (bool dump)
3262 if (!ix86_tune_ctrl_string)
3263 return;
3265 char *next_feature_string = NULL;
3266 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3267 char *orig = curr_feature_string;
3268 int i;
3271 bool clear = false;
3273 next_feature_string = strchr (curr_feature_string, ',');
3274 if (next_feature_string)
3275 *next_feature_string++ = '\0';
3276 if (*curr_feature_string == '^')
3278 curr_feature_string++;
3279 clear = true;
3281 for (i = 0; i < X86_TUNE_LAST; i++)
3283 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3285 ix86_tune_features[i] = !clear;
3286 if (dump)
3287 fprintf (stderr, "Explicitly %s feature %s\n",
3288 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3289 break;
3292 if (i == X86_TUNE_LAST)
3293 error ("unknown parameter to option -mtune-ctrl: %s",
3294 clear ? curr_feature_string - 1 : curr_feature_string);
3295 curr_feature_string = next_feature_string;
3297 while (curr_feature_string);
3298 free (orig);
3301 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3302 processor type. */
3304 static void
3305 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3307 unsigned int ix86_tune_mask = 1u << ix86_tune;
3308 int i;
3310 for (i = 0; i < X86_TUNE_LAST; ++i)
3312 if (ix86_tune_no_default)
3313 ix86_tune_features[i] = 0;
3314 else
3315 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3318 if (dump)
3320 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3321 for (i = 0; i < X86_TUNE_LAST; i++)
3322 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3323 ix86_tune_features[i] ? "on" : "off");
3326 parse_mtune_ctrl_str (dump);
3330 /* Default align_* from the processor table. */
3332 static void
3333 ix86_default_align (struct gcc_options *opts)
3335 if (opts->x_align_loops == 0)
3337 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3338 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3340 if (opts->x_align_jumps == 0)
3342 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3343 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3345 if (opts->x_align_functions == 0)
3347 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3351 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3353 static void
3354 ix86_override_options_after_change (void)
3356 ix86_default_align (&global_options);
3359 /* Override various settings based on options. If MAIN_ARGS_P, the
3360 options are from the command line, otherwise they are from
3361 attributes. Return true if there's an error related to march
3362 option. */
3364 static bool
3365 ix86_option_override_internal (bool main_args_p,
3366 struct gcc_options *opts,
3367 struct gcc_options *opts_set)
3369 int i;
3370 unsigned int ix86_arch_mask;
3371 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3373 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3374 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3375 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3376 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3377 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3378 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3379 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3380 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3381 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3382 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3383 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3384 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3385 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3386 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3387 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3388 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3389 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3390 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3391 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3392 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3393 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3394 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3395 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3396 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3397 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3398 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3399 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3400 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3401 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3402 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3403 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3404 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3405 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3406 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3407 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3408 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3409 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3410 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3411 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3412 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3413 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3414 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3415 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3416 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3417 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3418 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3419 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3420 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3421 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3422 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3423 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3424 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3425 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3426 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3427 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3428 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3429 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3430 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3431 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3432 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3433 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3434 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3435 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3436 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3438 #define PTA_CORE2 \
3439 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3440 | PTA_CX16 | PTA_FXSR)
3441 #define PTA_NEHALEM \
3442 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3443 #define PTA_WESTMERE \
3444 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3445 #define PTA_SANDYBRIDGE \
3446 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3447 #define PTA_IVYBRIDGE \
3448 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3449 #define PTA_HASWELL \
3450 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3451 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3452 #define PTA_BROADWELL \
3453 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3454 #define PTA_SKYLAKE \
3455 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3456 #define PTA_SKYLAKE_AVX512 \
3457 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3458 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU | PTA_CLWB)
3459 #define PTA_CANNONLAKE \
3460 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA)
3461 #define PTA_KNL \
3462 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3463 #define PTA_BONNELL \
3464 (PTA_CORE2 | PTA_MOVBE)
3465 #define PTA_SILVERMONT \
3466 (PTA_WESTMERE | PTA_MOVBE)
3467 #define PTA_KNM \
3468 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3470 /* if this reaches 64, need to widen struct pta flags below */
3472 static struct pta
3474 const char *const name; /* processor name or nickname. */
3475 const enum processor_type processor;
3476 const enum attr_cpu schedule;
3477 const unsigned HOST_WIDE_INT flags;
3479 const processor_alias_table[] =
3481 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3482 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3483 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3484 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3485 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3486 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3487 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3488 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3489 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3490 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3491 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3492 PTA_MMX | PTA_SSE | PTA_FXSR},
3493 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3494 PTA_MMX | PTA_SSE | PTA_FXSR},
3495 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3496 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3497 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3498 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3499 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3500 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3501 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3502 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3503 PTA_MMX | PTA_SSE | PTA_FXSR},
3504 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3505 PTA_MMX | PTA_SSE | PTA_FXSR},
3506 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3507 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3508 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3509 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3510 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3511 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3512 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3513 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3514 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3515 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3516 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3517 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3518 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3519 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3520 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3521 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3522 PTA_SANDYBRIDGE},
3523 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3524 PTA_SANDYBRIDGE},
3525 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3526 PTA_IVYBRIDGE},
3527 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3528 PTA_IVYBRIDGE},
3529 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3530 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3531 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3532 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3533 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3534 PTA_SKYLAKE_AVX512},
3535 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3536 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3537 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3538 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3539 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3540 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3541 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3542 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3543 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3544 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3545 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3546 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3547 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3548 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3549 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3550 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3551 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3552 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3553 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3554 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3555 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3556 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3557 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3558 {"x86-64", PROCESSOR_K8, CPU_K8,
3559 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3560 {"eden-x2", PROCESSOR_K8, CPU_K8,
3561 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3562 {"nano", PROCESSOR_K8, CPU_K8,
3563 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3564 | PTA_SSSE3 | PTA_FXSR},
3565 {"nano-1000", PROCESSOR_K8, CPU_K8,
3566 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3567 | PTA_SSSE3 | PTA_FXSR},
3568 {"nano-2000", PROCESSOR_K8, CPU_K8,
3569 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3570 | PTA_SSSE3 | PTA_FXSR},
3571 {"nano-3000", PROCESSOR_K8, CPU_K8,
3572 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3573 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3574 {"nano-x2", PROCESSOR_K8, CPU_K8,
3575 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3576 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3577 {"eden-x4", PROCESSOR_K8, CPU_K8,
3578 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3579 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3580 {"nano-x4", PROCESSOR_K8, CPU_K8,
3581 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3582 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3583 {"k8", PROCESSOR_K8, CPU_K8,
3584 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3585 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3586 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3587 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3588 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3589 {"opteron", PROCESSOR_K8, CPU_K8,
3590 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3591 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3592 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3593 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3594 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3595 {"athlon64", PROCESSOR_K8, CPU_K8,
3596 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3597 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3598 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3599 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3600 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3601 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3602 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3603 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3604 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3605 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3606 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3607 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3608 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3609 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3610 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3611 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3612 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3613 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3614 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3615 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3616 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3617 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3618 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3619 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3620 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3621 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3622 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3623 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3624 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3625 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3626 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3627 | PTA_XSAVEOPT | PTA_FSGSBASE},
3628 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3629 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3630 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3631 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3632 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3633 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3634 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3635 | PTA_MOVBE | PTA_MWAITX},
3636 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3637 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3638 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3639 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3640 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3641 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3642 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3643 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3644 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3645 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3646 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3647 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3648 | PTA_FXSR | PTA_XSAVE},
3649 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3650 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3651 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3652 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3653 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3654 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3656 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3657 PTA_64BIT
3658 | PTA_HLE /* flags are only used for -march switch. */ },
3661 /* -mrecip options. */
3662 static struct
3664 const char *string; /* option name */
3665 unsigned int mask; /* mask bits to set */
3667 const recip_options[] =
3669 { "all", RECIP_MASK_ALL },
3670 { "none", RECIP_MASK_NONE },
3671 { "div", RECIP_MASK_DIV },
3672 { "sqrt", RECIP_MASK_SQRT },
3673 { "vec-div", RECIP_MASK_VEC_DIV },
3674 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3677 int const pta_size = ARRAY_SIZE (processor_alias_table);
3679 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3680 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3681 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3682 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3683 #ifdef TARGET_BI_ARCH
3684 else
3686 #if TARGET_BI_ARCH == 1
3687 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3688 is on and OPTION_MASK_ABI_X32 is off. We turn off
3689 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3690 -mx32. */
3691 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3692 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3693 #else
3694 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3695 on and OPTION_MASK_ABI_64 is off. We turn off
3696 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3697 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3698 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3699 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3700 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3701 #endif
3702 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3703 && TARGET_IAMCU_P (opts->x_target_flags))
3704 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3705 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3707 #endif
3709 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3711 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3712 OPTION_MASK_ABI_64 for TARGET_X32. */
3713 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3714 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3716 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3717 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3718 | OPTION_MASK_ABI_X32
3719 | OPTION_MASK_ABI_64);
3720 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3722 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3723 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3724 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3725 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3728 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3729 SUBTARGET_OVERRIDE_OPTIONS;
3730 #endif
3732 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3733 SUBSUBTARGET_OVERRIDE_OPTIONS;
3734 #endif
3736 /* -fPIC is the default for x86_64. */
3737 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3738 opts->x_flag_pic = 2;
3740 /* Need to check -mtune=generic first. */
3741 if (opts->x_ix86_tune_string)
3743 /* As special support for cross compilers we read -mtune=native
3744 as -mtune=generic. With native compilers we won't see the
3745 -mtune=native, as it was changed by the driver. */
3746 if (!strcmp (opts->x_ix86_tune_string, "native"))
3748 opts->x_ix86_tune_string = "generic";
3750 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3751 warning (OPT_Wdeprecated,
3752 main_args_p
3753 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3754 "or %<-mtune=generic%> instead as appropriate")
3755 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3756 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3757 " instead as appropriate"));
3759 else
3761 if (opts->x_ix86_arch_string)
3762 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3763 if (!opts->x_ix86_tune_string)
3765 opts->x_ix86_tune_string
3766 = processor_target_table[TARGET_CPU_DEFAULT].name;
3767 ix86_tune_defaulted = 1;
3770 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3771 or defaulted. We need to use a sensible tune option. */
3772 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3774 opts->x_ix86_tune_string = "generic";
3778 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3779 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3781 /* rep; movq isn't available in 32-bit code. */
3782 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3783 opts->x_ix86_stringop_alg = no_stringop;
3786 if (!opts->x_ix86_arch_string)
3787 opts->x_ix86_arch_string
3788 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3789 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3790 else
3791 ix86_arch_specified = 1;
3793 if (opts_set->x_ix86_pmode)
3795 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3796 && opts->x_ix86_pmode == PMODE_SI)
3797 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3798 && opts->x_ix86_pmode == PMODE_DI))
3799 error ("address mode %qs not supported in the %s bit mode",
3800 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3801 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3803 else
3804 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3805 ? PMODE_DI : PMODE_SI;
3807 if (!opts_set->x_ix86_abi)
3808 opts->x_ix86_abi = DEFAULT_ABI;
3810 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3811 error ("-mabi=ms not supported with X32 ABI");
3812 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3814 /* For targets using ms ABI enable ms-extensions, if not
3815 explicit turned off. For non-ms ABI we turn off this
3816 option. */
3817 if (!opts_set->x_flag_ms_extensions)
3818 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3820 if (opts_set->x_ix86_cmodel)
3822 switch (opts->x_ix86_cmodel)
3824 case CM_SMALL:
3825 case CM_SMALL_PIC:
3826 if (opts->x_flag_pic)
3827 opts->x_ix86_cmodel = CM_SMALL_PIC;
3828 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3829 error ("code model %qs not supported in the %s bit mode",
3830 "small", "32");
3831 break;
3833 case CM_MEDIUM:
3834 case CM_MEDIUM_PIC:
3835 if (opts->x_flag_pic)
3836 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3837 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3838 error ("code model %qs not supported in the %s bit mode",
3839 "medium", "32");
3840 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3841 error ("code model %qs not supported in x32 mode",
3842 "medium");
3843 break;
3845 case CM_LARGE:
3846 case CM_LARGE_PIC:
3847 if (opts->x_flag_pic)
3848 opts->x_ix86_cmodel = CM_LARGE_PIC;
3849 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3850 error ("code model %qs not supported in the %s bit mode",
3851 "large", "32");
3852 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3853 error ("code model %qs not supported in x32 mode",
3854 "large");
3855 break;
3857 case CM_32:
3858 if (opts->x_flag_pic)
3859 error ("code model %s does not support PIC mode", "32");
3860 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3861 error ("code model %qs not supported in the %s bit mode",
3862 "32", "64");
3863 break;
3865 case CM_KERNEL:
3866 if (opts->x_flag_pic)
3868 error ("code model %s does not support PIC mode", "kernel");
3869 opts->x_ix86_cmodel = CM_32;
3871 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3872 error ("code model %qs not supported in the %s bit mode",
3873 "kernel", "32");
3874 break;
3876 default:
3877 gcc_unreachable ();
3880 else
3882 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3883 use of rip-relative addressing. This eliminates fixups that
3884 would otherwise be needed if this object is to be placed in a
3885 DLL, and is essentially just as efficient as direct addressing. */
3886 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3887 && (TARGET_RDOS || TARGET_PECOFF))
3888 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3889 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3890 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3891 else
3892 opts->x_ix86_cmodel = CM_32;
3894 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3896 error ("-masm=intel not supported in this configuration");
3897 opts->x_ix86_asm_dialect = ASM_ATT;
3899 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3900 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3901 sorry ("%i-bit mode not compiled in",
3902 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3904 for (i = 0; i < pta_size; i++)
3905 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3907 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3909 error (main_args_p
3910 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3911 "switch")
3912 : G_("%<generic%> CPU can be used only for "
3913 "%<target(\"tune=\")%> attribute"));
3914 return false;
3916 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3918 error (main_args_p
3919 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3920 "switch")
3921 : G_("%<intel%> CPU can be used only for "
3922 "%<target(\"tune=\")%> attribute"));
3923 return false;
3926 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3927 && !(processor_alias_table[i].flags & PTA_64BIT))
3929 error ("CPU you selected does not support x86-64 "
3930 "instruction set");
3931 return false;
3934 ix86_schedule = processor_alias_table[i].schedule;
3935 ix86_arch = processor_alias_table[i].processor;
3936 /* Default cpu tuning to the architecture. */
3937 ix86_tune = ix86_arch;
3939 if (processor_alias_table[i].flags & PTA_MMX
3940 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3941 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3942 if (processor_alias_table[i].flags & PTA_3DNOW
3943 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3944 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3945 if (processor_alias_table[i].flags & PTA_3DNOW_A
3946 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3947 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3948 if (processor_alias_table[i].flags & PTA_SSE
3949 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3950 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3951 if (processor_alias_table[i].flags & PTA_SSE2
3952 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3953 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3954 if (processor_alias_table[i].flags & PTA_SSE3
3955 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3956 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3957 if (processor_alias_table[i].flags & PTA_SSSE3
3958 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3959 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3960 if (processor_alias_table[i].flags & PTA_SSE4_1
3961 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3962 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3963 if (processor_alias_table[i].flags & PTA_SSE4_2
3964 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3965 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3966 if (processor_alias_table[i].flags & PTA_AVX
3967 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3968 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3969 if (processor_alias_table[i].flags & PTA_AVX2
3970 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3971 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3972 if (processor_alias_table[i].flags & PTA_FMA
3973 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3974 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3975 if (processor_alias_table[i].flags & PTA_SSE4A
3976 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3977 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3978 if (processor_alias_table[i].flags & PTA_FMA4
3979 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3980 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3981 if (processor_alias_table[i].flags & PTA_XOP
3982 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3983 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3984 if (processor_alias_table[i].flags & PTA_LWP
3985 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3986 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3987 if (processor_alias_table[i].flags & PTA_ABM
3988 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3989 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3990 if (processor_alias_table[i].flags & PTA_BMI
3991 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3992 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3993 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3994 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3995 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3996 if (processor_alias_table[i].flags & PTA_TBM
3997 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3998 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3999 if (processor_alias_table[i].flags & PTA_BMI2
4000 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4001 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4002 if (processor_alias_table[i].flags & PTA_CX16
4003 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4004 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4005 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
4006 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4007 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4008 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4009 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4010 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4011 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4012 if (processor_alias_table[i].flags & PTA_MOVBE
4013 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4014 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4015 if (processor_alias_table[i].flags & PTA_AES
4016 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4017 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4018 if (processor_alias_table[i].flags & PTA_SHA
4019 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4020 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4021 if (processor_alias_table[i].flags & PTA_PCLMUL
4022 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4023 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4024 if (processor_alias_table[i].flags & PTA_FSGSBASE
4025 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4026 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4027 if (processor_alias_table[i].flags & PTA_RDRND
4028 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4029 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4030 if (processor_alias_table[i].flags & PTA_F16C
4031 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4032 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4033 if (processor_alias_table[i].flags & PTA_RTM
4034 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4035 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4036 if (processor_alias_table[i].flags & PTA_HLE
4037 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4038 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4039 if (processor_alias_table[i].flags & PTA_PRFCHW
4040 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4041 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4042 if (processor_alias_table[i].flags & PTA_RDSEED
4043 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4044 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4045 if (processor_alias_table[i].flags & PTA_ADX
4046 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4047 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4048 if (processor_alias_table[i].flags & PTA_FXSR
4049 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4050 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4051 if (processor_alias_table[i].flags & PTA_XSAVE
4052 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4053 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4054 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4055 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4056 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4057 if (processor_alias_table[i].flags & PTA_AVX512F
4058 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4059 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4060 if (processor_alias_table[i].flags & PTA_AVX512ER
4061 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4062 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4063 if (processor_alias_table[i].flags & PTA_AVX512PF
4064 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4065 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4066 if (processor_alias_table[i].flags & PTA_AVX512CD
4067 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4068 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4069 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4070 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4071 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4072 if (processor_alias_table[i].flags & PTA_CLWB
4073 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4074 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4075 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4076 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4077 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4078 if (processor_alias_table[i].flags & PTA_CLZERO
4079 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4080 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4081 if (processor_alias_table[i].flags & PTA_XSAVEC
4082 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4083 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4084 if (processor_alias_table[i].flags & PTA_XSAVES
4085 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4086 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4087 if (processor_alias_table[i].flags & PTA_AVX512DQ
4088 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4089 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4090 if (processor_alias_table[i].flags & PTA_AVX512BW
4091 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4092 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4093 if (processor_alias_table[i].flags & PTA_AVX512VL
4094 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4095 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4096 if (processor_alias_table[i].flags & PTA_MPX
4097 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4098 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4099 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4100 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4101 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4102 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4103 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4104 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4106 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4107 && !(opts->x_ix86_isa_flags2_explicit
4108 & OPTION_MASK_ISA_AVX5124VNNIW))
4109 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4110 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4111 && !(opts->x_ix86_isa_flags2_explicit
4112 & OPTION_MASK_ISA_AVX5124FMAPS))
4113 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4114 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4115 && !(opts->x_ix86_isa_flags_explicit
4116 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4117 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4118 if (processor_alias_table[i].flags & PTA_SGX
4119 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4120 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4122 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4123 x86_prefetch_sse = true;
4124 if (processor_alias_table[i].flags & PTA_MWAITX
4125 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4126 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4127 if (processor_alias_table[i].flags & PTA_PKU
4128 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4129 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4131 /* Don't enable x87 instructions if only
4132 general registers are allowed. */
4133 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4134 && !(opts_set->x_target_flags & MASK_80387))
4136 if (processor_alias_table[i].flags & PTA_NO_80387)
4137 opts->x_target_flags &= ~MASK_80387;
4138 else
4139 opts->x_target_flags |= MASK_80387;
4141 break;
4144 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4145 error ("Intel MPX does not support x32");
4147 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4148 error ("Intel MPX does not support x32");
4150 if (i == pta_size)
4152 error (main_args_p
4153 ? G_("bad value (%qs) for %<-march=%> switch")
4154 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4155 opts->x_ix86_arch_string);
4157 auto_vec <const char *> candidates;
4158 for (i = 0; i < pta_size; i++)
4159 if (strcmp (processor_alias_table[i].name, "generic")
4160 && strcmp (processor_alias_table[i].name, "intel")
4161 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4162 || (processor_alias_table[i].flags & PTA_64BIT)))
4163 candidates.safe_push (processor_alias_table[i].name);
4165 char *s;
4166 const char *hint
4167 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4168 if (hint)
4169 inform (input_location,
4170 main_args_p
4171 ? G_("valid arguments to %<-march=%> switch are: "
4172 "%s; did you mean %qs?")
4173 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4174 "%s; did you mean %qs?"), s, hint);
4175 else
4176 inform (input_location,
4177 main_args_p
4178 ? G_("valid arguments to %<-march=%> switch are: %s")
4179 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4180 "are: %s"), s);
4181 XDELETEVEC (s);
4184 ix86_arch_mask = 1u << ix86_arch;
4185 for (i = 0; i < X86_ARCH_LAST; ++i)
4186 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4188 for (i = 0; i < pta_size; i++)
4189 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4191 ix86_schedule = processor_alias_table[i].schedule;
4192 ix86_tune = processor_alias_table[i].processor;
4193 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4195 if (!(processor_alias_table[i].flags & PTA_64BIT))
4197 if (ix86_tune_defaulted)
4199 opts->x_ix86_tune_string = "x86-64";
4200 for (i = 0; i < pta_size; i++)
4201 if (! strcmp (opts->x_ix86_tune_string,
4202 processor_alias_table[i].name))
4203 break;
4204 ix86_schedule = processor_alias_table[i].schedule;
4205 ix86_tune = processor_alias_table[i].processor;
4207 else
4208 error ("CPU you selected does not support x86-64 "
4209 "instruction set");
4212 /* Intel CPUs have always interpreted SSE prefetch instructions as
4213 NOPs; so, we can enable SSE prefetch instructions even when
4214 -mtune (rather than -march) points us to a processor that has them.
4215 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4216 higher processors. */
4217 if (TARGET_CMOV
4218 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4219 x86_prefetch_sse = true;
4220 break;
4223 if (ix86_tune_specified && i == pta_size)
4225 error (main_args_p
4226 ? G_("bad value (%qs) for %<-mtune=%> switch")
4227 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4228 opts->x_ix86_tune_string);
4230 auto_vec <const char *> candidates;
4231 for (i = 0; i < pta_size; i++)
4232 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4233 || (processor_alias_table[i].flags & PTA_64BIT))
4234 candidates.safe_push (processor_alias_table[i].name);
4236 char *s;
4237 const char *hint
4238 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4239 if (hint)
4240 inform (input_location,
4241 main_args_p
4242 ? G_("valid arguments to %<-mtune=%> switch are: "
4243 "%s; did you mean %qs?")
4244 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4245 "%s; did you mean %qs?"), s, hint);
4246 else
4247 inform (input_location,
4248 main_args_p
4249 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4250 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4251 "are: %s"), s);
4252 XDELETEVEC (s);
4255 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4257 #ifndef USE_IX86_FRAME_POINTER
4258 #define USE_IX86_FRAME_POINTER 0
4259 #endif
4261 #ifndef USE_X86_64_FRAME_POINTER
4262 #define USE_X86_64_FRAME_POINTER 0
4263 #endif
4265 /* Set the default values for switches whose default depends on TARGET_64BIT
4266 in case they weren't overwritten by command line options. */
4267 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4269 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4270 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4271 if (opts->x_flag_asynchronous_unwind_tables
4272 && !opts_set->x_flag_unwind_tables
4273 && TARGET_64BIT_MS_ABI)
4274 opts->x_flag_unwind_tables = 1;
4275 if (opts->x_flag_asynchronous_unwind_tables == 2)
4276 opts->x_flag_unwind_tables
4277 = opts->x_flag_asynchronous_unwind_tables = 1;
4278 if (opts->x_flag_pcc_struct_return == 2)
4279 opts->x_flag_pcc_struct_return = 0;
4281 else
4283 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4284 opts->x_flag_omit_frame_pointer
4285 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4286 if (opts->x_flag_asynchronous_unwind_tables == 2)
4287 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4288 if (opts->x_flag_pcc_struct_return == 2)
4290 /* Intel MCU psABI specifies that -freg-struct-return should
4291 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4292 we check -miamcu so that -freg-struct-return is always
4293 turned on if -miamcu is used. */
4294 if (TARGET_IAMCU_P (opts->x_target_flags))
4295 opts->x_flag_pcc_struct_return = 0;
4296 else
4297 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4301 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4302 /* TODO: ix86_cost should be chosen at instruction or function granuality
4303 so for cold code we use size_cost even in !optimize_size compilation. */
4304 if (opts->x_optimize_size)
4305 ix86_cost = &ix86_size_cost;
4306 else
4307 ix86_cost = ix86_tune_cost;
4309 /* Arrange to set up i386_stack_locals for all functions. */
4310 init_machine_status = ix86_init_machine_status;
4312 /* Validate -mregparm= value. */
4313 if (opts_set->x_ix86_regparm)
4315 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4316 warning (0, "-mregparm is ignored in 64-bit mode");
4317 else if (TARGET_IAMCU_P (opts->x_target_flags))
4318 warning (0, "-mregparm is ignored for Intel MCU psABI");
4319 if (opts->x_ix86_regparm > REGPARM_MAX)
4321 error ("-mregparm=%d is not between 0 and %d",
4322 opts->x_ix86_regparm, REGPARM_MAX);
4323 opts->x_ix86_regparm = 0;
4326 if (TARGET_IAMCU_P (opts->x_target_flags)
4327 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4328 opts->x_ix86_regparm = REGPARM_MAX;
4330 /* Default align_* from the processor table. */
4331 ix86_default_align (opts);
4333 /* Provide default for -mbranch-cost= value. */
4334 if (!opts_set->x_ix86_branch_cost)
4335 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4337 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4339 opts->x_target_flags
4340 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4342 /* Enable by default the SSE and MMX builtins. Do allow the user to
4343 explicitly disable any of these. In particular, disabling SSE and
4344 MMX for kernel code is extremely useful. */
4345 if (!ix86_arch_specified)
4346 opts->x_ix86_isa_flags
4347 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4348 | TARGET_SUBTARGET64_ISA_DEFAULT)
4349 & ~opts->x_ix86_isa_flags_explicit);
4351 if (TARGET_RTD_P (opts->x_target_flags))
4352 warning (0,
4353 main_args_p
4354 ? G_("%<-mrtd%> is ignored in 64bit mode")
4355 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4357 else
4359 opts->x_target_flags
4360 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4362 if (!ix86_arch_specified)
4363 opts->x_ix86_isa_flags
4364 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4366 /* i386 ABI does not specify red zone. It still makes sense to use it
4367 when programmer takes care to stack from being destroyed. */
4368 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4369 opts->x_target_flags |= MASK_NO_RED_ZONE;
4372 /* Keep nonleaf frame pointers. */
4373 if (opts->x_flag_omit_frame_pointer)
4374 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4375 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4376 opts->x_flag_omit_frame_pointer = 1;
4378 /* If we're doing fast math, we don't care about comparison order
4379 wrt NaNs. This lets us use a shorter comparison sequence. */
4380 if (opts->x_flag_finite_math_only)
4381 opts->x_target_flags &= ~MASK_IEEE_FP;
4383 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4384 since the insns won't need emulation. */
4385 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4386 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4388 /* Likewise, if the target doesn't have a 387, or we've specified
4389 software floating point, don't use 387 inline intrinsics. */
4390 if (!TARGET_80387_P (opts->x_target_flags))
4391 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4393 /* Turn on MMX builtins for -msse. */
4394 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4395 opts->x_ix86_isa_flags
4396 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4398 /* Enable SSE prefetch. */
4399 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4400 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4401 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4402 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4403 x86_prefetch_sse = true;
4405 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4406 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4407 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4408 opts->x_ix86_isa_flags
4409 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4411 /* Enable lzcnt instruction for -mabm. */
4412 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4413 opts->x_ix86_isa_flags
4414 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4416 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4417 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4418 opts->x_ix86_isa_flags
4419 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4420 & ~opts->x_ix86_isa_flags_explicit);
4422 /* Validate -mpreferred-stack-boundary= value or default it to
4423 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4424 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4425 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4427 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4428 int max = TARGET_SEH ? 4 : 12;
4430 if (opts->x_ix86_preferred_stack_boundary_arg < min
4431 || opts->x_ix86_preferred_stack_boundary_arg > max)
4433 if (min == max)
4434 error ("-mpreferred-stack-boundary is not supported "
4435 "for this target");
4436 else
4437 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4438 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4440 else
4441 ix86_preferred_stack_boundary
4442 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4445 /* Set the default value for -mstackrealign. */
4446 if (!opts_set->x_ix86_force_align_arg_pointer)
4447 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4449 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4451 /* Validate -mincoming-stack-boundary= value or default it to
4452 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4453 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4454 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4456 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4458 if (opts->x_ix86_incoming_stack_boundary_arg < min
4459 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4460 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4461 opts->x_ix86_incoming_stack_boundary_arg, min);
4462 else
4464 ix86_user_incoming_stack_boundary
4465 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4466 ix86_incoming_stack_boundary
4467 = ix86_user_incoming_stack_boundary;
4471 #ifndef NO_PROFILE_COUNTERS
4472 if (flag_nop_mcount)
4473 error ("-mnop-mcount is not compatible with this target");
4474 #endif
4475 if (flag_nop_mcount && flag_pic)
4476 error ("-mnop-mcount is not implemented for -fPIC");
4478 /* Accept -msseregparm only if at least SSE support is enabled. */
4479 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4480 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4481 error (main_args_p
4482 ? G_("%<-msseregparm%> used without SSE enabled")
4483 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4485 if (opts_set->x_ix86_fpmath)
4487 if (opts->x_ix86_fpmath & FPMATH_SSE)
4489 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4491 if (TARGET_80387_P (opts->x_target_flags))
4493 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4494 opts->x_ix86_fpmath = FPMATH_387;
4497 else if ((opts->x_ix86_fpmath & FPMATH_387)
4498 && !TARGET_80387_P (opts->x_target_flags))
4500 warning (0, "387 instruction set disabled, using SSE arithmetics");
4501 opts->x_ix86_fpmath = FPMATH_SSE;
4505 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4506 fpmath=387. The second is however default at many targets since the
4507 extra 80bit precision of temporaries is considered to be part of ABI.
4508 Overwrite the default at least for -ffast-math.
4509 TODO: -mfpmath=both seems to produce same performing code with bit
4510 smaller binaries. It is however not clear if register allocation is
4511 ready for this setting.
4512 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4513 codegen. We may switch to 387 with -ffast-math for size optimized
4514 functions. */
4515 else if (fast_math_flags_set_p (&global_options)
4516 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4517 opts->x_ix86_fpmath = FPMATH_SSE;
4518 else
4519 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4521 /* Use external vectorized library in vectorizing intrinsics. */
4522 if (opts_set->x_ix86_veclibabi_type)
4523 switch (opts->x_ix86_veclibabi_type)
4525 case ix86_veclibabi_type_svml:
4526 ix86_veclib_handler = ix86_veclibabi_svml;
4527 break;
4529 case ix86_veclibabi_type_acml:
4530 ix86_veclib_handler = ix86_veclibabi_acml;
4531 break;
4533 default:
4534 gcc_unreachable ();
4537 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4538 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4539 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4541 /* If stack probes are required, the space used for large function
4542 arguments on the stack must also be probed, so enable
4543 -maccumulate-outgoing-args so this happens in the prologue. */
4544 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4545 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4547 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4548 warning (0,
4549 main_args_p
4550 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4551 "for correctness")
4552 : G_("stack probing requires "
4553 "%<target(\"accumulate-outgoing-args\")%> for "
4554 "correctness"));
4555 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4558 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4559 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4560 if (fixed_regs[BP_REG]
4561 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4563 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4564 warning (0,
4565 main_args_p
4566 ? G_("fixed ebp register requires "
4567 "%<-maccumulate-outgoing-args%>")
4568 : G_("fixed ebp register requires "
4569 "%<target(\"accumulate-outgoing-args\")%>"));
4570 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4573 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4575 char *p;
4576 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4577 p = strchr (internal_label_prefix, 'X');
4578 internal_label_prefix_len = p - internal_label_prefix;
4579 *p = '\0';
4582 /* When scheduling description is not available, disable scheduler pass
4583 so it won't slow down the compilation and make x87 code slower. */
4584 if (!TARGET_SCHEDULE)
4585 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4587 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4588 ix86_tune_cost->simultaneous_prefetches,
4589 opts->x_param_values,
4590 opts_set->x_param_values);
4591 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4592 ix86_tune_cost->prefetch_block,
4593 opts->x_param_values,
4594 opts_set->x_param_values);
4595 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4596 ix86_tune_cost->l1_cache_size,
4597 opts->x_param_values,
4598 opts_set->x_param_values);
4599 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4600 ix86_tune_cost->l2_cache_size,
4601 opts->x_param_values,
4602 opts_set->x_param_values);
4604 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4605 if (opts->x_flag_prefetch_loop_arrays < 0
4606 && HAVE_prefetch
4607 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4608 && !opts->x_optimize_size
4609 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4610 opts->x_flag_prefetch_loop_arrays = 1;
4612 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4613 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4614 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4615 targetm.expand_builtin_va_start = NULL;
4617 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4619 ix86_gen_leave = gen_leave_rex64;
4620 if (Pmode == DImode)
4622 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4623 ix86_gen_tls_local_dynamic_base_64
4624 = gen_tls_local_dynamic_base_64_di;
4626 else
4628 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4629 ix86_gen_tls_local_dynamic_base_64
4630 = gen_tls_local_dynamic_base_64_si;
4633 else
4634 ix86_gen_leave = gen_leave;
4636 if (Pmode == DImode)
4638 ix86_gen_add3 = gen_adddi3;
4639 ix86_gen_sub3 = gen_subdi3;
4640 ix86_gen_sub3_carry = gen_subdi3_carry;
4641 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4642 ix86_gen_andsp = gen_anddi3;
4643 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4644 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4645 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4646 ix86_gen_monitor = gen_sse3_monitor_di;
4647 ix86_gen_monitorx = gen_monitorx_di;
4648 ix86_gen_clzero = gen_clzero_di;
4650 else
4652 ix86_gen_add3 = gen_addsi3;
4653 ix86_gen_sub3 = gen_subsi3;
4654 ix86_gen_sub3_carry = gen_subsi3_carry;
4655 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4656 ix86_gen_andsp = gen_andsi3;
4657 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4658 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4659 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4660 ix86_gen_monitor = gen_sse3_monitor_si;
4661 ix86_gen_monitorx = gen_monitorx_si;
4662 ix86_gen_clzero = gen_clzero_si;
4665 #ifdef USE_IX86_CLD
4666 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4667 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4668 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4669 #endif
4671 /* Set the default value for -mfentry. */
4672 if (!opts_set->x_flag_fentry)
4673 opts->x_flag_fentry = TARGET_SEH;
4674 else
4676 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4677 && opts->x_flag_fentry)
4678 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4679 "with -fpic");
4680 else if (TARGET_SEH && !opts->x_flag_fentry)
4681 sorry ("-mno-fentry isn%'t compatible with SEH");
4684 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4685 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4687 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4688 && TARGET_EMIT_VZEROUPPER)
4689 opts->x_target_flags |= MASK_VZEROUPPER;
4690 if (!(opts_set->x_target_flags & MASK_STV))
4691 opts->x_target_flags |= MASK_STV;
4692 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4693 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4694 stack realignment will be extra cost the pass doesn't take into
4695 account and the pass can't realign the stack. */
4696 if (ix86_preferred_stack_boundary < 128
4697 || ix86_incoming_stack_boundary < 128
4698 || opts->x_ix86_force_align_arg_pointer)
4699 opts->x_target_flags &= ~MASK_STV;
4700 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4701 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4702 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4703 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4704 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4705 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4707 /* Enable 128-bit AVX instruction generation
4708 for the auto-vectorizer. */
4709 if (TARGET_AVX128_OPTIMAL
4710 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4711 opts->x_prefer_vector_width_type = PVW_AVX128;
4713 /* Use 256-bit AVX instruction generation
4714 in the auto-vectorizer. */
4715 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4716 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4717 opts->x_prefer_vector_width_type = PVW_AVX256;
4719 if (opts->x_ix86_recip_name)
4721 char *p = ASTRDUP (opts->x_ix86_recip_name);
4722 char *q;
4723 unsigned int mask, i;
4724 bool invert;
4726 while ((q = strtok (p, ",")) != NULL)
4728 p = NULL;
4729 if (*q == '!')
4731 invert = true;
4732 q++;
4734 else
4735 invert = false;
4737 if (!strcmp (q, "default"))
4738 mask = RECIP_MASK_ALL;
4739 else
4741 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4742 if (!strcmp (q, recip_options[i].string))
4744 mask = recip_options[i].mask;
4745 break;
4748 if (i == ARRAY_SIZE (recip_options))
4750 error ("unknown option for -mrecip=%s", q);
4751 invert = false;
4752 mask = RECIP_MASK_NONE;
4756 opts->x_recip_mask_explicit |= mask;
4757 if (invert)
4758 opts->x_recip_mask &= ~mask;
4759 else
4760 opts->x_recip_mask |= mask;
4764 if (TARGET_RECIP_P (opts->x_target_flags))
4765 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4766 else if (opts_set->x_target_flags & MASK_RECIP)
4767 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4769 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4770 for 64-bit Bionic. Also default long double to 64-bit for Intel
4771 MCU psABI. */
4772 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4773 && !(opts_set->x_target_flags
4774 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4775 opts->x_target_flags |= (TARGET_64BIT
4776 ? MASK_LONG_DOUBLE_128
4777 : MASK_LONG_DOUBLE_64);
4779 /* Only one of them can be active. */
4780 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4781 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4783 /* Handle stack protector */
4784 if (!opts_set->x_ix86_stack_protector_guard)
4785 opts->x_ix86_stack_protector_guard
4786 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4788 #ifdef TARGET_THREAD_SSP_OFFSET
4789 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4790 #endif
4792 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4794 char *endp;
4795 const char *str = ix86_stack_protector_guard_offset_str;
4797 errno = 0;
4798 int64_t offset;
4800 #if defined(INT64_T_IS_LONG)
4801 offset = strtol (str, &endp, 0);
4802 #else
4803 offset = strtoll (str, &endp, 0);
4804 #endif
4806 if (!*str || *endp || errno)
4807 error ("%qs is not a valid number "
4808 "in -mstack-protector-guard-offset=", str);
4810 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4811 HOST_WIDE_INT_C (0x7fffffff)))
4812 error ("%qs is not a valid offset "
4813 "in -mstack-protector-guard-offset=", str);
4815 ix86_stack_protector_guard_offset = offset;
4818 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4820 /* The kernel uses a different segment register for performance
4821 reasons; a system call would not have to trash the userspace
4822 segment register, which would be expensive. */
4823 if (ix86_cmodel == CM_KERNEL)
4824 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4826 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4828 const char *str = ix86_stack_protector_guard_reg_str;
4829 addr_space_t seg = ADDR_SPACE_GENERIC;
4831 /* Discard optional register prefix. */
4832 if (str[0] == '%')
4833 str++;
4835 if (strlen (str) == 2 && str[1] == 's')
4837 if (str[0] == 'f')
4838 seg = ADDR_SPACE_SEG_FS;
4839 else if (str[0] == 'g')
4840 seg = ADDR_SPACE_SEG_GS;
4843 if (seg == ADDR_SPACE_GENERIC)
4844 error ("%qs is not a valid base register "
4845 "in -mstack-protector-guard-reg=",
4846 ix86_stack_protector_guard_reg_str);
4848 ix86_stack_protector_guard_reg = seg;
4851 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4852 if (opts->x_ix86_tune_memcpy_strategy)
4854 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4855 ix86_parse_stringop_strategy_string (str, false);
4856 free (str);
4859 if (opts->x_ix86_tune_memset_strategy)
4861 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4862 ix86_parse_stringop_strategy_string (str, true);
4863 free (str);
4866 /* Save the initial options in case the user does function specific
4867 options. */
4868 if (main_args_p)
4869 target_option_default_node = target_option_current_node
4870 = build_target_option_node (opts);
4872 /* Do not support control flow instrumentation if CET is not enabled. */
4873 if (opts->x_flag_cf_protection != CF_NONE)
4875 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4876 || TARGET_SHSTK_P (opts->x_ix86_isa_flags)))
4878 if (flag_cf_protection == CF_FULL)
4880 error ("%<-fcf-protection=full%> requires CET support "
4881 "on this target. Use -mcet or one of -mibt, "
4882 "-mshstk options to enable CET");
4884 else if (flag_cf_protection == CF_BRANCH)
4886 error ("%<-fcf-protection=branch%> requires CET support "
4887 "on this target. Use -mcet or one of -mibt, "
4888 "-mshstk options to enable CET");
4890 else if (flag_cf_protection == CF_RETURN)
4892 error ("%<-fcf-protection=return%> requires CET support "
4893 "on this target. Use -mcet or one of -mibt, "
4894 "-mshstk options to enable CET");
4896 flag_cf_protection = CF_NONE;
4897 return false;
4899 opts->x_flag_cf_protection =
4900 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4903 return true;
4906 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4908 static void
4909 ix86_option_override (void)
4911 ix86_option_override_internal (true, &global_options, &global_options_set);
4914 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4915 static char *
4916 ix86_offload_options (void)
4918 if (TARGET_LP64)
4919 return xstrdup ("-foffload-abi=lp64");
4920 return xstrdup ("-foffload-abi=ilp32");
4923 /* Update register usage after having seen the compiler flags. */
4925 static void
4926 ix86_conditional_register_usage (void)
4928 int i, c_mask;
4930 /* If there are no caller-saved registers, preserve all registers.
4931 except fixed_regs and registers used for function return value
4932 since aggregate_value_p checks call_used_regs[regno] on return
4933 value. */
4934 if (cfun && cfun->machine->no_caller_saved_registers)
4935 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4936 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4937 call_used_regs[i] = 0;
4939 /* For 32-bit targets, squash the REX registers. */
4940 if (! TARGET_64BIT)
4942 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4943 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4944 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4945 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4946 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4947 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4950 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4951 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4953 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4955 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4957 /* Set/reset conditionally defined registers from
4958 CALL_USED_REGISTERS initializer. */
4959 if (call_used_regs[i] > 1)
4960 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4962 /* Calculate registers of CLOBBERED_REGS register set
4963 as call used registers from GENERAL_REGS register set. */
4964 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4965 && call_used_regs[i])
4966 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4969 /* If MMX is disabled, squash the registers. */
4970 if (! TARGET_MMX)
4971 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4972 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4973 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4975 /* If SSE is disabled, squash the registers. */
4976 if (! TARGET_SSE)
4977 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4978 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4979 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4981 /* If the FPU is disabled, squash the registers. */
4982 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4983 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4984 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4985 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4987 /* If AVX512F is disabled, squash the registers. */
4988 if (! TARGET_AVX512F)
4990 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4993 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4994 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4997 /* If MPX is disabled, squash the registers. */
4998 if (! TARGET_MPX)
4999 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5000 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5003 /* Canonicalize a comparison from one we don't have to one we do have. */
5005 static void
5006 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5007 bool op0_preserve_value)
5009 /* The order of operands in x87 ficom compare is forced by combine in
5010 simplify_comparison () function. Float operator is treated as RTX_OBJ
5011 with a precedence over other operators and is always put in the first
5012 place. Swap condition and operands to match ficom instruction. */
5013 if (!op0_preserve_value
5014 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5016 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5018 /* We are called only for compares that are split to SAHF instruction.
5019 Ensure that we have setcc/jcc insn for the swapped condition. */
5020 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5022 std::swap (*op0, *op1);
5023 *code = (int) scode;
5028 /* Save the current options */
5030 static void
5031 ix86_function_specific_save (struct cl_target_option *ptr,
5032 struct gcc_options *opts)
5034 ptr->arch = ix86_arch;
5035 ptr->schedule = ix86_schedule;
5036 ptr->prefetch_sse = x86_prefetch_sse;
5037 ptr->tune = ix86_tune;
5038 ptr->branch_cost = ix86_branch_cost;
5039 ptr->tune_defaulted = ix86_tune_defaulted;
5040 ptr->arch_specified = ix86_arch_specified;
5041 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5042 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5043 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5044 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5045 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5046 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5047 ptr->x_ix86_abi = opts->x_ix86_abi;
5048 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5049 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5050 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5051 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5052 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5053 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5054 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5055 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5056 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5057 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5058 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5059 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5060 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5061 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5062 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5063 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5064 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5065 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5066 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5067 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5069 /* The fields are char but the variables are not; make sure the
5070 values fit in the fields. */
5071 gcc_assert (ptr->arch == ix86_arch);
5072 gcc_assert (ptr->schedule == ix86_schedule);
5073 gcc_assert (ptr->tune == ix86_tune);
5074 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5077 /* Restore the current options */
5079 static void
5080 ix86_function_specific_restore (struct gcc_options *opts,
5081 struct cl_target_option *ptr)
5083 enum processor_type old_tune = ix86_tune;
5084 enum processor_type old_arch = ix86_arch;
5085 unsigned int ix86_arch_mask;
5086 int i;
5088 /* We don't change -fPIC. */
5089 opts->x_flag_pic = flag_pic;
5091 ix86_arch = (enum processor_type) ptr->arch;
5092 ix86_schedule = (enum attr_cpu) ptr->schedule;
5093 ix86_tune = (enum processor_type) ptr->tune;
5094 x86_prefetch_sse = ptr->prefetch_sse;
5095 opts->x_ix86_branch_cost = ptr->branch_cost;
5096 ix86_tune_defaulted = ptr->tune_defaulted;
5097 ix86_arch_specified = ptr->arch_specified;
5098 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5099 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5100 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5101 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5102 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5103 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5104 opts->x_ix86_abi = ptr->x_ix86_abi;
5105 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5106 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5107 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5108 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5109 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5110 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5111 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5112 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5113 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5114 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5115 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5116 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5117 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5118 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5119 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5120 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5121 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5122 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5123 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5124 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5125 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5126 /* TODO: ix86_cost should be chosen at instruction or function granuality
5127 so for cold code we use size_cost even in !optimize_size compilation. */
5128 if (opts->x_optimize_size)
5129 ix86_cost = &ix86_size_cost;
5130 else
5131 ix86_cost = ix86_tune_cost;
5133 /* Recreate the arch feature tests if the arch changed */
5134 if (old_arch != ix86_arch)
5136 ix86_arch_mask = 1u << ix86_arch;
5137 for (i = 0; i < X86_ARCH_LAST; ++i)
5138 ix86_arch_features[i]
5139 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5142 /* Recreate the tune optimization tests */
5143 if (old_tune != ix86_tune)
5144 set_ix86_tune_features (ix86_tune, false);
5147 /* Adjust target options after streaming them in. This is mainly about
5148 reconciling them with global options. */
5150 static void
5151 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5153 /* flag_pic is a global option, but ix86_cmodel is target saved option
5154 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5155 for PIC, or error out. */
5156 if (flag_pic)
5157 switch (ptr->x_ix86_cmodel)
5159 case CM_SMALL:
5160 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5161 break;
5163 case CM_MEDIUM:
5164 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5165 break;
5167 case CM_LARGE:
5168 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5169 break;
5171 case CM_KERNEL:
5172 error ("code model %s does not support PIC mode", "kernel");
5173 break;
5175 default:
5176 break;
5178 else
5179 switch (ptr->x_ix86_cmodel)
5181 case CM_SMALL_PIC:
5182 ptr->x_ix86_cmodel = CM_SMALL;
5183 break;
5185 case CM_MEDIUM_PIC:
5186 ptr->x_ix86_cmodel = CM_MEDIUM;
5187 break;
5189 case CM_LARGE_PIC:
5190 ptr->x_ix86_cmodel = CM_LARGE;
5191 break;
5193 default:
5194 break;
5198 /* Print the current options */
5200 static void
5201 ix86_function_specific_print (FILE *file, int indent,
5202 struct cl_target_option *ptr)
5204 char *target_string
5205 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5206 ptr->x_target_flags, ptr->x_ix86_target_flags,
5207 NULL, NULL, ptr->x_ix86_fpmath, false);
5209 gcc_assert (ptr->arch < PROCESSOR_max);
5210 fprintf (file, "%*sarch = %d (%s)\n",
5211 indent, "",
5212 ptr->arch, processor_target_table[ptr->arch].name);
5214 gcc_assert (ptr->tune < PROCESSOR_max);
5215 fprintf (file, "%*stune = %d (%s)\n",
5216 indent, "",
5217 ptr->tune, processor_target_table[ptr->tune].name);
5219 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5221 if (target_string)
5223 fprintf (file, "%*s%s\n", indent, "", target_string);
5224 free (target_string);
5229 /* Inner function to process the attribute((target(...))), take an argument and
5230 set the current options from the argument. If we have a list, recursively go
5231 over the list. */
5233 static bool
5234 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5235 struct gcc_options *opts,
5236 struct gcc_options *opts_set,
5237 struct gcc_options *enum_opts_set)
5239 char *next_optstr;
5240 bool ret = true;
5242 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5243 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5244 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5245 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5246 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5248 enum ix86_opt_type
5250 ix86_opt_unknown,
5251 ix86_opt_yes,
5252 ix86_opt_no,
5253 ix86_opt_str,
5254 ix86_opt_enum,
5255 ix86_opt_isa
5258 static const struct
5260 const char *string;
5261 size_t len;
5262 enum ix86_opt_type type;
5263 int opt;
5264 int mask;
5265 } attrs[] = {
5266 /* isa options */
5267 IX86_ATTR_ISA ("sgx", OPT_msgx),
5268 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5269 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5270 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5271 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5272 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5273 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5275 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5276 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5277 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5278 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5279 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5280 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5281 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5282 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5283 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5284 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5285 IX86_ATTR_ISA ("fma", OPT_mfma),
5286 IX86_ATTR_ISA ("xop", OPT_mxop),
5287 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5288 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5289 IX86_ATTR_ISA ("avx", OPT_mavx),
5290 IX86_ATTR_ISA ("sse4", OPT_msse4),
5291 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5292 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5293 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5294 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5295 IX86_ATTR_ISA ("sse3", OPT_msse3),
5296 IX86_ATTR_ISA ("aes", OPT_maes),
5297 IX86_ATTR_ISA ("sha", OPT_msha),
5298 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5299 IX86_ATTR_ISA ("sse2", OPT_msse2),
5300 IX86_ATTR_ISA ("sse", OPT_msse),
5301 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5302 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5303 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5304 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5305 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5306 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5307 IX86_ATTR_ISA ("adx", OPT_madx),
5308 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5309 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5310 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5311 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5312 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5313 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5314 IX86_ATTR_ISA ("abm", OPT_mabm),
5315 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5316 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5317 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5318 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5319 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5320 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5321 IX86_ATTR_ISA ("sahf", OPT_msahf),
5322 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5323 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5324 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5325 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5326 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5327 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5328 IX86_ATTR_ISA ("pku", OPT_mpku),
5329 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5330 IX86_ATTR_ISA ("hle", OPT_mhle),
5331 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5332 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5333 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5334 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5335 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5336 IX86_ATTR_ISA ("ibt", OPT_mibt),
5337 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5338 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5339 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5341 /* enum options */
5342 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5344 /* string options */
5345 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5346 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5348 /* flag options */
5349 IX86_ATTR_YES ("cld",
5350 OPT_mcld,
5351 MASK_CLD),
5353 IX86_ATTR_NO ("fancy-math-387",
5354 OPT_mfancy_math_387,
5355 MASK_NO_FANCY_MATH_387),
5357 IX86_ATTR_YES ("ieee-fp",
5358 OPT_mieee_fp,
5359 MASK_IEEE_FP),
5361 IX86_ATTR_YES ("inline-all-stringops",
5362 OPT_minline_all_stringops,
5363 MASK_INLINE_ALL_STRINGOPS),
5365 IX86_ATTR_YES ("inline-stringops-dynamically",
5366 OPT_minline_stringops_dynamically,
5367 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5369 IX86_ATTR_NO ("align-stringops",
5370 OPT_mno_align_stringops,
5371 MASK_NO_ALIGN_STRINGOPS),
5373 IX86_ATTR_YES ("recip",
5374 OPT_mrecip,
5375 MASK_RECIP),
5379 /* If this is a list, recurse to get the options. */
5380 if (TREE_CODE (args) == TREE_LIST)
5382 bool ret = true;
5384 for (; args; args = TREE_CHAIN (args))
5385 if (TREE_VALUE (args)
5386 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5387 p_strings, opts, opts_set,
5388 enum_opts_set))
5389 ret = false;
5391 return ret;
5394 else if (TREE_CODE (args) != STRING_CST)
5396 error ("attribute %<target%> argument not a string");
5397 return false;
5400 /* Handle multiple arguments separated by commas. */
5401 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5403 while (next_optstr && *next_optstr != '\0')
5405 char *p = next_optstr;
5406 char *orig_p = p;
5407 char *comma = strchr (next_optstr, ',');
5408 const char *opt_string;
5409 size_t len, opt_len;
5410 int opt;
5411 bool opt_set_p;
5412 char ch;
5413 unsigned i;
5414 enum ix86_opt_type type = ix86_opt_unknown;
5415 int mask = 0;
5417 if (comma)
5419 *comma = '\0';
5420 len = comma - next_optstr;
5421 next_optstr = comma + 1;
5423 else
5425 len = strlen (p);
5426 next_optstr = NULL;
5429 /* Recognize no-xxx. */
5430 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5432 opt_set_p = false;
5433 p += 3;
5434 len -= 3;
5436 else
5437 opt_set_p = true;
5439 /* Find the option. */
5440 ch = *p;
5441 opt = N_OPTS;
5442 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5444 type = attrs[i].type;
5445 opt_len = attrs[i].len;
5446 if (ch == attrs[i].string[0]
5447 && ((type != ix86_opt_str && type != ix86_opt_enum)
5448 ? len == opt_len
5449 : len > opt_len)
5450 && memcmp (p, attrs[i].string, opt_len) == 0)
5452 opt = attrs[i].opt;
5453 mask = attrs[i].mask;
5454 opt_string = attrs[i].string;
5455 break;
5459 /* Process the option. */
5460 if (opt == N_OPTS)
5462 error ("attribute(target(\"%s\")) is unknown", orig_p);
5463 ret = false;
5466 else if (type == ix86_opt_isa)
5468 struct cl_decoded_option decoded;
5470 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5471 ix86_handle_option (opts, opts_set,
5472 &decoded, input_location);
5475 else if (type == ix86_opt_yes || type == ix86_opt_no)
5477 if (type == ix86_opt_no)
5478 opt_set_p = !opt_set_p;
5480 if (opt_set_p)
5481 opts->x_target_flags |= mask;
5482 else
5483 opts->x_target_flags &= ~mask;
5486 else if (type == ix86_opt_str)
5488 if (p_strings[opt])
5490 error ("option(\"%s\") was already specified", opt_string);
5491 ret = false;
5493 else
5494 p_strings[opt] = xstrdup (p + opt_len);
5497 else if (type == ix86_opt_enum)
5499 bool arg_ok;
5500 int value;
5502 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5503 if (arg_ok)
5504 set_option (opts, enum_opts_set, opt, value,
5505 p + opt_len, DK_UNSPECIFIED, input_location,
5506 global_dc);
5507 else
5509 error ("attribute(target(\"%s\")) is unknown", orig_p);
5510 ret = false;
5514 else
5515 gcc_unreachable ();
5518 return ret;
5521 /* Release allocated strings. */
5522 static void
5523 release_options_strings (char **option_strings)
5525 /* Free up memory allocated to hold the strings */
5526 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5527 free (option_strings[i]);
5530 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5532 tree
5533 ix86_valid_target_attribute_tree (tree args,
5534 struct gcc_options *opts,
5535 struct gcc_options *opts_set)
5537 const char *orig_arch_string = opts->x_ix86_arch_string;
5538 const char *orig_tune_string = opts->x_ix86_tune_string;
5539 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5540 int orig_tune_defaulted = ix86_tune_defaulted;
5541 int orig_arch_specified = ix86_arch_specified;
5542 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5543 tree t = NULL_TREE;
5544 struct cl_target_option *def
5545 = TREE_TARGET_OPTION (target_option_default_node);
5546 struct gcc_options enum_opts_set;
5548 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5550 /* Process each of the options on the chain. */
5551 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5552 opts_set, &enum_opts_set))
5553 return error_mark_node;
5555 /* If the changed options are different from the default, rerun
5556 ix86_option_override_internal, and then save the options away.
5557 The string options are attribute options, and will be undone
5558 when we copy the save structure. */
5559 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5560 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5561 || opts->x_target_flags != def->x_target_flags
5562 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5563 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5564 || enum_opts_set.x_ix86_fpmath)
5566 /* If we are using the default tune= or arch=, undo the string assigned,
5567 and use the default. */
5568 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5570 opts->x_ix86_arch_string
5571 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5573 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5574 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5575 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5576 | OPTION_MASK_ABI_64
5577 | OPTION_MASK_ABI_X32
5578 | OPTION_MASK_CODE16);
5579 opts->x_ix86_isa_flags2 = 0;
5581 else if (!orig_arch_specified)
5582 opts->x_ix86_arch_string = NULL;
5584 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5585 opts->x_ix86_tune_string
5586 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5587 else if (orig_tune_defaulted)
5588 opts->x_ix86_tune_string = NULL;
5590 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5591 if (enum_opts_set.x_ix86_fpmath)
5592 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5594 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5595 bool r = ix86_option_override_internal (false, opts, opts_set);
5596 if (!r)
5598 release_options_strings (option_strings);
5599 return error_mark_node;
5602 /* Add any builtin functions with the new isa if any. */
5603 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5605 /* Save the current options unless we are validating options for
5606 #pragma. */
5607 t = build_target_option_node (opts);
5609 opts->x_ix86_arch_string = orig_arch_string;
5610 opts->x_ix86_tune_string = orig_tune_string;
5611 opts_set->x_ix86_fpmath = orig_fpmath_set;
5613 release_options_strings (option_strings);
5616 return t;
5619 /* Hook to validate attribute((target("string"))). */
5621 static bool
5622 ix86_valid_target_attribute_p (tree fndecl,
5623 tree ARG_UNUSED (name),
5624 tree args,
5625 int ARG_UNUSED (flags))
5627 struct gcc_options func_options;
5628 tree new_target, new_optimize;
5629 bool ret = true;
5631 /* attribute((target("default"))) does nothing, beyond
5632 affecting multi-versioning. */
5633 if (TREE_VALUE (args)
5634 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5635 && TREE_CHAIN (args) == NULL_TREE
5636 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5637 return true;
5639 tree old_optimize = build_optimization_node (&global_options);
5641 /* Get the optimization options of the current function. */
5642 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5644 if (!func_optimize)
5645 func_optimize = old_optimize;
5647 /* Init func_options. */
5648 memset (&func_options, 0, sizeof (func_options));
5649 init_options_struct (&func_options, NULL);
5650 lang_hooks.init_options_struct (&func_options);
5652 cl_optimization_restore (&func_options,
5653 TREE_OPTIMIZATION (func_optimize));
5655 /* Initialize func_options to the default before its target options can
5656 be set. */
5657 cl_target_option_restore (&func_options,
5658 TREE_TARGET_OPTION (target_option_default_node));
5660 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5661 &global_options_set);
5663 new_optimize = build_optimization_node (&func_options);
5665 if (new_target == error_mark_node)
5666 ret = false;
5668 else if (fndecl && new_target)
5670 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5672 if (old_optimize != new_optimize)
5673 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5676 finalize_options_struct (&func_options);
5678 return ret;
5682 /* Hook to determine if one function can safely inline another. */
5684 static bool
5685 ix86_can_inline_p (tree caller, tree callee)
5687 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5688 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5689 if (!callee_tree)
5690 callee_tree = target_option_default_node;
5691 if (!caller_tree)
5692 caller_tree = target_option_default_node;
5693 if (callee_tree == caller_tree)
5694 return true;
5696 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5697 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5698 bool ret = false;
5700 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5701 function can inline a SSE2 function but a SSE2 function can't inline
5702 a SSE4 function. */
5703 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5704 != callee_opts->x_ix86_isa_flags)
5705 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5706 != callee_opts->x_ix86_isa_flags2))
5707 ret = false;
5709 /* See if we have the same non-isa options. */
5710 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5711 ret = false;
5713 /* See if arch, tune, etc. are the same. */
5714 else if (caller_opts->arch != callee_opts->arch)
5715 ret = false;
5717 else if (caller_opts->tune != callee_opts->tune)
5718 ret = false;
5720 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5721 /* If the calle doesn't use FP expressions differences in
5722 ix86_fpmath can be ignored. We are called from FEs
5723 for multi-versioning call optimization, so beware of
5724 ipa_fn_summaries not available. */
5725 && (! ipa_fn_summaries
5726 || ipa_fn_summaries->get
5727 (cgraph_node::get (callee))->fp_expressions))
5728 ret = false;
5730 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5731 ret = false;
5733 else
5734 ret = true;
5736 return ret;
5740 /* Remember the last target of ix86_set_current_function. */
5741 static GTY(()) tree ix86_previous_fndecl;
5743 /* Set targets globals to the default (or current #pragma GCC target
5744 if active). Invalidate ix86_previous_fndecl cache. */
5746 void
5747 ix86_reset_previous_fndecl (void)
5749 tree new_tree = target_option_current_node;
5750 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5751 if (TREE_TARGET_GLOBALS (new_tree))
5752 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5753 else if (new_tree == target_option_default_node)
5754 restore_target_globals (&default_target_globals);
5755 else
5756 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5757 ix86_previous_fndecl = NULL_TREE;
5760 /* Set the func_type field from the function FNDECL. */
5762 static void
5763 ix86_set_func_type (tree fndecl)
5765 if (cfun->machine->func_type == TYPE_UNKNOWN)
5767 if (lookup_attribute ("interrupt",
5768 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5770 if (ix86_function_naked (fndecl))
5771 error_at (DECL_SOURCE_LOCATION (fndecl),
5772 "interrupt and naked attributes are not compatible");
5774 int nargs = 0;
5775 for (tree arg = DECL_ARGUMENTS (fndecl);
5776 arg;
5777 arg = TREE_CHAIN (arg))
5778 nargs++;
5779 cfun->machine->no_caller_saved_registers = true;
5780 cfun->machine->func_type
5781 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5783 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5785 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5786 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5787 sorry ("Only DWARF debug format is supported for interrupt "
5788 "service routine.");
5790 else
5792 cfun->machine->func_type = TYPE_NORMAL;
5793 if (lookup_attribute ("no_caller_saved_registers",
5794 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5795 cfun->machine->no_caller_saved_registers = true;
5800 /* Establish appropriate back-end context for processing the function
5801 FNDECL. The argument might be NULL to indicate processing at top
5802 level, outside of any function scope. */
5803 static void
5804 ix86_set_current_function (tree fndecl)
5806 /* Only change the context if the function changes. This hook is called
5807 several times in the course of compiling a function, and we don't want to
5808 slow things down too much or call target_reinit when it isn't safe. */
5809 if (fndecl == ix86_previous_fndecl)
5811 /* There may be 2 function bodies for the same function FNDECL,
5812 one is extern inline and one isn't. Call ix86_set_func_type
5813 to set the func_type field. */
5814 if (fndecl != NULL_TREE)
5815 ix86_set_func_type (fndecl);
5816 return;
5819 tree old_tree;
5820 if (ix86_previous_fndecl == NULL_TREE)
5821 old_tree = target_option_current_node;
5822 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5823 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5824 else
5825 old_tree = target_option_default_node;
5827 if (fndecl == NULL_TREE)
5829 if (old_tree != target_option_current_node)
5830 ix86_reset_previous_fndecl ();
5831 return;
5834 ix86_set_func_type (fndecl);
5836 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5837 if (new_tree == NULL_TREE)
5838 new_tree = target_option_default_node;
5840 if (old_tree != new_tree)
5842 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5843 if (TREE_TARGET_GLOBALS (new_tree))
5844 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5845 else if (new_tree == target_option_default_node)
5846 restore_target_globals (&default_target_globals);
5847 else
5848 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5850 ix86_previous_fndecl = fndecl;
5852 static bool prev_no_caller_saved_registers;
5854 /* 64-bit MS and SYSV ABI have different set of call used registers.
5855 Avoid expensive re-initialization of init_regs each time we switch
5856 function context. */
5857 if (TARGET_64BIT
5858 && (call_used_regs[SI_REG]
5859 == (cfun->machine->call_abi == MS_ABI)))
5860 reinit_regs ();
5861 /* Need to re-initialize init_regs if caller-saved registers are
5862 changed. */
5863 else if (prev_no_caller_saved_registers
5864 != cfun->machine->no_caller_saved_registers)
5865 reinit_regs ();
5867 if (cfun->machine->func_type != TYPE_NORMAL
5868 || cfun->machine->no_caller_saved_registers)
5870 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5871 may change processor state. */
5872 const char *isa;
5873 if (TARGET_MPX)
5874 isa = "MPX";
5875 else if (TARGET_SSE)
5876 isa = "SSE";
5877 else if (TARGET_MMX)
5878 isa = "MMX/3Dnow";
5879 else if (TARGET_80387)
5880 isa = "80387";
5881 else
5882 isa = NULL;
5883 if (isa != NULL)
5885 if (cfun->machine->func_type != TYPE_NORMAL)
5886 sorry ("%s instructions aren't allowed in %s service routine",
5887 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5888 ? "exception" : "interrupt"));
5889 else
5890 sorry ("%s instructions aren't allowed in function with "
5891 "no_caller_saved_registers attribute", isa);
5892 /* Don't issue the same error twice. */
5893 cfun->machine->func_type = TYPE_NORMAL;
5894 cfun->machine->no_caller_saved_registers = false;
5898 prev_no_caller_saved_registers
5899 = cfun->machine->no_caller_saved_registers;
5903 /* Return true if this goes in large data/bss. */
5905 static bool
5906 ix86_in_large_data_p (tree exp)
5908 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5909 return false;
5911 if (exp == NULL_TREE)
5912 return false;
5914 /* Functions are never large data. */
5915 if (TREE_CODE (exp) == FUNCTION_DECL)
5916 return false;
5918 /* Automatic variables are never large data. */
5919 if (VAR_P (exp) && !is_global_var (exp))
5920 return false;
5922 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5924 const char *section = DECL_SECTION_NAME (exp);
5925 if (strcmp (section, ".ldata") == 0
5926 || strcmp (section, ".lbss") == 0)
5927 return true;
5928 return false;
5930 else
5932 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5934 /* If this is an incomplete type with size 0, then we can't put it
5935 in data because it might be too big when completed. Also,
5936 int_size_in_bytes returns -1 if size can vary or is larger than
5937 an integer in which case also it is safer to assume that it goes in
5938 large data. */
5939 if (size <= 0 || size > ix86_section_threshold)
5940 return true;
5943 return false;
5946 /* i386-specific section flag to mark large sections. */
5947 #define SECTION_LARGE SECTION_MACH_DEP
5949 /* Switch to the appropriate section for output of DECL.
5950 DECL is either a `VAR_DECL' node or a constant of some sort.
5951 RELOC indicates whether forming the initial value of DECL requires
5952 link-time relocations. */
5954 ATTRIBUTE_UNUSED static section *
5955 x86_64_elf_select_section (tree decl, int reloc,
5956 unsigned HOST_WIDE_INT align)
5958 if (ix86_in_large_data_p (decl))
5960 const char *sname = NULL;
5961 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5962 switch (categorize_decl_for_section (decl, reloc))
5964 case SECCAT_DATA:
5965 sname = ".ldata";
5966 break;
5967 case SECCAT_DATA_REL:
5968 sname = ".ldata.rel";
5969 break;
5970 case SECCAT_DATA_REL_LOCAL:
5971 sname = ".ldata.rel.local";
5972 break;
5973 case SECCAT_DATA_REL_RO:
5974 sname = ".ldata.rel.ro";
5975 break;
5976 case SECCAT_DATA_REL_RO_LOCAL:
5977 sname = ".ldata.rel.ro.local";
5978 break;
5979 case SECCAT_BSS:
5980 sname = ".lbss";
5981 flags |= SECTION_BSS;
5982 break;
5983 case SECCAT_RODATA:
5984 case SECCAT_RODATA_MERGE_STR:
5985 case SECCAT_RODATA_MERGE_STR_INIT:
5986 case SECCAT_RODATA_MERGE_CONST:
5987 sname = ".lrodata";
5988 flags &= ~SECTION_WRITE;
5989 break;
5990 case SECCAT_SRODATA:
5991 case SECCAT_SDATA:
5992 case SECCAT_SBSS:
5993 gcc_unreachable ();
5994 case SECCAT_TEXT:
5995 case SECCAT_TDATA:
5996 case SECCAT_TBSS:
5997 /* We don't split these for medium model. Place them into
5998 default sections and hope for best. */
5999 break;
6001 if (sname)
6003 /* We might get called with string constants, but get_named_section
6004 doesn't like them as they are not DECLs. Also, we need to set
6005 flags in that case. */
6006 if (!DECL_P (decl))
6007 return get_section (sname, flags, NULL);
6008 return get_named_section (decl, sname, reloc);
6011 return default_elf_select_section (decl, reloc, align);
6014 /* Select a set of attributes for section NAME based on the properties
6015 of DECL and whether or not RELOC indicates that DECL's initializer
6016 might contain runtime relocations. */
6018 static unsigned int ATTRIBUTE_UNUSED
6019 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6021 unsigned int flags = default_section_type_flags (decl, name, reloc);
6023 if (ix86_in_large_data_p (decl))
6024 flags |= SECTION_LARGE;
6026 if (decl == NULL_TREE
6027 && (strcmp (name, ".ldata.rel.ro") == 0
6028 || strcmp (name, ".ldata.rel.ro.local") == 0))
6029 flags |= SECTION_RELRO;
6031 if (strcmp (name, ".lbss") == 0
6032 || strncmp (name, ".lbss.", 5) == 0
6033 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6034 flags |= SECTION_BSS;
6036 return flags;
6039 /* Build up a unique section name, expressed as a
6040 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6041 RELOC indicates whether the initial value of EXP requires
6042 link-time relocations. */
6044 static void ATTRIBUTE_UNUSED
6045 x86_64_elf_unique_section (tree decl, int reloc)
6047 if (ix86_in_large_data_p (decl))
6049 const char *prefix = NULL;
6050 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6051 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6053 switch (categorize_decl_for_section (decl, reloc))
6055 case SECCAT_DATA:
6056 case SECCAT_DATA_REL:
6057 case SECCAT_DATA_REL_LOCAL:
6058 case SECCAT_DATA_REL_RO:
6059 case SECCAT_DATA_REL_RO_LOCAL:
6060 prefix = one_only ? ".ld" : ".ldata";
6061 break;
6062 case SECCAT_BSS:
6063 prefix = one_only ? ".lb" : ".lbss";
6064 break;
6065 case SECCAT_RODATA:
6066 case SECCAT_RODATA_MERGE_STR:
6067 case SECCAT_RODATA_MERGE_STR_INIT:
6068 case SECCAT_RODATA_MERGE_CONST:
6069 prefix = one_only ? ".lr" : ".lrodata";
6070 break;
6071 case SECCAT_SRODATA:
6072 case SECCAT_SDATA:
6073 case SECCAT_SBSS:
6074 gcc_unreachable ();
6075 case SECCAT_TEXT:
6076 case SECCAT_TDATA:
6077 case SECCAT_TBSS:
6078 /* We don't split these for medium model. Place them into
6079 default sections and hope for best. */
6080 break;
6082 if (prefix)
6084 const char *name, *linkonce;
6085 char *string;
6087 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6088 name = targetm.strip_name_encoding (name);
6090 /* If we're using one_only, then there needs to be a .gnu.linkonce
6091 prefix to the section name. */
6092 linkonce = one_only ? ".gnu.linkonce" : "";
6094 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6096 set_decl_section_name (decl, string);
6097 return;
6100 default_unique_section (decl, reloc);
6103 #ifdef COMMON_ASM_OP
6105 #ifndef LARGECOMM_SECTION_ASM_OP
6106 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6107 #endif
6109 /* This says how to output assembler code to declare an
6110 uninitialized external linkage data object.
6112 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6113 large objects. */
6114 void
6115 x86_elf_aligned_decl_common (FILE *file, tree decl,
6116 const char *name, unsigned HOST_WIDE_INT size,
6117 int align)
6119 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6120 && size > (unsigned int)ix86_section_threshold)
6122 switch_to_section (get_named_section (decl, ".lbss", 0));
6123 fputs (LARGECOMM_SECTION_ASM_OP, file);
6125 else
6126 fputs (COMMON_ASM_OP, file);
6127 assemble_name (file, name);
6128 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6129 size, align / BITS_PER_UNIT);
6131 #endif
6133 /* Utility function for targets to use in implementing
6134 ASM_OUTPUT_ALIGNED_BSS. */
6136 void
6137 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6138 unsigned HOST_WIDE_INT size, int align)
6140 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6141 && size > (unsigned int)ix86_section_threshold)
6142 switch_to_section (get_named_section (decl, ".lbss", 0));
6143 else
6144 switch_to_section (bss_section);
6145 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6146 #ifdef ASM_DECLARE_OBJECT_NAME
6147 last_assemble_variable_decl = decl;
6148 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6149 #else
6150 /* Standard thing is just output label for the object. */
6151 ASM_OUTPUT_LABEL (file, name);
6152 #endif /* ASM_DECLARE_OBJECT_NAME */
6153 ASM_OUTPUT_SKIP (file, size ? size : 1);
6156 /* Decide whether we must probe the stack before any space allocation
6157 on this target. It's essentially TARGET_STACK_PROBE except when
6158 -fstack-check causes the stack to be already probed differently. */
6160 bool
6161 ix86_target_stack_probe (void)
6163 /* Do not probe the stack twice if static stack checking is enabled. */
6164 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6165 return false;
6167 return TARGET_STACK_PROBE;
6170 /* Decide whether we can make a sibling call to a function. DECL is the
6171 declaration of the function being targeted by the call and EXP is the
6172 CALL_EXPR representing the call. */
6174 static bool
6175 ix86_function_ok_for_sibcall (tree decl, tree exp)
6177 tree type, decl_or_type;
6178 rtx a, b;
6179 bool bind_global = decl && !targetm.binds_local_p (decl);
6181 if (ix86_function_naked (current_function_decl))
6182 return false;
6184 /* Sibling call isn't OK if there are no caller-saved registers
6185 since all registers must be preserved before return. */
6186 if (cfun->machine->no_caller_saved_registers)
6187 return false;
6189 /* If we are generating position-independent code, we cannot sibcall
6190 optimize direct calls to global functions, as the PLT requires
6191 %ebx be live. (Darwin does not have a PLT.) */
6192 if (!TARGET_MACHO
6193 && !TARGET_64BIT
6194 && flag_pic
6195 && flag_plt
6196 && bind_global)
6197 return false;
6199 /* If we need to align the outgoing stack, then sibcalling would
6200 unalign the stack, which may break the called function. */
6201 if (ix86_minimum_incoming_stack_boundary (true)
6202 < PREFERRED_STACK_BOUNDARY)
6203 return false;
6205 if (decl)
6207 decl_or_type = decl;
6208 type = TREE_TYPE (decl);
6210 else
6212 /* We're looking at the CALL_EXPR, we need the type of the function. */
6213 type = CALL_EXPR_FN (exp); /* pointer expression */
6214 type = TREE_TYPE (type); /* pointer type */
6215 type = TREE_TYPE (type); /* function type */
6216 decl_or_type = type;
6219 /* Check that the return value locations are the same. Like
6220 if we are returning floats on the 80387 register stack, we cannot
6221 make a sibcall from a function that doesn't return a float to a
6222 function that does or, conversely, from a function that does return
6223 a float to a function that doesn't; the necessary stack adjustment
6224 would not be executed. This is also the place we notice
6225 differences in the return value ABI. Note that it is ok for one
6226 of the functions to have void return type as long as the return
6227 value of the other is passed in a register. */
6228 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6229 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6230 cfun->decl, false);
6231 if (STACK_REG_P (a) || STACK_REG_P (b))
6233 if (!rtx_equal_p (a, b))
6234 return false;
6236 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6238 else if (!rtx_equal_p (a, b))
6239 return false;
6241 if (TARGET_64BIT)
6243 /* The SYSV ABI has more call-clobbered registers;
6244 disallow sibcalls from MS to SYSV. */
6245 if (cfun->machine->call_abi == MS_ABI
6246 && ix86_function_type_abi (type) == SYSV_ABI)
6247 return false;
6249 else
6251 /* If this call is indirect, we'll need to be able to use a
6252 call-clobbered register for the address of the target function.
6253 Make sure that all such registers are not used for passing
6254 parameters. Note that DLLIMPORT functions and call to global
6255 function via GOT slot are indirect. */
6256 if (!decl
6257 || (bind_global && flag_pic && !flag_plt)
6258 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6260 /* Check if regparm >= 3 since arg_reg_available is set to
6261 false if regparm == 0. If regparm is 1 or 2, there is
6262 always a call-clobbered register available.
6264 ??? The symbol indirect call doesn't need a call-clobbered
6265 register. But we don't know if this is a symbol indirect
6266 call or not here. */
6267 if (ix86_function_regparm (type, NULL) >= 3
6268 && !cfun->machine->arg_reg_available)
6269 return false;
6273 /* Otherwise okay. That also includes certain types of indirect calls. */
6274 return true;
6277 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6278 and "sseregparm" calling convention attributes;
6279 arguments as in struct attribute_spec.handler. */
6281 static tree
6282 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6283 bool *no_add_attrs)
6285 if (TREE_CODE (*node) != FUNCTION_TYPE
6286 && TREE_CODE (*node) != METHOD_TYPE
6287 && TREE_CODE (*node) != FIELD_DECL
6288 && TREE_CODE (*node) != TYPE_DECL)
6290 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6291 name);
6292 *no_add_attrs = true;
6293 return NULL_TREE;
6296 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6297 if (is_attribute_p ("regparm", name))
6299 tree cst;
6301 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6303 error ("fastcall and regparm attributes are not compatible");
6306 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6308 error ("regparam and thiscall attributes are not compatible");
6311 cst = TREE_VALUE (args);
6312 if (TREE_CODE (cst) != INTEGER_CST)
6314 warning (OPT_Wattributes,
6315 "%qE attribute requires an integer constant argument",
6316 name);
6317 *no_add_attrs = true;
6319 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6321 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6322 name, REGPARM_MAX);
6323 *no_add_attrs = true;
6326 return NULL_TREE;
6329 if (TARGET_64BIT)
6331 /* Do not warn when emulating the MS ABI. */
6332 if ((TREE_CODE (*node) != FUNCTION_TYPE
6333 && TREE_CODE (*node) != METHOD_TYPE)
6334 || ix86_function_type_abi (*node) != MS_ABI)
6335 warning (OPT_Wattributes, "%qE attribute ignored",
6336 name);
6337 *no_add_attrs = true;
6338 return NULL_TREE;
6341 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6342 if (is_attribute_p ("fastcall", name))
6344 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6346 error ("fastcall and cdecl attributes are not compatible");
6348 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6350 error ("fastcall and stdcall attributes are not compatible");
6352 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6354 error ("fastcall and regparm attributes are not compatible");
6356 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6358 error ("fastcall and thiscall attributes are not compatible");
6362 /* Can combine stdcall with fastcall (redundant), regparm and
6363 sseregparm. */
6364 else if (is_attribute_p ("stdcall", name))
6366 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6368 error ("stdcall and cdecl attributes are not compatible");
6370 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6372 error ("stdcall and fastcall attributes are not compatible");
6374 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6376 error ("stdcall and thiscall attributes are not compatible");
6380 /* Can combine cdecl with regparm and sseregparm. */
6381 else if (is_attribute_p ("cdecl", name))
6383 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6385 error ("stdcall and cdecl attributes are not compatible");
6387 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6389 error ("fastcall and cdecl attributes are not compatible");
6391 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6393 error ("cdecl and thiscall attributes are not compatible");
6396 else if (is_attribute_p ("thiscall", name))
6398 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6399 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6400 name);
6401 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6403 error ("stdcall and thiscall attributes are not compatible");
6405 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6407 error ("fastcall and thiscall attributes are not compatible");
6409 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6411 error ("cdecl and thiscall attributes are not compatible");
6415 /* Can combine sseregparm with all attributes. */
6417 return NULL_TREE;
6420 /* The transactional memory builtins are implicitly regparm or fastcall
6421 depending on the ABI. Override the generic do-nothing attribute that
6422 these builtins were declared with, and replace it with one of the two
6423 attributes that we expect elsewhere. */
6425 static tree
6426 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6427 int flags, bool *no_add_attrs)
6429 tree alt;
6431 /* In no case do we want to add the placeholder attribute. */
6432 *no_add_attrs = true;
6434 /* The 64-bit ABI is unchanged for transactional memory. */
6435 if (TARGET_64BIT)
6436 return NULL_TREE;
6438 /* ??? Is there a better way to validate 32-bit windows? We have
6439 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6440 if (CHECK_STACK_LIMIT > 0)
6441 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6442 else
6444 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6445 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6447 decl_attributes (node, alt, flags);
6449 return NULL_TREE;
6452 /* This function determines from TYPE the calling-convention. */
6454 unsigned int
6455 ix86_get_callcvt (const_tree type)
6457 unsigned int ret = 0;
6458 bool is_stdarg;
6459 tree attrs;
6461 if (TARGET_64BIT)
6462 return IX86_CALLCVT_CDECL;
6464 attrs = TYPE_ATTRIBUTES (type);
6465 if (attrs != NULL_TREE)
6467 if (lookup_attribute ("cdecl", attrs))
6468 ret |= IX86_CALLCVT_CDECL;
6469 else if (lookup_attribute ("stdcall", attrs))
6470 ret |= IX86_CALLCVT_STDCALL;
6471 else if (lookup_attribute ("fastcall", attrs))
6472 ret |= IX86_CALLCVT_FASTCALL;
6473 else if (lookup_attribute ("thiscall", attrs))
6474 ret |= IX86_CALLCVT_THISCALL;
6476 /* Regparam isn't allowed for thiscall and fastcall. */
6477 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6479 if (lookup_attribute ("regparm", attrs))
6480 ret |= IX86_CALLCVT_REGPARM;
6481 if (lookup_attribute ("sseregparm", attrs))
6482 ret |= IX86_CALLCVT_SSEREGPARM;
6485 if (IX86_BASE_CALLCVT(ret) != 0)
6486 return ret;
6489 is_stdarg = stdarg_p (type);
6490 if (TARGET_RTD && !is_stdarg)
6491 return IX86_CALLCVT_STDCALL | ret;
6493 if (ret != 0
6494 || is_stdarg
6495 || TREE_CODE (type) != METHOD_TYPE
6496 || ix86_function_type_abi (type) != MS_ABI)
6497 return IX86_CALLCVT_CDECL | ret;
6499 return IX86_CALLCVT_THISCALL;
6502 /* Return 0 if the attributes for two types are incompatible, 1 if they
6503 are compatible, and 2 if they are nearly compatible (which causes a
6504 warning to be generated). */
6506 static int
6507 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6509 unsigned int ccvt1, ccvt2;
6511 if (TREE_CODE (type1) != FUNCTION_TYPE
6512 && TREE_CODE (type1) != METHOD_TYPE)
6513 return 1;
6515 ccvt1 = ix86_get_callcvt (type1);
6516 ccvt2 = ix86_get_callcvt (type2);
6517 if (ccvt1 != ccvt2)
6518 return 0;
6519 if (ix86_function_regparm (type1, NULL)
6520 != ix86_function_regparm (type2, NULL))
6521 return 0;
6523 return 1;
6526 /* Return the regparm value for a function with the indicated TYPE and DECL.
6527 DECL may be NULL when calling function indirectly
6528 or considering a libcall. */
6530 static int
6531 ix86_function_regparm (const_tree type, const_tree decl)
6533 tree attr;
6534 int regparm;
6535 unsigned int ccvt;
6537 if (TARGET_64BIT)
6538 return (ix86_function_type_abi (type) == SYSV_ABI
6539 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6540 ccvt = ix86_get_callcvt (type);
6541 regparm = ix86_regparm;
6543 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6545 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6546 if (attr)
6548 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6549 return regparm;
6552 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6553 return 2;
6554 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6555 return 1;
6557 /* Use register calling convention for local functions when possible. */
6558 if (decl
6559 && TREE_CODE (decl) == FUNCTION_DECL)
6561 cgraph_node *target = cgraph_node::get (decl);
6562 if (target)
6563 target = target->function_symbol ();
6565 /* Caller and callee must agree on the calling convention, so
6566 checking here just optimize means that with
6567 __attribute__((optimize (...))) caller could use regparm convention
6568 and callee not, or vice versa. Instead look at whether the callee
6569 is optimized or not. */
6570 if (target && opt_for_fn (target->decl, optimize)
6571 && !(profile_flag && !flag_fentry))
6573 cgraph_local_info *i = &target->local;
6574 if (i && i->local && i->can_change_signature)
6576 int local_regparm, globals = 0, regno;
6578 /* Make sure no regparm register is taken by a
6579 fixed register variable. */
6580 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6581 local_regparm++)
6582 if (fixed_regs[local_regparm])
6583 break;
6585 /* We don't want to use regparm(3) for nested functions as
6586 these use a static chain pointer in the third argument. */
6587 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6588 local_regparm = 2;
6590 /* Save a register for the split stack. */
6591 if (flag_split_stack)
6593 if (local_regparm == 3)
6594 local_regparm = 2;
6595 else if (local_regparm == 2
6596 && DECL_STATIC_CHAIN (target->decl))
6597 local_regparm = 1;
6600 /* Each fixed register usage increases register pressure,
6601 so less registers should be used for argument passing.
6602 This functionality can be overriden by an explicit
6603 regparm value. */
6604 for (regno = AX_REG; regno <= DI_REG; regno++)
6605 if (fixed_regs[regno])
6606 globals++;
6608 local_regparm
6609 = globals < local_regparm ? local_regparm - globals : 0;
6611 if (local_regparm > regparm)
6612 regparm = local_regparm;
6617 return regparm;
6620 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6621 DFmode (2) arguments in SSE registers for a function with the
6622 indicated TYPE and DECL. DECL may be NULL when calling function
6623 indirectly or considering a libcall. Return -1 if any FP parameter
6624 should be rejected by error. This is used in siutation we imply SSE
6625 calling convetion but the function is called from another function with
6626 SSE disabled. Otherwise return 0. */
6628 static int
6629 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6631 gcc_assert (!TARGET_64BIT);
6633 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6634 by the sseregparm attribute. */
6635 if (TARGET_SSEREGPARM
6636 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6638 if (!TARGET_SSE)
6640 if (warn)
6642 if (decl)
6643 error ("calling %qD with attribute sseregparm without "
6644 "SSE/SSE2 enabled", decl);
6645 else
6646 error ("calling %qT with attribute sseregparm without "
6647 "SSE/SSE2 enabled", type);
6649 return 0;
6652 return 2;
6655 if (!decl)
6656 return 0;
6658 cgraph_node *target = cgraph_node::get (decl);
6659 if (target)
6660 target = target->function_symbol ();
6662 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6663 (and DFmode for SSE2) arguments in SSE registers. */
6664 if (target
6665 /* TARGET_SSE_MATH */
6666 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6667 && opt_for_fn (target->decl, optimize)
6668 && !(profile_flag && !flag_fentry))
6670 cgraph_local_info *i = &target->local;
6671 if (i && i->local && i->can_change_signature)
6673 /* Refuse to produce wrong code when local function with SSE enabled
6674 is called from SSE disabled function.
6675 FIXME: We need a way to detect these cases cross-ltrans partition
6676 and avoid using SSE calling conventions on local functions called
6677 from function with SSE disabled. For now at least delay the
6678 warning until we know we are going to produce wrong code.
6679 See PR66047 */
6680 if (!TARGET_SSE && warn)
6681 return -1;
6682 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6683 ->x_ix86_isa_flags) ? 2 : 1;
6687 return 0;
6690 /* Return true if EAX is live at the start of the function. Used by
6691 ix86_expand_prologue to determine if we need special help before
6692 calling allocate_stack_worker. */
6694 static bool
6695 ix86_eax_live_at_start_p (void)
6697 /* Cheat. Don't bother working forward from ix86_function_regparm
6698 to the function type to whether an actual argument is located in
6699 eax. Instead just look at cfg info, which is still close enough
6700 to correct at this point. This gives false positives for broken
6701 functions that might use uninitialized data that happens to be
6702 allocated in eax, but who cares? */
6703 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6706 static bool
6707 ix86_keep_aggregate_return_pointer (tree fntype)
6709 tree attr;
6711 if (!TARGET_64BIT)
6713 attr = lookup_attribute ("callee_pop_aggregate_return",
6714 TYPE_ATTRIBUTES (fntype));
6715 if (attr)
6716 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6718 /* For 32-bit MS-ABI the default is to keep aggregate
6719 return pointer. */
6720 if (ix86_function_type_abi (fntype) == MS_ABI)
6721 return true;
6723 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6726 /* Value is the number of bytes of arguments automatically
6727 popped when returning from a subroutine call.
6728 FUNDECL is the declaration node of the function (as a tree),
6729 FUNTYPE is the data type of the function (as a tree),
6730 or for a library call it is an identifier node for the subroutine name.
6731 SIZE is the number of bytes of arguments passed on the stack.
6733 On the 80386, the RTD insn may be used to pop them if the number
6734 of args is fixed, but if the number is variable then the caller
6735 must pop them all. RTD can't be used for library calls now
6736 because the library is compiled with the Unix compiler.
6737 Use of RTD is a selectable option, since it is incompatible with
6738 standard Unix calling sequences. If the option is not selected,
6739 the caller must always pop the args.
6741 The attribute stdcall is equivalent to RTD on a per module basis. */
6743 static poly_int64
6744 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6746 unsigned int ccvt;
6748 /* None of the 64-bit ABIs pop arguments. */
6749 if (TARGET_64BIT)
6750 return 0;
6752 ccvt = ix86_get_callcvt (funtype);
6754 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6755 | IX86_CALLCVT_THISCALL)) != 0
6756 && ! stdarg_p (funtype))
6757 return size;
6759 /* Lose any fake structure return argument if it is passed on the stack. */
6760 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6761 && !ix86_keep_aggregate_return_pointer (funtype))
6763 int nregs = ix86_function_regparm (funtype, fundecl);
6764 if (nregs == 0)
6765 return GET_MODE_SIZE (Pmode);
6768 return 0;
6771 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6773 static bool
6774 ix86_legitimate_combined_insn (rtx_insn *insn)
6776 int i;
6778 /* Check operand constraints in case hard registers were propagated
6779 into insn pattern. This check prevents combine pass from
6780 generating insn patterns with invalid hard register operands.
6781 These invalid insns can eventually confuse reload to error out
6782 with a spill failure. See also PRs 46829 and 46843. */
6784 gcc_assert (INSN_CODE (insn) >= 0);
6786 extract_insn (insn);
6787 preprocess_constraints (insn);
6789 int n_operands = recog_data.n_operands;
6790 int n_alternatives = recog_data.n_alternatives;
6791 for (i = 0; i < n_operands; i++)
6793 rtx op = recog_data.operand[i];
6794 machine_mode mode = GET_MODE (op);
6795 const operand_alternative *op_alt;
6796 int offset = 0;
6797 bool win;
6798 int j;
6800 /* A unary operator may be accepted by the predicate, but it
6801 is irrelevant for matching constraints. */
6802 if (UNARY_P (op))
6803 op = XEXP (op, 0);
6805 if (SUBREG_P (op))
6807 if (REG_P (SUBREG_REG (op))
6808 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6809 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6810 GET_MODE (SUBREG_REG (op)),
6811 SUBREG_BYTE (op),
6812 GET_MODE (op));
6813 op = SUBREG_REG (op);
6816 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6817 continue;
6819 op_alt = recog_op_alt;
6821 /* Operand has no constraints, anything is OK. */
6822 win = !n_alternatives;
6824 alternative_mask preferred = get_preferred_alternatives (insn);
6825 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6827 if (!TEST_BIT (preferred, j))
6828 continue;
6829 if (op_alt[i].anything_ok
6830 || (op_alt[i].matches != -1
6831 && operands_match_p
6832 (recog_data.operand[i],
6833 recog_data.operand[op_alt[i].matches]))
6834 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6836 win = true;
6837 break;
6841 if (!win)
6842 return false;
6845 return true;
6848 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6850 static unsigned HOST_WIDE_INT
6851 ix86_asan_shadow_offset (void)
6853 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6854 : HOST_WIDE_INT_C (0x7fff8000))
6855 : (HOST_WIDE_INT_1 << 29);
6858 /* Argument support functions. */
6860 /* Return true when register may be used to pass function parameters. */
6861 bool
6862 ix86_function_arg_regno_p (int regno)
6864 int i;
6865 enum calling_abi call_abi;
6866 const int *parm_regs;
6868 if (TARGET_MPX && BND_REGNO_P (regno))
6869 return true;
6871 if (!TARGET_64BIT)
6873 if (TARGET_MACHO)
6874 return (regno < REGPARM_MAX
6875 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6876 else
6877 return (regno < REGPARM_MAX
6878 || (TARGET_MMX && MMX_REGNO_P (regno)
6879 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6880 || (TARGET_SSE && SSE_REGNO_P (regno)
6881 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6884 if (TARGET_SSE && SSE_REGNO_P (regno)
6885 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6886 return true;
6888 /* TODO: The function should depend on current function ABI but
6889 builtins.c would need updating then. Therefore we use the
6890 default ABI. */
6891 call_abi = ix86_cfun_abi ();
6893 /* RAX is used as hidden argument to va_arg functions. */
6894 if (call_abi == SYSV_ABI && regno == AX_REG)
6895 return true;
6897 if (call_abi == MS_ABI)
6898 parm_regs = x86_64_ms_abi_int_parameter_registers;
6899 else
6900 parm_regs = x86_64_int_parameter_registers;
6902 for (i = 0; i < (call_abi == MS_ABI
6903 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6904 if (regno == parm_regs[i])
6905 return true;
6906 return false;
6909 /* Return if we do not know how to pass TYPE solely in registers. */
6911 static bool
6912 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6914 if (must_pass_in_stack_var_size_or_pad (mode, type))
6915 return true;
6917 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6918 The layout_type routine is crafty and tries to trick us into passing
6919 currently unsupported vector types on the stack by using TImode. */
6920 return (!TARGET_64BIT && mode == TImode
6921 && type && TREE_CODE (type) != VECTOR_TYPE);
6924 /* It returns the size, in bytes, of the area reserved for arguments passed
6925 in registers for the function represented by fndecl dependent to the used
6926 abi format. */
6928 ix86_reg_parm_stack_space (const_tree fndecl)
6930 enum calling_abi call_abi = SYSV_ABI;
6931 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6932 call_abi = ix86_function_abi (fndecl);
6933 else
6934 call_abi = ix86_function_type_abi (fndecl);
6935 if (TARGET_64BIT && call_abi == MS_ABI)
6936 return 32;
6937 return 0;
6940 /* We add this as a workaround in order to use libc_has_function
6941 hook in i386.md. */
6942 bool
6943 ix86_libc_has_function (enum function_class fn_class)
6945 return targetm.libc_has_function (fn_class);
6948 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6949 specifying the call abi used. */
6950 enum calling_abi
6951 ix86_function_type_abi (const_tree fntype)
6953 enum calling_abi abi = ix86_abi;
6955 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6956 return abi;
6958 if (abi == SYSV_ABI
6959 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6961 static int warned;
6962 if (TARGET_X32 && !warned)
6964 error ("X32 does not support ms_abi attribute");
6965 warned = 1;
6968 abi = MS_ABI;
6970 else if (abi == MS_ABI
6971 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6972 abi = SYSV_ABI;
6974 return abi;
6977 static enum calling_abi
6978 ix86_function_abi (const_tree fndecl)
6980 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6983 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6984 specifying the call abi used. */
6985 enum calling_abi
6986 ix86_cfun_abi (void)
6988 return cfun ? cfun->machine->call_abi : ix86_abi;
6991 static bool
6992 ix86_function_ms_hook_prologue (const_tree fn)
6994 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6996 if (decl_function_context (fn) != NULL_TREE)
6997 error_at (DECL_SOURCE_LOCATION (fn),
6998 "ms_hook_prologue is not compatible with nested function");
6999 else
7000 return true;
7002 return false;
7005 static bool
7006 ix86_function_naked (const_tree fn)
7008 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7009 return true;
7011 return false;
7014 /* Write the extra assembler code needed to declare a function properly. */
7016 void
7017 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7018 tree decl)
7020 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7022 if (is_ms_hook)
7024 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7025 unsigned int filler_cc = 0xcccccccc;
7027 for (i = 0; i < filler_count; i += 4)
7028 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7031 #ifdef SUBTARGET_ASM_UNWIND_INIT
7032 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7033 #endif
7035 ASM_OUTPUT_LABEL (asm_out_file, fname);
7037 /* Output magic byte marker, if hot-patch attribute is set. */
7038 if (is_ms_hook)
7040 if (TARGET_64BIT)
7042 /* leaq [%rsp + 0], %rsp */
7043 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7044 asm_out_file);
7046 else
7048 /* movl.s %edi, %edi
7049 push %ebp
7050 movl.s %esp, %ebp */
7051 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7056 /* Implementation of call abi switching target hook. Specific to FNDECL
7057 the specific call register sets are set. See also
7058 ix86_conditional_register_usage for more details. */
7059 void
7060 ix86_call_abi_override (const_tree fndecl)
7062 cfun->machine->call_abi = ix86_function_abi (fndecl);
7065 /* Return 1 if pseudo register should be created and used to hold
7066 GOT address for PIC code. */
7067 bool
7068 ix86_use_pseudo_pic_reg (void)
7070 if ((TARGET_64BIT
7071 && (ix86_cmodel == CM_SMALL_PIC
7072 || TARGET_PECOFF))
7073 || !flag_pic)
7074 return false;
7075 return true;
7078 /* Initialize large model PIC register. */
7080 static void
7081 ix86_init_large_pic_reg (unsigned int tmp_regno)
7083 rtx_code_label *label;
7084 rtx tmp_reg;
7086 gcc_assert (Pmode == DImode);
7087 label = gen_label_rtx ();
7088 emit_label (label);
7089 LABEL_PRESERVE_P (label) = 1;
7090 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7091 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7092 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7093 label));
7094 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7095 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7096 pic_offset_table_rtx, tmp_reg));
7097 const char *name = LABEL_NAME (label);
7098 PUT_CODE (label, NOTE);
7099 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7100 NOTE_DELETED_LABEL_NAME (label) = name;
7103 /* Create and initialize PIC register if required. */
7104 static void
7105 ix86_init_pic_reg (void)
7107 edge entry_edge;
7108 rtx_insn *seq;
7110 if (!ix86_use_pseudo_pic_reg ())
7111 return;
7113 start_sequence ();
7115 if (TARGET_64BIT)
7117 if (ix86_cmodel == CM_LARGE_PIC)
7118 ix86_init_large_pic_reg (R11_REG);
7119 else
7120 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7122 else
7124 /* If there is future mcount call in the function it is more profitable
7125 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7126 rtx reg = crtl->profile
7127 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7128 : pic_offset_table_rtx;
7129 rtx_insn *insn = emit_insn (gen_set_got (reg));
7130 RTX_FRAME_RELATED_P (insn) = 1;
7131 if (crtl->profile)
7132 emit_move_insn (pic_offset_table_rtx, reg);
7133 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7136 seq = get_insns ();
7137 end_sequence ();
7139 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7140 insert_insn_on_edge (seq, entry_edge);
7141 commit_one_edge_insertion (entry_edge);
7144 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7145 for a call to a function whose data type is FNTYPE.
7146 For a library call, FNTYPE is 0. */
7148 void
7149 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7150 tree fntype, /* tree ptr for function decl */
7151 rtx libname, /* SYMBOL_REF of library name or 0 */
7152 tree fndecl,
7153 int caller)
7155 struct cgraph_local_info *i = NULL;
7156 struct cgraph_node *target = NULL;
7158 memset (cum, 0, sizeof (*cum));
7160 if (fndecl)
7162 target = cgraph_node::get (fndecl);
7163 if (target)
7165 target = target->function_symbol ();
7166 i = cgraph_node::local_info (target->decl);
7167 cum->call_abi = ix86_function_abi (target->decl);
7169 else
7170 cum->call_abi = ix86_function_abi (fndecl);
7172 else
7173 cum->call_abi = ix86_function_type_abi (fntype);
7175 cum->caller = caller;
7177 /* Set up the number of registers to use for passing arguments. */
7178 cum->nregs = ix86_regparm;
7179 if (TARGET_64BIT)
7181 cum->nregs = (cum->call_abi == SYSV_ABI
7182 ? X86_64_REGPARM_MAX
7183 : X86_64_MS_REGPARM_MAX);
7185 if (TARGET_SSE)
7187 cum->sse_nregs = SSE_REGPARM_MAX;
7188 if (TARGET_64BIT)
7190 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7191 ? X86_64_SSE_REGPARM_MAX
7192 : X86_64_MS_SSE_REGPARM_MAX);
7195 if (TARGET_MMX)
7196 cum->mmx_nregs = MMX_REGPARM_MAX;
7197 cum->warn_avx512f = true;
7198 cum->warn_avx = true;
7199 cum->warn_sse = true;
7200 cum->warn_mmx = true;
7202 /* Because type might mismatch in between caller and callee, we need to
7203 use actual type of function for local calls.
7204 FIXME: cgraph_analyze can be told to actually record if function uses
7205 va_start so for local functions maybe_vaarg can be made aggressive
7206 helping K&R code.
7207 FIXME: once typesytem is fixed, we won't need this code anymore. */
7208 if (i && i->local && i->can_change_signature)
7209 fntype = TREE_TYPE (target->decl);
7210 cum->stdarg = stdarg_p (fntype);
7211 cum->maybe_vaarg = (fntype
7212 ? (!prototype_p (fntype) || stdarg_p (fntype))
7213 : !libname);
7215 cum->bnd_regno = FIRST_BND_REG;
7216 cum->bnds_in_bt = 0;
7217 cum->force_bnd_pass = 0;
7218 cum->decl = fndecl;
7220 cum->warn_empty = !warn_abi || cum->stdarg;
7221 if (!cum->warn_empty && fntype)
7223 function_args_iterator iter;
7224 tree argtype;
7225 bool seen_empty_type = false;
7226 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7228 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7229 break;
7230 if (TYPE_EMPTY_P (argtype))
7231 seen_empty_type = true;
7232 else if (seen_empty_type)
7234 cum->warn_empty = true;
7235 break;
7240 if (!TARGET_64BIT)
7242 /* If there are variable arguments, then we won't pass anything
7243 in registers in 32-bit mode. */
7244 if (stdarg_p (fntype))
7246 cum->nregs = 0;
7247 /* Since in 32-bit, variable arguments are always passed on
7248 stack, there is scratch register available for indirect
7249 sibcall. */
7250 cfun->machine->arg_reg_available = true;
7251 cum->sse_nregs = 0;
7252 cum->mmx_nregs = 0;
7253 cum->warn_avx512f = false;
7254 cum->warn_avx = false;
7255 cum->warn_sse = false;
7256 cum->warn_mmx = false;
7257 return;
7260 /* Use ecx and edx registers if function has fastcall attribute,
7261 else look for regparm information. */
7262 if (fntype)
7264 unsigned int ccvt = ix86_get_callcvt (fntype);
7265 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7267 cum->nregs = 1;
7268 cum->fastcall = 1; /* Same first register as in fastcall. */
7270 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7272 cum->nregs = 2;
7273 cum->fastcall = 1;
7275 else
7276 cum->nregs = ix86_function_regparm (fntype, fndecl);
7279 /* Set up the number of SSE registers used for passing SFmode
7280 and DFmode arguments. Warn for mismatching ABI. */
7281 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7284 cfun->machine->arg_reg_available = (cum->nregs > 0);
7287 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7288 But in the case of vector types, it is some vector mode.
7290 When we have only some of our vector isa extensions enabled, then there
7291 are some modes for which vector_mode_supported_p is false. For these
7292 modes, the generic vector support in gcc will choose some non-vector mode
7293 in order to implement the type. By computing the natural mode, we'll
7294 select the proper ABI location for the operand and not depend on whatever
7295 the middle-end decides to do with these vector types.
7297 The midde-end can't deal with the vector types > 16 bytes. In this
7298 case, we return the original mode and warn ABI change if CUM isn't
7299 NULL.
7301 If INT_RETURN is true, warn ABI change if the vector mode isn't
7302 available for function return value. */
7304 static machine_mode
7305 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7306 bool in_return)
7308 machine_mode mode = TYPE_MODE (type);
7310 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7312 HOST_WIDE_INT size = int_size_in_bytes (type);
7313 if ((size == 8 || size == 16 || size == 32 || size == 64)
7314 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7315 && TYPE_VECTOR_SUBPARTS (type) > 1)
7317 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7319 /* There are no XFmode vector modes. */
7320 if (innermode == XFmode)
7321 return mode;
7323 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7324 mode = MIN_MODE_VECTOR_FLOAT;
7325 else
7326 mode = MIN_MODE_VECTOR_INT;
7328 /* Get the mode which has this inner mode and number of units. */
7329 FOR_EACH_MODE_FROM (mode, mode)
7330 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7331 && GET_MODE_INNER (mode) == innermode)
7333 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7335 static bool warnedavx512f;
7336 static bool warnedavx512f_ret;
7338 if (cum && cum->warn_avx512f && !warnedavx512f)
7340 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7341 "without AVX512F enabled changes the ABI"))
7342 warnedavx512f = true;
7344 else if (in_return && !warnedavx512f_ret)
7346 if (warning (OPT_Wpsabi, "AVX512F vector return "
7347 "without AVX512F enabled changes the ABI"))
7348 warnedavx512f_ret = true;
7351 return TYPE_MODE (type);
7353 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7355 static bool warnedavx;
7356 static bool warnedavx_ret;
7358 if (cum && cum->warn_avx && !warnedavx)
7360 if (warning (OPT_Wpsabi, "AVX vector argument "
7361 "without AVX enabled changes the ABI"))
7362 warnedavx = true;
7364 else if (in_return && !warnedavx_ret)
7366 if (warning (OPT_Wpsabi, "AVX vector return "
7367 "without AVX enabled changes the ABI"))
7368 warnedavx_ret = true;
7371 return TYPE_MODE (type);
7373 else if (((size == 8 && TARGET_64BIT) || size == 16)
7374 && !TARGET_SSE
7375 && !TARGET_IAMCU)
7377 static bool warnedsse;
7378 static bool warnedsse_ret;
7380 if (cum && cum->warn_sse && !warnedsse)
7382 if (warning (OPT_Wpsabi, "SSE vector argument "
7383 "without SSE enabled changes the ABI"))
7384 warnedsse = true;
7386 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7388 if (warning (OPT_Wpsabi, "SSE vector return "
7389 "without SSE enabled changes the ABI"))
7390 warnedsse_ret = true;
7393 else if ((size == 8 && !TARGET_64BIT)
7394 && (!cfun
7395 || cfun->machine->func_type == TYPE_NORMAL)
7396 && !TARGET_MMX
7397 && !TARGET_IAMCU)
7399 static bool warnedmmx;
7400 static bool warnedmmx_ret;
7402 if (cum && cum->warn_mmx && !warnedmmx)
7404 if (warning (OPT_Wpsabi, "MMX vector argument "
7405 "without MMX enabled changes the ABI"))
7406 warnedmmx = true;
7408 else if (in_return && !warnedmmx_ret)
7410 if (warning (OPT_Wpsabi, "MMX vector return "
7411 "without MMX enabled changes the ABI"))
7412 warnedmmx_ret = true;
7415 return mode;
7418 gcc_unreachable ();
7422 return mode;
7425 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7426 this may not agree with the mode that the type system has chosen for the
7427 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7428 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7430 static rtx
7431 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7432 unsigned int regno)
7434 rtx tmp;
7436 if (orig_mode != BLKmode)
7437 tmp = gen_rtx_REG (orig_mode, regno);
7438 else
7440 tmp = gen_rtx_REG (mode, regno);
7441 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7442 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7445 return tmp;
7448 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7449 of this code is to classify each 8bytes of incoming argument by the register
7450 class and assign registers accordingly. */
7452 /* Return the union class of CLASS1 and CLASS2.
7453 See the x86-64 PS ABI for details. */
7455 static enum x86_64_reg_class
7456 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7458 /* Rule #1: If both classes are equal, this is the resulting class. */
7459 if (class1 == class2)
7460 return class1;
7462 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7463 the other class. */
7464 if (class1 == X86_64_NO_CLASS)
7465 return class2;
7466 if (class2 == X86_64_NO_CLASS)
7467 return class1;
7469 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7470 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7471 return X86_64_MEMORY_CLASS;
7473 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7474 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7475 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7476 return X86_64_INTEGERSI_CLASS;
7477 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7478 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7479 return X86_64_INTEGER_CLASS;
7481 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7482 MEMORY is used. */
7483 if (class1 == X86_64_X87_CLASS
7484 || class1 == X86_64_X87UP_CLASS
7485 || class1 == X86_64_COMPLEX_X87_CLASS
7486 || class2 == X86_64_X87_CLASS
7487 || class2 == X86_64_X87UP_CLASS
7488 || class2 == X86_64_COMPLEX_X87_CLASS)
7489 return X86_64_MEMORY_CLASS;
7491 /* Rule #6: Otherwise class SSE is used. */
7492 return X86_64_SSE_CLASS;
7495 /* Classify the argument of type TYPE and mode MODE.
7496 CLASSES will be filled by the register class used to pass each word
7497 of the operand. The number of words is returned. In case the parameter
7498 should be passed in memory, 0 is returned. As a special case for zero
7499 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7501 BIT_OFFSET is used internally for handling records and specifies offset
7502 of the offset in bits modulo 512 to avoid overflow cases.
7504 See the x86-64 PS ABI for details.
7507 static int
7508 classify_argument (machine_mode mode, const_tree type,
7509 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7511 HOST_WIDE_INT bytes =
7512 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7513 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7515 /* Variable sized entities are always passed/returned in memory. */
7516 if (bytes < 0)
7517 return 0;
7519 if (mode != VOIDmode
7520 && targetm.calls.must_pass_in_stack (mode, type))
7521 return 0;
7523 if (type && AGGREGATE_TYPE_P (type))
7525 int i;
7526 tree field;
7527 enum x86_64_reg_class subclasses[MAX_CLASSES];
7529 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7530 if (bytes > 64)
7531 return 0;
7533 for (i = 0; i < words; i++)
7534 classes[i] = X86_64_NO_CLASS;
7536 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7537 signalize memory class, so handle it as special case. */
7538 if (!words)
7540 classes[0] = X86_64_NO_CLASS;
7541 return 1;
7544 /* Classify each field of record and merge classes. */
7545 switch (TREE_CODE (type))
7547 case RECORD_TYPE:
7548 /* And now merge the fields of structure. */
7549 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7551 if (TREE_CODE (field) == FIELD_DECL)
7553 int num;
7555 if (TREE_TYPE (field) == error_mark_node)
7556 continue;
7558 /* Bitfields are always classified as integer. Handle them
7559 early, since later code would consider them to be
7560 misaligned integers. */
7561 if (DECL_BIT_FIELD (field))
7563 for (i = (int_bit_position (field)
7564 + (bit_offset % 64)) / 8 / 8;
7565 i < ((int_bit_position (field) + (bit_offset % 64))
7566 + tree_to_shwi (DECL_SIZE (field))
7567 + 63) / 8 / 8; i++)
7568 classes[i] =
7569 merge_classes (X86_64_INTEGER_CLASS,
7570 classes[i]);
7572 else
7574 int pos;
7576 type = TREE_TYPE (field);
7578 /* Flexible array member is ignored. */
7579 if (TYPE_MODE (type) == BLKmode
7580 && TREE_CODE (type) == ARRAY_TYPE
7581 && TYPE_SIZE (type) == NULL_TREE
7582 && TYPE_DOMAIN (type) != NULL_TREE
7583 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7584 == NULL_TREE))
7586 static bool warned;
7588 if (!warned && warn_psabi)
7590 warned = true;
7591 inform (input_location,
7592 "the ABI of passing struct with"
7593 " a flexible array member has"
7594 " changed in GCC 4.4");
7596 continue;
7598 num = classify_argument (TYPE_MODE (type), type,
7599 subclasses,
7600 (int_bit_position (field)
7601 + bit_offset) % 512);
7602 if (!num)
7603 return 0;
7604 pos = (int_bit_position (field)
7605 + (bit_offset % 64)) / 8 / 8;
7606 for (i = 0; i < num && (i + pos) < words; i++)
7607 classes[i + pos] =
7608 merge_classes (subclasses[i], classes[i + pos]);
7612 break;
7614 case ARRAY_TYPE:
7615 /* Arrays are handled as small records. */
7617 int num;
7618 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7619 TREE_TYPE (type), subclasses, bit_offset);
7620 if (!num)
7621 return 0;
7623 /* The partial classes are now full classes. */
7624 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7625 subclasses[0] = X86_64_SSE_CLASS;
7626 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7627 && !((bit_offset % 64) == 0 && bytes == 4))
7628 subclasses[0] = X86_64_INTEGER_CLASS;
7630 for (i = 0; i < words; i++)
7631 classes[i] = subclasses[i % num];
7633 break;
7635 case UNION_TYPE:
7636 case QUAL_UNION_TYPE:
7637 /* Unions are similar to RECORD_TYPE but offset is always 0.
7639 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7641 if (TREE_CODE (field) == FIELD_DECL)
7643 int num;
7645 if (TREE_TYPE (field) == error_mark_node)
7646 continue;
7648 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7649 TREE_TYPE (field), subclasses,
7650 bit_offset);
7651 if (!num)
7652 return 0;
7653 for (i = 0; i < num && i < words; i++)
7654 classes[i] = merge_classes (subclasses[i], classes[i]);
7657 break;
7659 default:
7660 gcc_unreachable ();
7663 if (words > 2)
7665 /* When size > 16 bytes, if the first one isn't
7666 X86_64_SSE_CLASS or any other ones aren't
7667 X86_64_SSEUP_CLASS, everything should be passed in
7668 memory. */
7669 if (classes[0] != X86_64_SSE_CLASS)
7670 return 0;
7672 for (i = 1; i < words; i++)
7673 if (classes[i] != X86_64_SSEUP_CLASS)
7674 return 0;
7677 /* Final merger cleanup. */
7678 for (i = 0; i < words; i++)
7680 /* If one class is MEMORY, everything should be passed in
7681 memory. */
7682 if (classes[i] == X86_64_MEMORY_CLASS)
7683 return 0;
7685 /* The X86_64_SSEUP_CLASS should be always preceded by
7686 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7687 if (classes[i] == X86_64_SSEUP_CLASS
7688 && classes[i - 1] != X86_64_SSE_CLASS
7689 && classes[i - 1] != X86_64_SSEUP_CLASS)
7691 /* The first one should never be X86_64_SSEUP_CLASS. */
7692 gcc_assert (i != 0);
7693 classes[i] = X86_64_SSE_CLASS;
7696 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7697 everything should be passed in memory. */
7698 if (classes[i] == X86_64_X87UP_CLASS
7699 && (classes[i - 1] != X86_64_X87_CLASS))
7701 static bool warned;
7703 /* The first one should never be X86_64_X87UP_CLASS. */
7704 gcc_assert (i != 0);
7705 if (!warned && warn_psabi)
7707 warned = true;
7708 inform (input_location,
7709 "the ABI of passing union with long double"
7710 " has changed in GCC 4.4");
7712 return 0;
7715 return words;
7718 /* Compute alignment needed. We align all types to natural boundaries with
7719 exception of XFmode that is aligned to 64bits. */
7720 if (mode != VOIDmode && mode != BLKmode)
7722 int mode_alignment = GET_MODE_BITSIZE (mode);
7724 if (mode == XFmode)
7725 mode_alignment = 128;
7726 else if (mode == XCmode)
7727 mode_alignment = 256;
7728 if (COMPLEX_MODE_P (mode))
7729 mode_alignment /= 2;
7730 /* Misaligned fields are always returned in memory. */
7731 if (bit_offset % mode_alignment)
7732 return 0;
7735 /* for V1xx modes, just use the base mode */
7736 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7737 && GET_MODE_UNIT_SIZE (mode) == bytes)
7738 mode = GET_MODE_INNER (mode);
7740 /* Classification of atomic types. */
7741 switch (mode)
7743 case E_SDmode:
7744 case E_DDmode:
7745 classes[0] = X86_64_SSE_CLASS;
7746 return 1;
7747 case E_TDmode:
7748 classes[0] = X86_64_SSE_CLASS;
7749 classes[1] = X86_64_SSEUP_CLASS;
7750 return 2;
7751 case E_DImode:
7752 case E_SImode:
7753 case E_HImode:
7754 case E_QImode:
7755 case E_CSImode:
7756 case E_CHImode:
7757 case E_CQImode:
7759 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7761 /* Analyze last 128 bits only. */
7762 size = (size - 1) & 0x7f;
7764 if (size < 32)
7766 classes[0] = X86_64_INTEGERSI_CLASS;
7767 return 1;
7769 else if (size < 64)
7771 classes[0] = X86_64_INTEGER_CLASS;
7772 return 1;
7774 else if (size < 64+32)
7776 classes[0] = X86_64_INTEGER_CLASS;
7777 classes[1] = X86_64_INTEGERSI_CLASS;
7778 return 2;
7780 else if (size < 64+64)
7782 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7783 return 2;
7785 else
7786 gcc_unreachable ();
7788 case E_CDImode:
7789 case E_TImode:
7790 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7791 return 2;
7792 case E_COImode:
7793 case E_OImode:
7794 /* OImode shouldn't be used directly. */
7795 gcc_unreachable ();
7796 case E_CTImode:
7797 return 0;
7798 case E_SFmode:
7799 if (!(bit_offset % 64))
7800 classes[0] = X86_64_SSESF_CLASS;
7801 else
7802 classes[0] = X86_64_SSE_CLASS;
7803 return 1;
7804 case E_DFmode:
7805 classes[0] = X86_64_SSEDF_CLASS;
7806 return 1;
7807 case E_XFmode:
7808 classes[0] = X86_64_X87_CLASS;
7809 classes[1] = X86_64_X87UP_CLASS;
7810 return 2;
7811 case E_TFmode:
7812 classes[0] = X86_64_SSE_CLASS;
7813 classes[1] = X86_64_SSEUP_CLASS;
7814 return 2;
7815 case E_SCmode:
7816 classes[0] = X86_64_SSE_CLASS;
7817 if (!(bit_offset % 64))
7818 return 1;
7819 else
7821 static bool warned;
7823 if (!warned && warn_psabi)
7825 warned = true;
7826 inform (input_location,
7827 "the ABI of passing structure with complex float"
7828 " member has changed in GCC 4.4");
7830 classes[1] = X86_64_SSESF_CLASS;
7831 return 2;
7833 case E_DCmode:
7834 classes[0] = X86_64_SSEDF_CLASS;
7835 classes[1] = X86_64_SSEDF_CLASS;
7836 return 2;
7837 case E_XCmode:
7838 classes[0] = X86_64_COMPLEX_X87_CLASS;
7839 return 1;
7840 case E_TCmode:
7841 /* This modes is larger than 16 bytes. */
7842 return 0;
7843 case E_V8SFmode:
7844 case E_V8SImode:
7845 case E_V32QImode:
7846 case E_V16HImode:
7847 case E_V4DFmode:
7848 case E_V4DImode:
7849 classes[0] = X86_64_SSE_CLASS;
7850 classes[1] = X86_64_SSEUP_CLASS;
7851 classes[2] = X86_64_SSEUP_CLASS;
7852 classes[3] = X86_64_SSEUP_CLASS;
7853 return 4;
7854 case E_V8DFmode:
7855 case E_V16SFmode:
7856 case E_V8DImode:
7857 case E_V16SImode:
7858 case E_V32HImode:
7859 case E_V64QImode:
7860 classes[0] = X86_64_SSE_CLASS;
7861 classes[1] = X86_64_SSEUP_CLASS;
7862 classes[2] = X86_64_SSEUP_CLASS;
7863 classes[3] = X86_64_SSEUP_CLASS;
7864 classes[4] = X86_64_SSEUP_CLASS;
7865 classes[5] = X86_64_SSEUP_CLASS;
7866 classes[6] = X86_64_SSEUP_CLASS;
7867 classes[7] = X86_64_SSEUP_CLASS;
7868 return 8;
7869 case E_V4SFmode:
7870 case E_V4SImode:
7871 case E_V16QImode:
7872 case E_V8HImode:
7873 case E_V2DFmode:
7874 case E_V2DImode:
7875 classes[0] = X86_64_SSE_CLASS;
7876 classes[1] = X86_64_SSEUP_CLASS;
7877 return 2;
7878 case E_V1TImode:
7879 case E_V1DImode:
7880 case E_V2SFmode:
7881 case E_V2SImode:
7882 case E_V4HImode:
7883 case E_V8QImode:
7884 classes[0] = X86_64_SSE_CLASS;
7885 return 1;
7886 case E_BLKmode:
7887 case E_VOIDmode:
7888 return 0;
7889 default:
7890 gcc_assert (VECTOR_MODE_P (mode));
7892 if (bytes > 16)
7893 return 0;
7895 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7897 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7898 classes[0] = X86_64_INTEGERSI_CLASS;
7899 else
7900 classes[0] = X86_64_INTEGER_CLASS;
7901 classes[1] = X86_64_INTEGER_CLASS;
7902 return 1 + (bytes > 8);
7906 /* Examine the argument and return set number of register required in each
7907 class. Return true iff parameter should be passed in memory. */
7909 static bool
7910 examine_argument (machine_mode mode, const_tree type, int in_return,
7911 int *int_nregs, int *sse_nregs)
7913 enum x86_64_reg_class regclass[MAX_CLASSES];
7914 int n = classify_argument (mode, type, regclass, 0);
7916 *int_nregs = 0;
7917 *sse_nregs = 0;
7919 if (!n)
7920 return true;
7921 for (n--; n >= 0; n--)
7922 switch (regclass[n])
7924 case X86_64_INTEGER_CLASS:
7925 case X86_64_INTEGERSI_CLASS:
7926 (*int_nregs)++;
7927 break;
7928 case X86_64_SSE_CLASS:
7929 case X86_64_SSESF_CLASS:
7930 case X86_64_SSEDF_CLASS:
7931 (*sse_nregs)++;
7932 break;
7933 case X86_64_NO_CLASS:
7934 case X86_64_SSEUP_CLASS:
7935 break;
7936 case X86_64_X87_CLASS:
7937 case X86_64_X87UP_CLASS:
7938 case X86_64_COMPLEX_X87_CLASS:
7939 if (!in_return)
7940 return true;
7941 break;
7942 case X86_64_MEMORY_CLASS:
7943 gcc_unreachable ();
7946 return false;
7949 /* Construct container for the argument used by GCC interface. See
7950 FUNCTION_ARG for the detailed description. */
7952 static rtx
7953 construct_container (machine_mode mode, machine_mode orig_mode,
7954 const_tree type, int in_return, int nintregs, int nsseregs,
7955 const int *intreg, int sse_regno)
7957 /* The following variables hold the static issued_error state. */
7958 static bool issued_sse_arg_error;
7959 static bool issued_sse_ret_error;
7960 static bool issued_x87_ret_error;
7962 machine_mode tmpmode;
7963 int bytes =
7964 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7965 enum x86_64_reg_class regclass[MAX_CLASSES];
7966 int n;
7967 int i;
7968 int nexps = 0;
7969 int needed_sseregs, needed_intregs;
7970 rtx exp[MAX_CLASSES];
7971 rtx ret;
7973 n = classify_argument (mode, type, regclass, 0);
7974 if (!n)
7975 return NULL;
7976 if (examine_argument (mode, type, in_return, &needed_intregs,
7977 &needed_sseregs))
7978 return NULL;
7979 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7980 return NULL;
7982 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7983 some less clueful developer tries to use floating-point anyway. */
7984 if (needed_sseregs && !TARGET_SSE)
7986 if (in_return)
7988 if (!issued_sse_ret_error)
7990 error ("SSE register return with SSE disabled");
7991 issued_sse_ret_error = true;
7994 else if (!issued_sse_arg_error)
7996 error ("SSE register argument with SSE disabled");
7997 issued_sse_arg_error = true;
7999 return NULL;
8002 /* Likewise, error if the ABI requires us to return values in the
8003 x87 registers and the user specified -mno-80387. */
8004 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8005 for (i = 0; i < n; i++)
8006 if (regclass[i] == X86_64_X87_CLASS
8007 || regclass[i] == X86_64_X87UP_CLASS
8008 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8010 if (!issued_x87_ret_error)
8012 error ("x87 register return with x87 disabled");
8013 issued_x87_ret_error = true;
8015 return NULL;
8018 /* First construct simple cases. Avoid SCmode, since we want to use
8019 single register to pass this type. */
8020 if (n == 1 && mode != SCmode)
8021 switch (regclass[0])
8023 case X86_64_INTEGER_CLASS:
8024 case X86_64_INTEGERSI_CLASS:
8025 return gen_rtx_REG (mode, intreg[0]);
8026 case X86_64_SSE_CLASS:
8027 case X86_64_SSESF_CLASS:
8028 case X86_64_SSEDF_CLASS:
8029 if (mode != BLKmode)
8030 return gen_reg_or_parallel (mode, orig_mode,
8031 SSE_REGNO (sse_regno));
8032 break;
8033 case X86_64_X87_CLASS:
8034 case X86_64_COMPLEX_X87_CLASS:
8035 return gen_rtx_REG (mode, FIRST_STACK_REG);
8036 case X86_64_NO_CLASS:
8037 /* Zero sized array, struct or class. */
8038 return NULL;
8039 default:
8040 gcc_unreachable ();
8042 if (n == 2
8043 && regclass[0] == X86_64_SSE_CLASS
8044 && regclass[1] == X86_64_SSEUP_CLASS
8045 && mode != BLKmode)
8046 return gen_reg_or_parallel (mode, orig_mode,
8047 SSE_REGNO (sse_regno));
8048 if (n == 4
8049 && regclass[0] == X86_64_SSE_CLASS
8050 && regclass[1] == X86_64_SSEUP_CLASS
8051 && regclass[2] == X86_64_SSEUP_CLASS
8052 && regclass[3] == X86_64_SSEUP_CLASS
8053 && mode != BLKmode)
8054 return gen_reg_or_parallel (mode, orig_mode,
8055 SSE_REGNO (sse_regno));
8056 if (n == 8
8057 && regclass[0] == X86_64_SSE_CLASS
8058 && regclass[1] == X86_64_SSEUP_CLASS
8059 && regclass[2] == X86_64_SSEUP_CLASS
8060 && regclass[3] == X86_64_SSEUP_CLASS
8061 && regclass[4] == X86_64_SSEUP_CLASS
8062 && regclass[5] == X86_64_SSEUP_CLASS
8063 && regclass[6] == X86_64_SSEUP_CLASS
8064 && regclass[7] == X86_64_SSEUP_CLASS
8065 && mode != BLKmode)
8066 return gen_reg_or_parallel (mode, orig_mode,
8067 SSE_REGNO (sse_regno));
8068 if (n == 2
8069 && regclass[0] == X86_64_X87_CLASS
8070 && regclass[1] == X86_64_X87UP_CLASS)
8071 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8073 if (n == 2
8074 && regclass[0] == X86_64_INTEGER_CLASS
8075 && regclass[1] == X86_64_INTEGER_CLASS
8076 && (mode == CDImode || mode == TImode)
8077 && intreg[0] + 1 == intreg[1])
8078 return gen_rtx_REG (mode, intreg[0]);
8080 /* Otherwise figure out the entries of the PARALLEL. */
8081 for (i = 0; i < n; i++)
8083 int pos;
8085 switch (regclass[i])
8087 case X86_64_NO_CLASS:
8088 break;
8089 case X86_64_INTEGER_CLASS:
8090 case X86_64_INTEGERSI_CLASS:
8091 /* Merge TImodes on aligned occasions here too. */
8092 if (i * 8 + 8 > bytes)
8094 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8095 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8096 /* We've requested 24 bytes we
8097 don't have mode for. Use DImode. */
8098 tmpmode = DImode;
8100 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8101 tmpmode = SImode;
8102 else
8103 tmpmode = DImode;
8104 exp [nexps++]
8105 = gen_rtx_EXPR_LIST (VOIDmode,
8106 gen_rtx_REG (tmpmode, *intreg),
8107 GEN_INT (i*8));
8108 intreg++;
8109 break;
8110 case X86_64_SSESF_CLASS:
8111 exp [nexps++]
8112 = gen_rtx_EXPR_LIST (VOIDmode,
8113 gen_rtx_REG (SFmode,
8114 SSE_REGNO (sse_regno)),
8115 GEN_INT (i*8));
8116 sse_regno++;
8117 break;
8118 case X86_64_SSEDF_CLASS:
8119 exp [nexps++]
8120 = gen_rtx_EXPR_LIST (VOIDmode,
8121 gen_rtx_REG (DFmode,
8122 SSE_REGNO (sse_regno)),
8123 GEN_INT (i*8));
8124 sse_regno++;
8125 break;
8126 case X86_64_SSE_CLASS:
8127 pos = i;
8128 switch (n)
8130 case 1:
8131 tmpmode = DImode;
8132 break;
8133 case 2:
8134 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8136 tmpmode = TImode;
8137 i++;
8139 else
8140 tmpmode = DImode;
8141 break;
8142 case 4:
8143 gcc_assert (i == 0
8144 && regclass[1] == X86_64_SSEUP_CLASS
8145 && regclass[2] == X86_64_SSEUP_CLASS
8146 && regclass[3] == X86_64_SSEUP_CLASS);
8147 tmpmode = OImode;
8148 i += 3;
8149 break;
8150 case 8:
8151 gcc_assert (i == 0
8152 && regclass[1] == X86_64_SSEUP_CLASS
8153 && regclass[2] == X86_64_SSEUP_CLASS
8154 && regclass[3] == X86_64_SSEUP_CLASS
8155 && regclass[4] == X86_64_SSEUP_CLASS
8156 && regclass[5] == X86_64_SSEUP_CLASS
8157 && regclass[6] == X86_64_SSEUP_CLASS
8158 && regclass[7] == X86_64_SSEUP_CLASS);
8159 tmpmode = XImode;
8160 i += 7;
8161 break;
8162 default:
8163 gcc_unreachable ();
8165 exp [nexps++]
8166 = gen_rtx_EXPR_LIST (VOIDmode,
8167 gen_rtx_REG (tmpmode,
8168 SSE_REGNO (sse_regno)),
8169 GEN_INT (pos*8));
8170 sse_regno++;
8171 break;
8172 default:
8173 gcc_unreachable ();
8177 /* Empty aligned struct, union or class. */
8178 if (nexps == 0)
8179 return NULL;
8181 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8182 for (i = 0; i < nexps; i++)
8183 XVECEXP (ret, 0, i) = exp [i];
8184 return ret;
8187 /* Update the data in CUM to advance over an argument of mode MODE
8188 and data type TYPE. (TYPE is null for libcalls where that information
8189 may not be available.)
8191 Return a number of integer regsiters advanced over. */
8193 static int
8194 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8195 const_tree type, HOST_WIDE_INT bytes,
8196 HOST_WIDE_INT words)
8198 int res = 0;
8199 bool error_p = false;
8201 if (TARGET_IAMCU)
8203 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8204 bytes in registers. */
8205 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8206 goto pass_in_reg;
8207 return res;
8210 switch (mode)
8212 default:
8213 break;
8215 case E_BLKmode:
8216 if (bytes < 0)
8217 break;
8218 /* FALLTHRU */
8220 case E_DImode:
8221 case E_SImode:
8222 case E_HImode:
8223 case E_QImode:
8224 pass_in_reg:
8225 cum->words += words;
8226 cum->nregs -= words;
8227 cum->regno += words;
8228 if (cum->nregs >= 0)
8229 res = words;
8230 if (cum->nregs <= 0)
8232 cum->nregs = 0;
8233 cfun->machine->arg_reg_available = false;
8234 cum->regno = 0;
8236 break;
8238 case E_OImode:
8239 /* OImode shouldn't be used directly. */
8240 gcc_unreachable ();
8242 case E_DFmode:
8243 if (cum->float_in_sse == -1)
8244 error_p = true;
8245 if (cum->float_in_sse < 2)
8246 break;
8247 /* FALLTHRU */
8248 case E_SFmode:
8249 if (cum->float_in_sse == -1)
8250 error_p = true;
8251 if (cum->float_in_sse < 1)
8252 break;
8253 /* FALLTHRU */
8255 case E_V8SFmode:
8256 case E_V8SImode:
8257 case E_V64QImode:
8258 case E_V32HImode:
8259 case E_V16SImode:
8260 case E_V8DImode:
8261 case E_V16SFmode:
8262 case E_V8DFmode:
8263 case E_V32QImode:
8264 case E_V16HImode:
8265 case E_V4DFmode:
8266 case E_V4DImode:
8267 case E_TImode:
8268 case E_V16QImode:
8269 case E_V8HImode:
8270 case E_V4SImode:
8271 case E_V2DImode:
8272 case E_V4SFmode:
8273 case E_V2DFmode:
8274 if (!type || !AGGREGATE_TYPE_P (type))
8276 cum->sse_words += words;
8277 cum->sse_nregs -= 1;
8278 cum->sse_regno += 1;
8279 if (cum->sse_nregs <= 0)
8281 cum->sse_nregs = 0;
8282 cum->sse_regno = 0;
8285 break;
8287 case E_V8QImode:
8288 case E_V4HImode:
8289 case E_V2SImode:
8290 case E_V2SFmode:
8291 case E_V1TImode:
8292 case E_V1DImode:
8293 if (!type || !AGGREGATE_TYPE_P (type))
8295 cum->mmx_words += words;
8296 cum->mmx_nregs -= 1;
8297 cum->mmx_regno += 1;
8298 if (cum->mmx_nregs <= 0)
8300 cum->mmx_nregs = 0;
8301 cum->mmx_regno = 0;
8304 break;
8306 if (error_p)
8308 cum->float_in_sse = 0;
8309 error ("calling %qD with SSE calling convention without "
8310 "SSE/SSE2 enabled", cum->decl);
8311 sorry ("this is a GCC bug that can be worked around by adding "
8312 "attribute used to function called");
8315 return res;
8318 static int
8319 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8320 const_tree type, HOST_WIDE_INT words, bool named)
8322 int int_nregs, sse_nregs;
8324 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8325 if (!named && (VALID_AVX512F_REG_MODE (mode)
8326 || VALID_AVX256_REG_MODE (mode)))
8327 return 0;
8329 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8330 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8332 cum->nregs -= int_nregs;
8333 cum->sse_nregs -= sse_nregs;
8334 cum->regno += int_nregs;
8335 cum->sse_regno += sse_nregs;
8336 return int_nregs;
8338 else
8340 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8341 cum->words = ROUND_UP (cum->words, align);
8342 cum->words += words;
8343 return 0;
8347 static int
8348 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8349 HOST_WIDE_INT words)
8351 /* Otherwise, this should be passed indirect. */
8352 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8354 cum->words += words;
8355 if (cum->nregs > 0)
8357 cum->nregs -= 1;
8358 cum->regno += 1;
8359 return 1;
8361 return 0;
8364 /* Update the data in CUM to advance over an argument of mode MODE and
8365 data type TYPE. (TYPE is null for libcalls where that information
8366 may not be available.) */
8368 static void
8369 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8370 const_tree type, bool named)
8372 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8373 HOST_WIDE_INT bytes, words;
8374 int nregs;
8376 /* The argument of interrupt handler is a special case and is
8377 handled in ix86_function_arg. */
8378 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8379 return;
8381 if (mode == BLKmode)
8382 bytes = int_size_in_bytes (type);
8383 else
8384 bytes = GET_MODE_SIZE (mode);
8385 words = CEIL (bytes, UNITS_PER_WORD);
8387 if (type)
8388 mode = type_natural_mode (type, NULL, false);
8390 if ((type && POINTER_BOUNDS_TYPE_P (type))
8391 || POINTER_BOUNDS_MODE_P (mode))
8393 /* If we pass bounds in BT then just update remained bounds count. */
8394 if (cum->bnds_in_bt)
8396 cum->bnds_in_bt--;
8397 return;
8400 /* Update remained number of bounds to force. */
8401 if (cum->force_bnd_pass)
8402 cum->force_bnd_pass--;
8404 cum->bnd_regno++;
8406 return;
8409 /* The first arg not going to Bounds Tables resets this counter. */
8410 cum->bnds_in_bt = 0;
8411 /* For unnamed args we always pass bounds to avoid bounds mess when
8412 passed and received types do not match. If bounds do not follow
8413 unnamed arg, still pretend required number of bounds were passed. */
8414 if (cum->force_bnd_pass)
8416 cum->bnd_regno += cum->force_bnd_pass;
8417 cum->force_bnd_pass = 0;
8420 if (TARGET_64BIT)
8422 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8424 if (call_abi == MS_ABI)
8425 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8426 else
8427 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8429 else
8430 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8432 /* For stdarg we expect bounds to be passed for each value passed
8433 in register. */
8434 if (cum->stdarg)
8435 cum->force_bnd_pass = nregs;
8436 /* For pointers passed in memory we expect bounds passed in Bounds
8437 Table. */
8438 if (!nregs)
8440 /* Track if there are outgoing arguments on stack. */
8441 if (cum->caller)
8442 cfun->machine->outgoing_args_on_stack = true;
8444 cum->bnds_in_bt = chkp_type_bounds_count (type);
8448 /* Define where to put the arguments to a function.
8449 Value is zero to push the argument on the stack,
8450 or a hard register in which to store the argument.
8452 MODE is the argument's machine mode.
8453 TYPE is the data type of the argument (as a tree).
8454 This is null for libcalls where that information may
8455 not be available.
8456 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8457 the preceding args and about the function being called.
8458 NAMED is nonzero if this argument is a named parameter
8459 (otherwise it is an extra parameter matching an ellipsis). */
8461 static rtx
8462 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8463 machine_mode orig_mode, const_tree type,
8464 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8466 bool error_p = false;
8468 /* Avoid the AL settings for the Unix64 ABI. */
8469 if (mode == VOIDmode)
8470 return constm1_rtx;
8472 if (TARGET_IAMCU)
8474 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8475 bytes in registers. */
8476 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8477 goto pass_in_reg;
8478 return NULL_RTX;
8481 switch (mode)
8483 default:
8484 break;
8486 case E_BLKmode:
8487 if (bytes < 0)
8488 break;
8489 /* FALLTHRU */
8490 case E_DImode:
8491 case E_SImode:
8492 case E_HImode:
8493 case E_QImode:
8494 pass_in_reg:
8495 if (words <= cum->nregs)
8497 int regno = cum->regno;
8499 /* Fastcall allocates the first two DWORD (SImode) or
8500 smaller arguments to ECX and EDX if it isn't an
8501 aggregate type . */
8502 if (cum->fastcall)
8504 if (mode == BLKmode
8505 || mode == DImode
8506 || (type && AGGREGATE_TYPE_P (type)))
8507 break;
8509 /* ECX not EAX is the first allocated register. */
8510 if (regno == AX_REG)
8511 regno = CX_REG;
8513 return gen_rtx_REG (mode, regno);
8515 break;
8517 case E_DFmode:
8518 if (cum->float_in_sse == -1)
8519 error_p = true;
8520 if (cum->float_in_sse < 2)
8521 break;
8522 /* FALLTHRU */
8523 case E_SFmode:
8524 if (cum->float_in_sse == -1)
8525 error_p = true;
8526 if (cum->float_in_sse < 1)
8527 break;
8528 /* FALLTHRU */
8529 case E_TImode:
8530 /* In 32bit, we pass TImode in xmm registers. */
8531 case E_V16QImode:
8532 case E_V8HImode:
8533 case E_V4SImode:
8534 case E_V2DImode:
8535 case E_V4SFmode:
8536 case E_V2DFmode:
8537 if (!type || !AGGREGATE_TYPE_P (type))
8539 if (cum->sse_nregs)
8540 return gen_reg_or_parallel (mode, orig_mode,
8541 cum->sse_regno + FIRST_SSE_REG);
8543 break;
8545 case E_OImode:
8546 case E_XImode:
8547 /* OImode and XImode shouldn't be used directly. */
8548 gcc_unreachable ();
8550 case E_V64QImode:
8551 case E_V32HImode:
8552 case E_V16SImode:
8553 case E_V8DImode:
8554 case E_V16SFmode:
8555 case E_V8DFmode:
8556 case E_V8SFmode:
8557 case E_V8SImode:
8558 case E_V32QImode:
8559 case E_V16HImode:
8560 case E_V4DFmode:
8561 case E_V4DImode:
8562 if (!type || !AGGREGATE_TYPE_P (type))
8564 if (cum->sse_nregs)
8565 return gen_reg_or_parallel (mode, orig_mode,
8566 cum->sse_regno + FIRST_SSE_REG);
8568 break;
8570 case E_V8QImode:
8571 case E_V4HImode:
8572 case E_V2SImode:
8573 case E_V2SFmode:
8574 case E_V1TImode:
8575 case E_V1DImode:
8576 if (!type || !AGGREGATE_TYPE_P (type))
8578 if (cum->mmx_nregs)
8579 return gen_reg_or_parallel (mode, orig_mode,
8580 cum->mmx_regno + FIRST_MMX_REG);
8582 break;
8584 if (error_p)
8586 cum->float_in_sse = 0;
8587 error ("calling %qD with SSE calling convention without "
8588 "SSE/SSE2 enabled", cum->decl);
8589 sorry ("this is a GCC bug that can be worked around by adding "
8590 "attribute used to function called");
8593 return NULL_RTX;
8596 static rtx
8597 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8598 machine_mode orig_mode, const_tree type, bool named)
8600 /* Handle a hidden AL argument containing number of registers
8601 for varargs x86-64 functions. */
8602 if (mode == VOIDmode)
8603 return GEN_INT (cum->maybe_vaarg
8604 ? (cum->sse_nregs < 0
8605 ? X86_64_SSE_REGPARM_MAX
8606 : cum->sse_regno)
8607 : -1);
8609 switch (mode)
8611 default:
8612 break;
8614 case E_V8SFmode:
8615 case E_V8SImode:
8616 case E_V32QImode:
8617 case E_V16HImode:
8618 case E_V4DFmode:
8619 case E_V4DImode:
8620 case E_V16SFmode:
8621 case E_V16SImode:
8622 case E_V64QImode:
8623 case E_V32HImode:
8624 case E_V8DFmode:
8625 case E_V8DImode:
8626 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8627 if (!named)
8628 return NULL;
8629 break;
8632 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8633 cum->sse_nregs,
8634 &x86_64_int_parameter_registers [cum->regno],
8635 cum->sse_regno);
8638 static rtx
8639 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8640 machine_mode orig_mode, bool named,
8641 HOST_WIDE_INT bytes)
8643 unsigned int regno;
8645 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8646 We use value of -2 to specify that current function call is MSABI. */
8647 if (mode == VOIDmode)
8648 return GEN_INT (-2);
8650 /* If we've run out of registers, it goes on the stack. */
8651 if (cum->nregs == 0)
8652 return NULL_RTX;
8654 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8656 /* Only floating point modes are passed in anything but integer regs. */
8657 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8659 if (named)
8660 regno = cum->regno + FIRST_SSE_REG;
8661 else
8663 rtx t1, t2;
8665 /* Unnamed floating parameters are passed in both the
8666 SSE and integer registers. */
8667 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8668 t2 = gen_rtx_REG (mode, regno);
8669 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8670 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8671 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8674 /* Handle aggregated types passed in register. */
8675 if (orig_mode == BLKmode)
8677 if (bytes > 0 && bytes <= 8)
8678 mode = (bytes > 4 ? DImode : SImode);
8679 if (mode == BLKmode)
8680 mode = DImode;
8683 return gen_reg_or_parallel (mode, orig_mode, regno);
8686 /* Return where to put the arguments to a function.
8687 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8689 MODE is the argument's machine mode. TYPE is the data type of the
8690 argument. It is null for libcalls where that information may not be
8691 available. CUM gives information about the preceding args and about
8692 the function being called. NAMED is nonzero if this argument is a
8693 named parameter (otherwise it is an extra parameter matching an
8694 ellipsis). */
8696 static rtx
8697 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8698 const_tree type, bool named)
8700 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8701 machine_mode mode = omode;
8702 HOST_WIDE_INT bytes, words;
8703 rtx arg;
8705 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8707 gcc_assert (type != NULL_TREE);
8708 if (POINTER_TYPE_P (type))
8710 /* This is the pointer argument. */
8711 gcc_assert (TYPE_MODE (type) == Pmode);
8712 /* It is at -WORD(AP) in the current frame in interrupt and
8713 exception handlers. */
8714 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8716 else
8718 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8719 && TREE_CODE (type) == INTEGER_TYPE
8720 && TYPE_MODE (type) == word_mode);
8721 /* The error code is the word-mode integer argument at
8722 -2 * WORD(AP) in the current frame of the exception
8723 handler. */
8724 arg = gen_rtx_MEM (word_mode,
8725 plus_constant (Pmode,
8726 arg_pointer_rtx,
8727 -2 * UNITS_PER_WORD));
8729 return arg;
8732 /* All pointer bounds arguments are handled separately here. */
8733 if ((type && POINTER_BOUNDS_TYPE_P (type))
8734 || POINTER_BOUNDS_MODE_P (mode))
8736 /* Return NULL if bounds are forced to go in Bounds Table. */
8737 if (cum->bnds_in_bt)
8738 arg = NULL;
8739 /* Return the next available bound reg if any. */
8740 else if (cum->bnd_regno <= LAST_BND_REG)
8741 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8742 /* Return the next special slot number otherwise. */
8743 else
8744 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8746 return arg;
8749 if (mode == BLKmode)
8750 bytes = int_size_in_bytes (type);
8751 else
8752 bytes = GET_MODE_SIZE (mode);
8753 words = CEIL (bytes, UNITS_PER_WORD);
8755 /* To simplify the code below, represent vector types with a vector mode
8756 even if MMX/SSE are not active. */
8757 if (type && TREE_CODE (type) == VECTOR_TYPE)
8758 mode = type_natural_mode (type, cum, false);
8760 if (TARGET_64BIT)
8762 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8764 if (call_abi == MS_ABI)
8765 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8766 else
8767 arg = function_arg_64 (cum, mode, omode, type, named);
8769 else
8770 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8772 /* Track if there are outgoing arguments on stack. */
8773 if (arg == NULL_RTX && cum->caller)
8774 cfun->machine->outgoing_args_on_stack = true;
8776 return arg;
8779 /* A C expression that indicates when an argument must be passed by
8780 reference. If nonzero for an argument, a copy of that argument is
8781 made in memory and a pointer to the argument is passed instead of
8782 the argument itself. The pointer is passed in whatever way is
8783 appropriate for passing a pointer to that type. */
8785 static bool
8786 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8787 const_tree type, bool)
8789 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8791 /* Bounds are never passed by reference. */
8792 if ((type && POINTER_BOUNDS_TYPE_P (type))
8793 || POINTER_BOUNDS_MODE_P (mode))
8794 return false;
8796 if (TARGET_64BIT)
8798 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8800 /* See Windows x64 Software Convention. */
8801 if (call_abi == MS_ABI)
8803 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8805 if (type)
8807 /* Arrays are passed by reference. */
8808 if (TREE_CODE (type) == ARRAY_TYPE)
8809 return true;
8811 if (RECORD_OR_UNION_TYPE_P (type))
8813 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8814 are passed by reference. */
8815 msize = int_size_in_bytes (type);
8819 /* __m128 is passed by reference. */
8820 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8822 else if (type && int_size_in_bytes (type) == -1)
8823 return true;
8826 return false;
8829 /* Return true when TYPE should be 128bit aligned for 32bit argument
8830 passing ABI. XXX: This function is obsolete and is only used for
8831 checking psABI compatibility with previous versions of GCC. */
8833 static bool
8834 ix86_compat_aligned_value_p (const_tree type)
8836 machine_mode mode = TYPE_MODE (type);
8837 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8838 || mode == TDmode
8839 || mode == TFmode
8840 || mode == TCmode)
8841 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8842 return true;
8843 if (TYPE_ALIGN (type) < 128)
8844 return false;
8846 if (AGGREGATE_TYPE_P (type))
8848 /* Walk the aggregates recursively. */
8849 switch (TREE_CODE (type))
8851 case RECORD_TYPE:
8852 case UNION_TYPE:
8853 case QUAL_UNION_TYPE:
8855 tree field;
8857 /* Walk all the structure fields. */
8858 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8860 if (TREE_CODE (field) == FIELD_DECL
8861 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8862 return true;
8864 break;
8867 case ARRAY_TYPE:
8868 /* Just for use if some languages passes arrays by value. */
8869 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8870 return true;
8871 break;
8873 default:
8874 gcc_unreachable ();
8877 return false;
8880 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8881 XXX: This function is obsolete and is only used for checking psABI
8882 compatibility with previous versions of GCC. */
8884 static unsigned int
8885 ix86_compat_function_arg_boundary (machine_mode mode,
8886 const_tree type, unsigned int align)
8888 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8889 natural boundaries. */
8890 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8892 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8893 make an exception for SSE modes since these require 128bit
8894 alignment.
8896 The handling here differs from field_alignment. ICC aligns MMX
8897 arguments to 4 byte boundaries, while structure fields are aligned
8898 to 8 byte boundaries. */
8899 if (!type)
8901 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8902 align = PARM_BOUNDARY;
8904 else
8906 if (!ix86_compat_aligned_value_p (type))
8907 align = PARM_BOUNDARY;
8910 if (align > BIGGEST_ALIGNMENT)
8911 align = BIGGEST_ALIGNMENT;
8912 return align;
8915 /* Return true when TYPE should be 128bit aligned for 32bit argument
8916 passing ABI. */
8918 static bool
8919 ix86_contains_aligned_value_p (const_tree type)
8921 machine_mode mode = TYPE_MODE (type);
8923 if (mode == XFmode || mode == XCmode)
8924 return false;
8926 if (TYPE_ALIGN (type) < 128)
8927 return false;
8929 if (AGGREGATE_TYPE_P (type))
8931 /* Walk the aggregates recursively. */
8932 switch (TREE_CODE (type))
8934 case RECORD_TYPE:
8935 case UNION_TYPE:
8936 case QUAL_UNION_TYPE:
8938 tree field;
8940 /* Walk all the structure fields. */
8941 for (field = TYPE_FIELDS (type);
8942 field;
8943 field = DECL_CHAIN (field))
8945 if (TREE_CODE (field) == FIELD_DECL
8946 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8947 return true;
8949 break;
8952 case ARRAY_TYPE:
8953 /* Just for use if some languages passes arrays by value. */
8954 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8955 return true;
8956 break;
8958 default:
8959 gcc_unreachable ();
8962 else
8963 return TYPE_ALIGN (type) >= 128;
8965 return false;
8968 /* Gives the alignment boundary, in bits, of an argument with the
8969 specified mode and type. */
8971 static unsigned int
8972 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8974 unsigned int align;
8975 if (type)
8977 /* Since the main variant type is used for call, we convert it to
8978 the main variant type. */
8979 type = TYPE_MAIN_VARIANT (type);
8980 align = TYPE_ALIGN (type);
8981 if (TYPE_EMPTY_P (type))
8982 return PARM_BOUNDARY;
8984 else
8985 align = GET_MODE_ALIGNMENT (mode);
8986 if (align < PARM_BOUNDARY)
8987 align = PARM_BOUNDARY;
8988 else
8990 static bool warned;
8991 unsigned int saved_align = align;
8993 if (!TARGET_64BIT)
8995 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8996 if (!type)
8998 if (mode == XFmode || mode == XCmode)
8999 align = PARM_BOUNDARY;
9001 else if (!ix86_contains_aligned_value_p (type))
9002 align = PARM_BOUNDARY;
9004 if (align < 128)
9005 align = PARM_BOUNDARY;
9008 if (warn_psabi
9009 && !warned
9010 && align != ix86_compat_function_arg_boundary (mode, type,
9011 saved_align))
9013 warned = true;
9014 inform (input_location,
9015 "The ABI for passing parameters with %d-byte"
9016 " alignment has changed in GCC 4.6",
9017 align / BITS_PER_UNIT);
9021 return align;
9024 /* Return true if N is a possible register number of function value. */
9026 static bool
9027 ix86_function_value_regno_p (const unsigned int regno)
9029 switch (regno)
9031 case AX_REG:
9032 return true;
9033 case DX_REG:
9034 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9035 case DI_REG:
9036 case SI_REG:
9037 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9039 case BND0_REG:
9040 case BND1_REG:
9041 return chkp_function_instrumented_p (current_function_decl);
9043 /* Complex values are returned in %st(0)/%st(1) pair. */
9044 case ST0_REG:
9045 case ST1_REG:
9046 /* TODO: The function should depend on current function ABI but
9047 builtins.c would need updating then. Therefore we use the
9048 default ABI. */
9049 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9050 return false;
9051 return TARGET_FLOAT_RETURNS_IN_80387;
9053 /* Complex values are returned in %xmm0/%xmm1 pair. */
9054 case XMM0_REG:
9055 case XMM1_REG:
9056 return TARGET_SSE;
9058 case MM0_REG:
9059 if (TARGET_MACHO || TARGET_64BIT)
9060 return false;
9061 return TARGET_MMX;
9064 return false;
9067 /* Define how to find the value returned by a function.
9068 VALTYPE is the data type of the value (as a tree).
9069 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9070 otherwise, FUNC is 0. */
9072 static rtx
9073 function_value_32 (machine_mode orig_mode, machine_mode mode,
9074 const_tree fntype, const_tree fn)
9076 unsigned int regno;
9078 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9079 we normally prevent this case when mmx is not available. However
9080 some ABIs may require the result to be returned like DImode. */
9081 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9082 regno = FIRST_MMX_REG;
9084 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9085 we prevent this case when sse is not available. However some ABIs
9086 may require the result to be returned like integer TImode. */
9087 else if (mode == TImode
9088 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9089 regno = FIRST_SSE_REG;
9091 /* 32-byte vector modes in %ymm0. */
9092 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9093 regno = FIRST_SSE_REG;
9095 /* 64-byte vector modes in %zmm0. */
9096 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9097 regno = FIRST_SSE_REG;
9099 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9100 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9101 regno = FIRST_FLOAT_REG;
9102 else
9103 /* Most things go in %eax. */
9104 regno = AX_REG;
9106 /* Override FP return register with %xmm0 for local functions when
9107 SSE math is enabled or for functions with sseregparm attribute. */
9108 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9110 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9111 if (sse_level == -1)
9113 error ("calling %qD with SSE calling convention without "
9114 "SSE/SSE2 enabled", fn);
9115 sorry ("this is a GCC bug that can be worked around by adding "
9116 "attribute used to function called");
9118 else if ((sse_level >= 1 && mode == SFmode)
9119 || (sse_level == 2 && mode == DFmode))
9120 regno = FIRST_SSE_REG;
9123 /* OImode shouldn't be used directly. */
9124 gcc_assert (mode != OImode);
9126 return gen_rtx_REG (orig_mode, regno);
9129 static rtx
9130 function_value_64 (machine_mode orig_mode, machine_mode mode,
9131 const_tree valtype)
9133 rtx ret;
9135 /* Handle libcalls, which don't provide a type node. */
9136 if (valtype == NULL)
9138 unsigned int regno;
9140 switch (mode)
9142 case E_SFmode:
9143 case E_SCmode:
9144 case E_DFmode:
9145 case E_DCmode:
9146 case E_TFmode:
9147 case E_SDmode:
9148 case E_DDmode:
9149 case E_TDmode:
9150 regno = FIRST_SSE_REG;
9151 break;
9152 case E_XFmode:
9153 case E_XCmode:
9154 regno = FIRST_FLOAT_REG;
9155 break;
9156 case E_TCmode:
9157 return NULL;
9158 default:
9159 regno = AX_REG;
9162 return gen_rtx_REG (mode, regno);
9164 else if (POINTER_TYPE_P (valtype))
9166 /* Pointers are always returned in word_mode. */
9167 mode = word_mode;
9170 ret = construct_container (mode, orig_mode, valtype, 1,
9171 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9172 x86_64_int_return_registers, 0);
9174 /* For zero sized structures, construct_container returns NULL, but we
9175 need to keep rest of compiler happy by returning meaningful value. */
9176 if (!ret)
9177 ret = gen_rtx_REG (orig_mode, AX_REG);
9179 return ret;
9182 static rtx
9183 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9184 const_tree valtype)
9186 unsigned int regno = AX_REG;
9188 if (TARGET_SSE)
9190 switch (GET_MODE_SIZE (mode))
9192 case 16:
9193 if (valtype != NULL_TREE
9194 && !VECTOR_INTEGER_TYPE_P (valtype)
9195 && !VECTOR_INTEGER_TYPE_P (valtype)
9196 && !INTEGRAL_TYPE_P (valtype)
9197 && !VECTOR_FLOAT_TYPE_P (valtype))
9198 break;
9199 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9200 && !COMPLEX_MODE_P (mode))
9201 regno = FIRST_SSE_REG;
9202 break;
9203 case 8:
9204 case 4:
9205 if (mode == SFmode || mode == DFmode)
9206 regno = FIRST_SSE_REG;
9207 break;
9208 default:
9209 break;
9212 return gen_rtx_REG (orig_mode, regno);
9215 static rtx
9216 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9217 machine_mode orig_mode, machine_mode mode)
9219 const_tree fn, fntype;
9221 fn = NULL_TREE;
9222 if (fntype_or_decl && DECL_P (fntype_or_decl))
9223 fn = fntype_or_decl;
9224 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9226 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9227 || POINTER_BOUNDS_MODE_P (mode))
9228 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9229 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9230 return function_value_ms_64 (orig_mode, mode, valtype);
9231 else if (TARGET_64BIT)
9232 return function_value_64 (orig_mode, mode, valtype);
9233 else
9234 return function_value_32 (orig_mode, mode, fntype, fn);
9237 static rtx
9238 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9240 machine_mode mode, orig_mode;
9242 orig_mode = TYPE_MODE (valtype);
9243 mode = type_natural_mode (valtype, NULL, true);
9244 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9247 /* Return an RTX representing a place where a function returns
9248 or recieves pointer bounds or NULL if no bounds are returned.
9250 VALTYPE is a data type of a value returned by the function.
9252 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9253 or FUNCTION_TYPE of the function.
9255 If OUTGOING is false, return a place in which the caller will
9256 see the return value. Otherwise, return a place where a
9257 function returns a value. */
9259 static rtx
9260 ix86_function_value_bounds (const_tree valtype,
9261 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9262 bool outgoing ATTRIBUTE_UNUSED)
9264 rtx res = NULL_RTX;
9266 if (BOUNDED_TYPE_P (valtype))
9267 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9268 else if (chkp_type_has_pointer (valtype))
9270 bitmap slots;
9271 rtx bounds[2];
9272 bitmap_iterator bi;
9273 unsigned i, bnd_no = 0;
9275 bitmap_obstack_initialize (NULL);
9276 slots = BITMAP_ALLOC (NULL);
9277 chkp_find_bound_slots (valtype, slots);
9279 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9281 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9282 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9283 gcc_assert (bnd_no < 2);
9284 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9287 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9289 BITMAP_FREE (slots);
9290 bitmap_obstack_release (NULL);
9292 else
9293 res = NULL_RTX;
9295 return res;
9298 /* Pointer function arguments and return values are promoted to
9299 word_mode for normal functions. */
9301 static machine_mode
9302 ix86_promote_function_mode (const_tree type, machine_mode mode,
9303 int *punsignedp, const_tree fntype,
9304 int for_return)
9306 if (cfun->machine->func_type == TYPE_NORMAL
9307 && type != NULL_TREE
9308 && POINTER_TYPE_P (type))
9310 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9311 return word_mode;
9313 return default_promote_function_mode (type, mode, punsignedp, fntype,
9314 for_return);
9317 /* Return true if a structure, union or array with MODE containing FIELD
9318 should be accessed using BLKmode. */
9320 static bool
9321 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9323 /* Union with XFmode must be in BLKmode. */
9324 return (mode == XFmode
9325 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9326 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9330 ix86_libcall_value (machine_mode mode)
9332 return ix86_function_value_1 (NULL, NULL, mode, mode);
9335 /* Return true iff type is returned in memory. */
9337 static bool
9338 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9340 #ifdef SUBTARGET_RETURN_IN_MEMORY
9341 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9342 #else
9343 const machine_mode mode = type_natural_mode (type, NULL, true);
9344 HOST_WIDE_INT size;
9346 if (POINTER_BOUNDS_TYPE_P (type))
9347 return false;
9349 if (TARGET_64BIT)
9351 if (ix86_function_type_abi (fntype) == MS_ABI)
9353 size = int_size_in_bytes (type);
9355 /* __m128 is returned in xmm0. */
9356 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9357 || INTEGRAL_TYPE_P (type)
9358 || VECTOR_FLOAT_TYPE_P (type))
9359 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9360 && !COMPLEX_MODE_P (mode)
9361 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9362 return false;
9364 /* Otherwise, the size must be exactly in [1248]. */
9365 return size != 1 && size != 2 && size != 4 && size != 8;
9367 else
9369 int needed_intregs, needed_sseregs;
9371 return examine_argument (mode, type, 1,
9372 &needed_intregs, &needed_sseregs);
9375 else
9377 size = int_size_in_bytes (type);
9379 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9380 bytes in registers. */
9381 if (TARGET_IAMCU)
9382 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9384 if (mode == BLKmode)
9385 return true;
9387 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9388 return false;
9390 if (VECTOR_MODE_P (mode) || mode == TImode)
9392 /* User-created vectors small enough to fit in EAX. */
9393 if (size < 8)
9394 return false;
9396 /* Unless ABI prescibes otherwise,
9397 MMX/3dNow values are returned in MM0 if available. */
9399 if (size == 8)
9400 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9402 /* SSE values are returned in XMM0 if available. */
9403 if (size == 16)
9404 return !TARGET_SSE;
9406 /* AVX values are returned in YMM0 if available. */
9407 if (size == 32)
9408 return !TARGET_AVX;
9410 /* AVX512F values are returned in ZMM0 if available. */
9411 if (size == 64)
9412 return !TARGET_AVX512F;
9415 if (mode == XFmode)
9416 return false;
9418 if (size > 12)
9419 return true;
9421 /* OImode shouldn't be used directly. */
9422 gcc_assert (mode != OImode);
9424 return false;
9426 #endif
9430 /* Create the va_list data type. */
9432 static tree
9433 ix86_build_builtin_va_list_64 (void)
9435 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9437 record = lang_hooks.types.make_type (RECORD_TYPE);
9438 type_decl = build_decl (BUILTINS_LOCATION,
9439 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9441 f_gpr = build_decl (BUILTINS_LOCATION,
9442 FIELD_DECL, get_identifier ("gp_offset"),
9443 unsigned_type_node);
9444 f_fpr = build_decl (BUILTINS_LOCATION,
9445 FIELD_DECL, get_identifier ("fp_offset"),
9446 unsigned_type_node);
9447 f_ovf = build_decl (BUILTINS_LOCATION,
9448 FIELD_DECL, get_identifier ("overflow_arg_area"),
9449 ptr_type_node);
9450 f_sav = build_decl (BUILTINS_LOCATION,
9451 FIELD_DECL, get_identifier ("reg_save_area"),
9452 ptr_type_node);
9454 va_list_gpr_counter_field = f_gpr;
9455 va_list_fpr_counter_field = f_fpr;
9457 DECL_FIELD_CONTEXT (f_gpr) = record;
9458 DECL_FIELD_CONTEXT (f_fpr) = record;
9459 DECL_FIELD_CONTEXT (f_ovf) = record;
9460 DECL_FIELD_CONTEXT (f_sav) = record;
9462 TYPE_STUB_DECL (record) = type_decl;
9463 TYPE_NAME (record) = type_decl;
9464 TYPE_FIELDS (record) = f_gpr;
9465 DECL_CHAIN (f_gpr) = f_fpr;
9466 DECL_CHAIN (f_fpr) = f_ovf;
9467 DECL_CHAIN (f_ovf) = f_sav;
9469 layout_type (record);
9471 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9472 NULL_TREE, TYPE_ATTRIBUTES (record));
9474 /* The correct type is an array type of one element. */
9475 return build_array_type (record, build_index_type (size_zero_node));
9478 /* Setup the builtin va_list data type and for 64-bit the additional
9479 calling convention specific va_list data types. */
9481 static tree
9482 ix86_build_builtin_va_list (void)
9484 if (TARGET_64BIT)
9486 /* Initialize ABI specific va_list builtin types.
9488 In lto1, we can encounter two va_list types:
9489 - one as a result of the type-merge across TUs, and
9490 - the one constructed here.
9491 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9492 a type identity check in canonical_va_list_type based on
9493 TYPE_MAIN_VARIANT (which we used to have) will not work.
9494 Instead, we tag each va_list_type_node with its unique attribute, and
9495 look for the attribute in the type identity check in
9496 canonical_va_list_type.
9498 Tagging sysv_va_list_type_node directly with the attribute is
9499 problematic since it's a array of one record, which will degrade into a
9500 pointer to record when used as parameter (see build_va_arg comments for
9501 an example), dropping the attribute in the process. So we tag the
9502 record instead. */
9504 /* For SYSV_ABI we use an array of one record. */
9505 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9507 /* For MS_ABI we use plain pointer to argument area. */
9508 tree char_ptr_type = build_pointer_type (char_type_node);
9509 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9510 TYPE_ATTRIBUTES (char_ptr_type));
9511 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9513 return ((ix86_abi == MS_ABI)
9514 ? ms_va_list_type_node
9515 : sysv_va_list_type_node);
9517 else
9519 /* For i386 we use plain pointer to argument area. */
9520 return build_pointer_type (char_type_node);
9524 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9526 static void
9527 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9529 rtx save_area, mem;
9530 alias_set_type set;
9531 int i, max;
9533 /* GPR size of varargs save area. */
9534 if (cfun->va_list_gpr_size)
9535 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9536 else
9537 ix86_varargs_gpr_size = 0;
9539 /* FPR size of varargs save area. We don't need it if we don't pass
9540 anything in SSE registers. */
9541 if (TARGET_SSE && cfun->va_list_fpr_size)
9542 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9543 else
9544 ix86_varargs_fpr_size = 0;
9546 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9547 return;
9549 save_area = frame_pointer_rtx;
9550 set = get_varargs_alias_set ();
9552 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9553 if (max > X86_64_REGPARM_MAX)
9554 max = X86_64_REGPARM_MAX;
9556 for (i = cum->regno; i < max; i++)
9558 mem = gen_rtx_MEM (word_mode,
9559 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9560 MEM_NOTRAP_P (mem) = 1;
9561 set_mem_alias_set (mem, set);
9562 emit_move_insn (mem,
9563 gen_rtx_REG (word_mode,
9564 x86_64_int_parameter_registers[i]));
9567 if (ix86_varargs_fpr_size)
9569 machine_mode smode;
9570 rtx_code_label *label;
9571 rtx test;
9573 /* Now emit code to save SSE registers. The AX parameter contains number
9574 of SSE parameter registers used to call this function, though all we
9575 actually check here is the zero/non-zero status. */
9577 label = gen_label_rtx ();
9578 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9579 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9580 label));
9582 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9583 we used movdqa (i.e. TImode) instead? Perhaps even better would
9584 be if we could determine the real mode of the data, via a hook
9585 into pass_stdarg. Ignore all that for now. */
9586 smode = V4SFmode;
9587 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9588 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9590 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9591 if (max > X86_64_SSE_REGPARM_MAX)
9592 max = X86_64_SSE_REGPARM_MAX;
9594 for (i = cum->sse_regno; i < max; ++i)
9596 mem = plus_constant (Pmode, save_area,
9597 i * 16 + ix86_varargs_gpr_size);
9598 mem = gen_rtx_MEM (smode, mem);
9599 MEM_NOTRAP_P (mem) = 1;
9600 set_mem_alias_set (mem, set);
9601 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9603 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9606 emit_label (label);
9610 static void
9611 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9613 alias_set_type set = get_varargs_alias_set ();
9614 int i;
9616 /* Reset to zero, as there might be a sysv vaarg used
9617 before. */
9618 ix86_varargs_gpr_size = 0;
9619 ix86_varargs_fpr_size = 0;
9621 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9623 rtx reg, mem;
9625 mem = gen_rtx_MEM (Pmode,
9626 plus_constant (Pmode, virtual_incoming_args_rtx,
9627 i * UNITS_PER_WORD));
9628 MEM_NOTRAP_P (mem) = 1;
9629 set_mem_alias_set (mem, set);
9631 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9632 emit_move_insn (mem, reg);
9636 static void
9637 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9638 tree type, int *, int no_rtl)
9640 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9641 CUMULATIVE_ARGS next_cum;
9642 tree fntype;
9644 /* This argument doesn't appear to be used anymore. Which is good,
9645 because the old code here didn't suppress rtl generation. */
9646 gcc_assert (!no_rtl);
9648 if (!TARGET_64BIT)
9649 return;
9651 fntype = TREE_TYPE (current_function_decl);
9653 /* For varargs, we do not want to skip the dummy va_dcl argument.
9654 For stdargs, we do want to skip the last named argument. */
9655 next_cum = *cum;
9656 if (stdarg_p (fntype))
9657 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9658 true);
9660 if (cum->call_abi == MS_ABI)
9661 setup_incoming_varargs_ms_64 (&next_cum);
9662 else
9663 setup_incoming_varargs_64 (&next_cum);
9666 static void
9667 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9668 machine_mode mode,
9669 tree type,
9670 int *pretend_size ATTRIBUTE_UNUSED,
9671 int no_rtl)
9673 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9674 CUMULATIVE_ARGS next_cum;
9675 tree fntype;
9676 rtx save_area;
9677 int bnd_reg, i, max;
9679 gcc_assert (!no_rtl);
9681 /* Do nothing if we use plain pointer to argument area. */
9682 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9683 return;
9685 fntype = TREE_TYPE (current_function_decl);
9687 /* For varargs, we do not want to skip the dummy va_dcl argument.
9688 For stdargs, we do want to skip the last named argument. */
9689 next_cum = *cum;
9690 if (stdarg_p (fntype))
9691 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9692 true);
9693 save_area = frame_pointer_rtx;
9695 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9696 if (max > X86_64_REGPARM_MAX)
9697 max = X86_64_REGPARM_MAX;
9699 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9700 if (chkp_function_instrumented_p (current_function_decl))
9701 for (i = cum->regno; i < max; i++)
9703 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9704 rtx ptr = gen_rtx_REG (Pmode,
9705 x86_64_int_parameter_registers[i]);
9706 rtx bounds;
9708 if (bnd_reg <= LAST_BND_REG)
9709 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9710 else
9712 rtx ldx_addr =
9713 plus_constant (Pmode, arg_pointer_rtx,
9714 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9715 bounds = gen_reg_rtx (BNDmode);
9716 emit_insn (BNDmode == BND64mode
9717 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9718 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9721 emit_insn (BNDmode == BND64mode
9722 ? gen_bnd64_stx (addr, ptr, bounds)
9723 : gen_bnd32_stx (addr, ptr, bounds));
9725 bnd_reg++;
9730 /* Checks if TYPE is of kind va_list char *. */
9732 static bool
9733 is_va_list_char_pointer (tree type)
9735 tree canonic;
9737 /* For 32-bit it is always true. */
9738 if (!TARGET_64BIT)
9739 return true;
9740 canonic = ix86_canonical_va_list_type (type);
9741 return (canonic == ms_va_list_type_node
9742 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9745 /* Implement va_start. */
9747 static void
9748 ix86_va_start (tree valist, rtx nextarg)
9750 HOST_WIDE_INT words, n_gpr, n_fpr;
9751 tree f_gpr, f_fpr, f_ovf, f_sav;
9752 tree gpr, fpr, ovf, sav, t;
9753 tree type;
9754 rtx ovf_rtx;
9756 if (flag_split_stack
9757 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9759 unsigned int scratch_regno;
9761 /* When we are splitting the stack, we can't refer to the stack
9762 arguments using internal_arg_pointer, because they may be on
9763 the old stack. The split stack prologue will arrange to
9764 leave a pointer to the old stack arguments in a scratch
9765 register, which we here copy to a pseudo-register. The split
9766 stack prologue can't set the pseudo-register directly because
9767 it (the prologue) runs before any registers have been saved. */
9769 scratch_regno = split_stack_prologue_scratch_regno ();
9770 if (scratch_regno != INVALID_REGNUM)
9772 rtx reg;
9773 rtx_insn *seq;
9775 reg = gen_reg_rtx (Pmode);
9776 cfun->machine->split_stack_varargs_pointer = reg;
9778 start_sequence ();
9779 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9780 seq = get_insns ();
9781 end_sequence ();
9783 push_topmost_sequence ();
9784 emit_insn_after (seq, entry_of_function ());
9785 pop_topmost_sequence ();
9789 /* Only 64bit target needs something special. */
9790 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9792 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9793 std_expand_builtin_va_start (valist, nextarg);
9794 else
9796 rtx va_r, next;
9798 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9799 next = expand_binop (ptr_mode, add_optab,
9800 cfun->machine->split_stack_varargs_pointer,
9801 crtl->args.arg_offset_rtx,
9802 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9803 convert_move (va_r, next, 0);
9805 /* Store zero bounds for va_list. */
9806 if (chkp_function_instrumented_p (current_function_decl))
9807 chkp_expand_bounds_reset_for_mem (valist,
9808 make_tree (TREE_TYPE (valist),
9809 next));
9812 return;
9815 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9816 f_fpr = DECL_CHAIN (f_gpr);
9817 f_ovf = DECL_CHAIN (f_fpr);
9818 f_sav = DECL_CHAIN (f_ovf);
9820 valist = build_simple_mem_ref (valist);
9821 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9822 /* The following should be folded into the MEM_REF offset. */
9823 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9824 f_gpr, NULL_TREE);
9825 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9826 f_fpr, NULL_TREE);
9827 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9828 f_ovf, NULL_TREE);
9829 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9830 f_sav, NULL_TREE);
9832 /* Count number of gp and fp argument registers used. */
9833 words = crtl->args.info.words;
9834 n_gpr = crtl->args.info.regno;
9835 n_fpr = crtl->args.info.sse_regno;
9837 if (cfun->va_list_gpr_size)
9839 type = TREE_TYPE (gpr);
9840 t = build2 (MODIFY_EXPR, type,
9841 gpr, build_int_cst (type, n_gpr * 8));
9842 TREE_SIDE_EFFECTS (t) = 1;
9843 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9846 if (TARGET_SSE && cfun->va_list_fpr_size)
9848 type = TREE_TYPE (fpr);
9849 t = build2 (MODIFY_EXPR, type, fpr,
9850 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9851 TREE_SIDE_EFFECTS (t) = 1;
9852 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9855 /* Find the overflow area. */
9856 type = TREE_TYPE (ovf);
9857 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9858 ovf_rtx = crtl->args.internal_arg_pointer;
9859 else
9860 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9861 t = make_tree (type, ovf_rtx);
9862 if (words != 0)
9863 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9865 /* Store zero bounds for overflow area pointer. */
9866 if (chkp_function_instrumented_p (current_function_decl))
9867 chkp_expand_bounds_reset_for_mem (ovf, t);
9869 t = build2 (MODIFY_EXPR, type, ovf, t);
9870 TREE_SIDE_EFFECTS (t) = 1;
9871 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9873 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9875 /* Find the register save area.
9876 Prologue of the function save it right above stack frame. */
9877 type = TREE_TYPE (sav);
9878 t = make_tree (type, frame_pointer_rtx);
9879 if (!ix86_varargs_gpr_size)
9880 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9882 /* Store zero bounds for save area pointer. */
9883 if (chkp_function_instrumented_p (current_function_decl))
9884 chkp_expand_bounds_reset_for_mem (sav, t);
9886 t = build2 (MODIFY_EXPR, type, sav, t);
9887 TREE_SIDE_EFFECTS (t) = 1;
9888 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9892 /* Implement va_arg. */
9894 static tree
9895 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9896 gimple_seq *post_p)
9898 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9899 tree f_gpr, f_fpr, f_ovf, f_sav;
9900 tree gpr, fpr, ovf, sav, t;
9901 int size, rsize;
9902 tree lab_false, lab_over = NULL_TREE;
9903 tree addr, t2;
9904 rtx container;
9905 int indirect_p = 0;
9906 tree ptrtype;
9907 machine_mode nat_mode;
9908 unsigned int arg_boundary;
9910 /* Only 64bit target needs something special. */
9911 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9912 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9914 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9915 f_fpr = DECL_CHAIN (f_gpr);
9916 f_ovf = DECL_CHAIN (f_fpr);
9917 f_sav = DECL_CHAIN (f_ovf);
9919 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9920 valist, f_gpr, NULL_TREE);
9922 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9923 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9924 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9926 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9927 if (indirect_p)
9928 type = build_pointer_type (type);
9929 size = arg_int_size_in_bytes (type);
9930 rsize = CEIL (size, UNITS_PER_WORD);
9932 nat_mode = type_natural_mode (type, NULL, false);
9933 switch (nat_mode)
9935 case E_V8SFmode:
9936 case E_V8SImode:
9937 case E_V32QImode:
9938 case E_V16HImode:
9939 case E_V4DFmode:
9940 case E_V4DImode:
9941 case E_V16SFmode:
9942 case E_V16SImode:
9943 case E_V64QImode:
9944 case E_V32HImode:
9945 case E_V8DFmode:
9946 case E_V8DImode:
9947 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9948 if (!TARGET_64BIT_MS_ABI)
9950 container = NULL;
9951 break;
9953 /* FALLTHRU */
9955 default:
9956 container = construct_container (nat_mode, TYPE_MODE (type),
9957 type, 0, X86_64_REGPARM_MAX,
9958 X86_64_SSE_REGPARM_MAX, intreg,
9960 break;
9963 /* Pull the value out of the saved registers. */
9965 addr = create_tmp_var (ptr_type_node, "addr");
9967 if (container)
9969 int needed_intregs, needed_sseregs;
9970 bool need_temp;
9971 tree int_addr, sse_addr;
9973 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9974 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9976 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9978 need_temp = (!REG_P (container)
9979 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9980 || TYPE_ALIGN (type) > 128));
9982 /* In case we are passing structure, verify that it is consecutive block
9983 on the register save area. If not we need to do moves. */
9984 if (!need_temp && !REG_P (container))
9986 /* Verify that all registers are strictly consecutive */
9987 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9989 int i;
9991 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9993 rtx slot = XVECEXP (container, 0, i);
9994 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9995 || INTVAL (XEXP (slot, 1)) != i * 16)
9996 need_temp = true;
9999 else
10001 int i;
10003 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10005 rtx slot = XVECEXP (container, 0, i);
10006 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10007 || INTVAL (XEXP (slot, 1)) != i * 8)
10008 need_temp = true;
10012 if (!need_temp)
10014 int_addr = addr;
10015 sse_addr = addr;
10017 else
10019 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10020 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10023 /* First ensure that we fit completely in registers. */
10024 if (needed_intregs)
10026 t = build_int_cst (TREE_TYPE (gpr),
10027 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10028 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10029 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10030 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10031 gimplify_and_add (t, pre_p);
10033 if (needed_sseregs)
10035 t = build_int_cst (TREE_TYPE (fpr),
10036 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10037 + X86_64_REGPARM_MAX * 8);
10038 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10039 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10040 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10041 gimplify_and_add (t, pre_p);
10044 /* Compute index to start of area used for integer regs. */
10045 if (needed_intregs)
10047 /* int_addr = gpr + sav; */
10048 t = fold_build_pointer_plus (sav, gpr);
10049 gimplify_assign (int_addr, t, pre_p);
10051 if (needed_sseregs)
10053 /* sse_addr = fpr + sav; */
10054 t = fold_build_pointer_plus (sav, fpr);
10055 gimplify_assign (sse_addr, t, pre_p);
10057 if (need_temp)
10059 int i, prev_size = 0;
10060 tree temp = create_tmp_var (type, "va_arg_tmp");
10062 /* addr = &temp; */
10063 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10064 gimplify_assign (addr, t, pre_p);
10066 for (i = 0; i < XVECLEN (container, 0); i++)
10068 rtx slot = XVECEXP (container, 0, i);
10069 rtx reg = XEXP (slot, 0);
10070 machine_mode mode = GET_MODE (reg);
10071 tree piece_type;
10072 tree addr_type;
10073 tree daddr_type;
10074 tree src_addr, src;
10075 int src_offset;
10076 tree dest_addr, dest;
10077 int cur_size = GET_MODE_SIZE (mode);
10079 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10080 prev_size = INTVAL (XEXP (slot, 1));
10081 if (prev_size + cur_size > size)
10083 cur_size = size - prev_size;
10084 unsigned int nbits = cur_size * BITS_PER_UNIT;
10085 if (!int_mode_for_size (nbits, 1).exists (&mode))
10086 mode = QImode;
10088 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10089 if (mode == GET_MODE (reg))
10090 addr_type = build_pointer_type (piece_type);
10091 else
10092 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10093 true);
10094 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10095 true);
10097 if (SSE_REGNO_P (REGNO (reg)))
10099 src_addr = sse_addr;
10100 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10102 else
10104 src_addr = int_addr;
10105 src_offset = REGNO (reg) * 8;
10107 src_addr = fold_convert (addr_type, src_addr);
10108 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10110 dest_addr = fold_convert (daddr_type, addr);
10111 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10112 if (cur_size == GET_MODE_SIZE (mode))
10114 src = build_va_arg_indirect_ref (src_addr);
10115 dest = build_va_arg_indirect_ref (dest_addr);
10117 gimplify_assign (dest, src, pre_p);
10119 else
10121 tree copy
10122 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10123 3, dest_addr, src_addr,
10124 size_int (cur_size));
10125 gimplify_and_add (copy, pre_p);
10127 prev_size += cur_size;
10131 if (needed_intregs)
10133 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10134 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10135 gimplify_assign (gpr, t, pre_p);
10138 if (needed_sseregs)
10140 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10141 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10142 gimplify_assign (unshare_expr (fpr), t, pre_p);
10145 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10147 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10150 /* ... otherwise out of the overflow area. */
10152 /* When we align parameter on stack for caller, if the parameter
10153 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10154 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10155 here with caller. */
10156 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10157 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10158 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10160 /* Care for on-stack alignment if needed. */
10161 if (arg_boundary <= 64 || size == 0)
10162 t = ovf;
10163 else
10165 HOST_WIDE_INT align = arg_boundary / 8;
10166 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10167 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10168 build_int_cst (TREE_TYPE (t), -align));
10171 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10172 gimplify_assign (addr, t, pre_p);
10174 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10175 gimplify_assign (unshare_expr (ovf), t, pre_p);
10177 if (container)
10178 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10180 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10181 addr = fold_convert (ptrtype, addr);
10183 if (indirect_p)
10184 addr = build_va_arg_indirect_ref (addr);
10185 return build_va_arg_indirect_ref (addr);
10188 /* Return true if OPNUM's MEM should be matched
10189 in movabs* patterns. */
10191 bool
10192 ix86_check_movabs (rtx insn, int opnum)
10194 rtx set, mem;
10196 set = PATTERN (insn);
10197 if (GET_CODE (set) == PARALLEL)
10198 set = XVECEXP (set, 0, 0);
10199 gcc_assert (GET_CODE (set) == SET);
10200 mem = XEXP (set, opnum);
10201 while (SUBREG_P (mem))
10202 mem = SUBREG_REG (mem);
10203 gcc_assert (MEM_P (mem));
10204 return volatile_ok || !MEM_VOLATILE_P (mem);
10207 /* Return false if INSN contains a MEM with a non-default address space. */
10208 bool
10209 ix86_check_no_addr_space (rtx insn)
10211 subrtx_var_iterator::array_type array;
10212 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10214 rtx x = *iter;
10215 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10216 return false;
10218 return true;
10221 /* Initialize the table of extra 80387 mathematical constants. */
10223 static void
10224 init_ext_80387_constants (void)
10226 static const char * cst[5] =
10228 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10229 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10230 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10231 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10232 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10234 int i;
10236 for (i = 0; i < 5; i++)
10238 real_from_string (&ext_80387_constants_table[i], cst[i]);
10239 /* Ensure each constant is rounded to XFmode precision. */
10240 real_convert (&ext_80387_constants_table[i],
10241 XFmode, &ext_80387_constants_table[i]);
10244 ext_80387_constants_init = 1;
10247 /* Return non-zero if the constant is something that
10248 can be loaded with a special instruction. */
10251 standard_80387_constant_p (rtx x)
10253 machine_mode mode = GET_MODE (x);
10255 const REAL_VALUE_TYPE *r;
10257 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10258 return -1;
10260 if (x == CONST0_RTX (mode))
10261 return 1;
10262 if (x == CONST1_RTX (mode))
10263 return 2;
10265 r = CONST_DOUBLE_REAL_VALUE (x);
10267 /* For XFmode constants, try to find a special 80387 instruction when
10268 optimizing for size or on those CPUs that benefit from them. */
10269 if (mode == XFmode
10270 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10272 int i;
10274 if (! ext_80387_constants_init)
10275 init_ext_80387_constants ();
10277 for (i = 0; i < 5; i++)
10278 if (real_identical (r, &ext_80387_constants_table[i]))
10279 return i + 3;
10282 /* Load of the constant -0.0 or -1.0 will be split as
10283 fldz;fchs or fld1;fchs sequence. */
10284 if (real_isnegzero (r))
10285 return 8;
10286 if (real_identical (r, &dconstm1))
10287 return 9;
10289 return 0;
10292 /* Return the opcode of the special instruction to be used to load
10293 the constant X. */
10295 const char *
10296 standard_80387_constant_opcode (rtx x)
10298 switch (standard_80387_constant_p (x))
10300 case 1:
10301 return "fldz";
10302 case 2:
10303 return "fld1";
10304 case 3:
10305 return "fldlg2";
10306 case 4:
10307 return "fldln2";
10308 case 5:
10309 return "fldl2e";
10310 case 6:
10311 return "fldl2t";
10312 case 7:
10313 return "fldpi";
10314 case 8:
10315 case 9:
10316 return "#";
10317 default:
10318 gcc_unreachable ();
10322 /* Return the CONST_DOUBLE representing the 80387 constant that is
10323 loaded by the specified special instruction. The argument IDX
10324 matches the return value from standard_80387_constant_p. */
10327 standard_80387_constant_rtx (int idx)
10329 int i;
10331 if (! ext_80387_constants_init)
10332 init_ext_80387_constants ();
10334 switch (idx)
10336 case 3:
10337 case 4:
10338 case 5:
10339 case 6:
10340 case 7:
10341 i = idx - 3;
10342 break;
10344 default:
10345 gcc_unreachable ();
10348 return const_double_from_real_value (ext_80387_constants_table[i],
10349 XFmode);
10352 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10353 in supported SSE/AVX vector mode. */
10356 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10358 machine_mode mode;
10360 if (!TARGET_SSE)
10361 return 0;
10363 mode = GET_MODE (x);
10365 if (x == const0_rtx || const0_operand (x, mode))
10366 return 1;
10368 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10370 /* VOIDmode integer constant, get mode from the predicate. */
10371 if (mode == VOIDmode)
10372 mode = pred_mode;
10374 switch (GET_MODE_SIZE (mode))
10376 case 64:
10377 if (TARGET_AVX512F)
10378 return 2;
10379 break;
10380 case 32:
10381 if (TARGET_AVX2)
10382 return 2;
10383 break;
10384 case 16:
10385 if (TARGET_SSE2)
10386 return 2;
10387 break;
10388 case 0:
10389 /* VOIDmode */
10390 gcc_unreachable ();
10391 default:
10392 break;
10396 return 0;
10399 /* Return the opcode of the special instruction to be used to load
10400 the constant operands[1] into operands[0]. */
10402 const char *
10403 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10405 machine_mode mode;
10406 rtx x = operands[1];
10408 gcc_assert (TARGET_SSE);
10410 mode = GET_MODE (x);
10412 if (x == const0_rtx || const0_operand (x, mode))
10414 switch (get_attr_mode (insn))
10416 case MODE_TI:
10417 if (!EXT_REX_SSE_REG_P (operands[0]))
10418 return "%vpxor\t%0, %d0";
10419 /* FALLTHRU */
10420 case MODE_XI:
10421 case MODE_OI:
10422 if (EXT_REX_SSE_REG_P (operands[0]))
10423 return (TARGET_AVX512VL
10424 ? "vpxord\t%x0, %x0, %x0"
10425 : "vpxord\t%g0, %g0, %g0");
10426 return "vpxor\t%x0, %x0, %x0";
10428 case MODE_V2DF:
10429 if (!EXT_REX_SSE_REG_P (operands[0]))
10430 return "%vxorpd\t%0, %d0";
10431 /* FALLTHRU */
10432 case MODE_V8DF:
10433 case MODE_V4DF:
10434 if (!EXT_REX_SSE_REG_P (operands[0]))
10435 return "vxorpd\t%x0, %x0, %x0";
10436 else if (TARGET_AVX512DQ)
10437 return (TARGET_AVX512VL
10438 ? "vxorpd\t%x0, %x0, %x0"
10439 : "vxorpd\t%g0, %g0, %g0");
10440 else
10441 return (TARGET_AVX512VL
10442 ? "vpxorq\t%x0, %x0, %x0"
10443 : "vpxorq\t%g0, %g0, %g0");
10445 case MODE_V4SF:
10446 if (!EXT_REX_SSE_REG_P (operands[0]))
10447 return "%vxorps\t%0, %d0";
10448 /* FALLTHRU */
10449 case MODE_V16SF:
10450 case MODE_V8SF:
10451 if (!EXT_REX_SSE_REG_P (operands[0]))
10452 return "vxorps\t%x0, %x0, %x0";
10453 else if (TARGET_AVX512DQ)
10454 return (TARGET_AVX512VL
10455 ? "vxorps\t%x0, %x0, %x0"
10456 : "vxorps\t%g0, %g0, %g0");
10457 else
10458 return (TARGET_AVX512VL
10459 ? "vpxord\t%x0, %x0, %x0"
10460 : "vpxord\t%g0, %g0, %g0");
10462 default:
10463 gcc_unreachable ();
10466 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10468 enum attr_mode insn_mode = get_attr_mode (insn);
10470 switch (insn_mode)
10472 case MODE_XI:
10473 case MODE_V8DF:
10474 case MODE_V16SF:
10475 gcc_assert (TARGET_AVX512F);
10476 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10478 case MODE_OI:
10479 case MODE_V4DF:
10480 case MODE_V8SF:
10481 gcc_assert (TARGET_AVX2);
10482 /* FALLTHRU */
10483 case MODE_TI:
10484 case MODE_V2DF:
10485 case MODE_V4SF:
10486 gcc_assert (TARGET_SSE2);
10487 if (!EXT_REX_SSE_REG_P (operands[0]))
10488 return (TARGET_AVX
10489 ? "vpcmpeqd\t%0, %0, %0"
10490 : "pcmpeqd\t%0, %0");
10491 else if (TARGET_AVX512VL)
10492 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10493 else
10494 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10496 default:
10497 gcc_unreachable ();
10501 gcc_unreachable ();
10504 /* Returns true if INSN can be transformed from a memory load
10505 to a supported FP constant load. */
10507 bool
10508 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10510 rtx src = find_constant_src (insn);
10512 gcc_assert (REG_P (dst));
10514 if (src == NULL
10515 || (SSE_REGNO_P (REGNO (dst))
10516 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10517 || (STACK_REGNO_P (REGNO (dst))
10518 && standard_80387_constant_p (src) < 1))
10519 return false;
10521 return true;
10524 /* Returns true if OP contains a symbol reference */
10526 bool
10527 symbolic_reference_mentioned_p (rtx op)
10529 const char *fmt;
10530 int i;
10532 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10533 return true;
10535 fmt = GET_RTX_FORMAT (GET_CODE (op));
10536 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10538 if (fmt[i] == 'E')
10540 int j;
10542 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10543 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10544 return true;
10547 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10548 return true;
10551 return false;
10554 /* Return true if it is appropriate to emit `ret' instructions in the
10555 body of a function. Do this only if the epilogue is simple, needing a
10556 couple of insns. Prior to reloading, we can't tell how many registers
10557 must be saved, so return false then. Return false if there is no frame
10558 marker to de-allocate. */
10560 bool
10561 ix86_can_use_return_insn_p (void)
10563 if (ix86_function_naked (current_function_decl))
10564 return false;
10566 /* Don't use `ret' instruction in interrupt handler. */
10567 if (! reload_completed
10568 || frame_pointer_needed
10569 || cfun->machine->func_type != TYPE_NORMAL)
10570 return 0;
10572 /* Don't allow more than 32k pop, since that's all we can do
10573 with one instruction. */
10574 if (crtl->args.pops_args && crtl->args.size >= 32768)
10575 return 0;
10577 struct ix86_frame &frame = cfun->machine->frame;
10578 return (frame.stack_pointer_offset == UNITS_PER_WORD
10579 && (frame.nregs + frame.nsseregs) == 0);
10582 /* Value should be nonzero if functions must have frame pointers.
10583 Zero means the frame pointer need not be set up (and parms may
10584 be accessed via the stack pointer) in functions that seem suitable. */
10586 static bool
10587 ix86_frame_pointer_required (void)
10589 /* If we accessed previous frames, then the generated code expects
10590 to be able to access the saved ebp value in our frame. */
10591 if (cfun->machine->accesses_prev_frame)
10592 return true;
10594 /* Several x86 os'es need a frame pointer for other reasons,
10595 usually pertaining to setjmp. */
10596 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10597 return true;
10599 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10600 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10601 return true;
10603 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10604 allocation is 4GB. */
10605 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10606 return true;
10608 /* SSE saves require frame-pointer when stack is misaligned. */
10609 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10610 return true;
10612 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10613 turns off the frame pointer by default. Turn it back on now if
10614 we've not got a leaf function. */
10615 if (TARGET_OMIT_LEAF_FRAME_POINTER
10616 && (!crtl->is_leaf
10617 || ix86_current_function_calls_tls_descriptor))
10618 return true;
10620 if (crtl->profile && !flag_fentry)
10621 return true;
10623 return false;
10626 /* Record that the current function accesses previous call frames. */
10628 void
10629 ix86_setup_frame_addresses (void)
10631 cfun->machine->accesses_prev_frame = 1;
10634 #ifndef USE_HIDDEN_LINKONCE
10635 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10636 # define USE_HIDDEN_LINKONCE 1
10637 # else
10638 # define USE_HIDDEN_LINKONCE 0
10639 # endif
10640 #endif
10642 static int pic_labels_used;
10644 /* Fills in the label name that should be used for a pc thunk for
10645 the given register. */
10647 static void
10648 get_pc_thunk_name (char name[32], unsigned int regno)
10650 gcc_assert (!TARGET_64BIT);
10652 if (USE_HIDDEN_LINKONCE)
10653 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10654 else
10655 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10659 /* This function generates code for -fpic that loads %ebx with
10660 the return address of the caller and then returns. */
10662 static void
10663 ix86_code_end (void)
10665 rtx xops[2];
10666 int regno;
10668 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10670 char name[32];
10671 tree decl;
10673 if (!(pic_labels_used & (1 << regno)))
10674 continue;
10676 get_pc_thunk_name (name, regno);
10678 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10679 get_identifier (name),
10680 build_function_type_list (void_type_node, NULL_TREE));
10681 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10682 NULL_TREE, void_type_node);
10683 TREE_PUBLIC (decl) = 1;
10684 TREE_STATIC (decl) = 1;
10685 DECL_IGNORED_P (decl) = 1;
10687 #if TARGET_MACHO
10688 if (TARGET_MACHO)
10690 switch_to_section (darwin_sections[picbase_thunk_section]);
10691 fputs ("\t.weak_definition\t", asm_out_file);
10692 assemble_name (asm_out_file, name);
10693 fputs ("\n\t.private_extern\t", asm_out_file);
10694 assemble_name (asm_out_file, name);
10695 putc ('\n', asm_out_file);
10696 ASM_OUTPUT_LABEL (asm_out_file, name);
10697 DECL_WEAK (decl) = 1;
10699 else
10700 #endif
10701 if (USE_HIDDEN_LINKONCE)
10703 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10705 targetm.asm_out.unique_section (decl, 0);
10706 switch_to_section (get_named_section (decl, NULL, 0));
10708 targetm.asm_out.globalize_label (asm_out_file, name);
10709 fputs ("\t.hidden\t", asm_out_file);
10710 assemble_name (asm_out_file, name);
10711 putc ('\n', asm_out_file);
10712 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10714 else
10716 switch_to_section (text_section);
10717 ASM_OUTPUT_LABEL (asm_out_file, name);
10720 DECL_INITIAL (decl) = make_node (BLOCK);
10721 current_function_decl = decl;
10722 allocate_struct_function (decl, false);
10723 init_function_start (decl);
10724 /* We're about to hide the function body from callees of final_* by
10725 emitting it directly; tell them we're a thunk, if they care. */
10726 cfun->is_thunk = true;
10727 first_function_block_is_cold = false;
10728 /* Make sure unwind info is emitted for the thunk if needed. */
10729 final_start_function (emit_barrier (), asm_out_file, 1);
10731 /* Pad stack IP move with 4 instructions (two NOPs count
10732 as one instruction). */
10733 if (TARGET_PAD_SHORT_FUNCTION)
10735 int i = 8;
10737 while (i--)
10738 fputs ("\tnop\n", asm_out_file);
10741 xops[0] = gen_rtx_REG (Pmode, regno);
10742 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10743 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10744 output_asm_insn ("%!ret", NULL);
10745 final_end_function ();
10746 init_insn_lengths ();
10747 free_after_compilation (cfun);
10748 set_cfun (NULL);
10749 current_function_decl = NULL;
10752 if (flag_split_stack)
10753 file_end_indicate_split_stack ();
10756 /* Emit code for the SET_GOT patterns. */
10758 const char *
10759 output_set_got (rtx dest, rtx label)
10761 rtx xops[3];
10763 xops[0] = dest;
10765 if (TARGET_VXWORKS_RTP && flag_pic)
10767 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10768 xops[2] = gen_rtx_MEM (Pmode,
10769 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10770 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10772 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10773 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10774 an unadorned address. */
10775 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10776 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10777 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10778 return "";
10781 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10783 if (flag_pic)
10785 char name[32];
10786 get_pc_thunk_name (name, REGNO (dest));
10787 pic_labels_used |= 1 << REGNO (dest);
10789 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10790 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10791 output_asm_insn ("%!call\t%X2", xops);
10793 #if TARGET_MACHO
10794 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10795 This is what will be referenced by the Mach-O PIC subsystem. */
10796 if (machopic_should_output_picbase_label () || !label)
10797 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10799 /* When we are restoring the pic base at the site of a nonlocal label,
10800 and we decided to emit the pic base above, we will still output a
10801 local label used for calculating the correction offset (even though
10802 the offset will be 0 in that case). */
10803 if (label)
10804 targetm.asm_out.internal_label (asm_out_file, "L",
10805 CODE_LABEL_NUMBER (label));
10806 #endif
10808 else
10810 if (TARGET_MACHO)
10811 /* We don't need a pic base, we're not producing pic. */
10812 gcc_unreachable ();
10814 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10815 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10816 targetm.asm_out.internal_label (asm_out_file, "L",
10817 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10820 if (!TARGET_MACHO)
10821 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10823 return "";
10826 /* Generate an "push" pattern for input ARG. */
10828 static rtx
10829 gen_push (rtx arg)
10831 struct machine_function *m = cfun->machine;
10833 if (m->fs.cfa_reg == stack_pointer_rtx)
10834 m->fs.cfa_offset += UNITS_PER_WORD;
10835 m->fs.sp_offset += UNITS_PER_WORD;
10837 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10838 arg = gen_rtx_REG (word_mode, REGNO (arg));
10840 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10841 gen_rtx_PRE_DEC (Pmode,
10842 stack_pointer_rtx)),
10843 arg);
10846 /* Generate an "pop" pattern for input ARG. */
10848 static rtx
10849 gen_pop (rtx arg)
10851 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10852 arg = gen_rtx_REG (word_mode, REGNO (arg));
10854 return gen_rtx_SET (arg,
10855 gen_rtx_MEM (word_mode,
10856 gen_rtx_POST_INC (Pmode,
10857 stack_pointer_rtx)));
10860 /* Return >= 0 if there is an unused call-clobbered register available
10861 for the entire function. */
10863 static unsigned int
10864 ix86_select_alt_pic_regnum (void)
10866 if (ix86_use_pseudo_pic_reg ())
10867 return INVALID_REGNUM;
10869 if (crtl->is_leaf
10870 && !crtl->profile
10871 && !ix86_current_function_calls_tls_descriptor)
10873 int i, drap;
10874 /* Can't use the same register for both PIC and DRAP. */
10875 if (crtl->drap_reg)
10876 drap = REGNO (crtl->drap_reg);
10877 else
10878 drap = -1;
10879 for (i = 2; i >= 0; --i)
10880 if (i != drap && !df_regs_ever_live_p (i))
10881 return i;
10884 return INVALID_REGNUM;
10887 /* Return true if REGNO is used by the epilogue. */
10889 bool
10890 ix86_epilogue_uses (int regno)
10892 /* If there are no caller-saved registers, we preserve all registers,
10893 except for MMX and x87 registers which aren't supported when saving
10894 and restoring registers. Don't explicitly save SP register since
10895 it is always preserved. */
10896 return (epilogue_completed
10897 && cfun->machine->no_caller_saved_registers
10898 && !fixed_regs[regno]
10899 && !STACK_REGNO_P (regno)
10900 && !MMX_REGNO_P (regno));
10903 /* Return nonzero if register REGNO can be used as a scratch register
10904 in peephole2. */
10906 static bool
10907 ix86_hard_regno_scratch_ok (unsigned int regno)
10909 /* If there are no caller-saved registers, we can't use any register
10910 as a scratch register after epilogue and use REGNO as scratch
10911 register only if it has been used before to avoid saving and
10912 restoring it. */
10913 return (!cfun->machine->no_caller_saved_registers
10914 || (!epilogue_completed
10915 && df_regs_ever_live_p (regno)));
10918 /* Return true if register class CL should be an additional allocno
10919 class. */
10921 static bool
10922 ix86_additional_allocno_class_p (reg_class_t cl)
10924 return cl == MOD4_SSE_REGS;
10927 /* Return TRUE if we need to save REGNO. */
10929 static bool
10930 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10932 /* If there are no caller-saved registers, we preserve all registers,
10933 except for MMX and x87 registers which aren't supported when saving
10934 and restoring registers. Don't explicitly save SP register since
10935 it is always preserved. */
10936 if (cfun->machine->no_caller_saved_registers)
10938 /* Don't preserve registers used for function return value. */
10939 rtx reg = crtl->return_rtx;
10940 if (reg)
10942 unsigned int i = REGNO (reg);
10943 unsigned int nregs = REG_NREGS (reg);
10944 while (nregs-- > 0)
10945 if ((i + nregs) == regno)
10946 return false;
10948 reg = crtl->return_bnd;
10949 if (reg)
10951 i = REGNO (reg);
10952 nregs = REG_NREGS (reg);
10953 while (nregs-- > 0)
10954 if ((i + nregs) == regno)
10955 return false;
10959 return (df_regs_ever_live_p (regno)
10960 && !fixed_regs[regno]
10961 && !STACK_REGNO_P (regno)
10962 && !MMX_REGNO_P (regno)
10963 && (regno != HARD_FRAME_POINTER_REGNUM
10964 || !frame_pointer_needed));
10967 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10968 && pic_offset_table_rtx)
10970 if (ix86_use_pseudo_pic_reg ())
10972 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10973 _mcount in prologue. */
10974 if (!TARGET_64BIT && flag_pic && crtl->profile)
10975 return true;
10977 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10978 || crtl->profile
10979 || crtl->calls_eh_return
10980 || crtl->uses_const_pool
10981 || cfun->has_nonlocal_label)
10982 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10985 if (crtl->calls_eh_return && maybe_eh_return)
10987 unsigned i;
10988 for (i = 0; ; i++)
10990 unsigned test = EH_RETURN_DATA_REGNO (i);
10991 if (test == INVALID_REGNUM)
10992 break;
10993 if (test == regno)
10994 return true;
10998 if (ignore_outlined && cfun->machine->call_ms2sysv)
11000 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11001 + xlogue_layout::MIN_REGS;
11002 if (xlogue_layout::is_stub_managed_reg (regno, count))
11003 return false;
11006 if (crtl->drap_reg
11007 && regno == REGNO (crtl->drap_reg)
11008 && !cfun->machine->no_drap_save_restore)
11009 return true;
11011 return (df_regs_ever_live_p (regno)
11012 && !call_used_regs[regno]
11013 && !fixed_regs[regno]
11014 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11017 /* Return number of saved general prupose registers. */
11019 static int
11020 ix86_nsaved_regs (void)
11022 int nregs = 0;
11023 int regno;
11025 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11026 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11027 nregs ++;
11028 return nregs;
11031 /* Return number of saved SSE registers. */
11033 static int
11034 ix86_nsaved_sseregs (void)
11036 int nregs = 0;
11037 int regno;
11039 if (!TARGET_64BIT_MS_ABI)
11040 return 0;
11041 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11042 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11043 nregs ++;
11044 return nregs;
11047 /* Given FROM and TO register numbers, say whether this elimination is
11048 allowed. If stack alignment is needed, we can only replace argument
11049 pointer with hard frame pointer, or replace frame pointer with stack
11050 pointer. Otherwise, frame pointer elimination is automatically
11051 handled and all other eliminations are valid. */
11053 static bool
11054 ix86_can_eliminate (const int from, const int to)
11056 if (stack_realign_fp)
11057 return ((from == ARG_POINTER_REGNUM
11058 && to == HARD_FRAME_POINTER_REGNUM)
11059 || (from == FRAME_POINTER_REGNUM
11060 && to == STACK_POINTER_REGNUM));
11061 else
11062 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11065 /* Return the offset between two registers, one to be eliminated, and the other
11066 its replacement, at the start of a routine. */
11068 HOST_WIDE_INT
11069 ix86_initial_elimination_offset (int from, int to)
11071 struct ix86_frame &frame = cfun->machine->frame;
11073 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11074 return frame.hard_frame_pointer_offset;
11075 else if (from == FRAME_POINTER_REGNUM
11076 && to == HARD_FRAME_POINTER_REGNUM)
11077 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11078 else
11080 gcc_assert (to == STACK_POINTER_REGNUM);
11082 if (from == ARG_POINTER_REGNUM)
11083 return frame.stack_pointer_offset;
11085 gcc_assert (from == FRAME_POINTER_REGNUM);
11086 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11090 /* In a dynamically-aligned function, we can't know the offset from
11091 stack pointer to frame pointer, so we must ensure that setjmp
11092 eliminates fp against the hard fp (%ebp) rather than trying to
11093 index from %esp up to the top of the frame across a gap that is
11094 of unknown (at compile-time) size. */
11095 static rtx
11096 ix86_builtin_setjmp_frame_value (void)
11098 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11101 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11102 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11104 static bool warned_once = false;
11105 if (!warned_once)
11107 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11108 feature);
11109 warned_once = true;
11113 /* When using -fsplit-stack, the allocation routines set a field in
11114 the TCB to the bottom of the stack plus this much space, measured
11115 in bytes. */
11117 #define SPLIT_STACK_AVAILABLE 256
11119 /* Fill structure ix86_frame about frame of currently computed function. */
11121 static void
11122 ix86_compute_frame_layout (void)
11124 struct ix86_frame *frame = &cfun->machine->frame;
11125 struct machine_function *m = cfun->machine;
11126 unsigned HOST_WIDE_INT stack_alignment_needed;
11127 HOST_WIDE_INT offset;
11128 unsigned HOST_WIDE_INT preferred_alignment;
11129 HOST_WIDE_INT size = get_frame_size ();
11130 HOST_WIDE_INT to_allocate;
11132 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11133 * ms_abi functions that call a sysv function. We now need to prune away
11134 * cases where it should be disabled. */
11135 if (TARGET_64BIT && m->call_ms2sysv)
11137 gcc_assert (TARGET_64BIT_MS_ABI);
11138 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11139 gcc_assert (!TARGET_SEH);
11140 gcc_assert (TARGET_SSE);
11141 gcc_assert (!ix86_using_red_zone ());
11143 if (crtl->calls_eh_return)
11145 gcc_assert (!reload_completed);
11146 m->call_ms2sysv = false;
11147 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11150 else if (ix86_static_chain_on_stack)
11152 gcc_assert (!reload_completed);
11153 m->call_ms2sysv = false;
11154 warn_once_call_ms2sysv_xlogues ("static call chains");
11157 /* Finally, compute which registers the stub will manage. */
11158 else
11160 unsigned count = xlogue_layout::count_stub_managed_regs ();
11161 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11162 m->call_ms2sysv_pad_in = 0;
11166 frame->nregs = ix86_nsaved_regs ();
11167 frame->nsseregs = ix86_nsaved_sseregs ();
11169 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11170 except for function prologues, leaf functions and when the defult
11171 incoming stack boundary is overriden at command line or via
11172 force_align_arg_pointer attribute. */
11173 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11174 && (!crtl->is_leaf || cfun->calls_alloca != 0
11175 || ix86_current_function_calls_tls_descriptor
11176 || ix86_incoming_stack_boundary < 128))
11178 crtl->preferred_stack_boundary = 128;
11179 crtl->stack_alignment_needed = 128;
11182 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11183 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11185 gcc_assert (!size || stack_alignment_needed);
11186 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11187 gcc_assert (preferred_alignment <= stack_alignment_needed);
11189 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11190 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11191 if (TARGET_64BIT && m->call_ms2sysv)
11193 gcc_assert (stack_alignment_needed >= 16);
11194 gcc_assert (!frame->nsseregs);
11197 /* For SEH we have to limit the amount of code movement into the prologue.
11198 At present we do this via a BLOCKAGE, at which point there's very little
11199 scheduling that can be done, which means that there's very little point
11200 in doing anything except PUSHs. */
11201 if (TARGET_SEH)
11202 m->use_fast_prologue_epilogue = false;
11203 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11205 int count = frame->nregs;
11206 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11208 /* The fast prologue uses move instead of push to save registers. This
11209 is significantly longer, but also executes faster as modern hardware
11210 can execute the moves in parallel, but can't do that for push/pop.
11212 Be careful about choosing what prologue to emit: When function takes
11213 many instructions to execute we may use slow version as well as in
11214 case function is known to be outside hot spot (this is known with
11215 feedback only). Weight the size of function by number of registers
11216 to save as it is cheap to use one or two push instructions but very
11217 slow to use many of them. */
11218 if (count)
11219 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11220 if (node->frequency < NODE_FREQUENCY_NORMAL
11221 || (flag_branch_probabilities
11222 && node->frequency < NODE_FREQUENCY_HOT))
11223 m->use_fast_prologue_epilogue = false;
11224 else
11225 m->use_fast_prologue_epilogue
11226 = !expensive_function_p (count);
11229 frame->save_regs_using_mov
11230 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11231 /* If static stack checking is enabled and done with probes,
11232 the registers need to be saved before allocating the frame. */
11233 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11235 /* Skip return address and error code in exception handler. */
11236 offset = INCOMING_FRAME_SP_OFFSET;
11238 /* Skip pushed static chain. */
11239 if (ix86_static_chain_on_stack)
11240 offset += UNITS_PER_WORD;
11242 /* Skip saved base pointer. */
11243 if (frame_pointer_needed)
11244 offset += UNITS_PER_WORD;
11245 frame->hfp_save_offset = offset;
11247 /* The traditional frame pointer location is at the top of the frame. */
11248 frame->hard_frame_pointer_offset = offset;
11250 /* Register save area */
11251 offset += frame->nregs * UNITS_PER_WORD;
11252 frame->reg_save_offset = offset;
11254 /* On SEH target, registers are pushed just before the frame pointer
11255 location. */
11256 if (TARGET_SEH)
11257 frame->hard_frame_pointer_offset = offset;
11259 /* Calculate the size of the va-arg area (not including padding, if any). */
11260 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11262 if (stack_realign_fp)
11264 /* We may need a 16-byte aligned stack for the remainder of the
11265 register save area, but the stack frame for the local function
11266 may require a greater alignment if using AVX/2/512. In order
11267 to avoid wasting space, we first calculate the space needed for
11268 the rest of the register saves, add that to the stack pointer,
11269 and then realign the stack to the boundary of the start of the
11270 frame for the local function. */
11271 HOST_WIDE_INT space_needed = 0;
11272 HOST_WIDE_INT sse_reg_space_needed = 0;
11274 if (TARGET_64BIT)
11276 if (m->call_ms2sysv)
11278 m->call_ms2sysv_pad_in = 0;
11279 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11282 else if (frame->nsseregs)
11283 /* The only ABI that has saved SSE registers (Win64) also has a
11284 16-byte aligned default stack. However, many programs violate
11285 the ABI, and Wine64 forces stack realignment to compensate. */
11286 space_needed = frame->nsseregs * 16;
11288 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11290 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11291 rounding to be pedantic. */
11292 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11294 else
11295 space_needed = frame->va_arg_size;
11297 /* Record the allocation size required prior to the realignment AND. */
11298 frame->stack_realign_allocate = space_needed;
11300 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11301 before this point are not directly comparable with values below
11302 this point. Use sp_valid_at to determine if the stack pointer is
11303 valid for a given offset, fp_valid_at for the frame pointer, or
11304 choose_baseaddr to have a base register chosen for you.
11306 Note that the result of (frame->stack_realign_offset
11307 & (stack_alignment_needed - 1)) may not equal zero. */
11308 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11309 frame->stack_realign_offset = offset - space_needed;
11310 frame->sse_reg_save_offset = frame->stack_realign_offset
11311 + sse_reg_space_needed;
11313 else
11315 frame->stack_realign_offset = offset;
11317 if (TARGET_64BIT && m->call_ms2sysv)
11319 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11320 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11323 /* Align and set SSE register save area. */
11324 else if (frame->nsseregs)
11326 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11327 required and the DRAP re-alignment boundary is at least 16 bytes,
11328 then we want the SSE register save area properly aligned. */
11329 if (ix86_incoming_stack_boundary >= 128
11330 || (stack_realign_drap && stack_alignment_needed >= 16))
11331 offset = ROUND_UP (offset, 16);
11332 offset += frame->nsseregs * 16;
11334 frame->sse_reg_save_offset = offset;
11335 offset += frame->va_arg_size;
11338 /* Align start of frame for local function. */
11339 if (m->call_ms2sysv
11340 || frame->va_arg_size != 0
11341 || size != 0
11342 || !crtl->is_leaf
11343 || cfun->calls_alloca
11344 || ix86_current_function_calls_tls_descriptor)
11345 offset = ROUND_UP (offset, stack_alignment_needed);
11347 /* Frame pointer points here. */
11348 frame->frame_pointer_offset = offset;
11350 offset += size;
11352 /* Add outgoing arguments area. Can be skipped if we eliminated
11353 all the function calls as dead code.
11354 Skipping is however impossible when function calls alloca. Alloca
11355 expander assumes that last crtl->outgoing_args_size
11356 of stack frame are unused. */
11357 if (ACCUMULATE_OUTGOING_ARGS
11358 && (!crtl->is_leaf || cfun->calls_alloca
11359 || ix86_current_function_calls_tls_descriptor))
11361 offset += crtl->outgoing_args_size;
11362 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11364 else
11365 frame->outgoing_arguments_size = 0;
11367 /* Align stack boundary. Only needed if we're calling another function
11368 or using alloca. */
11369 if (!crtl->is_leaf || cfun->calls_alloca
11370 || ix86_current_function_calls_tls_descriptor)
11371 offset = ROUND_UP (offset, preferred_alignment);
11373 /* We've reached end of stack frame. */
11374 frame->stack_pointer_offset = offset;
11376 /* Size prologue needs to allocate. */
11377 to_allocate = offset - frame->sse_reg_save_offset;
11379 if ((!to_allocate && frame->nregs <= 1)
11380 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11381 frame->save_regs_using_mov = false;
11383 if (ix86_using_red_zone ()
11384 && crtl->sp_is_unchanging
11385 && crtl->is_leaf
11386 && !ix86_pc_thunk_call_expanded
11387 && !ix86_current_function_calls_tls_descriptor)
11389 frame->red_zone_size = to_allocate;
11390 if (frame->save_regs_using_mov)
11391 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11392 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11393 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11395 else
11396 frame->red_zone_size = 0;
11397 frame->stack_pointer_offset -= frame->red_zone_size;
11399 /* The SEH frame pointer location is near the bottom of the frame.
11400 This is enforced by the fact that the difference between the
11401 stack pointer and the frame pointer is limited to 240 bytes in
11402 the unwind data structure. */
11403 if (TARGET_SEH)
11405 HOST_WIDE_INT diff;
11407 /* If we can leave the frame pointer where it is, do so. Also, returns
11408 the establisher frame for __builtin_frame_address (0). */
11409 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11410 if (diff <= SEH_MAX_FRAME_SIZE
11411 && (diff > 240 || (diff & 15) != 0)
11412 && !crtl->accesses_prior_frames)
11414 /* Ideally we'd determine what portion of the local stack frame
11415 (within the constraint of the lowest 240) is most heavily used.
11416 But without that complication, simply bias the frame pointer
11417 by 128 bytes so as to maximize the amount of the local stack
11418 frame that is addressable with 8-bit offsets. */
11419 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11424 /* This is semi-inlined memory_address_length, but simplified
11425 since we know that we're always dealing with reg+offset, and
11426 to avoid having to create and discard all that rtl. */
11428 static inline int
11429 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11431 int len = 4;
11433 if (offset == 0)
11435 /* EBP and R13 cannot be encoded without an offset. */
11436 len = (regno == BP_REG || regno == R13_REG);
11438 else if (IN_RANGE (offset, -128, 127))
11439 len = 1;
11441 /* ESP and R12 must be encoded with a SIB byte. */
11442 if (regno == SP_REG || regno == R12_REG)
11443 len++;
11445 return len;
11448 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11449 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11451 static bool
11452 sp_valid_at (HOST_WIDE_INT cfa_offset)
11454 const struct machine_frame_state &fs = cfun->machine->fs;
11455 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11457 /* Validate that the cfa_offset isn't in a "no-man's land". */
11458 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11459 return false;
11461 return fs.sp_valid;
11464 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11465 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11467 static inline bool
11468 fp_valid_at (HOST_WIDE_INT cfa_offset)
11470 const struct machine_frame_state &fs = cfun->machine->fs;
11471 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11473 /* Validate that the cfa_offset isn't in a "no-man's land". */
11474 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11475 return false;
11477 return fs.fp_valid;
11480 /* Choose a base register based upon alignment requested, speed and/or
11481 size. */
11483 static void
11484 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11485 HOST_WIDE_INT &base_offset,
11486 unsigned int align_reqested, unsigned int *align)
11488 const struct machine_function *m = cfun->machine;
11489 unsigned int hfp_align;
11490 unsigned int drap_align;
11491 unsigned int sp_align;
11492 bool hfp_ok = fp_valid_at (cfa_offset);
11493 bool drap_ok = m->fs.drap_valid;
11494 bool sp_ok = sp_valid_at (cfa_offset);
11496 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11498 /* Filter out any registers that don't meet the requested alignment
11499 criteria. */
11500 if (align_reqested)
11502 if (m->fs.realigned)
11503 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11504 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11505 notes (which we would need to use a realigned stack pointer),
11506 so disable on SEH targets. */
11507 else if (m->fs.sp_realigned)
11508 sp_align = crtl->stack_alignment_needed;
11510 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11511 drap_ok = drap_ok && drap_align >= align_reqested;
11512 sp_ok = sp_ok && sp_align >= align_reqested;
11515 if (m->use_fast_prologue_epilogue)
11517 /* Choose the base register most likely to allow the most scheduling
11518 opportunities. Generally FP is valid throughout the function,
11519 while DRAP must be reloaded within the epilogue. But choose either
11520 over the SP due to increased encoding size. */
11522 if (hfp_ok)
11524 base_reg = hard_frame_pointer_rtx;
11525 base_offset = m->fs.fp_offset - cfa_offset;
11527 else if (drap_ok)
11529 base_reg = crtl->drap_reg;
11530 base_offset = 0 - cfa_offset;
11532 else if (sp_ok)
11534 base_reg = stack_pointer_rtx;
11535 base_offset = m->fs.sp_offset - cfa_offset;
11538 else
11540 HOST_WIDE_INT toffset;
11541 int len = 16, tlen;
11543 /* Choose the base register with the smallest address encoding.
11544 With a tie, choose FP > DRAP > SP. */
11545 if (sp_ok)
11547 base_reg = stack_pointer_rtx;
11548 base_offset = m->fs.sp_offset - cfa_offset;
11549 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11551 if (drap_ok)
11553 toffset = 0 - cfa_offset;
11554 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11555 if (tlen <= len)
11557 base_reg = crtl->drap_reg;
11558 base_offset = toffset;
11559 len = tlen;
11562 if (hfp_ok)
11564 toffset = m->fs.fp_offset - cfa_offset;
11565 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11566 if (tlen <= len)
11568 base_reg = hard_frame_pointer_rtx;
11569 base_offset = toffset;
11570 len = tlen;
11575 /* Set the align return value. */
11576 if (align)
11578 if (base_reg == stack_pointer_rtx)
11579 *align = sp_align;
11580 else if (base_reg == crtl->drap_reg)
11581 *align = drap_align;
11582 else if (base_reg == hard_frame_pointer_rtx)
11583 *align = hfp_align;
11587 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11588 the alignment of address. If ALIGN is non-null, it should point to
11589 an alignment value (in bits) that is preferred or zero and will
11590 recieve the alignment of the base register that was selected,
11591 irrespective of rather or not CFA_OFFSET is a multiple of that
11592 alignment value. If it is possible for the base register offset to be
11593 non-immediate then SCRATCH_REGNO should specify a scratch register to
11594 use.
11596 The valid base registers are taken from CFUN->MACHINE->FS. */
11598 static rtx
11599 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11600 unsigned int scratch_regno = INVALID_REGNUM)
11602 rtx base_reg = NULL;
11603 HOST_WIDE_INT base_offset = 0;
11605 /* If a specific alignment is requested, try to get a base register
11606 with that alignment first. */
11607 if (align && *align)
11608 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11610 if (!base_reg)
11611 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11613 gcc_assert (base_reg != NULL);
11615 rtx base_offset_rtx = GEN_INT (base_offset);
11617 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11619 gcc_assert (scratch_regno != INVALID_REGNUM);
11621 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11622 emit_move_insn (scratch_reg, base_offset_rtx);
11624 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11627 return plus_constant (Pmode, base_reg, base_offset);
11630 /* Emit code to save registers in the prologue. */
11632 static void
11633 ix86_emit_save_regs (void)
11635 unsigned int regno;
11636 rtx_insn *insn;
11638 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11639 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11641 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11642 RTX_FRAME_RELATED_P (insn) = 1;
11646 /* Emit a single register save at CFA - CFA_OFFSET. */
11648 static void
11649 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11650 HOST_WIDE_INT cfa_offset)
11652 struct machine_function *m = cfun->machine;
11653 rtx reg = gen_rtx_REG (mode, regno);
11654 rtx mem, addr, base, insn;
11655 unsigned int align = GET_MODE_ALIGNMENT (mode);
11657 addr = choose_baseaddr (cfa_offset, &align);
11658 mem = gen_frame_mem (mode, addr);
11660 /* The location aligment depends upon the base register. */
11661 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11662 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11663 set_mem_align (mem, align);
11665 insn = emit_insn (gen_rtx_SET (mem, reg));
11666 RTX_FRAME_RELATED_P (insn) = 1;
11668 base = addr;
11669 if (GET_CODE (base) == PLUS)
11670 base = XEXP (base, 0);
11671 gcc_checking_assert (REG_P (base));
11673 /* When saving registers into a re-aligned local stack frame, avoid
11674 any tricky guessing by dwarf2out. */
11675 if (m->fs.realigned)
11677 gcc_checking_assert (stack_realign_drap);
11679 if (regno == REGNO (crtl->drap_reg))
11681 /* A bit of a hack. We force the DRAP register to be saved in
11682 the re-aligned stack frame, which provides us with a copy
11683 of the CFA that will last past the prologue. Install it. */
11684 gcc_checking_assert (cfun->machine->fs.fp_valid);
11685 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11686 cfun->machine->fs.fp_offset - cfa_offset);
11687 mem = gen_rtx_MEM (mode, addr);
11688 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11690 else
11692 /* The frame pointer is a stable reference within the
11693 aligned frame. Use it. */
11694 gcc_checking_assert (cfun->machine->fs.fp_valid);
11695 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11696 cfun->machine->fs.fp_offset - cfa_offset);
11697 mem = gen_rtx_MEM (mode, addr);
11698 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11702 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11703 && cfa_offset >= m->fs.sp_realigned_offset)
11705 gcc_checking_assert (stack_realign_fp);
11706 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11709 /* The memory may not be relative to the current CFA register,
11710 which means that we may need to generate a new pattern for
11711 use by the unwind info. */
11712 else if (base != m->fs.cfa_reg)
11714 addr = plus_constant (Pmode, m->fs.cfa_reg,
11715 m->fs.cfa_offset - cfa_offset);
11716 mem = gen_rtx_MEM (mode, addr);
11717 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11721 /* Emit code to save registers using MOV insns.
11722 First register is stored at CFA - CFA_OFFSET. */
11723 static void
11724 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11726 unsigned int regno;
11728 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11729 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11731 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11732 cfa_offset -= UNITS_PER_WORD;
11736 /* Emit code to save SSE registers using MOV insns.
11737 First register is stored at CFA - CFA_OFFSET. */
11738 static void
11739 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11741 unsigned int regno;
11743 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11744 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11746 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11747 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11751 static GTY(()) rtx queued_cfa_restores;
11753 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11754 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11755 Don't add the note if the previously saved value will be left untouched
11756 within stack red-zone till return, as unwinders can find the same value
11757 in the register and on the stack. */
11759 static void
11760 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11762 if (!crtl->shrink_wrapped
11763 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11764 return;
11766 if (insn)
11768 add_reg_note (insn, REG_CFA_RESTORE, reg);
11769 RTX_FRAME_RELATED_P (insn) = 1;
11771 else
11772 queued_cfa_restores
11773 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11776 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11778 static void
11779 ix86_add_queued_cfa_restore_notes (rtx insn)
11781 rtx last;
11782 if (!queued_cfa_restores)
11783 return;
11784 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11786 XEXP (last, 1) = REG_NOTES (insn);
11787 REG_NOTES (insn) = queued_cfa_restores;
11788 queued_cfa_restores = NULL_RTX;
11789 RTX_FRAME_RELATED_P (insn) = 1;
11792 /* Expand prologue or epilogue stack adjustment.
11793 The pattern exist to put a dependency on all ebp-based memory accesses.
11794 STYLE should be negative if instructions should be marked as frame related,
11795 zero if %r11 register is live and cannot be freely used and positive
11796 otherwise. */
11798 static rtx
11799 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11800 int style, bool set_cfa)
11802 struct machine_function *m = cfun->machine;
11803 rtx insn;
11804 bool add_frame_related_expr = false;
11806 if (Pmode == SImode)
11807 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11808 else if (x86_64_immediate_operand (offset, DImode))
11809 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11810 else
11812 rtx tmp;
11813 /* r11 is used by indirect sibcall return as well, set before the
11814 epilogue and used after the epilogue. */
11815 if (style)
11816 tmp = gen_rtx_REG (DImode, R11_REG);
11817 else
11819 gcc_assert (src != hard_frame_pointer_rtx
11820 && dest != hard_frame_pointer_rtx);
11821 tmp = hard_frame_pointer_rtx;
11823 insn = emit_insn (gen_rtx_SET (tmp, offset));
11824 if (style < 0)
11825 add_frame_related_expr = true;
11827 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11830 insn = emit_insn (insn);
11831 if (style >= 0)
11832 ix86_add_queued_cfa_restore_notes (insn);
11834 if (set_cfa)
11836 rtx r;
11838 gcc_assert (m->fs.cfa_reg == src);
11839 m->fs.cfa_offset += INTVAL (offset);
11840 m->fs.cfa_reg = dest;
11842 r = gen_rtx_PLUS (Pmode, src, offset);
11843 r = gen_rtx_SET (dest, r);
11844 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11845 RTX_FRAME_RELATED_P (insn) = 1;
11847 else if (style < 0)
11849 RTX_FRAME_RELATED_P (insn) = 1;
11850 if (add_frame_related_expr)
11852 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11853 r = gen_rtx_SET (dest, r);
11854 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11858 if (dest == stack_pointer_rtx)
11860 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11861 bool valid = m->fs.sp_valid;
11862 bool realigned = m->fs.sp_realigned;
11864 if (src == hard_frame_pointer_rtx)
11866 valid = m->fs.fp_valid;
11867 realigned = false;
11868 ooffset = m->fs.fp_offset;
11870 else if (src == crtl->drap_reg)
11872 valid = m->fs.drap_valid;
11873 realigned = false;
11874 ooffset = 0;
11876 else
11878 /* Else there are two possibilities: SP itself, which we set
11879 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11880 taken care of this by hand along the eh_return path. */
11881 gcc_checking_assert (src == stack_pointer_rtx
11882 || offset == const0_rtx);
11885 m->fs.sp_offset = ooffset - INTVAL (offset);
11886 m->fs.sp_valid = valid;
11887 m->fs.sp_realigned = realigned;
11889 return insn;
11892 /* Find an available register to be used as dynamic realign argument
11893 pointer regsiter. Such a register will be written in prologue and
11894 used in begin of body, so it must not be
11895 1. parameter passing register.
11896 2. GOT pointer.
11897 We reuse static-chain register if it is available. Otherwise, we
11898 use DI for i386 and R13 for x86-64. We chose R13 since it has
11899 shorter encoding.
11901 Return: the regno of chosen register. */
11903 static unsigned int
11904 find_drap_reg (void)
11906 tree decl = cfun->decl;
11908 /* Always use callee-saved register if there are no caller-saved
11909 registers. */
11910 if (TARGET_64BIT)
11912 /* Use R13 for nested function or function need static chain.
11913 Since function with tail call may use any caller-saved
11914 registers in epilogue, DRAP must not use caller-saved
11915 register in such case. */
11916 if (DECL_STATIC_CHAIN (decl)
11917 || cfun->machine->no_caller_saved_registers
11918 || crtl->tail_call_emit)
11919 return R13_REG;
11921 return R10_REG;
11923 else
11925 /* Use DI for nested function or function need static chain.
11926 Since function with tail call may use any caller-saved
11927 registers in epilogue, DRAP must not use caller-saved
11928 register in such case. */
11929 if (DECL_STATIC_CHAIN (decl)
11930 || cfun->machine->no_caller_saved_registers
11931 || crtl->tail_call_emit)
11932 return DI_REG;
11934 /* Reuse static chain register if it isn't used for parameter
11935 passing. */
11936 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11938 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11939 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11940 return CX_REG;
11942 return DI_REG;
11946 /* Handle a "force_align_arg_pointer" attribute. */
11948 static tree
11949 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11950 tree, int, bool *no_add_attrs)
11952 if (TREE_CODE (*node) != FUNCTION_TYPE
11953 && TREE_CODE (*node) != METHOD_TYPE
11954 && TREE_CODE (*node) != FIELD_DECL
11955 && TREE_CODE (*node) != TYPE_DECL)
11957 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11958 name);
11959 *no_add_attrs = true;
11962 return NULL_TREE;
11965 /* Return minimum incoming stack alignment. */
11967 static unsigned int
11968 ix86_minimum_incoming_stack_boundary (bool sibcall)
11970 unsigned int incoming_stack_boundary;
11972 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11973 if (cfun->machine->func_type != TYPE_NORMAL)
11974 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11975 /* Prefer the one specified at command line. */
11976 else if (ix86_user_incoming_stack_boundary)
11977 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11978 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11979 if -mstackrealign is used, it isn't used for sibcall check and
11980 estimated stack alignment is 128bit. */
11981 else if (!sibcall
11982 && ix86_force_align_arg_pointer
11983 && crtl->stack_alignment_estimated == 128)
11984 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11985 else
11986 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11988 /* Incoming stack alignment can be changed on individual functions
11989 via force_align_arg_pointer attribute. We use the smallest
11990 incoming stack boundary. */
11991 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
11992 && lookup_attribute (ix86_force_align_arg_pointer_string,
11993 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
11994 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11996 /* The incoming stack frame has to be aligned at least at
11997 parm_stack_boundary. */
11998 if (incoming_stack_boundary < crtl->parm_stack_boundary)
11999 incoming_stack_boundary = crtl->parm_stack_boundary;
12001 /* Stack at entrance of main is aligned by runtime. We use the
12002 smallest incoming stack boundary. */
12003 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12004 && DECL_NAME (current_function_decl)
12005 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12006 && DECL_FILE_SCOPE_P (current_function_decl))
12007 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12009 return incoming_stack_boundary;
12012 /* Update incoming stack boundary and estimated stack alignment. */
12014 static void
12015 ix86_update_stack_boundary (void)
12017 ix86_incoming_stack_boundary
12018 = ix86_minimum_incoming_stack_boundary (false);
12020 /* x86_64 vararg needs 16byte stack alignment for register save
12021 area. */
12022 if (TARGET_64BIT
12023 && cfun->stdarg
12024 && crtl->stack_alignment_estimated < 128)
12025 crtl->stack_alignment_estimated = 128;
12027 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12028 if (ix86_tls_descriptor_calls_expanded_in_cfun
12029 && crtl->preferred_stack_boundary < 128)
12030 crtl->preferred_stack_boundary = 128;
12033 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12034 needed or an rtx for DRAP otherwise. */
12036 static rtx
12037 ix86_get_drap_rtx (void)
12039 /* We must use DRAP if there are outgoing arguments on stack and
12040 ACCUMULATE_OUTGOING_ARGS is false. */
12041 if (ix86_force_drap
12042 || (cfun->machine->outgoing_args_on_stack
12043 && !ACCUMULATE_OUTGOING_ARGS))
12044 crtl->need_drap = true;
12046 if (stack_realign_drap)
12048 /* Assign DRAP to vDRAP and returns vDRAP */
12049 unsigned int regno = find_drap_reg ();
12050 rtx drap_vreg;
12051 rtx arg_ptr;
12052 rtx_insn *seq, *insn;
12054 arg_ptr = gen_rtx_REG (Pmode, regno);
12055 crtl->drap_reg = arg_ptr;
12057 start_sequence ();
12058 drap_vreg = copy_to_reg (arg_ptr);
12059 seq = get_insns ();
12060 end_sequence ();
12062 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12063 if (!optimize)
12065 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12066 RTX_FRAME_RELATED_P (insn) = 1;
12068 return drap_vreg;
12070 else
12071 return NULL;
12074 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12076 static rtx
12077 ix86_internal_arg_pointer (void)
12079 return virtual_incoming_args_rtx;
12082 struct scratch_reg {
12083 rtx reg;
12084 bool saved;
12087 /* Return a short-lived scratch register for use on function entry.
12088 In 32-bit mode, it is valid only after the registers are saved
12089 in the prologue. This register must be released by means of
12090 release_scratch_register_on_entry once it is dead. */
12092 static void
12093 get_scratch_register_on_entry (struct scratch_reg *sr)
12095 int regno;
12097 sr->saved = false;
12099 if (TARGET_64BIT)
12101 /* We always use R11 in 64-bit mode. */
12102 regno = R11_REG;
12104 else
12106 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12107 bool fastcall_p
12108 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12109 bool thiscall_p
12110 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12111 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12112 int regparm = ix86_function_regparm (fntype, decl);
12113 int drap_regno
12114 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12116 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12117 for the static chain register. */
12118 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12119 && drap_regno != AX_REG)
12120 regno = AX_REG;
12121 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12122 for the static chain register. */
12123 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12124 regno = AX_REG;
12125 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12126 regno = DX_REG;
12127 /* ecx is the static chain register. */
12128 else if (regparm < 3 && !fastcall_p && !thiscall_p
12129 && !static_chain_p
12130 && drap_regno != CX_REG)
12131 regno = CX_REG;
12132 else if (ix86_save_reg (BX_REG, true, false))
12133 regno = BX_REG;
12134 /* esi is the static chain register. */
12135 else if (!(regparm == 3 && static_chain_p)
12136 && ix86_save_reg (SI_REG, true, false))
12137 regno = SI_REG;
12138 else if (ix86_save_reg (DI_REG, true, false))
12139 regno = DI_REG;
12140 else
12142 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12143 sr->saved = true;
12147 sr->reg = gen_rtx_REG (Pmode, regno);
12148 if (sr->saved)
12150 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12151 RTX_FRAME_RELATED_P (insn) = 1;
12155 /* Release a scratch register obtained from the preceding function. */
12157 static void
12158 release_scratch_register_on_entry (struct scratch_reg *sr)
12160 if (sr->saved)
12162 struct machine_function *m = cfun->machine;
12163 rtx x, insn = emit_insn (gen_pop (sr->reg));
12165 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12166 RTX_FRAME_RELATED_P (insn) = 1;
12167 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12168 x = gen_rtx_SET (stack_pointer_rtx, x);
12169 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12170 m->fs.sp_offset -= UNITS_PER_WORD;
12174 /* Return the probing interval for -fstack-clash-protection. */
12176 static HOST_WIDE_INT
12177 get_probe_interval (void)
12179 if (flag_stack_clash_protection)
12180 return (HOST_WIDE_INT_1U
12181 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
12182 else
12183 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
12186 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12188 This differs from the next routine in that it tries hard to prevent
12189 attacks that jump the stack guard. Thus it is never allowed to allocate
12190 more than PROBE_INTERVAL bytes of stack space without a suitable
12191 probe. */
12193 static void
12194 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12196 struct machine_function *m = cfun->machine;
12198 /* If this function does not statically allocate stack space, then
12199 no probes are needed. */
12200 if (!size)
12202 /* However, the allocation of space via pushes for register
12203 saves could be viewed as allocating space, but without the
12204 need to probe. */
12205 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12206 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12207 else
12208 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12209 return;
12212 /* If we are a noreturn function, then we have to consider the
12213 possibility that we're called via a jump rather than a call.
12215 Thus we don't have the implicit probe generated by saving the
12216 return address into the stack at the call. Thus, the stack
12217 pointer could be anywhere in the guard page. The safe thing
12218 to do is emit a probe now.
12220 ?!? This should be revamped to work like aarch64 and s390 where
12221 we track the offset from the most recent probe. Normally that
12222 offset would be zero. For a noreturn function we would reset
12223 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12224 we just probe when we cross PROBE_INTERVAL. */
12225 if (TREE_THIS_VOLATILE (cfun->decl))
12227 /* We can safely use any register here since we're just going to push
12228 its value and immediately pop it back. But we do try and avoid
12229 argument passing registers so as not to introduce dependencies in
12230 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12231 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12232 rtx_insn *insn = emit_insn (gen_push (dummy_reg));
12233 RTX_FRAME_RELATED_P (insn) = 1;
12234 ix86_emit_restore_reg_using_pop (dummy_reg);
12235 emit_insn (gen_blockage ());
12238 /* If we allocate less than the size of the guard statically,
12239 then no probing is necessary, but we do need to allocate
12240 the stack. */
12241 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12243 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12244 GEN_INT (-size), -1,
12245 m->fs.cfa_reg == stack_pointer_rtx);
12246 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12247 return;
12250 /* We're allocating a large enough stack frame that we need to
12251 emit probes. Either emit them inline or in a loop depending
12252 on the size. */
12253 HOST_WIDE_INT probe_interval = get_probe_interval ();
12254 if (size <= 4 * probe_interval)
12256 HOST_WIDE_INT i;
12257 for (i = probe_interval; i <= size; i += probe_interval)
12259 /* Allocate PROBE_INTERVAL bytes. */
12260 rtx insn
12261 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12262 GEN_INT (-probe_interval), -1,
12263 m->fs.cfa_reg == stack_pointer_rtx);
12264 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12266 /* And probe at *sp. */
12267 emit_stack_probe (stack_pointer_rtx);
12268 emit_insn (gen_blockage ());
12271 /* We need to allocate space for the residual, but we do not need
12272 to probe the residual. */
12273 HOST_WIDE_INT residual = (i - probe_interval - size);
12274 if (residual)
12275 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12276 GEN_INT (residual), -1,
12277 m->fs.cfa_reg == stack_pointer_rtx);
12278 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12280 else
12282 struct scratch_reg sr;
12283 get_scratch_register_on_entry (&sr);
12285 /* Step 1: round SIZE down to a multiple of the interval. */
12286 HOST_WIDE_INT rounded_size = size & -probe_interval;
12288 /* Step 2: compute final value of the loop counter. Use lea if
12289 possible. */
12290 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12291 rtx insn;
12292 if (address_no_seg_operand (addr, Pmode))
12293 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12294 else
12296 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12297 insn = emit_insn (gen_rtx_SET (sr.reg,
12298 gen_rtx_PLUS (Pmode, sr.reg,
12299 stack_pointer_rtx)));
12301 if (m->fs.cfa_reg == stack_pointer_rtx)
12303 add_reg_note (insn, REG_CFA_DEF_CFA,
12304 plus_constant (Pmode, sr.reg,
12305 m->fs.cfa_offset + rounded_size));
12306 RTX_FRAME_RELATED_P (insn) = 1;
12309 /* Step 3: the loop. */
12310 rtx size_rtx = GEN_INT (rounded_size);
12311 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12312 size_rtx));
12313 if (m->fs.cfa_reg == stack_pointer_rtx)
12315 m->fs.cfa_offset += rounded_size;
12316 add_reg_note (insn, REG_CFA_DEF_CFA,
12317 plus_constant (Pmode, stack_pointer_rtx,
12318 m->fs.cfa_offset));
12319 RTX_FRAME_RELATED_P (insn) = 1;
12321 m->fs.sp_offset += rounded_size;
12322 emit_insn (gen_blockage ());
12324 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12325 is equal to ROUNDED_SIZE. */
12327 if (size != rounded_size)
12328 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12329 GEN_INT (rounded_size - size), -1,
12330 m->fs.cfa_reg == stack_pointer_rtx);
12331 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12333 release_scratch_register_on_entry (&sr);
12336 /* Make sure nothing is scheduled before we are done. */
12337 emit_insn (gen_blockage ());
12340 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12342 static void
12343 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12345 /* We skip the probe for the first interval + a small dope of 4 words and
12346 probe that many bytes past the specified size to maintain a protection
12347 area at the botton of the stack. */
12348 const int dope = 4 * UNITS_PER_WORD;
12349 rtx size_rtx = GEN_INT (size), last;
12351 /* See if we have a constant small number of probes to generate. If so,
12352 that's the easy case. The run-time loop is made up of 9 insns in the
12353 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12354 for n # of intervals. */
12355 if (size <= 4 * get_probe_interval ())
12357 HOST_WIDE_INT i, adjust;
12358 bool first_probe = true;
12360 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12361 values of N from 1 until it exceeds SIZE. If only one probe is
12362 needed, this will not generate any code. Then adjust and probe
12363 to PROBE_INTERVAL + SIZE. */
12364 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12366 if (first_probe)
12368 adjust = 2 * get_probe_interval () + dope;
12369 first_probe = false;
12371 else
12372 adjust = get_probe_interval ();
12374 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12375 plus_constant (Pmode, stack_pointer_rtx,
12376 -adjust)));
12377 emit_stack_probe (stack_pointer_rtx);
12380 if (first_probe)
12381 adjust = size + get_probe_interval () + dope;
12382 else
12383 adjust = size + get_probe_interval () - i;
12385 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12386 plus_constant (Pmode, stack_pointer_rtx,
12387 -adjust)));
12388 emit_stack_probe (stack_pointer_rtx);
12390 /* Adjust back to account for the additional first interval. */
12391 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12392 plus_constant (Pmode, stack_pointer_rtx,
12393 (get_probe_interval ()
12394 + dope))));
12397 /* Otherwise, do the same as above, but in a loop. Note that we must be
12398 extra careful with variables wrapping around because we might be at
12399 the very top (or the very bottom) of the address space and we have
12400 to be able to handle this case properly; in particular, we use an
12401 equality test for the loop condition. */
12402 else
12404 HOST_WIDE_INT rounded_size;
12405 struct scratch_reg sr;
12407 get_scratch_register_on_entry (&sr);
12410 /* Step 1: round SIZE to the previous multiple of the interval. */
12412 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12415 /* Step 2: compute initial and final value of the loop counter. */
12417 /* SP = SP_0 + PROBE_INTERVAL. */
12418 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12419 plus_constant (Pmode, stack_pointer_rtx,
12420 - (get_probe_interval () + dope))));
12422 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12423 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12424 emit_insn (gen_rtx_SET (sr.reg,
12425 plus_constant (Pmode, stack_pointer_rtx,
12426 -rounded_size)));
12427 else
12429 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12430 emit_insn (gen_rtx_SET (sr.reg,
12431 gen_rtx_PLUS (Pmode, sr.reg,
12432 stack_pointer_rtx)));
12436 /* Step 3: the loop
12440 SP = SP + PROBE_INTERVAL
12441 probe at SP
12443 while (SP != LAST_ADDR)
12445 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12446 values of N from 1 until it is equal to ROUNDED_SIZE. */
12448 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12451 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12452 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12454 if (size != rounded_size)
12456 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12457 plus_constant (Pmode, stack_pointer_rtx,
12458 rounded_size - size)));
12459 emit_stack_probe (stack_pointer_rtx);
12462 /* Adjust back to account for the additional first interval. */
12463 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12464 plus_constant (Pmode, stack_pointer_rtx,
12465 (get_probe_interval ()
12466 + dope))));
12468 release_scratch_register_on_entry (&sr);
12471 /* Even if the stack pointer isn't the CFA register, we need to correctly
12472 describe the adjustments made to it, in particular differentiate the
12473 frame-related ones from the frame-unrelated ones. */
12474 if (size > 0)
12476 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12477 XVECEXP (expr, 0, 0)
12478 = gen_rtx_SET (stack_pointer_rtx,
12479 plus_constant (Pmode, stack_pointer_rtx, -size));
12480 XVECEXP (expr, 0, 1)
12481 = gen_rtx_SET (stack_pointer_rtx,
12482 plus_constant (Pmode, stack_pointer_rtx,
12483 get_probe_interval () + dope + size));
12484 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12485 RTX_FRAME_RELATED_P (last) = 1;
12487 cfun->machine->fs.sp_offset += size;
12490 /* Make sure nothing is scheduled before we are done. */
12491 emit_insn (gen_blockage ());
12494 /* Adjust the stack pointer up to REG while probing it. */
12496 const char *
12497 output_adjust_stack_and_probe (rtx reg)
12499 static int labelno = 0;
12500 char loop_lab[32];
12501 rtx xops[2];
12503 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12505 /* Loop. */
12506 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12508 /* SP = SP + PROBE_INTERVAL. */
12509 xops[0] = stack_pointer_rtx;
12510 xops[1] = GEN_INT (get_probe_interval ());
12511 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12513 /* Probe at SP. */
12514 xops[1] = const0_rtx;
12515 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12517 /* Test if SP == LAST_ADDR. */
12518 xops[0] = stack_pointer_rtx;
12519 xops[1] = reg;
12520 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12522 /* Branch. */
12523 fputs ("\tjne\t", asm_out_file);
12524 assemble_name_raw (asm_out_file, loop_lab);
12525 fputc ('\n', asm_out_file);
12527 return "";
12530 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12531 inclusive. These are offsets from the current stack pointer. */
12533 static void
12534 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12536 /* See if we have a constant small number of probes to generate. If so,
12537 that's the easy case. The run-time loop is made up of 6 insns in the
12538 generic case while the compile-time loop is made up of n insns for n #
12539 of intervals. */
12540 if (size <= 6 * get_probe_interval ())
12542 HOST_WIDE_INT i;
12544 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12545 it exceeds SIZE. If only one probe is needed, this will not
12546 generate any code. Then probe at FIRST + SIZE. */
12547 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12548 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12549 -(first + i)));
12551 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12552 -(first + size)));
12555 /* Otherwise, do the same as above, but in a loop. Note that we must be
12556 extra careful with variables wrapping around because we might be at
12557 the very top (or the very bottom) of the address space and we have
12558 to be able to handle this case properly; in particular, we use an
12559 equality test for the loop condition. */
12560 else
12562 HOST_WIDE_INT rounded_size, last;
12563 struct scratch_reg sr;
12565 get_scratch_register_on_entry (&sr);
12568 /* Step 1: round SIZE to the previous multiple of the interval. */
12570 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12573 /* Step 2: compute initial and final value of the loop counter. */
12575 /* TEST_OFFSET = FIRST. */
12576 emit_move_insn (sr.reg, GEN_INT (-first));
12578 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12579 last = first + rounded_size;
12582 /* Step 3: the loop
12586 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12587 probe at TEST_ADDR
12589 while (TEST_ADDR != LAST_ADDR)
12591 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12592 until it is equal to ROUNDED_SIZE. */
12594 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12597 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12598 that SIZE is equal to ROUNDED_SIZE. */
12600 if (size != rounded_size)
12601 emit_stack_probe (plus_constant (Pmode,
12602 gen_rtx_PLUS (Pmode,
12603 stack_pointer_rtx,
12604 sr.reg),
12605 rounded_size - size));
12607 release_scratch_register_on_entry (&sr);
12610 /* Make sure nothing is scheduled before we are done. */
12611 emit_insn (gen_blockage ());
12614 /* Probe a range of stack addresses from REG to END, inclusive. These are
12615 offsets from the current stack pointer. */
12617 const char *
12618 output_probe_stack_range (rtx reg, rtx end)
12620 static int labelno = 0;
12621 char loop_lab[32];
12622 rtx xops[3];
12624 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12626 /* Loop. */
12627 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12629 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12630 xops[0] = reg;
12631 xops[1] = GEN_INT (get_probe_interval ());
12632 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12634 /* Probe at TEST_ADDR. */
12635 xops[0] = stack_pointer_rtx;
12636 xops[1] = reg;
12637 xops[2] = const0_rtx;
12638 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12640 /* Test if TEST_ADDR == LAST_ADDR. */
12641 xops[0] = reg;
12642 xops[1] = end;
12643 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12645 /* Branch. */
12646 fputs ("\tjne\t", asm_out_file);
12647 assemble_name_raw (asm_out_file, loop_lab);
12648 fputc ('\n', asm_out_file);
12650 return "";
12653 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12654 will guide prologue/epilogue to be generated in correct form. */
12656 static void
12657 ix86_finalize_stack_frame_flags (void)
12659 /* Check if stack realign is really needed after reload, and
12660 stores result in cfun */
12661 unsigned int incoming_stack_boundary
12662 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12663 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12664 unsigned int stack_alignment
12665 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12666 ? crtl->max_used_stack_slot_alignment
12667 : crtl->stack_alignment_needed);
12668 unsigned int stack_realign
12669 = (incoming_stack_boundary < stack_alignment);
12670 bool recompute_frame_layout_p = false;
12672 if (crtl->stack_realign_finalized)
12674 /* After stack_realign_needed is finalized, we can't no longer
12675 change it. */
12676 gcc_assert (crtl->stack_realign_needed == stack_realign);
12677 return;
12680 /* If the only reason for frame_pointer_needed is that we conservatively
12681 assumed stack realignment might be needed or -fno-omit-frame-pointer
12682 is used, but in the end nothing that needed the stack alignment had
12683 been spilled nor stack access, clear frame_pointer_needed and say we
12684 don't need stack realignment. */
12685 if ((stack_realign || !flag_omit_frame_pointer)
12686 && frame_pointer_needed
12687 && crtl->is_leaf
12688 && crtl->sp_is_unchanging
12689 && !ix86_current_function_calls_tls_descriptor
12690 && !crtl->accesses_prior_frames
12691 && !cfun->calls_alloca
12692 && !crtl->calls_eh_return
12693 /* See ira_setup_eliminable_regset for the rationale. */
12694 && !(STACK_CHECK_MOVING_SP
12695 && flag_stack_check
12696 && flag_exceptions
12697 && cfun->can_throw_non_call_exceptions)
12698 && !ix86_frame_pointer_required ()
12699 && get_frame_size () == 0
12700 && ix86_nsaved_sseregs () == 0
12701 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12703 HARD_REG_SET set_up_by_prologue, prologue_used;
12704 basic_block bb;
12706 CLEAR_HARD_REG_SET (prologue_used);
12707 CLEAR_HARD_REG_SET (set_up_by_prologue);
12708 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12709 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12710 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12711 HARD_FRAME_POINTER_REGNUM);
12713 /* The preferred stack alignment is the minimum stack alignment. */
12714 if (stack_alignment > crtl->preferred_stack_boundary)
12715 stack_alignment = crtl->preferred_stack_boundary;
12717 bool require_stack_frame = false;
12719 FOR_EACH_BB_FN (bb, cfun)
12721 rtx_insn *insn;
12722 FOR_BB_INSNS (bb, insn)
12723 if (NONDEBUG_INSN_P (insn)
12724 && requires_stack_frame_p (insn, prologue_used,
12725 set_up_by_prologue))
12727 require_stack_frame = true;
12729 if (stack_realign)
12731 /* Find the maximum stack alignment. */
12732 subrtx_iterator::array_type array;
12733 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12734 if (MEM_P (*iter)
12735 && (reg_mentioned_p (stack_pointer_rtx,
12736 *iter)
12737 || reg_mentioned_p (frame_pointer_rtx,
12738 *iter)))
12740 unsigned int alignment = MEM_ALIGN (*iter);
12741 if (alignment > stack_alignment)
12742 stack_alignment = alignment;
12748 if (require_stack_frame)
12750 /* Stack frame is required. If stack alignment needed is less
12751 than incoming stack boundary, don't realign stack. */
12752 stack_realign = incoming_stack_boundary < stack_alignment;
12753 if (!stack_realign)
12755 crtl->max_used_stack_slot_alignment
12756 = incoming_stack_boundary;
12757 crtl->stack_alignment_needed
12758 = incoming_stack_boundary;
12759 /* Also update preferred_stack_boundary for leaf
12760 functions. */
12761 crtl->preferred_stack_boundary
12762 = incoming_stack_boundary;
12765 else
12767 /* If drap has been set, but it actually isn't live at the
12768 start of the function, there is no reason to set it up. */
12769 if (crtl->drap_reg)
12771 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12772 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12773 REGNO (crtl->drap_reg)))
12775 crtl->drap_reg = NULL_RTX;
12776 crtl->need_drap = false;
12779 else
12780 cfun->machine->no_drap_save_restore = true;
12782 frame_pointer_needed = false;
12783 stack_realign = false;
12784 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12785 crtl->stack_alignment_needed = incoming_stack_boundary;
12786 crtl->stack_alignment_estimated = incoming_stack_boundary;
12787 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12788 crtl->preferred_stack_boundary = incoming_stack_boundary;
12789 df_finish_pass (true);
12790 df_scan_alloc (NULL);
12791 df_scan_blocks ();
12792 df_compute_regs_ever_live (true);
12793 df_analyze ();
12795 if (flag_var_tracking)
12797 /* Since frame pointer is no longer available, replace it with
12798 stack pointer - UNITS_PER_WORD in debug insns. */
12799 df_ref ref, next;
12800 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12801 ref; ref = next)
12803 next = DF_REF_NEXT_REG (ref);
12804 if (!DF_REF_INSN_INFO (ref))
12805 continue;
12807 /* Make sure the next ref is for a different instruction,
12808 so that we're not affected by the rescan. */
12809 rtx_insn *insn = DF_REF_INSN (ref);
12810 while (next && DF_REF_INSN (next) == insn)
12811 next = DF_REF_NEXT_REG (next);
12813 if (DEBUG_INSN_P (insn))
12815 bool changed = false;
12816 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12818 rtx *loc = DF_REF_LOC (ref);
12819 if (*loc == hard_frame_pointer_rtx)
12821 *loc = plus_constant (Pmode,
12822 stack_pointer_rtx,
12823 -UNITS_PER_WORD);
12824 changed = true;
12827 if (changed)
12828 df_insn_rescan (insn);
12833 recompute_frame_layout_p = true;
12837 if (crtl->stack_realign_needed != stack_realign)
12838 recompute_frame_layout_p = true;
12839 crtl->stack_realign_needed = stack_realign;
12840 crtl->stack_realign_finalized = true;
12841 if (recompute_frame_layout_p)
12842 ix86_compute_frame_layout ();
12845 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12847 static void
12848 ix86_elim_entry_set_got (rtx reg)
12850 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12851 rtx_insn *c_insn = BB_HEAD (bb);
12852 if (!NONDEBUG_INSN_P (c_insn))
12853 c_insn = next_nonnote_nondebug_insn (c_insn);
12854 if (c_insn && NONJUMP_INSN_P (c_insn))
12856 rtx pat = PATTERN (c_insn);
12857 if (GET_CODE (pat) == PARALLEL)
12859 rtx vec = XVECEXP (pat, 0, 0);
12860 if (GET_CODE (vec) == SET
12861 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12862 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12863 delete_insn (c_insn);
12868 static rtx
12869 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12871 rtx addr, mem;
12873 if (offset)
12874 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12875 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12876 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12879 static inline rtx
12880 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12882 return gen_frame_set (reg, frame_reg, offset, false);
12885 static inline rtx
12886 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12888 return gen_frame_set (reg, frame_reg, offset, true);
12891 static void
12892 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12894 struct machine_function *m = cfun->machine;
12895 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12896 + m->call_ms2sysv_extra_regs;
12897 rtvec v = rtvec_alloc (ncregs + 1);
12898 unsigned int align, i, vi = 0;
12899 rtx_insn *insn;
12900 rtx sym, addr;
12901 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12902 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12904 /* AL should only be live with sysv_abi. */
12905 gcc_assert (!ix86_eax_live_at_start_p ());
12906 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
12908 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12909 we've actually realigned the stack or not. */
12910 align = GET_MODE_ALIGNMENT (V4SFmode);
12911 addr = choose_baseaddr (frame.stack_realign_offset
12912 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
12913 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12915 emit_insn (gen_rtx_SET (rax, addr));
12917 /* Get the stub symbol. */
12918 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12919 : XLOGUE_STUB_SAVE);
12920 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12922 for (i = 0; i < ncregs; ++i)
12924 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12925 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12926 r.regno);
12927 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12930 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12932 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12933 RTX_FRAME_RELATED_P (insn) = true;
12936 /* Expand the prologue into a bunch of separate insns. */
12938 void
12939 ix86_expand_prologue (void)
12941 struct machine_function *m = cfun->machine;
12942 rtx insn, t;
12943 struct ix86_frame frame;
12944 HOST_WIDE_INT allocate;
12945 bool int_registers_saved;
12946 bool sse_registers_saved;
12947 bool save_stub_call_needed;
12948 rtx static_chain = NULL_RTX;
12950 if (ix86_function_naked (current_function_decl))
12951 return;
12953 ix86_finalize_stack_frame_flags ();
12955 /* DRAP should not coexist with stack_realign_fp */
12956 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
12958 memset (&m->fs, 0, sizeof (m->fs));
12960 /* Initialize CFA state for before the prologue. */
12961 m->fs.cfa_reg = stack_pointer_rtx;
12962 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
12964 /* Track SP offset to the CFA. We continue tracking this after we've
12965 swapped the CFA register away from SP. In the case of re-alignment
12966 this is fudged; we're interested to offsets within the local frame. */
12967 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
12968 m->fs.sp_valid = true;
12969 m->fs.sp_realigned = false;
12971 frame = m->frame;
12973 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
12975 /* We should have already generated an error for any use of
12976 ms_hook on a nested function. */
12977 gcc_checking_assert (!ix86_static_chain_on_stack);
12979 /* Check if profiling is active and we shall use profiling before
12980 prologue variant. If so sorry. */
12981 if (crtl->profile && flag_fentry != 0)
12982 sorry ("ms_hook_prologue attribute isn%'t compatible "
12983 "with -mfentry for 32-bit");
12985 /* In ix86_asm_output_function_label we emitted:
12986 8b ff movl.s %edi,%edi
12987 55 push %ebp
12988 8b ec movl.s %esp,%ebp
12990 This matches the hookable function prologue in Win32 API
12991 functions in Microsoft Windows XP Service Pack 2 and newer.
12992 Wine uses this to enable Windows apps to hook the Win32 API
12993 functions provided by Wine.
12995 What that means is that we've already set up the frame pointer. */
12997 if (frame_pointer_needed
12998 && !(crtl->drap_reg && crtl->stack_realign_needed))
13000 rtx push, mov;
13002 /* We've decided to use the frame pointer already set up.
13003 Describe this to the unwinder by pretending that both
13004 push and mov insns happen right here.
13006 Putting the unwind info here at the end of the ms_hook
13007 is done so that we can make absolutely certain we get
13008 the required byte sequence at the start of the function,
13009 rather than relying on an assembler that can produce
13010 the exact encoding required.
13012 However it does mean (in the unpatched case) that we have
13013 a 1 insn window where the asynchronous unwind info is
13014 incorrect. However, if we placed the unwind info at
13015 its correct location we would have incorrect unwind info
13016 in the patched case. Which is probably all moot since
13017 I don't expect Wine generates dwarf2 unwind info for the
13018 system libraries that use this feature. */
13020 insn = emit_insn (gen_blockage ());
13022 push = gen_push (hard_frame_pointer_rtx);
13023 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13024 stack_pointer_rtx);
13025 RTX_FRAME_RELATED_P (push) = 1;
13026 RTX_FRAME_RELATED_P (mov) = 1;
13028 RTX_FRAME_RELATED_P (insn) = 1;
13029 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13030 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13032 /* Note that gen_push incremented m->fs.cfa_offset, even
13033 though we didn't emit the push insn here. */
13034 m->fs.cfa_reg = hard_frame_pointer_rtx;
13035 m->fs.fp_offset = m->fs.cfa_offset;
13036 m->fs.fp_valid = true;
13038 else
13040 /* The frame pointer is not needed so pop %ebp again.
13041 This leaves us with a pristine state. */
13042 emit_insn (gen_pop (hard_frame_pointer_rtx));
13046 /* The first insn of a function that accepts its static chain on the
13047 stack is to push the register that would be filled in by a direct
13048 call. This insn will be skipped by the trampoline. */
13049 else if (ix86_static_chain_on_stack)
13051 static_chain = ix86_static_chain (cfun->decl, false);
13052 insn = emit_insn (gen_push (static_chain));
13053 emit_insn (gen_blockage ());
13055 /* We don't want to interpret this push insn as a register save,
13056 only as a stack adjustment. The real copy of the register as
13057 a save will be done later, if needed. */
13058 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13059 t = gen_rtx_SET (stack_pointer_rtx, t);
13060 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13061 RTX_FRAME_RELATED_P (insn) = 1;
13064 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13065 of DRAP is needed and stack realignment is really needed after reload */
13066 if (stack_realign_drap)
13068 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13070 /* Can't use DRAP in interrupt function. */
13071 if (cfun->machine->func_type != TYPE_NORMAL)
13072 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13073 "in interrupt service routine. This may be worked "
13074 "around by avoiding functions with aggregate return.");
13076 /* Only need to push parameter pointer reg if it is caller saved. */
13077 if (!call_used_regs[REGNO (crtl->drap_reg)])
13079 /* Push arg pointer reg */
13080 insn = emit_insn (gen_push (crtl->drap_reg));
13081 RTX_FRAME_RELATED_P (insn) = 1;
13084 /* Grab the argument pointer. */
13085 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13086 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13087 RTX_FRAME_RELATED_P (insn) = 1;
13088 m->fs.cfa_reg = crtl->drap_reg;
13089 m->fs.cfa_offset = 0;
13091 /* Align the stack. */
13092 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13093 stack_pointer_rtx,
13094 GEN_INT (-align_bytes)));
13095 RTX_FRAME_RELATED_P (insn) = 1;
13097 /* Replicate the return address on the stack so that return
13098 address can be reached via (argp - 1) slot. This is needed
13099 to implement macro RETURN_ADDR_RTX and intrinsic function
13100 expand_builtin_return_addr etc. */
13101 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13102 t = gen_frame_mem (word_mode, t);
13103 insn = emit_insn (gen_push (t));
13104 RTX_FRAME_RELATED_P (insn) = 1;
13106 /* For the purposes of frame and register save area addressing,
13107 we've started over with a new frame. */
13108 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13109 m->fs.realigned = true;
13111 if (static_chain)
13113 /* Replicate static chain on the stack so that static chain
13114 can be reached via (argp - 2) slot. This is needed for
13115 nested function with stack realignment. */
13116 insn = emit_insn (gen_push (static_chain));
13117 RTX_FRAME_RELATED_P (insn) = 1;
13121 int_registers_saved = (frame.nregs == 0);
13122 sse_registers_saved = (frame.nsseregs == 0);
13123 save_stub_call_needed = (m->call_ms2sysv);
13124 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13126 if (frame_pointer_needed && !m->fs.fp_valid)
13128 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13129 slower on all targets. Also sdb didn't like it. */
13130 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13131 RTX_FRAME_RELATED_P (insn) = 1;
13133 /* Push registers now, before setting the frame pointer
13134 on SEH target. */
13135 if (!int_registers_saved
13136 && TARGET_SEH
13137 && !frame.save_regs_using_mov)
13139 ix86_emit_save_regs ();
13140 int_registers_saved = true;
13141 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13144 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13146 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13147 RTX_FRAME_RELATED_P (insn) = 1;
13149 if (m->fs.cfa_reg == stack_pointer_rtx)
13150 m->fs.cfa_reg = hard_frame_pointer_rtx;
13151 m->fs.fp_offset = m->fs.sp_offset;
13152 m->fs.fp_valid = true;
13156 if (!int_registers_saved)
13158 /* If saving registers via PUSH, do so now. */
13159 if (!frame.save_regs_using_mov)
13161 ix86_emit_save_regs ();
13162 int_registers_saved = true;
13163 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13166 /* When using red zone we may start register saving before allocating
13167 the stack frame saving one cycle of the prologue. However, avoid
13168 doing this if we have to probe the stack; at least on x86_64 the
13169 stack probe can turn into a call that clobbers a red zone location. */
13170 else if (ix86_using_red_zone ()
13171 && (! TARGET_STACK_PROBE
13172 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13174 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13175 int_registers_saved = true;
13179 if (stack_realign_fp)
13181 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13182 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13184 /* Record last valid frame pointer offset. */
13185 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13187 /* The computation of the size of the re-aligned stack frame means
13188 that we must allocate the size of the register save area before
13189 performing the actual alignment. Otherwise we cannot guarantee
13190 that there's enough storage above the realignment point. */
13191 allocate = frame.reg_save_offset - m->fs.sp_offset
13192 + frame.stack_realign_allocate;
13193 if (allocate)
13194 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13195 GEN_INT (-allocate), -1, false);
13197 /* Align the stack. */
13198 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13199 stack_pointer_rtx,
13200 GEN_INT (-align_bytes)));
13201 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13202 m->fs.sp_realigned_offset = m->fs.sp_offset
13203 - frame.stack_realign_allocate;
13204 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13205 Beyond this point, stack access should be done via choose_baseaddr or
13206 by using sp_valid_at and fp_valid_at to determine the correct base
13207 register. Henceforth, any CFA offset should be thought of as logical
13208 and not physical. */
13209 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13210 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13211 m->fs.sp_realigned = true;
13213 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13214 is needed to describe where a register is saved using a realigned
13215 stack pointer, so we need to invalidate the stack pointer for that
13216 target. */
13217 if (TARGET_SEH)
13218 m->fs.sp_valid = false;
13220 /* If SP offset is non-immediate after allocation of the stack frame,
13221 then emit SSE saves or stub call prior to allocating the rest of the
13222 stack frame. This is less efficient for the out-of-line stub because
13223 we can't combine allocations across the call barrier, but it's better
13224 than using a scratch register. */
13225 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13226 - m->fs.sp_realigned_offset),
13227 Pmode))
13229 if (!sse_registers_saved)
13231 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13232 sse_registers_saved = true;
13234 else if (save_stub_call_needed)
13236 ix86_emit_outlined_ms2sysv_save (frame);
13237 save_stub_call_needed = false;
13242 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13244 if (flag_stack_usage_info)
13246 /* We start to count from ARG_POINTER. */
13247 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13249 /* If it was realigned, take into account the fake frame. */
13250 if (stack_realign_drap)
13252 if (ix86_static_chain_on_stack)
13253 stack_size += UNITS_PER_WORD;
13255 if (!call_used_regs[REGNO (crtl->drap_reg)])
13256 stack_size += UNITS_PER_WORD;
13258 /* This over-estimates by 1 minimal-stack-alignment-unit but
13259 mitigates that by counting in the new return address slot. */
13260 current_function_dynamic_stack_size
13261 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13264 current_function_static_stack_size = stack_size;
13267 /* On SEH target with very large frame size, allocate an area to save
13268 SSE registers (as the very large allocation won't be described). */
13269 if (TARGET_SEH
13270 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13271 && !sse_registers_saved)
13273 HOST_WIDE_INT sse_size =
13274 frame.sse_reg_save_offset - frame.reg_save_offset;
13276 gcc_assert (int_registers_saved);
13278 /* No need to do stack checking as the area will be immediately
13279 written. */
13280 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13281 GEN_INT (-sse_size), -1,
13282 m->fs.cfa_reg == stack_pointer_rtx);
13283 allocate -= sse_size;
13284 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13285 sse_registers_saved = true;
13288 /* The stack has already been decremented by the instruction calling us
13289 so probe if the size is non-negative to preserve the protection area. */
13290 if (allocate >= 0
13291 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13292 || flag_stack_clash_protection))
13294 /* This assert wants to verify that integer registers were saved
13295 prior to probing. This is necessary when probing may be implemented
13296 as a function call (Windows). It is not necessary for stack clash
13297 protection probing. */
13298 if (!flag_stack_clash_protection)
13299 gcc_assert (int_registers_saved);
13301 if (flag_stack_clash_protection)
13303 ix86_adjust_stack_and_probe_stack_clash (allocate);
13304 allocate = 0;
13306 else if (STACK_CHECK_MOVING_SP)
13308 if (!(crtl->is_leaf && !cfun->calls_alloca
13309 && allocate <= get_probe_interval ()))
13311 ix86_adjust_stack_and_probe (allocate);
13312 allocate = 0;
13315 else
13317 HOST_WIDE_INT size = allocate;
13319 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13320 size = 0x80000000 - get_stack_check_protect () - 1;
13322 if (TARGET_STACK_PROBE)
13324 if (crtl->is_leaf && !cfun->calls_alloca)
13326 if (size > get_probe_interval ())
13327 ix86_emit_probe_stack_range (0, size);
13329 else
13330 ix86_emit_probe_stack_range (0,
13331 size + get_stack_check_protect ());
13333 else
13335 if (crtl->is_leaf && !cfun->calls_alloca)
13337 if (size > get_probe_interval ()
13338 && size > get_stack_check_protect ())
13339 ix86_emit_probe_stack_range (get_stack_check_protect (),
13340 size - get_stack_check_protect ());
13342 else
13343 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13348 if (allocate == 0)
13350 else if (!ix86_target_stack_probe ()
13351 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13353 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13354 GEN_INT (-allocate), -1,
13355 m->fs.cfa_reg == stack_pointer_rtx);
13357 else
13359 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13360 rtx r10 = NULL;
13361 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13362 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13363 bool eax_live = ix86_eax_live_at_start_p ();
13364 bool r10_live = false;
13366 if (TARGET_64BIT)
13367 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13369 if (eax_live)
13371 insn = emit_insn (gen_push (eax));
13372 allocate -= UNITS_PER_WORD;
13373 /* Note that SEH directives need to continue tracking the stack
13374 pointer even after the frame pointer has been set up. */
13375 if (sp_is_cfa_reg || TARGET_SEH)
13377 if (sp_is_cfa_reg)
13378 m->fs.cfa_offset += UNITS_PER_WORD;
13379 RTX_FRAME_RELATED_P (insn) = 1;
13380 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13381 gen_rtx_SET (stack_pointer_rtx,
13382 plus_constant (Pmode, stack_pointer_rtx,
13383 -UNITS_PER_WORD)));
13387 if (r10_live)
13389 r10 = gen_rtx_REG (Pmode, R10_REG);
13390 insn = emit_insn (gen_push (r10));
13391 allocate -= UNITS_PER_WORD;
13392 if (sp_is_cfa_reg || TARGET_SEH)
13394 if (sp_is_cfa_reg)
13395 m->fs.cfa_offset += UNITS_PER_WORD;
13396 RTX_FRAME_RELATED_P (insn) = 1;
13397 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13398 gen_rtx_SET (stack_pointer_rtx,
13399 plus_constant (Pmode, stack_pointer_rtx,
13400 -UNITS_PER_WORD)));
13404 emit_move_insn (eax, GEN_INT (allocate));
13405 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13407 /* Use the fact that AX still contains ALLOCATE. */
13408 adjust_stack_insn = (Pmode == DImode
13409 ? gen_pro_epilogue_adjust_stack_di_sub
13410 : gen_pro_epilogue_adjust_stack_si_sub);
13412 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13413 stack_pointer_rtx, eax));
13415 if (sp_is_cfa_reg || TARGET_SEH)
13417 if (sp_is_cfa_reg)
13418 m->fs.cfa_offset += allocate;
13419 RTX_FRAME_RELATED_P (insn) = 1;
13420 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13421 gen_rtx_SET (stack_pointer_rtx,
13422 plus_constant (Pmode, stack_pointer_rtx,
13423 -allocate)));
13425 m->fs.sp_offset += allocate;
13427 /* Use stack_pointer_rtx for relative addressing so that code
13428 works for realigned stack, too. */
13429 if (r10_live && eax_live)
13431 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13432 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13433 gen_frame_mem (word_mode, t));
13434 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13435 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13436 gen_frame_mem (word_mode, t));
13438 else if (eax_live || r10_live)
13440 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13441 emit_move_insn (gen_rtx_REG (word_mode,
13442 (eax_live ? AX_REG : R10_REG)),
13443 gen_frame_mem (word_mode, t));
13446 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13448 /* If we havn't already set up the frame pointer, do so now. */
13449 if (frame_pointer_needed && !m->fs.fp_valid)
13451 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13452 GEN_INT (frame.stack_pointer_offset
13453 - frame.hard_frame_pointer_offset));
13454 insn = emit_insn (insn);
13455 RTX_FRAME_RELATED_P (insn) = 1;
13456 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13458 if (m->fs.cfa_reg == stack_pointer_rtx)
13459 m->fs.cfa_reg = hard_frame_pointer_rtx;
13460 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13461 m->fs.fp_valid = true;
13464 if (!int_registers_saved)
13465 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13466 if (!sse_registers_saved)
13467 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13468 else if (save_stub_call_needed)
13469 ix86_emit_outlined_ms2sysv_save (frame);
13471 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13472 in PROLOGUE. */
13473 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13475 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13476 insn = emit_insn (gen_set_got (pic));
13477 RTX_FRAME_RELATED_P (insn) = 1;
13478 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13479 emit_insn (gen_prologue_use (pic));
13480 /* Deleting already emmitted SET_GOT if exist and allocated to
13481 REAL_PIC_OFFSET_TABLE_REGNUM. */
13482 ix86_elim_entry_set_got (pic);
13485 if (crtl->drap_reg && !crtl->stack_realign_needed)
13487 /* vDRAP is setup but after reload it turns out stack realign
13488 isn't necessary, here we will emit prologue to setup DRAP
13489 without stack realign adjustment */
13490 t = choose_baseaddr (0, NULL);
13491 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13494 /* Prevent instructions from being scheduled into register save push
13495 sequence when access to the redzone area is done through frame pointer.
13496 The offset between the frame pointer and the stack pointer is calculated
13497 relative to the value of the stack pointer at the end of the function
13498 prologue, and moving instructions that access redzone area via frame
13499 pointer inside push sequence violates this assumption. */
13500 if (frame_pointer_needed && frame.red_zone_size)
13501 emit_insn (gen_memory_blockage ());
13503 /* SEH requires that the prologue end within 256 bytes of the start of
13504 the function. Prevent instruction schedules that would extend that.
13505 Further, prevent alloca modifications to the stack pointer from being
13506 combined with prologue modifications. */
13507 if (TARGET_SEH)
13508 emit_insn (gen_prologue_use (stack_pointer_rtx));
13511 /* Emit code to restore REG using a POP insn. */
13513 static void
13514 ix86_emit_restore_reg_using_pop (rtx reg)
13516 struct machine_function *m = cfun->machine;
13517 rtx_insn *insn = emit_insn (gen_pop (reg));
13519 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13520 m->fs.sp_offset -= UNITS_PER_WORD;
13522 if (m->fs.cfa_reg == crtl->drap_reg
13523 && REGNO (reg) == REGNO (crtl->drap_reg))
13525 /* Previously we'd represented the CFA as an expression
13526 like *(%ebp - 8). We've just popped that value from
13527 the stack, which means we need to reset the CFA to
13528 the drap register. This will remain until we restore
13529 the stack pointer. */
13530 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13531 RTX_FRAME_RELATED_P (insn) = 1;
13533 /* This means that the DRAP register is valid for addressing too. */
13534 m->fs.drap_valid = true;
13535 return;
13538 if (m->fs.cfa_reg == stack_pointer_rtx)
13540 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13541 x = gen_rtx_SET (stack_pointer_rtx, x);
13542 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13543 RTX_FRAME_RELATED_P (insn) = 1;
13545 m->fs.cfa_offset -= UNITS_PER_WORD;
13548 /* When the frame pointer is the CFA, and we pop it, we are
13549 swapping back to the stack pointer as the CFA. This happens
13550 for stack frames that don't allocate other data, so we assume
13551 the stack pointer is now pointing at the return address, i.e.
13552 the function entry state, which makes the offset be 1 word. */
13553 if (reg == hard_frame_pointer_rtx)
13555 m->fs.fp_valid = false;
13556 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13558 m->fs.cfa_reg = stack_pointer_rtx;
13559 m->fs.cfa_offset -= UNITS_PER_WORD;
13561 add_reg_note (insn, REG_CFA_DEF_CFA,
13562 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13563 GEN_INT (m->fs.cfa_offset)));
13564 RTX_FRAME_RELATED_P (insn) = 1;
13569 /* Emit code to restore saved registers using POP insns. */
13571 static void
13572 ix86_emit_restore_regs_using_pop (void)
13574 unsigned int regno;
13576 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13577 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13578 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13581 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13582 omits the emit and only attaches the notes. */
13584 static void
13585 ix86_emit_leave (rtx_insn *insn)
13587 struct machine_function *m = cfun->machine;
13588 if (!insn)
13589 insn = emit_insn (ix86_gen_leave ());
13591 ix86_add_queued_cfa_restore_notes (insn);
13593 gcc_assert (m->fs.fp_valid);
13594 m->fs.sp_valid = true;
13595 m->fs.sp_realigned = false;
13596 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13597 m->fs.fp_valid = false;
13599 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13601 m->fs.cfa_reg = stack_pointer_rtx;
13602 m->fs.cfa_offset = m->fs.sp_offset;
13604 add_reg_note (insn, REG_CFA_DEF_CFA,
13605 plus_constant (Pmode, stack_pointer_rtx,
13606 m->fs.sp_offset));
13607 RTX_FRAME_RELATED_P (insn) = 1;
13609 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13610 m->fs.fp_offset);
13613 /* Emit code to restore saved registers using MOV insns.
13614 First register is restored from CFA - CFA_OFFSET. */
13615 static void
13616 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13617 bool maybe_eh_return)
13619 struct machine_function *m = cfun->machine;
13620 unsigned int regno;
13622 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13623 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13625 rtx reg = gen_rtx_REG (word_mode, regno);
13626 rtx mem;
13627 rtx_insn *insn;
13629 mem = choose_baseaddr (cfa_offset, NULL);
13630 mem = gen_frame_mem (word_mode, mem);
13631 insn = emit_move_insn (reg, mem);
13633 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13635 /* Previously we'd represented the CFA as an expression
13636 like *(%ebp - 8). We've just popped that value from
13637 the stack, which means we need to reset the CFA to
13638 the drap register. This will remain until we restore
13639 the stack pointer. */
13640 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13641 RTX_FRAME_RELATED_P (insn) = 1;
13643 /* This means that the DRAP register is valid for addressing. */
13644 m->fs.drap_valid = true;
13646 else
13647 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13649 cfa_offset -= UNITS_PER_WORD;
13653 /* Emit code to restore saved registers using MOV insns.
13654 First register is restored from CFA - CFA_OFFSET. */
13655 static void
13656 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13657 bool maybe_eh_return)
13659 unsigned int regno;
13661 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13662 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13664 rtx reg = gen_rtx_REG (V4SFmode, regno);
13665 rtx mem;
13666 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13668 mem = choose_baseaddr (cfa_offset, &align);
13669 mem = gen_rtx_MEM (V4SFmode, mem);
13671 /* The location aligment depends upon the base register. */
13672 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13673 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13674 set_mem_align (mem, align);
13675 emit_insn (gen_rtx_SET (reg, mem));
13677 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13679 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13683 static void
13684 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13685 bool use_call, int style)
13687 struct machine_function *m = cfun->machine;
13688 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13689 + m->call_ms2sysv_extra_regs;
13690 rtvec v;
13691 unsigned int elems_needed, align, i, vi = 0;
13692 rtx_insn *insn;
13693 rtx sym, tmp;
13694 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13695 rtx r10 = NULL_RTX;
13696 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13697 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13698 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13699 rtx rsi_frame_load = NULL_RTX;
13700 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13701 enum xlogue_stub stub;
13703 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13705 /* If using a realigned stack, we should never start with padding. */
13706 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13708 /* Setup RSI as the stub's base pointer. */
13709 align = GET_MODE_ALIGNMENT (V4SFmode);
13710 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
13711 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13713 emit_insn (gen_rtx_SET (rsi, tmp));
13715 /* Get a symbol for the stub. */
13716 if (frame_pointer_needed)
13717 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13718 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13719 else
13720 stub = use_call ? XLOGUE_STUB_RESTORE
13721 : XLOGUE_STUB_RESTORE_TAIL;
13722 sym = xlogue.get_stub_rtx (stub);
13724 elems_needed = ncregs;
13725 if (use_call)
13726 elems_needed += 1;
13727 else
13728 elems_needed += frame_pointer_needed ? 5 : 3;
13729 v = rtvec_alloc (elems_needed);
13731 /* We call the epilogue stub when we need to pop incoming args or we are
13732 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13733 epilogue stub and it is the tail-call. */
13734 if (use_call)
13735 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13736 else
13738 RTVEC_ELT (v, vi++) = ret_rtx;
13739 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13740 if (frame_pointer_needed)
13742 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13743 gcc_assert (m->fs.fp_valid);
13744 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13746 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13747 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13748 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13749 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13750 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13752 else
13754 /* If no hard frame pointer, we set R10 to the SP restore value. */
13755 gcc_assert (!m->fs.fp_valid);
13756 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13757 gcc_assert (m->fs.sp_valid);
13759 r10 = gen_rtx_REG (DImode, R10_REG);
13760 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13761 emit_insn (gen_rtx_SET (r10, tmp));
13763 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13767 /* Generate frame load insns and restore notes. */
13768 for (i = 0; i < ncregs; ++i)
13770 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13771 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13772 rtx reg, frame_load;
13774 reg = gen_rtx_REG (mode, r.regno);
13775 frame_load = gen_frame_load (reg, rsi, r.offset);
13777 /* Save RSI frame load insn & note to add last. */
13778 if (r.regno == SI_REG)
13780 gcc_assert (!rsi_frame_load);
13781 rsi_frame_load = frame_load;
13782 rsi_restore_offset = r.offset;
13784 else
13786 RTVEC_ELT (v, vi++) = frame_load;
13787 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13791 /* Add RSI frame load & restore note at the end. */
13792 gcc_assert (rsi_frame_load);
13793 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13794 RTVEC_ELT (v, vi++) = rsi_frame_load;
13795 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13796 rsi_restore_offset);
13798 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13799 if (!use_call && !frame_pointer_needed)
13801 gcc_assert (m->fs.sp_valid);
13802 gcc_assert (!m->fs.sp_realigned);
13804 /* At this point, R10 should point to frame.stack_realign_offset. */
13805 if (m->fs.cfa_reg == stack_pointer_rtx)
13806 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13807 m->fs.sp_offset = frame.stack_realign_offset;
13810 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13811 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13812 if (use_call)
13813 insn = emit_insn (tmp);
13814 else
13816 insn = emit_jump_insn (tmp);
13817 JUMP_LABEL (insn) = ret_rtx;
13819 if (frame_pointer_needed)
13820 ix86_emit_leave (insn);
13821 else
13823 /* Need CFA adjust note. */
13824 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13825 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13829 RTX_FRAME_RELATED_P (insn) = true;
13830 ix86_add_queued_cfa_restore_notes (insn);
13832 /* If we're not doing a tail-call, we need to adjust the stack. */
13833 if (use_call && m->fs.sp_valid)
13835 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13836 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13837 GEN_INT (dealloc), style,
13838 m->fs.cfa_reg == stack_pointer_rtx);
13842 /* Restore function stack, frame, and registers. */
13844 void
13845 ix86_expand_epilogue (int style)
13847 struct machine_function *m = cfun->machine;
13848 struct machine_frame_state frame_state_save = m->fs;
13849 struct ix86_frame frame;
13850 bool restore_regs_via_mov;
13851 bool using_drap;
13852 bool restore_stub_is_tail = false;
13854 if (ix86_function_naked (current_function_decl))
13856 /* The program should not reach this point. */
13857 emit_insn (gen_ud2 ());
13858 return;
13861 ix86_finalize_stack_frame_flags ();
13862 frame = m->frame;
13864 m->fs.sp_realigned = stack_realign_fp;
13865 m->fs.sp_valid = stack_realign_fp
13866 || !frame_pointer_needed
13867 || crtl->sp_is_unchanging;
13868 gcc_assert (!m->fs.sp_valid
13869 || m->fs.sp_offset == frame.stack_pointer_offset);
13871 /* The FP must be valid if the frame pointer is present. */
13872 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13873 gcc_assert (!m->fs.fp_valid
13874 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13876 /* We must have *some* valid pointer to the stack frame. */
13877 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13879 /* The DRAP is never valid at this point. */
13880 gcc_assert (!m->fs.drap_valid);
13882 /* See the comment about red zone and frame
13883 pointer usage in ix86_expand_prologue. */
13884 if (frame_pointer_needed && frame.red_zone_size)
13885 emit_insn (gen_memory_blockage ());
13887 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13888 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13890 /* Determine the CFA offset of the end of the red-zone. */
13891 m->fs.red_zone_offset = 0;
13892 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13894 /* The red-zone begins below return address and error code in
13895 exception handler. */
13896 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13898 /* When the register save area is in the aligned portion of
13899 the stack, determine the maximum runtime displacement that
13900 matches up with the aligned frame. */
13901 if (stack_realign_drap)
13902 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13903 + UNITS_PER_WORD);
13906 /* Special care must be taken for the normal return case of a function
13907 using eh_return: the eax and edx registers are marked as saved, but
13908 not restored along this path. Adjust the save location to match. */
13909 if (crtl->calls_eh_return && style != 2)
13910 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13912 /* EH_RETURN requires the use of moves to function properly. */
13913 if (crtl->calls_eh_return)
13914 restore_regs_via_mov = true;
13915 /* SEH requires the use of pops to identify the epilogue. */
13916 else if (TARGET_SEH)
13917 restore_regs_via_mov = false;
13918 /* If we're only restoring one register and sp cannot be used then
13919 using a move instruction to restore the register since it's
13920 less work than reloading sp and popping the register. */
13921 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13922 restore_regs_via_mov = true;
13923 else if (TARGET_EPILOGUE_USING_MOVE
13924 && cfun->machine->use_fast_prologue_epilogue
13925 && (frame.nregs > 1
13926 || m->fs.sp_offset != frame.reg_save_offset))
13927 restore_regs_via_mov = true;
13928 else if (frame_pointer_needed
13929 && !frame.nregs
13930 && m->fs.sp_offset != frame.reg_save_offset)
13931 restore_regs_via_mov = true;
13932 else if (frame_pointer_needed
13933 && TARGET_USE_LEAVE
13934 && cfun->machine->use_fast_prologue_epilogue
13935 && frame.nregs == 1)
13936 restore_regs_via_mov = true;
13937 else
13938 restore_regs_via_mov = false;
13940 if (restore_regs_via_mov || frame.nsseregs)
13942 /* Ensure that the entire register save area is addressable via
13943 the stack pointer, if we will restore SSE regs via sp. */
13944 if (TARGET_64BIT
13945 && m->fs.sp_offset > 0x7fffffff
13946 && sp_valid_at (frame.stack_realign_offset + 1)
13947 && (frame.nsseregs + frame.nregs) != 0)
13949 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13950 GEN_INT (m->fs.sp_offset
13951 - frame.sse_reg_save_offset),
13952 style,
13953 m->fs.cfa_reg == stack_pointer_rtx);
13957 /* If there are any SSE registers to restore, then we have to do it
13958 via moves, since there's obviously no pop for SSE regs. */
13959 if (frame.nsseregs)
13960 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
13961 style == 2);
13963 if (m->call_ms2sysv)
13965 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
13967 /* We cannot use a tail-call for the stub if:
13968 1. We have to pop incoming args,
13969 2. We have additional int regs to restore, or
13970 3. A sibling call will be the tail-call, or
13971 4. We are emitting an eh_return_internal epilogue.
13973 TODO: Item 4 has not yet tested!
13975 If any of the above are true, we will call the stub rather than
13976 jump to it. */
13977 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
13978 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
13981 /* If using out-of-line stub that is a tail-call, then...*/
13982 if (m->call_ms2sysv && restore_stub_is_tail)
13984 /* TODO: parinoid tests. (remove eventually) */
13985 gcc_assert (m->fs.sp_valid);
13986 gcc_assert (!m->fs.sp_realigned);
13987 gcc_assert (!m->fs.fp_valid);
13988 gcc_assert (!m->fs.realigned);
13989 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
13990 gcc_assert (!crtl->drap_reg);
13991 gcc_assert (!frame.nregs);
13993 else if (restore_regs_via_mov)
13995 rtx t;
13997 if (frame.nregs)
13998 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14000 /* eh_return epilogues need %ecx added to the stack pointer. */
14001 if (style == 2)
14003 rtx sa = EH_RETURN_STACKADJ_RTX;
14004 rtx_insn *insn;
14006 /* %ecx can't be used for both DRAP register and eh_return. */
14007 if (crtl->drap_reg)
14008 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14010 /* regparm nested functions don't work with eh_return. */
14011 gcc_assert (!ix86_static_chain_on_stack);
14013 if (frame_pointer_needed)
14015 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14016 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14017 emit_insn (gen_rtx_SET (sa, t));
14019 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14020 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14022 /* Note that we use SA as a temporary CFA, as the return
14023 address is at the proper place relative to it. We
14024 pretend this happens at the FP restore insn because
14025 prior to this insn the FP would be stored at the wrong
14026 offset relative to SA, and after this insn we have no
14027 other reasonable register to use for the CFA. We don't
14028 bother resetting the CFA to the SP for the duration of
14029 the return insn, unless the control flow instrumentation
14030 is done. In this case the SP is used later and we have
14031 to reset CFA to SP. */
14032 add_reg_note (insn, REG_CFA_DEF_CFA,
14033 plus_constant (Pmode, sa, UNITS_PER_WORD));
14034 ix86_add_queued_cfa_restore_notes (insn);
14035 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14036 RTX_FRAME_RELATED_P (insn) = 1;
14038 m->fs.cfa_reg = sa;
14039 m->fs.cfa_offset = UNITS_PER_WORD;
14040 m->fs.fp_valid = false;
14042 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14043 const0_rtx, style,
14044 flag_cf_protection);
14046 else
14048 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14049 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14050 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14051 ix86_add_queued_cfa_restore_notes (insn);
14053 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14054 if (m->fs.cfa_offset != UNITS_PER_WORD)
14056 m->fs.cfa_offset = UNITS_PER_WORD;
14057 add_reg_note (insn, REG_CFA_DEF_CFA,
14058 plus_constant (Pmode, stack_pointer_rtx,
14059 UNITS_PER_WORD));
14060 RTX_FRAME_RELATED_P (insn) = 1;
14063 m->fs.sp_offset = UNITS_PER_WORD;
14064 m->fs.sp_valid = true;
14065 m->fs.sp_realigned = false;
14068 else
14070 /* SEH requires that the function end with (1) a stack adjustment
14071 if necessary, (2) a sequence of pops, and (3) a return or
14072 jump instruction. Prevent insns from the function body from
14073 being scheduled into this sequence. */
14074 if (TARGET_SEH)
14076 /* Prevent a catch region from being adjacent to the standard
14077 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14078 several other flags that would be interesting to test are
14079 not yet set up. */
14080 if (flag_non_call_exceptions)
14081 emit_insn (gen_nops (const1_rtx));
14082 else
14083 emit_insn (gen_blockage ());
14086 /* First step is to deallocate the stack frame so that we can
14087 pop the registers. If the stack pointer was realigned, it needs
14088 to be restored now. Also do it on SEH target for very large
14089 frame as the emitted instructions aren't allowed by the ABI
14090 in epilogues. */
14091 if (!m->fs.sp_valid || m->fs.sp_realigned
14092 || (TARGET_SEH
14093 && (m->fs.sp_offset - frame.reg_save_offset
14094 >= SEH_MAX_FRAME_SIZE)))
14096 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14097 GEN_INT (m->fs.fp_offset
14098 - frame.reg_save_offset),
14099 style, false);
14101 else if (m->fs.sp_offset != frame.reg_save_offset)
14103 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14104 GEN_INT (m->fs.sp_offset
14105 - frame.reg_save_offset),
14106 style,
14107 m->fs.cfa_reg == stack_pointer_rtx);
14110 ix86_emit_restore_regs_using_pop ();
14113 /* If we used a stack pointer and haven't already got rid of it,
14114 then do so now. */
14115 if (m->fs.fp_valid)
14117 /* If the stack pointer is valid and pointing at the frame
14118 pointer store address, then we only need a pop. */
14119 if (sp_valid_at (frame.hfp_save_offset)
14120 && m->fs.sp_offset == frame.hfp_save_offset)
14121 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14122 /* Leave results in shorter dependency chains on CPUs that are
14123 able to grok it fast. */
14124 else if (TARGET_USE_LEAVE
14125 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14126 || !cfun->machine->use_fast_prologue_epilogue)
14127 ix86_emit_leave (NULL);
14128 else
14130 pro_epilogue_adjust_stack (stack_pointer_rtx,
14131 hard_frame_pointer_rtx,
14132 const0_rtx, style, !using_drap);
14133 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14137 if (using_drap)
14139 int param_ptr_offset = UNITS_PER_WORD;
14140 rtx_insn *insn;
14142 gcc_assert (stack_realign_drap);
14144 if (ix86_static_chain_on_stack)
14145 param_ptr_offset += UNITS_PER_WORD;
14146 if (!call_used_regs[REGNO (crtl->drap_reg)])
14147 param_ptr_offset += UNITS_PER_WORD;
14149 insn = emit_insn (gen_rtx_SET
14150 (stack_pointer_rtx,
14151 gen_rtx_PLUS (Pmode,
14152 crtl->drap_reg,
14153 GEN_INT (-param_ptr_offset))));
14154 m->fs.cfa_reg = stack_pointer_rtx;
14155 m->fs.cfa_offset = param_ptr_offset;
14156 m->fs.sp_offset = param_ptr_offset;
14157 m->fs.realigned = false;
14159 add_reg_note (insn, REG_CFA_DEF_CFA,
14160 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14161 GEN_INT (param_ptr_offset)));
14162 RTX_FRAME_RELATED_P (insn) = 1;
14164 if (!call_used_regs[REGNO (crtl->drap_reg)])
14165 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14168 /* At this point the stack pointer must be valid, and we must have
14169 restored all of the registers. We may not have deallocated the
14170 entire stack frame. We've delayed this until now because it may
14171 be possible to merge the local stack deallocation with the
14172 deallocation forced by ix86_static_chain_on_stack. */
14173 gcc_assert (m->fs.sp_valid);
14174 gcc_assert (!m->fs.sp_realigned);
14175 gcc_assert (!m->fs.fp_valid);
14176 gcc_assert (!m->fs.realigned);
14177 if (m->fs.sp_offset != UNITS_PER_WORD)
14179 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14180 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14181 style, true);
14183 else
14184 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14186 /* Sibcall epilogues don't want a return instruction. */
14187 if (style == 0)
14189 m->fs = frame_state_save;
14190 return;
14193 if (cfun->machine->func_type != TYPE_NORMAL)
14194 emit_jump_insn (gen_interrupt_return ());
14195 else if (crtl->args.pops_args && crtl->args.size)
14197 rtx popc = GEN_INT (crtl->args.pops_args);
14199 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14200 address, do explicit add, and jump indirectly to the caller. */
14202 if (crtl->args.pops_args >= 65536)
14204 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14205 rtx_insn *insn;
14207 /* There is no "pascal" calling convention in any 64bit ABI. */
14208 gcc_assert (!TARGET_64BIT);
14210 insn = emit_insn (gen_pop (ecx));
14211 m->fs.cfa_offset -= UNITS_PER_WORD;
14212 m->fs.sp_offset -= UNITS_PER_WORD;
14214 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14215 x = gen_rtx_SET (stack_pointer_rtx, x);
14216 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14217 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14218 RTX_FRAME_RELATED_P (insn) = 1;
14220 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14221 popc, -1, true);
14222 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14224 else
14225 emit_jump_insn (gen_simple_return_pop_internal (popc));
14227 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14229 /* In case of return from EH a simple return cannot be used
14230 as a return address will be compared with a shadow stack
14231 return address. Use indirect jump instead. */
14232 if (style == 2 && flag_cf_protection)
14234 /* Register used in indirect jump must be in word_mode. But
14235 Pmode may not be the same as word_mode for x32. */
14236 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14237 rtx_insn *insn;
14239 insn = emit_insn (gen_pop (ecx));
14240 m->fs.cfa_offset -= UNITS_PER_WORD;
14241 m->fs.sp_offset -= UNITS_PER_WORD;
14243 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14244 x = gen_rtx_SET (stack_pointer_rtx, x);
14245 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14246 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14247 RTX_FRAME_RELATED_P (insn) = 1;
14249 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14251 else
14252 emit_jump_insn (gen_simple_return_internal ());
14255 /* Restore the state back to the state from the prologue,
14256 so that it's correct for the next epilogue. */
14257 m->fs = frame_state_save;
14260 /* Reset from the function's potential modifications. */
14262 static void
14263 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14265 if (pic_offset_table_rtx
14266 && !ix86_use_pseudo_pic_reg ())
14267 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14269 if (TARGET_MACHO)
14271 rtx_insn *insn = get_last_insn ();
14272 rtx_insn *deleted_debug_label = NULL;
14274 /* Mach-O doesn't support labels at the end of objects, so if
14275 it looks like we might want one, take special action.
14276 First, collect any sequence of deleted debug labels. */
14277 while (insn
14278 && NOTE_P (insn)
14279 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14281 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14282 notes only, instead set their CODE_LABEL_NUMBER to -1,
14283 otherwise there would be code generation differences
14284 in between -g and -g0. */
14285 if (NOTE_P (insn) && NOTE_KIND (insn)
14286 == NOTE_INSN_DELETED_DEBUG_LABEL)
14287 deleted_debug_label = insn;
14288 insn = PREV_INSN (insn);
14291 /* If we have:
14292 label:
14293 barrier
14294 then this needs to be detected, so skip past the barrier. */
14296 if (insn && BARRIER_P (insn))
14297 insn = PREV_INSN (insn);
14299 /* Up to now we've only seen notes or barriers. */
14300 if (insn)
14302 if (LABEL_P (insn)
14303 || (NOTE_P (insn)
14304 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14305 /* Trailing label. */
14306 fputs ("\tnop\n", file);
14307 else if (cfun && ! cfun->is_thunk)
14309 /* See if we have a completely empty function body, skipping
14310 the special case of the picbase thunk emitted as asm. */
14311 while (insn && ! INSN_P (insn))
14312 insn = PREV_INSN (insn);
14313 /* If we don't find any insns, we've got an empty function body;
14314 I.e. completely empty - without a return or branch. This is
14315 taken as the case where a function body has been removed
14316 because it contains an inline __builtin_unreachable(). GCC
14317 declares that reaching __builtin_unreachable() means UB so
14318 we're not obliged to do anything special; however, we want
14319 non-zero-sized function bodies. To meet this, and help the
14320 user out, let's trap the case. */
14321 if (insn == NULL)
14322 fputs ("\tud2\n", file);
14325 else if (deleted_debug_label)
14326 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14327 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14328 CODE_LABEL_NUMBER (insn) = -1;
14332 /* Return a scratch register to use in the split stack prologue. The
14333 split stack prologue is used for -fsplit-stack. It is the first
14334 instructions in the function, even before the regular prologue.
14335 The scratch register can be any caller-saved register which is not
14336 used for parameters or for the static chain. */
14338 static unsigned int
14339 split_stack_prologue_scratch_regno (void)
14341 if (TARGET_64BIT)
14342 return R11_REG;
14343 else
14345 bool is_fastcall, is_thiscall;
14346 int regparm;
14348 is_fastcall = (lookup_attribute ("fastcall",
14349 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14350 != NULL);
14351 is_thiscall = (lookup_attribute ("thiscall",
14352 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14353 != NULL);
14354 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14356 if (is_fastcall)
14358 if (DECL_STATIC_CHAIN (cfun->decl))
14360 sorry ("-fsplit-stack does not support fastcall with "
14361 "nested function");
14362 return INVALID_REGNUM;
14364 return AX_REG;
14366 else if (is_thiscall)
14368 if (!DECL_STATIC_CHAIN (cfun->decl))
14369 return DX_REG;
14370 return AX_REG;
14372 else if (regparm < 3)
14374 if (!DECL_STATIC_CHAIN (cfun->decl))
14375 return CX_REG;
14376 else
14378 if (regparm >= 2)
14380 sorry ("-fsplit-stack does not support 2 register "
14381 "parameters for a nested function");
14382 return INVALID_REGNUM;
14384 return DX_REG;
14387 else
14389 /* FIXME: We could make this work by pushing a register
14390 around the addition and comparison. */
14391 sorry ("-fsplit-stack does not support 3 register parameters");
14392 return INVALID_REGNUM;
14397 /* A SYMBOL_REF for the function which allocates new stackspace for
14398 -fsplit-stack. */
14400 static GTY(()) rtx split_stack_fn;
14402 /* A SYMBOL_REF for the more stack function when using the large
14403 model. */
14405 static GTY(()) rtx split_stack_fn_large;
14407 /* Return location of the stack guard value in the TLS block. */
14410 ix86_split_stack_guard (void)
14412 int offset;
14413 addr_space_t as = DEFAULT_TLS_SEG_REG;
14414 rtx r;
14416 gcc_assert (flag_split_stack);
14418 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14419 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14420 #else
14421 gcc_unreachable ();
14422 #endif
14424 r = GEN_INT (offset);
14425 r = gen_const_mem (Pmode, r);
14426 set_mem_addr_space (r, as);
14428 return r;
14431 /* Handle -fsplit-stack. These are the first instructions in the
14432 function, even before the regular prologue. */
14434 void
14435 ix86_expand_split_stack_prologue (void)
14437 HOST_WIDE_INT allocate;
14438 unsigned HOST_WIDE_INT args_size;
14439 rtx_code_label *label;
14440 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14441 rtx scratch_reg = NULL_RTX;
14442 rtx_code_label *varargs_label = NULL;
14443 rtx fn;
14445 gcc_assert (flag_split_stack && reload_completed);
14447 ix86_finalize_stack_frame_flags ();
14448 struct ix86_frame &frame = cfun->machine->frame;
14449 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14451 /* This is the label we will branch to if we have enough stack
14452 space. We expect the basic block reordering pass to reverse this
14453 branch if optimizing, so that we branch in the unlikely case. */
14454 label = gen_label_rtx ();
14456 /* We need to compare the stack pointer minus the frame size with
14457 the stack boundary in the TCB. The stack boundary always gives
14458 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14459 can compare directly. Otherwise we need to do an addition. */
14461 limit = ix86_split_stack_guard ();
14463 if (allocate < SPLIT_STACK_AVAILABLE)
14464 current = stack_pointer_rtx;
14465 else
14467 unsigned int scratch_regno;
14468 rtx offset;
14470 /* We need a scratch register to hold the stack pointer minus
14471 the required frame size. Since this is the very start of the
14472 function, the scratch register can be any caller-saved
14473 register which is not used for parameters. */
14474 offset = GEN_INT (- allocate);
14475 scratch_regno = split_stack_prologue_scratch_regno ();
14476 if (scratch_regno == INVALID_REGNUM)
14477 return;
14478 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14479 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14481 /* We don't use ix86_gen_add3 in this case because it will
14482 want to split to lea, but when not optimizing the insn
14483 will not be split after this point. */
14484 emit_insn (gen_rtx_SET (scratch_reg,
14485 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14486 offset)));
14488 else
14490 emit_move_insn (scratch_reg, offset);
14491 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14492 stack_pointer_rtx));
14494 current = scratch_reg;
14497 ix86_expand_branch (GEU, current, limit, label);
14498 rtx_insn *jump_insn = get_last_insn ();
14499 JUMP_LABEL (jump_insn) = label;
14501 /* Mark the jump as very likely to be taken. */
14502 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14504 if (split_stack_fn == NULL_RTX)
14506 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14507 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14509 fn = split_stack_fn;
14511 /* Get more stack space. We pass in the desired stack space and the
14512 size of the arguments to copy to the new stack. In 32-bit mode
14513 we push the parameters; __morestack will return on a new stack
14514 anyhow. In 64-bit mode we pass the parameters in r10 and
14515 r11. */
14516 allocate_rtx = GEN_INT (allocate);
14517 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
14518 call_fusage = NULL_RTX;
14519 rtx pop = NULL_RTX;
14520 if (TARGET_64BIT)
14522 rtx reg10, reg11;
14524 reg10 = gen_rtx_REG (Pmode, R10_REG);
14525 reg11 = gen_rtx_REG (Pmode, R11_REG);
14527 /* If this function uses a static chain, it will be in %r10.
14528 Preserve it across the call to __morestack. */
14529 if (DECL_STATIC_CHAIN (cfun->decl))
14531 rtx rax;
14533 rax = gen_rtx_REG (word_mode, AX_REG);
14534 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14535 use_reg (&call_fusage, rax);
14538 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14539 && !TARGET_PECOFF)
14541 HOST_WIDE_INT argval;
14543 gcc_assert (Pmode == DImode);
14544 /* When using the large model we need to load the address
14545 into a register, and we've run out of registers. So we
14546 switch to a different calling convention, and we call a
14547 different function: __morestack_large. We pass the
14548 argument size in the upper 32 bits of r10 and pass the
14549 frame size in the lower 32 bits. */
14550 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14551 gcc_assert ((args_size & 0xffffffff) == args_size);
14553 if (split_stack_fn_large == NULL_RTX)
14555 split_stack_fn_large =
14556 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14557 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14559 if (ix86_cmodel == CM_LARGE_PIC)
14561 rtx_code_label *label;
14562 rtx x;
14564 label = gen_label_rtx ();
14565 emit_label (label);
14566 LABEL_PRESERVE_P (label) = 1;
14567 emit_insn (gen_set_rip_rex64 (reg10, label));
14568 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14569 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14570 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14571 UNSPEC_GOT);
14572 x = gen_rtx_CONST (Pmode, x);
14573 emit_move_insn (reg11, x);
14574 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14575 x = gen_const_mem (Pmode, x);
14576 emit_move_insn (reg11, x);
14578 else
14579 emit_move_insn (reg11, split_stack_fn_large);
14581 fn = reg11;
14583 argval = ((args_size << 16) << 16) + allocate;
14584 emit_move_insn (reg10, GEN_INT (argval));
14586 else
14588 emit_move_insn (reg10, allocate_rtx);
14589 emit_move_insn (reg11, GEN_INT (args_size));
14590 use_reg (&call_fusage, reg11);
14593 use_reg (&call_fusage, reg10);
14595 else
14597 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14598 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14599 insn = emit_insn (gen_push (allocate_rtx));
14600 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14601 pop = GEN_INT (2 * UNITS_PER_WORD);
14603 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14604 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14605 pop, false);
14606 add_function_usage_to (call_insn, call_fusage);
14607 if (!TARGET_64BIT)
14608 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14609 /* Indicate that this function can't jump to non-local gotos. */
14610 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14612 /* In order to make call/return prediction work right, we now need
14613 to execute a return instruction. See
14614 libgcc/config/i386/morestack.S for the details on how this works.
14616 For flow purposes gcc must not see this as a return
14617 instruction--we need control flow to continue at the subsequent
14618 label. Therefore, we use an unspec. */
14619 gcc_assert (crtl->args.pops_args < 65536);
14620 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14622 /* If we are in 64-bit mode and this function uses a static chain,
14623 we saved %r10 in %rax before calling _morestack. */
14624 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14625 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14626 gen_rtx_REG (word_mode, AX_REG));
14628 /* If this function calls va_start, we need to store a pointer to
14629 the arguments on the old stack, because they may not have been
14630 all copied to the new stack. At this point the old stack can be
14631 found at the frame pointer value used by __morestack, because
14632 __morestack has set that up before calling back to us. Here we
14633 store that pointer in a scratch register, and in
14634 ix86_expand_prologue we store the scratch register in a stack
14635 slot. */
14636 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14638 unsigned int scratch_regno;
14639 rtx frame_reg;
14640 int words;
14642 scratch_regno = split_stack_prologue_scratch_regno ();
14643 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14644 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14646 /* 64-bit:
14647 fp -> old fp value
14648 return address within this function
14649 return address of caller of this function
14650 stack arguments
14651 So we add three words to get to the stack arguments.
14653 32-bit:
14654 fp -> old fp value
14655 return address within this function
14656 first argument to __morestack
14657 second argument to __morestack
14658 return address of caller of this function
14659 stack arguments
14660 So we add five words to get to the stack arguments.
14662 words = TARGET_64BIT ? 3 : 5;
14663 emit_insn (gen_rtx_SET (scratch_reg,
14664 gen_rtx_PLUS (Pmode, frame_reg,
14665 GEN_INT (words * UNITS_PER_WORD))));
14667 varargs_label = gen_label_rtx ();
14668 emit_jump_insn (gen_jump (varargs_label));
14669 JUMP_LABEL (get_last_insn ()) = varargs_label;
14671 emit_barrier ();
14674 emit_label (label);
14675 LABEL_NUSES (label) = 1;
14677 /* If this function calls va_start, we now have to set the scratch
14678 register for the case where we do not call __morestack. In this
14679 case we need to set it based on the stack pointer. */
14680 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14682 emit_insn (gen_rtx_SET (scratch_reg,
14683 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14684 GEN_INT (UNITS_PER_WORD))));
14686 emit_label (varargs_label);
14687 LABEL_NUSES (varargs_label) = 1;
14691 /* We may have to tell the dataflow pass that the split stack prologue
14692 is initializing a scratch register. */
14694 static void
14695 ix86_live_on_entry (bitmap regs)
14697 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14699 gcc_assert (flag_split_stack);
14700 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14704 /* Extract the parts of an RTL expression that is a valid memory address
14705 for an instruction. Return 0 if the structure of the address is
14706 grossly off. Return -1 if the address contains ASHIFT, so it is not
14707 strictly valid, but still used for computing length of lea instruction. */
14710 ix86_decompose_address (rtx addr, struct ix86_address *out)
14712 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14713 rtx base_reg, index_reg;
14714 HOST_WIDE_INT scale = 1;
14715 rtx scale_rtx = NULL_RTX;
14716 rtx tmp;
14717 int retval = 1;
14718 addr_space_t seg = ADDR_SPACE_GENERIC;
14720 /* Allow zero-extended SImode addresses,
14721 they will be emitted with addr32 prefix. */
14722 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14724 if (GET_CODE (addr) == ZERO_EXTEND
14725 && GET_MODE (XEXP (addr, 0)) == SImode)
14727 addr = XEXP (addr, 0);
14728 if (CONST_INT_P (addr))
14729 return 0;
14731 else if (GET_CODE (addr) == AND
14732 && const_32bit_mask (XEXP (addr, 1), DImode))
14734 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14735 if (addr == NULL_RTX)
14736 return 0;
14738 if (CONST_INT_P (addr))
14739 return 0;
14743 /* Allow SImode subregs of DImode addresses,
14744 they will be emitted with addr32 prefix. */
14745 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14747 if (SUBREG_P (addr)
14748 && GET_MODE (SUBREG_REG (addr)) == DImode)
14750 addr = SUBREG_REG (addr);
14751 if (CONST_INT_P (addr))
14752 return 0;
14756 if (REG_P (addr))
14757 base = addr;
14758 else if (SUBREG_P (addr))
14760 if (REG_P (SUBREG_REG (addr)))
14761 base = addr;
14762 else
14763 return 0;
14765 else if (GET_CODE (addr) == PLUS)
14767 rtx addends[4], op;
14768 int n = 0, i;
14770 op = addr;
14773 if (n >= 4)
14774 return 0;
14775 addends[n++] = XEXP (op, 1);
14776 op = XEXP (op, 0);
14778 while (GET_CODE (op) == PLUS);
14779 if (n >= 4)
14780 return 0;
14781 addends[n] = op;
14783 for (i = n; i >= 0; --i)
14785 op = addends[i];
14786 switch (GET_CODE (op))
14788 case MULT:
14789 if (index)
14790 return 0;
14791 index = XEXP (op, 0);
14792 scale_rtx = XEXP (op, 1);
14793 break;
14795 case ASHIFT:
14796 if (index)
14797 return 0;
14798 index = XEXP (op, 0);
14799 tmp = XEXP (op, 1);
14800 if (!CONST_INT_P (tmp))
14801 return 0;
14802 scale = INTVAL (tmp);
14803 if ((unsigned HOST_WIDE_INT) scale > 3)
14804 return 0;
14805 scale = 1 << scale;
14806 break;
14808 case ZERO_EXTEND:
14809 op = XEXP (op, 0);
14810 if (GET_CODE (op) != UNSPEC)
14811 return 0;
14812 /* FALLTHRU */
14814 case UNSPEC:
14815 if (XINT (op, 1) == UNSPEC_TP
14816 && TARGET_TLS_DIRECT_SEG_REFS
14817 && seg == ADDR_SPACE_GENERIC)
14818 seg = DEFAULT_TLS_SEG_REG;
14819 else
14820 return 0;
14821 break;
14823 case SUBREG:
14824 if (!REG_P (SUBREG_REG (op)))
14825 return 0;
14826 /* FALLTHRU */
14828 case REG:
14829 if (!base)
14830 base = op;
14831 else if (!index)
14832 index = op;
14833 else
14834 return 0;
14835 break;
14837 case CONST:
14838 case CONST_INT:
14839 case SYMBOL_REF:
14840 case LABEL_REF:
14841 if (disp)
14842 return 0;
14843 disp = op;
14844 break;
14846 default:
14847 return 0;
14851 else if (GET_CODE (addr) == MULT)
14853 index = XEXP (addr, 0); /* index*scale */
14854 scale_rtx = XEXP (addr, 1);
14856 else if (GET_CODE (addr) == ASHIFT)
14858 /* We're called for lea too, which implements ashift on occasion. */
14859 index = XEXP (addr, 0);
14860 tmp = XEXP (addr, 1);
14861 if (!CONST_INT_P (tmp))
14862 return 0;
14863 scale = INTVAL (tmp);
14864 if ((unsigned HOST_WIDE_INT) scale > 3)
14865 return 0;
14866 scale = 1 << scale;
14867 retval = -1;
14869 else
14870 disp = addr; /* displacement */
14872 if (index)
14874 if (REG_P (index))
14876 else if (SUBREG_P (index)
14877 && REG_P (SUBREG_REG (index)))
14879 else
14880 return 0;
14883 /* Extract the integral value of scale. */
14884 if (scale_rtx)
14886 if (!CONST_INT_P (scale_rtx))
14887 return 0;
14888 scale = INTVAL (scale_rtx);
14891 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14892 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14894 /* Avoid useless 0 displacement. */
14895 if (disp == const0_rtx && (base || index))
14896 disp = NULL_RTX;
14898 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14899 if (base_reg && index_reg && scale == 1
14900 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14901 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14902 || REGNO (index_reg) == SP_REG))
14904 std::swap (base, index);
14905 std::swap (base_reg, index_reg);
14908 /* Special case: %ebp cannot be encoded as a base without a displacement.
14909 Similarly %r13. */
14910 if (!disp && base_reg
14911 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14912 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14913 || REGNO (base_reg) == BP_REG
14914 || REGNO (base_reg) == R13_REG))
14915 disp = const0_rtx;
14917 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14918 Avoid this by transforming to [%esi+0].
14919 Reload calls address legitimization without cfun defined, so we need
14920 to test cfun for being non-NULL. */
14921 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14922 && base_reg && !index_reg && !disp
14923 && REGNO (base_reg) == SI_REG)
14924 disp = const0_rtx;
14926 /* Special case: encode reg+reg instead of reg*2. */
14927 if (!base && index && scale == 2)
14928 base = index, base_reg = index_reg, scale = 1;
14930 /* Special case: scaling cannot be encoded without base or displacement. */
14931 if (!base && !disp && index && scale != 1)
14932 disp = const0_rtx;
14934 out->base = base;
14935 out->index = index;
14936 out->disp = disp;
14937 out->scale = scale;
14938 out->seg = seg;
14940 return retval;
14943 /* Return cost of the memory address x.
14944 For i386, it is better to use a complex address than let gcc copy
14945 the address into a reg and make a new pseudo. But not if the address
14946 requires to two regs - that would mean more pseudos with longer
14947 lifetimes. */
14948 static int
14949 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
14951 struct ix86_address parts;
14952 int cost = 1;
14953 int ok = ix86_decompose_address (x, &parts);
14955 gcc_assert (ok);
14957 if (parts.base && SUBREG_P (parts.base))
14958 parts.base = SUBREG_REG (parts.base);
14959 if (parts.index && SUBREG_P (parts.index))
14960 parts.index = SUBREG_REG (parts.index);
14962 /* Attempt to minimize number of registers in the address by increasing
14963 address cost for each used register. We don't increase address cost
14964 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
14965 is not invariant itself it most likely means that base or index is not
14966 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
14967 which is not profitable for x86. */
14968 if (parts.base
14969 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
14970 && (current_pass->type == GIMPLE_PASS
14971 || !pic_offset_table_rtx
14972 || !REG_P (parts.base)
14973 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
14974 cost++;
14976 if (parts.index
14977 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
14978 && (current_pass->type == GIMPLE_PASS
14979 || !pic_offset_table_rtx
14980 || !REG_P (parts.index)
14981 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
14982 cost++;
14984 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
14985 since it's predecode logic can't detect the length of instructions
14986 and it degenerates to vector decoded. Increase cost of such
14987 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
14988 to split such addresses or even refuse such addresses at all.
14990 Following addressing modes are affected:
14991 [base+scale*index]
14992 [scale*index+disp]
14993 [base+index]
14995 The first and last case may be avoidable by explicitly coding the zero in
14996 memory address, but I don't have AMD-K6 machine handy to check this
14997 theory. */
14999 if (TARGET_K6
15000 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15001 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15002 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15003 cost += 10;
15005 return cost;
15008 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15009 this is used for to form addresses to local data when -fPIC is in
15010 use. */
15012 static bool
15013 darwin_local_data_pic (rtx disp)
15015 return (GET_CODE (disp) == UNSPEC
15016 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15019 /* True if operand X should be loaded from GOT. */
15021 bool
15022 ix86_force_load_from_GOT_p (rtx x)
15024 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15025 && !TARGET_PECOFF && !TARGET_MACHO
15026 && !flag_plt && !flag_pic
15027 && ix86_cmodel != CM_LARGE
15028 && GET_CODE (x) == SYMBOL_REF
15029 && SYMBOL_REF_FUNCTION_P (x)
15030 && !SYMBOL_REF_LOCAL_P (x));
15033 /* Determine if a given RTX is a valid constant. We already know this
15034 satisfies CONSTANT_P. */
15036 static bool
15037 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15039 /* Pointer bounds constants are not valid. */
15040 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15041 return false;
15043 switch (GET_CODE (x))
15045 case CONST:
15046 x = XEXP (x, 0);
15048 if (GET_CODE (x) == PLUS)
15050 if (!CONST_INT_P (XEXP (x, 1)))
15051 return false;
15052 x = XEXP (x, 0);
15055 if (TARGET_MACHO && darwin_local_data_pic (x))
15056 return true;
15058 /* Only some unspecs are valid as "constants". */
15059 if (GET_CODE (x) == UNSPEC)
15060 switch (XINT (x, 1))
15062 case UNSPEC_GOT:
15063 case UNSPEC_GOTOFF:
15064 case UNSPEC_PLTOFF:
15065 return TARGET_64BIT;
15066 case UNSPEC_TPOFF:
15067 case UNSPEC_NTPOFF:
15068 x = XVECEXP (x, 0, 0);
15069 return (GET_CODE (x) == SYMBOL_REF
15070 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15071 case UNSPEC_DTPOFF:
15072 x = XVECEXP (x, 0, 0);
15073 return (GET_CODE (x) == SYMBOL_REF
15074 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15075 default:
15076 return false;
15079 /* We must have drilled down to a symbol. */
15080 if (GET_CODE (x) == LABEL_REF)
15081 return true;
15082 if (GET_CODE (x) != SYMBOL_REF)
15083 return false;
15084 /* FALLTHRU */
15086 case SYMBOL_REF:
15087 /* TLS symbols are never valid. */
15088 if (SYMBOL_REF_TLS_MODEL (x))
15089 return false;
15091 /* DLLIMPORT symbols are never valid. */
15092 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15093 && SYMBOL_REF_DLLIMPORT_P (x))
15094 return false;
15096 #if TARGET_MACHO
15097 /* mdynamic-no-pic */
15098 if (MACHO_DYNAMIC_NO_PIC_P)
15099 return machopic_symbol_defined_p (x);
15100 #endif
15102 /* External function address should be loaded
15103 via the GOT slot to avoid PLT. */
15104 if (ix86_force_load_from_GOT_p (x))
15105 return false;
15107 break;
15109 CASE_CONST_SCALAR_INT:
15110 switch (mode)
15112 case E_TImode:
15113 if (TARGET_64BIT)
15114 return true;
15115 /* FALLTHRU */
15116 case E_OImode:
15117 case E_XImode:
15118 if (!standard_sse_constant_p (x, mode))
15119 return false;
15120 default:
15121 break;
15123 break;
15125 case CONST_VECTOR:
15126 if (!standard_sse_constant_p (x, mode))
15127 return false;
15129 default:
15130 break;
15133 /* Otherwise we handle everything else in the move patterns. */
15134 return true;
15137 /* Determine if it's legal to put X into the constant pool. This
15138 is not possible for the address of thread-local symbols, which
15139 is checked above. */
15141 static bool
15142 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15144 /* We can put any immediate constant in memory. */
15145 switch (GET_CODE (x))
15147 CASE_CONST_ANY:
15148 return false;
15150 default:
15151 break;
15154 return !ix86_legitimate_constant_p (mode, x);
15157 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15158 otherwise zero. */
15160 static bool
15161 is_imported_p (rtx x)
15163 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15164 || GET_CODE (x) != SYMBOL_REF)
15165 return false;
15167 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15171 /* Nonzero if the constant value X is a legitimate general operand
15172 when generating PIC code. It is given that flag_pic is on and
15173 that X satisfies CONSTANT_P. */
15175 bool
15176 legitimate_pic_operand_p (rtx x)
15178 rtx inner;
15180 switch (GET_CODE (x))
15182 case CONST:
15183 inner = XEXP (x, 0);
15184 if (GET_CODE (inner) == PLUS
15185 && CONST_INT_P (XEXP (inner, 1)))
15186 inner = XEXP (inner, 0);
15188 /* Only some unspecs are valid as "constants". */
15189 if (GET_CODE (inner) == UNSPEC)
15190 switch (XINT (inner, 1))
15192 case UNSPEC_GOT:
15193 case UNSPEC_GOTOFF:
15194 case UNSPEC_PLTOFF:
15195 return TARGET_64BIT;
15196 case UNSPEC_TPOFF:
15197 x = XVECEXP (inner, 0, 0);
15198 return (GET_CODE (x) == SYMBOL_REF
15199 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15200 case UNSPEC_MACHOPIC_OFFSET:
15201 return legitimate_pic_address_disp_p (x);
15202 default:
15203 return false;
15205 /* FALLTHRU */
15207 case SYMBOL_REF:
15208 case LABEL_REF:
15209 return legitimate_pic_address_disp_p (x);
15211 default:
15212 return true;
15216 /* Determine if a given CONST RTX is a valid memory displacement
15217 in PIC mode. */
15219 bool
15220 legitimate_pic_address_disp_p (rtx disp)
15222 bool saw_plus;
15224 /* In 64bit mode we can allow direct addresses of symbols and labels
15225 when they are not dynamic symbols. */
15226 if (TARGET_64BIT)
15228 rtx op0 = disp, op1;
15230 switch (GET_CODE (disp))
15232 case LABEL_REF:
15233 return true;
15235 case CONST:
15236 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15237 break;
15238 op0 = XEXP (XEXP (disp, 0), 0);
15239 op1 = XEXP (XEXP (disp, 0), 1);
15240 if (!CONST_INT_P (op1))
15241 break;
15242 if (GET_CODE (op0) == UNSPEC
15243 && (XINT (op0, 1) == UNSPEC_DTPOFF
15244 || XINT (op0, 1) == UNSPEC_NTPOFF)
15245 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15246 return true;
15247 if (INTVAL (op1) >= 16*1024*1024
15248 || INTVAL (op1) < -16*1024*1024)
15249 break;
15250 if (GET_CODE (op0) == LABEL_REF)
15251 return true;
15252 if (GET_CODE (op0) == CONST
15253 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15254 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15255 return true;
15256 if (GET_CODE (op0) == UNSPEC
15257 && XINT (op0, 1) == UNSPEC_PCREL)
15258 return true;
15259 if (GET_CODE (op0) != SYMBOL_REF)
15260 break;
15261 /* FALLTHRU */
15263 case SYMBOL_REF:
15264 /* TLS references should always be enclosed in UNSPEC.
15265 The dllimported symbol needs always to be resolved. */
15266 if (SYMBOL_REF_TLS_MODEL (op0)
15267 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15268 return false;
15270 if (TARGET_PECOFF)
15272 if (is_imported_p (op0))
15273 return true;
15275 if (SYMBOL_REF_FAR_ADDR_P (op0)
15276 || !SYMBOL_REF_LOCAL_P (op0))
15277 break;
15279 /* Function-symbols need to be resolved only for
15280 large-model.
15281 For the small-model we don't need to resolve anything
15282 here. */
15283 if ((ix86_cmodel != CM_LARGE_PIC
15284 && SYMBOL_REF_FUNCTION_P (op0))
15285 || ix86_cmodel == CM_SMALL_PIC)
15286 return true;
15287 /* Non-external symbols don't need to be resolved for
15288 large, and medium-model. */
15289 if ((ix86_cmodel == CM_LARGE_PIC
15290 || ix86_cmodel == CM_MEDIUM_PIC)
15291 && !SYMBOL_REF_EXTERNAL_P (op0))
15292 return true;
15294 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15295 && (SYMBOL_REF_LOCAL_P (op0)
15296 || (HAVE_LD_PIE_COPYRELOC
15297 && flag_pie
15298 && !SYMBOL_REF_WEAK (op0)
15299 && !SYMBOL_REF_FUNCTION_P (op0)))
15300 && ix86_cmodel != CM_LARGE_PIC)
15301 return true;
15302 break;
15304 default:
15305 break;
15308 if (GET_CODE (disp) != CONST)
15309 return false;
15310 disp = XEXP (disp, 0);
15312 if (TARGET_64BIT)
15314 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15315 of GOT tables. We should not need these anyway. */
15316 if (GET_CODE (disp) != UNSPEC
15317 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15318 && XINT (disp, 1) != UNSPEC_GOTOFF
15319 && XINT (disp, 1) != UNSPEC_PCREL
15320 && XINT (disp, 1) != UNSPEC_PLTOFF))
15321 return false;
15323 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15324 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15325 return false;
15326 return true;
15329 saw_plus = false;
15330 if (GET_CODE (disp) == PLUS)
15332 if (!CONST_INT_P (XEXP (disp, 1)))
15333 return false;
15334 disp = XEXP (disp, 0);
15335 saw_plus = true;
15338 if (TARGET_MACHO && darwin_local_data_pic (disp))
15339 return true;
15341 if (GET_CODE (disp) != UNSPEC)
15342 return false;
15344 switch (XINT (disp, 1))
15346 case UNSPEC_GOT:
15347 if (saw_plus)
15348 return false;
15349 /* We need to check for both symbols and labels because VxWorks loads
15350 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15351 details. */
15352 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15353 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15354 case UNSPEC_GOTOFF:
15355 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15356 While ABI specify also 32bit relocation but we don't produce it in
15357 small PIC model at all. */
15358 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15359 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15360 && !TARGET_64BIT)
15361 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15362 return false;
15363 case UNSPEC_GOTTPOFF:
15364 case UNSPEC_GOTNTPOFF:
15365 case UNSPEC_INDNTPOFF:
15366 if (saw_plus)
15367 return false;
15368 disp = XVECEXP (disp, 0, 0);
15369 return (GET_CODE (disp) == SYMBOL_REF
15370 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15371 case UNSPEC_NTPOFF:
15372 disp = XVECEXP (disp, 0, 0);
15373 return (GET_CODE (disp) == SYMBOL_REF
15374 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15375 case UNSPEC_DTPOFF:
15376 disp = XVECEXP (disp, 0, 0);
15377 return (GET_CODE (disp) == SYMBOL_REF
15378 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15381 return false;
15384 /* Determine if op is suitable RTX for an address register.
15385 Return naked register if a register or a register subreg is
15386 found, otherwise return NULL_RTX. */
15388 static rtx
15389 ix86_validate_address_register (rtx op)
15391 machine_mode mode = GET_MODE (op);
15393 /* Only SImode or DImode registers can form the address. */
15394 if (mode != SImode && mode != DImode)
15395 return NULL_RTX;
15397 if (REG_P (op))
15398 return op;
15399 else if (SUBREG_P (op))
15401 rtx reg = SUBREG_REG (op);
15403 if (!REG_P (reg))
15404 return NULL_RTX;
15406 mode = GET_MODE (reg);
15408 /* Don't allow SUBREGs that span more than a word. It can
15409 lead to spill failures when the register is one word out
15410 of a two word structure. */
15411 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15412 return NULL_RTX;
15414 /* Allow only SUBREGs of non-eliminable hard registers. */
15415 if (register_no_elim_operand (reg, mode))
15416 return reg;
15419 /* Op is not a register. */
15420 return NULL_RTX;
15423 /* Recognizes RTL expressions that are valid memory addresses for an
15424 instruction. The MODE argument is the machine mode for the MEM
15425 expression that wants to use this address.
15427 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15428 convert common non-canonical forms to canonical form so that they will
15429 be recognized. */
15431 static bool
15432 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15434 struct ix86_address parts;
15435 rtx base, index, disp;
15436 HOST_WIDE_INT scale;
15437 addr_space_t seg;
15439 if (ix86_decompose_address (addr, &parts) <= 0)
15440 /* Decomposition failed. */
15441 return false;
15443 base = parts.base;
15444 index = parts.index;
15445 disp = parts.disp;
15446 scale = parts.scale;
15447 seg = parts.seg;
15449 /* Validate base register. */
15450 if (base)
15452 rtx reg = ix86_validate_address_register (base);
15454 if (reg == NULL_RTX)
15455 return false;
15457 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15458 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15459 /* Base is not valid. */
15460 return false;
15463 /* Validate index register. */
15464 if (index)
15466 rtx reg = ix86_validate_address_register (index);
15468 if (reg == NULL_RTX)
15469 return false;
15471 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15472 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15473 /* Index is not valid. */
15474 return false;
15477 /* Index and base should have the same mode. */
15478 if (base && index
15479 && GET_MODE (base) != GET_MODE (index))
15480 return false;
15482 /* Address override works only on the (%reg) part of %fs:(%reg). */
15483 if (seg != ADDR_SPACE_GENERIC
15484 && ((base && GET_MODE (base) != word_mode)
15485 || (index && GET_MODE (index) != word_mode)))
15486 return false;
15488 /* Validate scale factor. */
15489 if (scale != 1)
15491 if (!index)
15492 /* Scale without index. */
15493 return false;
15495 if (scale != 2 && scale != 4 && scale != 8)
15496 /* Scale is not a valid multiplier. */
15497 return false;
15500 /* Validate displacement. */
15501 if (disp)
15503 if (GET_CODE (disp) == CONST
15504 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15505 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15506 switch (XINT (XEXP (disp, 0), 1))
15508 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15509 when used. While ABI specify also 32bit relocations, we
15510 don't produce them at all and use IP relative instead.
15511 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15512 should be loaded via GOT. */
15513 case UNSPEC_GOT:
15514 if (!TARGET_64BIT
15515 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15516 goto is_legitimate_pic;
15517 /* FALLTHRU */
15518 case UNSPEC_GOTOFF:
15519 gcc_assert (flag_pic);
15520 if (!TARGET_64BIT)
15521 goto is_legitimate_pic;
15523 /* 64bit address unspec. */
15524 return false;
15526 case UNSPEC_GOTPCREL:
15527 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15528 goto is_legitimate_pic;
15529 /* FALLTHRU */
15530 case UNSPEC_PCREL:
15531 gcc_assert (flag_pic);
15532 goto is_legitimate_pic;
15534 case UNSPEC_GOTTPOFF:
15535 case UNSPEC_GOTNTPOFF:
15536 case UNSPEC_INDNTPOFF:
15537 case UNSPEC_NTPOFF:
15538 case UNSPEC_DTPOFF:
15539 break;
15541 default:
15542 /* Invalid address unspec. */
15543 return false;
15546 else if (SYMBOLIC_CONST (disp)
15547 && (flag_pic
15548 || (TARGET_MACHO
15549 #if TARGET_MACHO
15550 && MACHOPIC_INDIRECT
15551 && !machopic_operand_p (disp)
15552 #endif
15556 is_legitimate_pic:
15557 if (TARGET_64BIT && (index || base))
15559 /* foo@dtpoff(%rX) is ok. */
15560 if (GET_CODE (disp) != CONST
15561 || GET_CODE (XEXP (disp, 0)) != PLUS
15562 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15563 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15564 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15565 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15566 /* Non-constant pic memory reference. */
15567 return false;
15569 else if ((!TARGET_MACHO || flag_pic)
15570 && ! legitimate_pic_address_disp_p (disp))
15571 /* Displacement is an invalid pic construct. */
15572 return false;
15573 #if TARGET_MACHO
15574 else if (MACHO_DYNAMIC_NO_PIC_P
15575 && !ix86_legitimate_constant_p (Pmode, disp))
15576 /* displacment must be referenced via non_lazy_pointer */
15577 return false;
15578 #endif
15580 /* This code used to verify that a symbolic pic displacement
15581 includes the pic_offset_table_rtx register.
15583 While this is good idea, unfortunately these constructs may
15584 be created by "adds using lea" optimization for incorrect
15585 code like:
15587 int a;
15588 int foo(int i)
15590 return *(&a+i);
15593 This code is nonsensical, but results in addressing
15594 GOT table with pic_offset_table_rtx base. We can't
15595 just refuse it easily, since it gets matched by
15596 "addsi3" pattern, that later gets split to lea in the
15597 case output register differs from input. While this
15598 can be handled by separate addsi pattern for this case
15599 that never results in lea, this seems to be easier and
15600 correct fix for crash to disable this test. */
15602 else if (GET_CODE (disp) != LABEL_REF
15603 && !CONST_INT_P (disp)
15604 && (GET_CODE (disp) != CONST
15605 || !ix86_legitimate_constant_p (Pmode, disp))
15606 && (GET_CODE (disp) != SYMBOL_REF
15607 || !ix86_legitimate_constant_p (Pmode, disp)))
15608 /* Displacement is not constant. */
15609 return false;
15610 else if (TARGET_64BIT
15611 && !x86_64_immediate_operand (disp, VOIDmode))
15612 /* Displacement is out of range. */
15613 return false;
15614 /* In x32 mode, constant addresses are sign extended to 64bit, so
15615 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15616 else if (TARGET_X32 && !(index || base)
15617 && CONST_INT_P (disp)
15618 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15619 return false;
15622 /* Everything looks valid. */
15623 return true;
15626 /* Determine if a given RTX is a valid constant address. */
15628 bool
15629 constant_address_p (rtx x)
15631 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15634 /* Return a unique alias set for the GOT. */
15636 static alias_set_type
15637 ix86_GOT_alias_set (void)
15639 static alias_set_type set = -1;
15640 if (set == -1)
15641 set = new_alias_set ();
15642 return set;
15645 /* Return a legitimate reference for ORIG (an address) using the
15646 register REG. If REG is 0, a new pseudo is generated.
15648 There are two types of references that must be handled:
15650 1. Global data references must load the address from the GOT, via
15651 the PIC reg. An insn is emitted to do this load, and the reg is
15652 returned.
15654 2. Static data references, constant pool addresses, and code labels
15655 compute the address as an offset from the GOT, whose base is in
15656 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15657 differentiate them from global data objects. The returned
15658 address is the PIC reg + an unspec constant.
15660 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15661 reg also appears in the address. */
15663 static rtx
15664 legitimize_pic_address (rtx orig, rtx reg)
15666 rtx addr = orig;
15667 rtx new_rtx = orig;
15669 #if TARGET_MACHO
15670 if (TARGET_MACHO && !TARGET_64BIT)
15672 if (reg == 0)
15673 reg = gen_reg_rtx (Pmode);
15674 /* Use the generic Mach-O PIC machinery. */
15675 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15677 #endif
15679 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15681 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15682 if (tmp)
15683 return tmp;
15686 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15687 new_rtx = addr;
15688 else if ((!TARGET_64BIT
15689 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15690 && !TARGET_PECOFF
15691 && gotoff_operand (addr, Pmode))
15693 /* This symbol may be referenced via a displacement
15694 from the PIC base address (@GOTOFF). */
15695 if (GET_CODE (addr) == CONST)
15696 addr = XEXP (addr, 0);
15698 if (GET_CODE (addr) == PLUS)
15700 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15701 UNSPEC_GOTOFF);
15702 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15704 else
15705 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15707 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15709 if (TARGET_64BIT)
15710 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15712 if (reg != 0)
15714 gcc_assert (REG_P (reg));
15715 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15716 new_rtx, reg, 1, OPTAB_DIRECT);
15718 else
15719 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15721 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15722 /* We can't use @GOTOFF for text labels
15723 on VxWorks, see gotoff_operand. */
15724 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15726 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15727 if (tmp)
15728 return tmp;
15730 /* For x64 PE-COFF there is no GOT table,
15731 so we use address directly. */
15732 if (TARGET_64BIT && TARGET_PECOFF)
15734 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15735 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15737 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15739 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15740 UNSPEC_GOTPCREL);
15741 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15742 new_rtx = gen_const_mem (Pmode, new_rtx);
15743 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15745 else
15747 /* This symbol must be referenced via a load
15748 from the Global Offset Table (@GOT). */
15749 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15750 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15751 if (TARGET_64BIT)
15752 new_rtx = force_reg (Pmode, new_rtx);
15753 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15754 new_rtx = gen_const_mem (Pmode, new_rtx);
15755 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15758 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15760 else
15762 if (CONST_INT_P (addr)
15763 && !x86_64_immediate_operand (addr, VOIDmode))
15764 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15765 else if (GET_CODE (addr) == CONST)
15767 addr = XEXP (addr, 0);
15769 /* We must match stuff we generate before. Assume the only
15770 unspecs that can get here are ours. Not that we could do
15771 anything with them anyway.... */
15772 if (GET_CODE (addr) == UNSPEC
15773 || (GET_CODE (addr) == PLUS
15774 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15775 return orig;
15776 gcc_assert (GET_CODE (addr) == PLUS);
15779 if (GET_CODE (addr) == PLUS)
15781 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15783 /* Check first to see if this is a constant
15784 offset from a @GOTOFF symbol reference. */
15785 if (!TARGET_PECOFF
15786 && gotoff_operand (op0, Pmode)
15787 && CONST_INT_P (op1))
15789 if (!TARGET_64BIT)
15791 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15792 UNSPEC_GOTOFF);
15793 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15794 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15796 if (reg != 0)
15798 gcc_assert (REG_P (reg));
15799 new_rtx = expand_simple_binop (Pmode, PLUS,
15800 pic_offset_table_rtx,
15801 new_rtx, reg, 1,
15802 OPTAB_DIRECT);
15804 else
15805 new_rtx
15806 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15808 else
15810 if (INTVAL (op1) < -16*1024*1024
15811 || INTVAL (op1) >= 16*1024*1024)
15813 if (!x86_64_immediate_operand (op1, Pmode))
15814 op1 = force_reg (Pmode, op1);
15816 new_rtx
15817 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15821 else
15823 rtx base = legitimize_pic_address (op0, reg);
15824 machine_mode mode = GET_MODE (base);
15825 new_rtx
15826 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15828 if (CONST_INT_P (new_rtx))
15830 if (INTVAL (new_rtx) < -16*1024*1024
15831 || INTVAL (new_rtx) >= 16*1024*1024)
15833 if (!x86_64_immediate_operand (new_rtx, mode))
15834 new_rtx = force_reg (mode, new_rtx);
15836 new_rtx
15837 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15839 else
15840 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15842 else
15844 /* For %rip addressing, we have to use
15845 just disp32, not base nor index. */
15846 if (TARGET_64BIT
15847 && (GET_CODE (base) == SYMBOL_REF
15848 || GET_CODE (base) == LABEL_REF))
15849 base = force_reg (mode, base);
15850 if (GET_CODE (new_rtx) == PLUS
15851 && CONSTANT_P (XEXP (new_rtx, 1)))
15853 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15854 new_rtx = XEXP (new_rtx, 1);
15856 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15861 return new_rtx;
15864 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15866 static rtx
15867 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15869 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15871 if (GET_MODE (tp) != tp_mode)
15873 gcc_assert (GET_MODE (tp) == SImode);
15874 gcc_assert (tp_mode == DImode);
15876 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15879 if (to_reg)
15880 tp = copy_to_mode_reg (tp_mode, tp);
15882 return tp;
15885 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15887 static GTY(()) rtx ix86_tls_symbol;
15889 static rtx
15890 ix86_tls_get_addr (void)
15892 if (!ix86_tls_symbol)
15894 const char *sym
15895 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15896 ? "___tls_get_addr" : "__tls_get_addr");
15898 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15901 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15903 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15904 UNSPEC_PLTOFF);
15905 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15906 gen_rtx_CONST (Pmode, unspec));
15909 return ix86_tls_symbol;
15912 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15914 static GTY(()) rtx ix86_tls_module_base_symbol;
15917 ix86_tls_module_base (void)
15919 if (!ix86_tls_module_base_symbol)
15921 ix86_tls_module_base_symbol
15922 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15924 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15925 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15928 return ix86_tls_module_base_symbol;
15931 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15932 false if we expect this to be used for a memory address and true if
15933 we expect to load the address into a register. */
15935 static rtx
15936 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15938 rtx dest, base, off;
15939 rtx pic = NULL_RTX, tp = NULL_RTX;
15940 machine_mode tp_mode = Pmode;
15941 int type;
15943 /* Fall back to global dynamic model if tool chain cannot support local
15944 dynamic. */
15945 if (TARGET_SUN_TLS && !TARGET_64BIT
15946 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15947 && model == TLS_MODEL_LOCAL_DYNAMIC)
15948 model = TLS_MODEL_GLOBAL_DYNAMIC;
15950 switch (model)
15952 case TLS_MODEL_GLOBAL_DYNAMIC:
15953 dest = gen_reg_rtx (Pmode);
15955 if (!TARGET_64BIT)
15957 if (flag_pic && !TARGET_PECOFF)
15958 pic = pic_offset_table_rtx;
15959 else
15961 pic = gen_reg_rtx (Pmode);
15962 emit_insn (gen_set_got (pic));
15966 if (TARGET_GNU2_TLS)
15968 if (TARGET_64BIT)
15969 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
15970 else
15971 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
15973 tp = get_thread_pointer (Pmode, true);
15974 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
15976 if (GET_MODE (x) != Pmode)
15977 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15979 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15981 else
15983 rtx caddr = ix86_tls_get_addr ();
15985 if (TARGET_64BIT)
15987 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15988 rtx_insn *insns;
15990 start_sequence ();
15991 emit_call_insn
15992 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
15993 insns = get_insns ();
15994 end_sequence ();
15996 if (GET_MODE (x) != Pmode)
15997 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15999 RTL_CONST_CALL_P (insns) = 1;
16000 emit_libcall_block (insns, dest, rax, x);
16002 else
16003 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16005 break;
16007 case TLS_MODEL_LOCAL_DYNAMIC:
16008 base = gen_reg_rtx (Pmode);
16010 if (!TARGET_64BIT)
16012 if (flag_pic)
16013 pic = pic_offset_table_rtx;
16014 else
16016 pic = gen_reg_rtx (Pmode);
16017 emit_insn (gen_set_got (pic));
16021 if (TARGET_GNU2_TLS)
16023 rtx tmp = ix86_tls_module_base ();
16025 if (TARGET_64BIT)
16026 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16027 else
16028 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16030 tp = get_thread_pointer (Pmode, true);
16031 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16032 gen_rtx_MINUS (Pmode, tmp, tp));
16034 else
16036 rtx caddr = ix86_tls_get_addr ();
16038 if (TARGET_64BIT)
16040 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16041 rtx_insn *insns;
16042 rtx eqv;
16044 start_sequence ();
16045 emit_call_insn
16046 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16047 insns = get_insns ();
16048 end_sequence ();
16050 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16051 share the LD_BASE result with other LD model accesses. */
16052 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16053 UNSPEC_TLS_LD_BASE);
16055 RTL_CONST_CALL_P (insns) = 1;
16056 emit_libcall_block (insns, base, rax, eqv);
16058 else
16059 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16062 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16063 off = gen_rtx_CONST (Pmode, off);
16065 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16067 if (TARGET_GNU2_TLS)
16069 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16071 if (GET_MODE (x) != Pmode)
16072 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16074 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16076 break;
16078 case TLS_MODEL_INITIAL_EXEC:
16079 if (TARGET_64BIT)
16081 if (TARGET_SUN_TLS && !TARGET_X32)
16083 /* The Sun linker took the AMD64 TLS spec literally
16084 and can only handle %rax as destination of the
16085 initial executable code sequence. */
16087 dest = gen_reg_rtx (DImode);
16088 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16089 return dest;
16092 /* Generate DImode references to avoid %fs:(%reg32)
16093 problems and linker IE->LE relaxation bug. */
16094 tp_mode = DImode;
16095 pic = NULL;
16096 type = UNSPEC_GOTNTPOFF;
16098 else if (flag_pic)
16100 pic = pic_offset_table_rtx;
16101 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16103 else if (!TARGET_ANY_GNU_TLS)
16105 pic = gen_reg_rtx (Pmode);
16106 emit_insn (gen_set_got (pic));
16107 type = UNSPEC_GOTTPOFF;
16109 else
16111 pic = NULL;
16112 type = UNSPEC_INDNTPOFF;
16115 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16116 off = gen_rtx_CONST (tp_mode, off);
16117 if (pic)
16118 off = gen_rtx_PLUS (tp_mode, pic, off);
16119 off = gen_const_mem (tp_mode, off);
16120 set_mem_alias_set (off, ix86_GOT_alias_set ());
16122 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16124 base = get_thread_pointer (tp_mode,
16125 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16126 off = force_reg (tp_mode, off);
16127 dest = gen_rtx_PLUS (tp_mode, base, off);
16128 if (tp_mode != Pmode)
16129 dest = convert_to_mode (Pmode, dest, 1);
16131 else
16133 base = get_thread_pointer (Pmode, true);
16134 dest = gen_reg_rtx (Pmode);
16135 emit_insn (ix86_gen_sub3 (dest, base, off));
16137 break;
16139 case TLS_MODEL_LOCAL_EXEC:
16140 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16141 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16142 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16143 off = gen_rtx_CONST (Pmode, off);
16145 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16147 base = get_thread_pointer (Pmode,
16148 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16149 return gen_rtx_PLUS (Pmode, base, off);
16151 else
16153 base = get_thread_pointer (Pmode, true);
16154 dest = gen_reg_rtx (Pmode);
16155 emit_insn (ix86_gen_sub3 (dest, base, off));
16157 break;
16159 default:
16160 gcc_unreachable ();
16163 return dest;
16166 /* Return true if OP refers to a TLS address. */
16167 bool
16168 ix86_tls_address_pattern_p (rtx op)
16170 subrtx_var_iterator::array_type array;
16171 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16173 rtx op = *iter;
16174 if (MEM_P (op))
16176 rtx *x = &XEXP (op, 0);
16177 while (GET_CODE (*x) == PLUS)
16179 int i;
16180 for (i = 0; i < 2; i++)
16182 rtx u = XEXP (*x, i);
16183 if (GET_CODE (u) == ZERO_EXTEND)
16184 u = XEXP (u, 0);
16185 if (GET_CODE (u) == UNSPEC
16186 && XINT (u, 1) == UNSPEC_TP)
16187 return true;
16189 x = &XEXP (*x, 0);
16192 iter.skip_subrtxes ();
16196 return false;
16199 /* Rewrite *LOC so that it refers to a default TLS address space. */
16200 void
16201 ix86_rewrite_tls_address_1 (rtx *loc)
16203 subrtx_ptr_iterator::array_type array;
16204 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16206 rtx *loc = *iter;
16207 if (MEM_P (*loc))
16209 rtx addr = XEXP (*loc, 0);
16210 rtx *x = &addr;
16211 while (GET_CODE (*x) == PLUS)
16213 int i;
16214 for (i = 0; i < 2; i++)
16216 rtx u = XEXP (*x, i);
16217 if (GET_CODE (u) == ZERO_EXTEND)
16218 u = XEXP (u, 0);
16219 if (GET_CODE (u) == UNSPEC
16220 && XINT (u, 1) == UNSPEC_TP)
16222 addr_space_t as = DEFAULT_TLS_SEG_REG;
16224 *x = XEXP (*x, 1 - i);
16226 *loc = replace_equiv_address_nv (*loc, addr, true);
16227 set_mem_addr_space (*loc, as);
16228 return;
16231 x = &XEXP (*x, 0);
16234 iter.skip_subrtxes ();
16239 /* Rewrite instruction pattern involvning TLS address
16240 so that it refers to a default TLS address space. */
16242 ix86_rewrite_tls_address (rtx pattern)
16244 pattern = copy_insn (pattern);
16245 ix86_rewrite_tls_address_1 (&pattern);
16246 return pattern;
16249 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16250 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16251 unique refptr-DECL symbol corresponding to symbol DECL. */
16253 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16255 static inline hashval_t hash (tree_map *m) { return m->hash; }
16256 static inline bool
16257 equal (tree_map *a, tree_map *b)
16259 return a->base.from == b->base.from;
16262 static int
16263 keep_cache_entry (tree_map *&m)
16265 return ggc_marked_p (m->base.from);
16269 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16271 static tree
16272 get_dllimport_decl (tree decl, bool beimport)
16274 struct tree_map *h, in;
16275 const char *name;
16276 const char *prefix;
16277 size_t namelen, prefixlen;
16278 char *imp_name;
16279 tree to;
16280 rtx rtl;
16282 if (!dllimport_map)
16283 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16285 in.hash = htab_hash_pointer (decl);
16286 in.base.from = decl;
16287 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16288 h = *loc;
16289 if (h)
16290 return h->to;
16292 *loc = h = ggc_alloc<tree_map> ();
16293 h->hash = in.hash;
16294 h->base.from = decl;
16295 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16296 VAR_DECL, NULL, ptr_type_node);
16297 DECL_ARTIFICIAL (to) = 1;
16298 DECL_IGNORED_P (to) = 1;
16299 DECL_EXTERNAL (to) = 1;
16300 TREE_READONLY (to) = 1;
16302 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16303 name = targetm.strip_name_encoding (name);
16304 if (beimport)
16305 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16306 ? "*__imp_" : "*__imp__";
16307 else
16308 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16309 namelen = strlen (name);
16310 prefixlen = strlen (prefix);
16311 imp_name = (char *) alloca (namelen + prefixlen + 1);
16312 memcpy (imp_name, prefix, prefixlen);
16313 memcpy (imp_name + prefixlen, name, namelen + 1);
16315 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16316 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16317 SET_SYMBOL_REF_DECL (rtl, to);
16318 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16319 if (!beimport)
16321 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16322 #ifdef SUB_TARGET_RECORD_STUB
16323 SUB_TARGET_RECORD_STUB (name);
16324 #endif
16327 rtl = gen_const_mem (Pmode, rtl);
16328 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16330 SET_DECL_RTL (to, rtl);
16331 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16333 return to;
16336 /* Expand SYMBOL into its corresponding far-address symbol.
16337 WANT_REG is true if we require the result be a register. */
16339 static rtx
16340 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16342 tree imp_decl;
16343 rtx x;
16345 gcc_assert (SYMBOL_REF_DECL (symbol));
16346 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16348 x = DECL_RTL (imp_decl);
16349 if (want_reg)
16350 x = force_reg (Pmode, x);
16351 return x;
16354 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16355 true if we require the result be a register. */
16357 static rtx
16358 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16360 tree imp_decl;
16361 rtx x;
16363 gcc_assert (SYMBOL_REF_DECL (symbol));
16364 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16366 x = DECL_RTL (imp_decl);
16367 if (want_reg)
16368 x = force_reg (Pmode, x);
16369 return x;
16372 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16373 is true if we require the result be a register. */
16375 static rtx
16376 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16378 if (!TARGET_PECOFF)
16379 return NULL_RTX;
16381 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16383 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16384 return legitimize_dllimport_symbol (addr, inreg);
16385 if (GET_CODE (addr) == CONST
16386 && GET_CODE (XEXP (addr, 0)) == PLUS
16387 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16388 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16390 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16391 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16395 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16396 return NULL_RTX;
16397 if (GET_CODE (addr) == SYMBOL_REF
16398 && !is_imported_p (addr)
16399 && SYMBOL_REF_EXTERNAL_P (addr)
16400 && SYMBOL_REF_DECL (addr))
16401 return legitimize_pe_coff_extern_decl (addr, inreg);
16403 if (GET_CODE (addr) == CONST
16404 && GET_CODE (XEXP (addr, 0)) == PLUS
16405 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16406 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16407 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16408 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16410 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16411 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16413 return NULL_RTX;
16416 /* Try machine-dependent ways of modifying an illegitimate address
16417 to be legitimate. If we find one, return the new, valid address.
16418 This macro is used in only one place: `memory_address' in explow.c.
16420 OLDX is the address as it was before break_out_memory_refs was called.
16421 In some cases it is useful to look at this to decide what needs to be done.
16423 It is always safe for this macro to do nothing. It exists to recognize
16424 opportunities to optimize the output.
16426 For the 80386, we handle X+REG by loading X into a register R and
16427 using R+REG. R will go in a general reg and indexing will be used.
16428 However, if REG is a broken-out memory address or multiplication,
16429 nothing needs to be done because REG can certainly go in a general reg.
16431 When -fpic is used, special handling is needed for symbolic references.
16432 See comments by legitimize_pic_address in i386.c for details. */
16434 static rtx
16435 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16437 bool changed = false;
16438 unsigned log;
16440 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16441 if (log)
16442 return legitimize_tls_address (x, (enum tls_model) log, false);
16443 if (GET_CODE (x) == CONST
16444 && GET_CODE (XEXP (x, 0)) == PLUS
16445 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16446 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16448 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16449 (enum tls_model) log, false);
16450 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16453 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16455 rtx tmp = legitimize_pe_coff_symbol (x, true);
16456 if (tmp)
16457 return tmp;
16460 if (flag_pic && SYMBOLIC_CONST (x))
16461 return legitimize_pic_address (x, 0);
16463 #if TARGET_MACHO
16464 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16465 return machopic_indirect_data_reference (x, 0);
16466 #endif
16468 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16469 if (GET_CODE (x) == ASHIFT
16470 && CONST_INT_P (XEXP (x, 1))
16471 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16473 changed = true;
16474 log = INTVAL (XEXP (x, 1));
16475 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16476 GEN_INT (1 << log));
16479 if (GET_CODE (x) == PLUS)
16481 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16483 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16484 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16485 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16487 changed = true;
16488 log = INTVAL (XEXP (XEXP (x, 0), 1));
16489 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16490 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16491 GEN_INT (1 << log));
16494 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16495 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16496 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16498 changed = true;
16499 log = INTVAL (XEXP (XEXP (x, 1), 1));
16500 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16501 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16502 GEN_INT (1 << log));
16505 /* Put multiply first if it isn't already. */
16506 if (GET_CODE (XEXP (x, 1)) == MULT)
16508 std::swap (XEXP (x, 0), XEXP (x, 1));
16509 changed = true;
16512 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16513 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16514 created by virtual register instantiation, register elimination, and
16515 similar optimizations. */
16516 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16518 changed = true;
16519 x = gen_rtx_PLUS (Pmode,
16520 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16521 XEXP (XEXP (x, 1), 0)),
16522 XEXP (XEXP (x, 1), 1));
16525 /* Canonicalize
16526 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16527 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16528 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16529 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16530 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16531 && CONSTANT_P (XEXP (x, 1)))
16533 rtx constant;
16534 rtx other = NULL_RTX;
16536 if (CONST_INT_P (XEXP (x, 1)))
16538 constant = XEXP (x, 1);
16539 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16541 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16543 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16544 other = XEXP (x, 1);
16546 else
16547 constant = 0;
16549 if (constant)
16551 changed = true;
16552 x = gen_rtx_PLUS (Pmode,
16553 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16554 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16555 plus_constant (Pmode, other,
16556 INTVAL (constant)));
16560 if (changed && ix86_legitimate_address_p (mode, x, false))
16561 return x;
16563 if (GET_CODE (XEXP (x, 0)) == MULT)
16565 changed = true;
16566 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16569 if (GET_CODE (XEXP (x, 1)) == MULT)
16571 changed = true;
16572 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16575 if (changed
16576 && REG_P (XEXP (x, 1))
16577 && REG_P (XEXP (x, 0)))
16578 return x;
16580 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16582 changed = true;
16583 x = legitimize_pic_address (x, 0);
16586 if (changed && ix86_legitimate_address_p (mode, x, false))
16587 return x;
16589 if (REG_P (XEXP (x, 0)))
16591 rtx temp = gen_reg_rtx (Pmode);
16592 rtx val = force_operand (XEXP (x, 1), temp);
16593 if (val != temp)
16595 val = convert_to_mode (Pmode, val, 1);
16596 emit_move_insn (temp, val);
16599 XEXP (x, 1) = temp;
16600 return x;
16603 else if (REG_P (XEXP (x, 1)))
16605 rtx temp = gen_reg_rtx (Pmode);
16606 rtx val = force_operand (XEXP (x, 0), temp);
16607 if (val != temp)
16609 val = convert_to_mode (Pmode, val, 1);
16610 emit_move_insn (temp, val);
16613 XEXP (x, 0) = temp;
16614 return x;
16618 return x;
16621 /* Print an integer constant expression in assembler syntax. Addition
16622 and subtraction are the only arithmetic that may appear in these
16623 expressions. FILE is the stdio stream to write to, X is the rtx, and
16624 CODE is the operand print code from the output string. */
16626 static void
16627 output_pic_addr_const (FILE *file, rtx x, int code)
16629 char buf[256];
16631 switch (GET_CODE (x))
16633 case PC:
16634 gcc_assert (flag_pic);
16635 putc ('.', file);
16636 break;
16638 case SYMBOL_REF:
16639 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16640 output_addr_const (file, x);
16641 else
16643 const char *name = XSTR (x, 0);
16645 /* Mark the decl as referenced so that cgraph will
16646 output the function. */
16647 if (SYMBOL_REF_DECL (x))
16648 mark_decl_referenced (SYMBOL_REF_DECL (x));
16650 #if TARGET_MACHO
16651 if (MACHOPIC_INDIRECT
16652 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16653 name = machopic_indirection_name (x, /*stub_p=*/true);
16654 #endif
16655 assemble_name (file, name);
16657 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16658 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16659 fputs ("@PLT", file);
16660 break;
16662 case LABEL_REF:
16663 x = XEXP (x, 0);
16664 /* FALLTHRU */
16665 case CODE_LABEL:
16666 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16667 assemble_name (asm_out_file, buf);
16668 break;
16670 case CONST_INT:
16671 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16672 break;
16674 case CONST:
16675 /* This used to output parentheses around the expression,
16676 but that does not work on the 386 (either ATT or BSD assembler). */
16677 output_pic_addr_const (file, XEXP (x, 0), code);
16678 break;
16680 case CONST_DOUBLE:
16681 /* We can't handle floating point constants;
16682 TARGET_PRINT_OPERAND must handle them. */
16683 output_operand_lossage ("floating constant misused");
16684 break;
16686 case PLUS:
16687 /* Some assemblers need integer constants to appear first. */
16688 if (CONST_INT_P (XEXP (x, 0)))
16690 output_pic_addr_const (file, XEXP (x, 0), code);
16691 putc ('+', file);
16692 output_pic_addr_const (file, XEXP (x, 1), code);
16694 else
16696 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16697 output_pic_addr_const (file, XEXP (x, 1), code);
16698 putc ('+', file);
16699 output_pic_addr_const (file, XEXP (x, 0), code);
16701 break;
16703 case MINUS:
16704 if (!TARGET_MACHO)
16705 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16706 output_pic_addr_const (file, XEXP (x, 0), code);
16707 putc ('-', file);
16708 output_pic_addr_const (file, XEXP (x, 1), code);
16709 if (!TARGET_MACHO)
16710 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16711 break;
16713 case UNSPEC:
16714 gcc_assert (XVECLEN (x, 0) == 1);
16715 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16716 switch (XINT (x, 1))
16718 case UNSPEC_GOT:
16719 fputs ("@GOT", file);
16720 break;
16721 case UNSPEC_GOTOFF:
16722 fputs ("@GOTOFF", file);
16723 break;
16724 case UNSPEC_PLTOFF:
16725 fputs ("@PLTOFF", file);
16726 break;
16727 case UNSPEC_PCREL:
16728 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16729 "(%rip)" : "[rip]", file);
16730 break;
16731 case UNSPEC_GOTPCREL:
16732 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16733 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16734 break;
16735 case UNSPEC_GOTTPOFF:
16736 /* FIXME: This might be @TPOFF in Sun ld too. */
16737 fputs ("@gottpoff", file);
16738 break;
16739 case UNSPEC_TPOFF:
16740 fputs ("@tpoff", file);
16741 break;
16742 case UNSPEC_NTPOFF:
16743 if (TARGET_64BIT)
16744 fputs ("@tpoff", file);
16745 else
16746 fputs ("@ntpoff", file);
16747 break;
16748 case UNSPEC_DTPOFF:
16749 fputs ("@dtpoff", file);
16750 break;
16751 case UNSPEC_GOTNTPOFF:
16752 if (TARGET_64BIT)
16753 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16754 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16755 else
16756 fputs ("@gotntpoff", file);
16757 break;
16758 case UNSPEC_INDNTPOFF:
16759 fputs ("@indntpoff", file);
16760 break;
16761 #if TARGET_MACHO
16762 case UNSPEC_MACHOPIC_OFFSET:
16763 putc ('-', file);
16764 machopic_output_function_base_name (file);
16765 break;
16766 #endif
16767 default:
16768 output_operand_lossage ("invalid UNSPEC as operand");
16769 break;
16771 break;
16773 default:
16774 output_operand_lossage ("invalid expression as operand");
16778 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16779 We need to emit DTP-relative relocations. */
16781 static void ATTRIBUTE_UNUSED
16782 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16784 fputs (ASM_LONG, file);
16785 output_addr_const (file, x);
16786 fputs ("@dtpoff", file);
16787 switch (size)
16789 case 4:
16790 break;
16791 case 8:
16792 fputs (", 0", file);
16793 break;
16794 default:
16795 gcc_unreachable ();
16799 /* Return true if X is a representation of the PIC register. This copes
16800 with calls from ix86_find_base_term, where the register might have
16801 been replaced by a cselib value. */
16803 static bool
16804 ix86_pic_register_p (rtx x)
16806 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16807 return (pic_offset_table_rtx
16808 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16809 else if (!REG_P (x))
16810 return false;
16811 else if (pic_offset_table_rtx)
16813 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16814 return true;
16815 if (HARD_REGISTER_P (x)
16816 && !HARD_REGISTER_P (pic_offset_table_rtx)
16817 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16818 return true;
16819 return false;
16821 else
16822 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16825 /* Helper function for ix86_delegitimize_address.
16826 Attempt to delegitimize TLS local-exec accesses. */
16828 static rtx
16829 ix86_delegitimize_tls_address (rtx orig_x)
16831 rtx x = orig_x, unspec;
16832 struct ix86_address addr;
16834 if (!TARGET_TLS_DIRECT_SEG_REFS)
16835 return orig_x;
16836 if (MEM_P (x))
16837 x = XEXP (x, 0);
16838 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16839 return orig_x;
16840 if (ix86_decompose_address (x, &addr) == 0
16841 || addr.seg != DEFAULT_TLS_SEG_REG
16842 || addr.disp == NULL_RTX
16843 || GET_CODE (addr.disp) != CONST)
16844 return orig_x;
16845 unspec = XEXP (addr.disp, 0);
16846 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16847 unspec = XEXP (unspec, 0);
16848 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16849 return orig_x;
16850 x = XVECEXP (unspec, 0, 0);
16851 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16852 if (unspec != XEXP (addr.disp, 0))
16853 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16854 if (addr.index)
16856 rtx idx = addr.index;
16857 if (addr.scale != 1)
16858 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16859 x = gen_rtx_PLUS (Pmode, idx, x);
16861 if (addr.base)
16862 x = gen_rtx_PLUS (Pmode, addr.base, x);
16863 if (MEM_P (orig_x))
16864 x = replace_equiv_address_nv (orig_x, x);
16865 return x;
16868 /* In the name of slightly smaller debug output, and to cater to
16869 general assembler lossage, recognize PIC+GOTOFF and turn it back
16870 into a direct symbol reference.
16872 On Darwin, this is necessary to avoid a crash, because Darwin
16873 has a different PIC label for each routine but the DWARF debugging
16874 information is not associated with any particular routine, so it's
16875 necessary to remove references to the PIC label from RTL stored by
16876 the DWARF output code.
16878 This helper is used in the normal ix86_delegitimize_address
16879 entrypoint (e.g. used in the target delegitimization hook) and
16880 in ix86_find_base_term. As compile time memory optimization, we
16881 avoid allocating rtxes that will not change anything on the outcome
16882 of the callers (find_base_value and find_base_term). */
16884 static inline rtx
16885 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16887 rtx orig_x = delegitimize_mem_from_attrs (x);
16888 /* addend is NULL or some rtx if x is something+GOTOFF where
16889 something doesn't include the PIC register. */
16890 rtx addend = NULL_RTX;
16891 /* reg_addend is NULL or a multiple of some register. */
16892 rtx reg_addend = NULL_RTX;
16893 /* const_addend is NULL or a const_int. */
16894 rtx const_addend = NULL_RTX;
16895 /* This is the result, or NULL. */
16896 rtx result = NULL_RTX;
16898 x = orig_x;
16900 if (MEM_P (x))
16901 x = XEXP (x, 0);
16903 if (TARGET_64BIT)
16905 if (GET_CODE (x) == CONST
16906 && GET_CODE (XEXP (x, 0)) == PLUS
16907 && GET_MODE (XEXP (x, 0)) == Pmode
16908 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16909 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16910 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16912 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16913 base. A CONST can't be arg_pointer_rtx based. */
16914 if (base_term_p && MEM_P (orig_x))
16915 return orig_x;
16916 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16917 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16918 if (MEM_P (orig_x))
16919 x = replace_equiv_address_nv (orig_x, x);
16920 return x;
16923 if (GET_CODE (x) == CONST
16924 && GET_CODE (XEXP (x, 0)) == UNSPEC
16925 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16926 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16927 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16929 x = XVECEXP (XEXP (x, 0), 0, 0);
16930 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16932 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16933 if (x == NULL_RTX)
16934 return orig_x;
16936 return x;
16939 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16940 return ix86_delegitimize_tls_address (orig_x);
16942 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16943 and -mcmodel=medium -fpic. */
16946 if (GET_CODE (x) != PLUS
16947 || GET_CODE (XEXP (x, 1)) != CONST)
16948 return ix86_delegitimize_tls_address (orig_x);
16950 if (ix86_pic_register_p (XEXP (x, 0)))
16951 /* %ebx + GOT/GOTOFF */
16953 else if (GET_CODE (XEXP (x, 0)) == PLUS)
16955 /* %ebx + %reg * scale + GOT/GOTOFF */
16956 reg_addend = XEXP (x, 0);
16957 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
16958 reg_addend = XEXP (reg_addend, 1);
16959 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
16960 reg_addend = XEXP (reg_addend, 0);
16961 else
16963 reg_addend = NULL_RTX;
16964 addend = XEXP (x, 0);
16967 else
16968 addend = XEXP (x, 0);
16970 x = XEXP (XEXP (x, 1), 0);
16971 if (GET_CODE (x) == PLUS
16972 && CONST_INT_P (XEXP (x, 1)))
16974 const_addend = XEXP (x, 1);
16975 x = XEXP (x, 0);
16978 if (GET_CODE (x) == UNSPEC
16979 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
16980 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
16981 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
16982 && !MEM_P (orig_x) && !addend)))
16983 result = XVECEXP (x, 0, 0);
16985 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
16986 && !MEM_P (orig_x))
16987 result = XVECEXP (x, 0, 0);
16989 if (! result)
16990 return ix86_delegitimize_tls_address (orig_x);
16992 /* For (PLUS something CONST_INT) both find_base_{value,term} just
16993 recurse on the first operand. */
16994 if (const_addend && !base_term_p)
16995 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
16996 if (reg_addend)
16997 result = gen_rtx_PLUS (Pmode, reg_addend, result);
16998 if (addend)
17000 /* If the rest of original X doesn't involve the PIC register, add
17001 addend and subtract pic_offset_table_rtx. This can happen e.g.
17002 for code like:
17003 leal (%ebx, %ecx, 4), %ecx
17005 movl foo@GOTOFF(%ecx), %edx
17006 in which case we return (%ecx - %ebx) + foo
17007 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17008 and reload has completed. Don't do the latter for debug,
17009 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17010 if (pic_offset_table_rtx
17011 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17012 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17013 pic_offset_table_rtx),
17014 result);
17015 else if (base_term_p
17016 && pic_offset_table_rtx
17017 && !TARGET_MACHO
17018 && !TARGET_VXWORKS_RTP)
17020 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17021 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17022 result = gen_rtx_PLUS (Pmode, tmp, result);
17024 else
17025 return orig_x;
17027 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17029 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17030 if (result == NULL_RTX)
17031 return orig_x;
17033 return result;
17036 /* The normal instantiation of the above template. */
17038 static rtx
17039 ix86_delegitimize_address (rtx x)
17041 return ix86_delegitimize_address_1 (x, false);
17044 /* If X is a machine specific address (i.e. a symbol or label being
17045 referenced as a displacement from the GOT implemented using an
17046 UNSPEC), then return the base term. Otherwise return X. */
17049 ix86_find_base_term (rtx x)
17051 rtx term;
17053 if (TARGET_64BIT)
17055 if (GET_CODE (x) != CONST)
17056 return x;
17057 term = XEXP (x, 0);
17058 if (GET_CODE (term) == PLUS
17059 && CONST_INT_P (XEXP (term, 1)))
17060 term = XEXP (term, 0);
17061 if (GET_CODE (term) != UNSPEC
17062 || (XINT (term, 1) != UNSPEC_GOTPCREL
17063 && XINT (term, 1) != UNSPEC_PCREL))
17064 return x;
17066 return XVECEXP (term, 0, 0);
17069 return ix86_delegitimize_address_1 (x, true);
17072 /* Return true if X shouldn't be emitted into the debug info.
17073 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17074 symbol easily into the .debug_info section, so we need not to
17075 delegitimize, but instead assemble as @gotoff.
17076 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17077 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17079 static bool
17080 ix86_const_not_ok_for_debug_p (rtx x)
17082 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17083 return true;
17085 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17086 return true;
17088 return false;
17091 static void
17092 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17093 bool fp, FILE *file)
17095 const char *suffix;
17097 if (mode == CCFPmode)
17099 code = ix86_fp_compare_code_to_integer (code);
17100 mode = CCmode;
17102 if (reverse)
17103 code = reverse_condition (code);
17105 switch (code)
17107 case EQ:
17108 gcc_assert (mode != CCGZmode);
17109 switch (mode)
17111 case E_CCAmode:
17112 suffix = "a";
17113 break;
17114 case E_CCCmode:
17115 suffix = "c";
17116 break;
17117 case E_CCOmode:
17118 suffix = "o";
17119 break;
17120 case E_CCPmode:
17121 suffix = "p";
17122 break;
17123 case E_CCSmode:
17124 suffix = "s";
17125 break;
17126 default:
17127 suffix = "e";
17128 break;
17130 break;
17131 case NE:
17132 gcc_assert (mode != CCGZmode);
17133 switch (mode)
17135 case E_CCAmode:
17136 suffix = "na";
17137 break;
17138 case E_CCCmode:
17139 suffix = "nc";
17140 break;
17141 case E_CCOmode:
17142 suffix = "no";
17143 break;
17144 case E_CCPmode:
17145 suffix = "np";
17146 break;
17147 case E_CCSmode:
17148 suffix = "ns";
17149 break;
17150 default:
17151 suffix = "ne";
17152 break;
17154 break;
17155 case GT:
17156 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17157 suffix = "g";
17158 break;
17159 case GTU:
17160 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17161 Those same assemblers have the same but opposite lossage on cmov. */
17162 if (mode == CCmode)
17163 suffix = fp ? "nbe" : "a";
17164 else
17165 gcc_unreachable ();
17166 break;
17167 case LT:
17168 switch (mode)
17170 case E_CCNOmode:
17171 case E_CCGOCmode:
17172 suffix = "s";
17173 break;
17175 case E_CCmode:
17176 case E_CCGCmode:
17177 case E_CCGZmode:
17178 suffix = "l";
17179 break;
17181 default:
17182 gcc_unreachable ();
17184 break;
17185 case LTU:
17186 if (mode == CCmode || mode == CCGZmode)
17187 suffix = "b";
17188 else if (mode == CCCmode)
17189 suffix = fp ? "b" : "c";
17190 else
17191 gcc_unreachable ();
17192 break;
17193 case GE:
17194 switch (mode)
17196 case E_CCNOmode:
17197 case E_CCGOCmode:
17198 suffix = "ns";
17199 break;
17201 case E_CCmode:
17202 case E_CCGCmode:
17203 case E_CCGZmode:
17204 suffix = "ge";
17205 break;
17207 default:
17208 gcc_unreachable ();
17210 break;
17211 case GEU:
17212 if (mode == CCmode || mode == CCGZmode)
17213 suffix = "nb";
17214 else if (mode == CCCmode)
17215 suffix = fp ? "nb" : "nc";
17216 else
17217 gcc_unreachable ();
17218 break;
17219 case LE:
17220 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17221 suffix = "le";
17222 break;
17223 case LEU:
17224 if (mode == CCmode)
17225 suffix = "be";
17226 else
17227 gcc_unreachable ();
17228 break;
17229 case UNORDERED:
17230 suffix = fp ? "u" : "p";
17231 break;
17232 case ORDERED:
17233 suffix = fp ? "nu" : "np";
17234 break;
17235 default:
17236 gcc_unreachable ();
17238 fputs (suffix, file);
17241 /* Print the name of register X to FILE based on its machine mode and number.
17242 If CODE is 'w', pretend the mode is HImode.
17243 If CODE is 'b', pretend the mode is QImode.
17244 If CODE is 'k', pretend the mode is SImode.
17245 If CODE is 'q', pretend the mode is DImode.
17246 If CODE is 'x', pretend the mode is V4SFmode.
17247 If CODE is 't', pretend the mode is V8SFmode.
17248 If CODE is 'g', pretend the mode is V16SFmode.
17249 If CODE is 'h', pretend the reg is the 'high' byte register.
17250 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17251 If CODE is 'd', duplicate the operand for AVX instruction.
17254 void
17255 print_reg (rtx x, int code, FILE *file)
17257 const char *reg;
17258 int msize;
17259 unsigned int regno;
17260 bool duplicated;
17262 if (ASSEMBLER_DIALECT == ASM_ATT)
17263 putc ('%', file);
17265 if (x == pc_rtx)
17267 gcc_assert (TARGET_64BIT);
17268 fputs ("rip", file);
17269 return;
17272 if (code == 'y' && STACK_TOP_P (x))
17274 fputs ("st(0)", file);
17275 return;
17278 if (code == 'w')
17279 msize = 2;
17280 else if (code == 'b')
17281 msize = 1;
17282 else if (code == 'k')
17283 msize = 4;
17284 else if (code == 'q')
17285 msize = 8;
17286 else if (code == 'h')
17287 msize = 0;
17288 else if (code == 'x')
17289 msize = 16;
17290 else if (code == 't')
17291 msize = 32;
17292 else if (code == 'g')
17293 msize = 64;
17294 else
17295 msize = GET_MODE_SIZE (GET_MODE (x));
17297 regno = REGNO (x);
17299 if (regno == ARG_POINTER_REGNUM
17300 || regno == FRAME_POINTER_REGNUM
17301 || regno == FPSR_REG
17302 || regno == FPCR_REG)
17304 output_operand_lossage
17305 ("invalid use of register '%s'", reg_names[regno]);
17306 return;
17308 else if (regno == FLAGS_REG)
17310 output_operand_lossage ("invalid use of asm flag output");
17311 return;
17314 duplicated = code == 'd' && TARGET_AVX;
17316 switch (msize)
17318 case 16:
17319 case 12:
17320 case 8:
17321 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17322 warning (0, "unsupported size for integer register");
17323 /* FALLTHRU */
17324 case 4:
17325 if (LEGACY_INT_REGNO_P (regno))
17326 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17327 /* FALLTHRU */
17328 case 2:
17329 normal:
17330 reg = hi_reg_name[regno];
17331 break;
17332 case 1:
17333 if (regno >= ARRAY_SIZE (qi_reg_name))
17334 goto normal;
17335 if (!ANY_QI_REGNO_P (regno))
17336 error ("unsupported size for integer register");
17337 reg = qi_reg_name[regno];
17338 break;
17339 case 0:
17340 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17341 goto normal;
17342 reg = qi_high_reg_name[regno];
17343 break;
17344 case 32:
17345 case 64:
17346 if (SSE_REGNO_P (regno))
17348 gcc_assert (!duplicated);
17349 putc (msize == 32 ? 'y' : 'z', file);
17350 reg = hi_reg_name[regno] + 1;
17351 break;
17353 goto normal;
17354 default:
17355 gcc_unreachable ();
17358 fputs (reg, file);
17360 /* Irritatingly, AMD extended registers use
17361 different naming convention: "r%d[bwd]" */
17362 if (REX_INT_REGNO_P (regno))
17364 gcc_assert (TARGET_64BIT);
17365 switch (msize)
17367 case 0:
17368 error ("extended registers have no high halves");
17369 break;
17370 case 1:
17371 putc ('b', file);
17372 break;
17373 case 2:
17374 putc ('w', file);
17375 break;
17376 case 4:
17377 putc ('d', file);
17378 break;
17379 case 8:
17380 /* no suffix */
17381 break;
17382 default:
17383 error ("unsupported operand size for extended register");
17384 break;
17386 return;
17389 if (duplicated)
17391 if (ASSEMBLER_DIALECT == ASM_ATT)
17392 fprintf (file, ", %%%s", reg);
17393 else
17394 fprintf (file, ", %s", reg);
17398 /* Meaning of CODE:
17399 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17400 C -- print opcode suffix for set/cmov insn.
17401 c -- like C, but print reversed condition
17402 F,f -- likewise, but for floating-point.
17403 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17404 otherwise nothing
17405 R -- print embeded rounding and sae.
17406 r -- print only sae.
17407 z -- print the opcode suffix for the size of the current operand.
17408 Z -- likewise, with special suffixes for x87 instructions.
17409 * -- print a star (in certain assembler syntax)
17410 A -- print an absolute memory reference.
17411 E -- print address with DImode register names if TARGET_64BIT.
17412 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17413 s -- print a shift double count, followed by the assemblers argument
17414 delimiter.
17415 b -- print the QImode name of the register for the indicated operand.
17416 %b0 would print %al if operands[0] is reg 0.
17417 w -- likewise, print the HImode name of the register.
17418 k -- likewise, print the SImode name of the register.
17419 q -- likewise, print the DImode name of the register.
17420 x -- likewise, print the V4SFmode name of the register.
17421 t -- likewise, print the V8SFmode name of the register.
17422 g -- likewise, print the V16SFmode name of the register.
17423 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17424 y -- print "st(0)" instead of "st" as a register.
17425 d -- print duplicated register operand for AVX instruction.
17426 D -- print condition for SSE cmp instruction.
17427 P -- if PIC, print an @PLT suffix.
17428 p -- print raw symbol name.
17429 X -- don't print any sort of PIC '@' suffix for a symbol.
17430 & -- print some in-use local-dynamic symbol name.
17431 H -- print a memory address offset by 8; used for sse high-parts
17432 Y -- print condition for XOP pcom* instruction.
17433 + -- print a branch hint as 'cs' or 'ds' prefix
17434 ; -- print a semicolon (after prefixes due to bug in older gas).
17435 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17436 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17437 ! -- print MPX prefix for jxx/call/ret instructions if required.
17440 void
17441 ix86_print_operand (FILE *file, rtx x, int code)
17443 if (code)
17445 switch (code)
17447 case 'A':
17448 switch (ASSEMBLER_DIALECT)
17450 case ASM_ATT:
17451 putc ('*', file);
17452 break;
17454 case ASM_INTEL:
17455 /* Intel syntax. For absolute addresses, registers should not
17456 be surrounded by braces. */
17457 if (!REG_P (x))
17459 putc ('[', file);
17460 ix86_print_operand (file, x, 0);
17461 putc (']', file);
17462 return;
17464 break;
17466 default:
17467 gcc_unreachable ();
17470 ix86_print_operand (file, x, 0);
17471 return;
17473 case 'E':
17474 /* Wrap address in an UNSPEC to declare special handling. */
17475 if (TARGET_64BIT)
17476 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17478 output_address (VOIDmode, x);
17479 return;
17481 case 'L':
17482 if (ASSEMBLER_DIALECT == ASM_ATT)
17483 putc ('l', file);
17484 return;
17486 case 'W':
17487 if (ASSEMBLER_DIALECT == ASM_ATT)
17488 putc ('w', file);
17489 return;
17491 case 'B':
17492 if (ASSEMBLER_DIALECT == ASM_ATT)
17493 putc ('b', file);
17494 return;
17496 case 'Q':
17497 if (ASSEMBLER_DIALECT == ASM_ATT)
17498 putc ('l', file);
17499 return;
17501 case 'S':
17502 if (ASSEMBLER_DIALECT == ASM_ATT)
17503 putc ('s', file);
17504 return;
17506 case 'T':
17507 if (ASSEMBLER_DIALECT == ASM_ATT)
17508 putc ('t', file);
17509 return;
17511 case 'O':
17512 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17513 if (ASSEMBLER_DIALECT != ASM_ATT)
17514 return;
17516 switch (GET_MODE_SIZE (GET_MODE (x)))
17518 case 2:
17519 putc ('w', file);
17520 break;
17522 case 4:
17523 putc ('l', file);
17524 break;
17526 case 8:
17527 putc ('q', file);
17528 break;
17530 default:
17531 output_operand_lossage ("invalid operand size for operand "
17532 "code 'O'");
17533 return;
17536 putc ('.', file);
17537 #endif
17538 return;
17540 case 'z':
17541 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17543 /* Opcodes don't get size suffixes if using Intel opcodes. */
17544 if (ASSEMBLER_DIALECT == ASM_INTEL)
17545 return;
17547 switch (GET_MODE_SIZE (GET_MODE (x)))
17549 case 1:
17550 putc ('b', file);
17551 return;
17553 case 2:
17554 putc ('w', file);
17555 return;
17557 case 4:
17558 putc ('l', file);
17559 return;
17561 case 8:
17562 putc ('q', file);
17563 return;
17565 default:
17566 output_operand_lossage ("invalid operand size for operand "
17567 "code 'z'");
17568 return;
17572 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17573 warning (0, "non-integer operand used with operand code 'z'");
17574 /* FALLTHRU */
17576 case 'Z':
17577 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17578 if (ASSEMBLER_DIALECT == ASM_INTEL)
17579 return;
17581 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17583 switch (GET_MODE_SIZE (GET_MODE (x)))
17585 case 2:
17586 #ifdef HAVE_AS_IX86_FILDS
17587 putc ('s', file);
17588 #endif
17589 return;
17591 case 4:
17592 putc ('l', file);
17593 return;
17595 case 8:
17596 #ifdef HAVE_AS_IX86_FILDQ
17597 putc ('q', file);
17598 #else
17599 fputs ("ll", file);
17600 #endif
17601 return;
17603 default:
17604 break;
17607 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17609 /* 387 opcodes don't get size suffixes
17610 if the operands are registers. */
17611 if (STACK_REG_P (x))
17612 return;
17614 switch (GET_MODE_SIZE (GET_MODE (x)))
17616 case 4:
17617 putc ('s', file);
17618 return;
17620 case 8:
17621 putc ('l', file);
17622 return;
17624 case 12:
17625 case 16:
17626 putc ('t', file);
17627 return;
17629 default:
17630 break;
17633 else
17635 output_operand_lossage ("invalid operand type used with "
17636 "operand code 'Z'");
17637 return;
17640 output_operand_lossage ("invalid operand size for operand code 'Z'");
17641 return;
17643 case 'd':
17644 case 'b':
17645 case 'w':
17646 case 'k':
17647 case 'q':
17648 case 'h':
17649 case 't':
17650 case 'g':
17651 case 'y':
17652 case 'x':
17653 case 'X':
17654 case 'P':
17655 case 'p':
17656 break;
17658 case 's':
17659 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17661 ix86_print_operand (file, x, 0);
17662 fputs (", ", file);
17664 return;
17666 case 'Y':
17667 switch (GET_CODE (x))
17669 case NE:
17670 fputs ("neq", file);
17671 break;
17672 case EQ:
17673 fputs ("eq", file);
17674 break;
17675 case GE:
17676 case GEU:
17677 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17678 break;
17679 case GT:
17680 case GTU:
17681 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17682 break;
17683 case LE:
17684 case LEU:
17685 fputs ("le", file);
17686 break;
17687 case LT:
17688 case LTU:
17689 fputs ("lt", file);
17690 break;
17691 case UNORDERED:
17692 fputs ("unord", file);
17693 break;
17694 case ORDERED:
17695 fputs ("ord", file);
17696 break;
17697 case UNEQ:
17698 fputs ("ueq", file);
17699 break;
17700 case UNGE:
17701 fputs ("nlt", file);
17702 break;
17703 case UNGT:
17704 fputs ("nle", file);
17705 break;
17706 case UNLE:
17707 fputs ("ule", file);
17708 break;
17709 case UNLT:
17710 fputs ("ult", file);
17711 break;
17712 case LTGT:
17713 fputs ("une", file);
17714 break;
17715 default:
17716 output_operand_lossage ("operand is not a condition code, "
17717 "invalid operand code 'Y'");
17718 return;
17720 return;
17722 case 'D':
17723 /* Little bit of braindamage here. The SSE compare instructions
17724 does use completely different names for the comparisons that the
17725 fp conditional moves. */
17726 switch (GET_CODE (x))
17728 case UNEQ:
17729 if (TARGET_AVX)
17731 fputs ("eq_us", file);
17732 break;
17734 /* FALLTHRU */
17735 case EQ:
17736 fputs ("eq", file);
17737 break;
17738 case UNLT:
17739 if (TARGET_AVX)
17741 fputs ("nge", file);
17742 break;
17744 /* FALLTHRU */
17745 case LT:
17746 fputs ("lt", file);
17747 break;
17748 case UNLE:
17749 if (TARGET_AVX)
17751 fputs ("ngt", file);
17752 break;
17754 /* FALLTHRU */
17755 case LE:
17756 fputs ("le", file);
17757 break;
17758 case UNORDERED:
17759 fputs ("unord", file);
17760 break;
17761 case LTGT:
17762 if (TARGET_AVX)
17764 fputs ("neq_oq", file);
17765 break;
17767 /* FALLTHRU */
17768 case NE:
17769 fputs ("neq", file);
17770 break;
17771 case GE:
17772 if (TARGET_AVX)
17774 fputs ("ge", file);
17775 break;
17777 /* FALLTHRU */
17778 case UNGE:
17779 fputs ("nlt", file);
17780 break;
17781 case GT:
17782 if (TARGET_AVX)
17784 fputs ("gt", file);
17785 break;
17787 /* FALLTHRU */
17788 case UNGT:
17789 fputs ("nle", file);
17790 break;
17791 case ORDERED:
17792 fputs ("ord", file);
17793 break;
17794 default:
17795 output_operand_lossage ("operand is not a condition code, "
17796 "invalid operand code 'D'");
17797 return;
17799 return;
17801 case 'F':
17802 case 'f':
17803 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17804 if (ASSEMBLER_DIALECT == ASM_ATT)
17805 putc ('.', file);
17806 gcc_fallthrough ();
17807 #endif
17809 case 'C':
17810 case 'c':
17811 if (!COMPARISON_P (x))
17813 output_operand_lossage ("operand is not a condition code, "
17814 "invalid operand code '%c'", code);
17815 return;
17817 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17818 code == 'c' || code == 'f',
17819 code == 'F' || code == 'f',
17820 file);
17821 return;
17823 case 'H':
17824 if (!offsettable_memref_p (x))
17826 output_operand_lossage ("operand is not an offsettable memory "
17827 "reference, invalid operand code 'H'");
17828 return;
17830 /* It doesn't actually matter what mode we use here, as we're
17831 only going to use this for printing. */
17832 x = adjust_address_nv (x, DImode, 8);
17833 /* Output 'qword ptr' for intel assembler dialect. */
17834 if (ASSEMBLER_DIALECT == ASM_INTEL)
17835 code = 'q';
17836 break;
17838 case 'K':
17839 if (!CONST_INT_P (x))
17841 output_operand_lossage ("operand is not an integer, invalid "
17842 "operand code 'K'");
17843 return;
17846 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17847 #ifdef HAVE_AS_IX86_HLE
17848 fputs ("xacquire ", file);
17849 #else
17850 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17851 #endif
17852 else if (INTVAL (x) & IX86_HLE_RELEASE)
17853 #ifdef HAVE_AS_IX86_HLE
17854 fputs ("xrelease ", file);
17855 #else
17856 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17857 #endif
17858 /* We do not want to print value of the operand. */
17859 return;
17861 case 'N':
17862 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17863 fputs ("{z}", file);
17864 return;
17866 case 'r':
17867 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17869 output_operand_lossage ("operand is not a specific integer, "
17870 "invalid operand code 'r'");
17871 return;
17874 if (ASSEMBLER_DIALECT == ASM_INTEL)
17875 fputs (", ", file);
17877 fputs ("{sae}", file);
17879 if (ASSEMBLER_DIALECT == ASM_ATT)
17880 fputs (", ", file);
17882 return;
17884 case 'R':
17885 if (!CONST_INT_P (x))
17887 output_operand_lossage ("operand is not an integer, invalid "
17888 "operand code 'R'");
17889 return;
17892 if (ASSEMBLER_DIALECT == ASM_INTEL)
17893 fputs (", ", file);
17895 switch (INTVAL (x))
17897 case ROUND_NEAREST_INT | ROUND_SAE:
17898 fputs ("{rn-sae}", file);
17899 break;
17900 case ROUND_NEG_INF | ROUND_SAE:
17901 fputs ("{rd-sae}", file);
17902 break;
17903 case ROUND_POS_INF | ROUND_SAE:
17904 fputs ("{ru-sae}", file);
17905 break;
17906 case ROUND_ZERO | ROUND_SAE:
17907 fputs ("{rz-sae}", file);
17908 break;
17909 default:
17910 output_operand_lossage ("operand is not a specific integer, "
17911 "invalid operand code 'R'");
17914 if (ASSEMBLER_DIALECT == ASM_ATT)
17915 fputs (", ", file);
17917 return;
17919 case '*':
17920 if (ASSEMBLER_DIALECT == ASM_ATT)
17921 putc ('*', file);
17922 return;
17924 case '&':
17926 const char *name = get_some_local_dynamic_name ();
17927 if (name == NULL)
17928 output_operand_lossage ("'%%&' used without any "
17929 "local dynamic TLS references");
17930 else
17931 assemble_name (file, name);
17932 return;
17935 case '+':
17937 rtx x;
17939 if (!optimize
17940 || optimize_function_for_size_p (cfun)
17941 || !TARGET_BRANCH_PREDICTION_HINTS)
17942 return;
17944 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17945 if (x)
17947 int pred_val = profile_probability::from_reg_br_prob_note
17948 (XINT (x, 0)).to_reg_br_prob_base ();
17950 if (pred_val < REG_BR_PROB_BASE * 45 / 100
17951 || pred_val > REG_BR_PROB_BASE * 55 / 100)
17953 bool taken = pred_val > REG_BR_PROB_BASE / 2;
17954 bool cputaken
17955 = final_forward_branch_p (current_output_insn) == 0;
17957 /* Emit hints only in the case default branch prediction
17958 heuristics would fail. */
17959 if (taken != cputaken)
17961 /* We use 3e (DS) prefix for taken branches and
17962 2e (CS) prefix for not taken branches. */
17963 if (taken)
17964 fputs ("ds ; ", file);
17965 else
17966 fputs ("cs ; ", file);
17970 return;
17973 case ';':
17974 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
17975 putc (';', file);
17976 #endif
17977 return;
17979 case '~':
17980 putc (TARGET_AVX2 ? 'i' : 'f', file);
17981 return;
17983 case '^':
17984 if (TARGET_64BIT && Pmode != word_mode)
17985 fputs ("addr32 ", file);
17986 return;
17988 case '!':
17989 if (ix86_bnd_prefixed_insn_p (current_output_insn))
17990 fputs ("bnd ", file);
17991 if (ix86_notrack_prefixed_insn_p (current_output_insn))
17992 fputs ("notrack ", file);
17993 return;
17995 default:
17996 output_operand_lossage ("invalid operand code '%c'", code);
18000 if (REG_P (x))
18001 print_reg (x, code, file);
18003 else if (MEM_P (x))
18005 rtx addr = XEXP (x, 0);
18007 /* No `byte ptr' prefix for call instructions ... */
18008 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18010 machine_mode mode = GET_MODE (x);
18011 const char *size;
18013 /* Check for explicit size override codes. */
18014 if (code == 'b')
18015 size = "BYTE";
18016 else if (code == 'w')
18017 size = "WORD";
18018 else if (code == 'k')
18019 size = "DWORD";
18020 else if (code == 'q')
18021 size = "QWORD";
18022 else if (code == 'x')
18023 size = "XMMWORD";
18024 else if (code == 't')
18025 size = "YMMWORD";
18026 else if (code == 'g')
18027 size = "ZMMWORD";
18028 else if (mode == BLKmode)
18029 /* ... or BLKmode operands, when not overridden. */
18030 size = NULL;
18031 else
18032 switch (GET_MODE_SIZE (mode))
18034 case 1: size = "BYTE"; break;
18035 case 2: size = "WORD"; break;
18036 case 4: size = "DWORD"; break;
18037 case 8: size = "QWORD"; break;
18038 case 12: size = "TBYTE"; break;
18039 case 16:
18040 if (mode == XFmode)
18041 size = "TBYTE";
18042 else
18043 size = "XMMWORD";
18044 break;
18045 case 32: size = "YMMWORD"; break;
18046 case 64: size = "ZMMWORD"; break;
18047 default:
18048 gcc_unreachable ();
18050 if (size)
18052 fputs (size, file);
18053 fputs (" PTR ", file);
18057 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18058 output_operand_lossage ("invalid constraints for operand");
18059 else
18060 ix86_print_operand_address_as
18061 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18064 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18066 long l;
18068 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18070 if (ASSEMBLER_DIALECT == ASM_ATT)
18071 putc ('$', file);
18072 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18073 if (code == 'q')
18074 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18075 (unsigned long long) (int) l);
18076 else
18077 fprintf (file, "0x%08x", (unsigned int) l);
18080 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18082 long l[2];
18084 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18086 if (ASSEMBLER_DIALECT == ASM_ATT)
18087 putc ('$', file);
18088 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18091 /* These float cases don't actually occur as immediate operands. */
18092 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18094 char dstr[30];
18096 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18097 fputs (dstr, file);
18100 else
18102 /* We have patterns that allow zero sets of memory, for instance.
18103 In 64-bit mode, we should probably support all 8-byte vectors,
18104 since we can in fact encode that into an immediate. */
18105 if (GET_CODE (x) == CONST_VECTOR)
18107 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18108 x = const0_rtx;
18111 if (code != 'P' && code != 'p')
18113 if (CONST_INT_P (x))
18115 if (ASSEMBLER_DIALECT == ASM_ATT)
18116 putc ('$', file);
18118 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18119 || GET_CODE (x) == LABEL_REF)
18121 if (ASSEMBLER_DIALECT == ASM_ATT)
18122 putc ('$', file);
18123 else
18124 fputs ("OFFSET FLAT:", file);
18127 if (CONST_INT_P (x))
18128 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18129 else if (flag_pic || MACHOPIC_INDIRECT)
18130 output_pic_addr_const (file, x, code);
18131 else
18132 output_addr_const (file, x);
18136 static bool
18137 ix86_print_operand_punct_valid_p (unsigned char code)
18139 return (code == '*' || code == '+' || code == '&' || code == ';'
18140 || code == '~' || code == '^' || code == '!');
18143 /* Print a memory operand whose address is ADDR. */
18145 static void
18146 ix86_print_operand_address_as (FILE *file, rtx addr,
18147 addr_space_t as, bool no_rip)
18149 struct ix86_address parts;
18150 rtx base, index, disp;
18151 int scale;
18152 int ok;
18153 bool vsib = false;
18154 int code = 0;
18156 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18158 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18159 gcc_assert (parts.index == NULL_RTX);
18160 parts.index = XVECEXP (addr, 0, 1);
18161 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18162 addr = XVECEXP (addr, 0, 0);
18163 vsib = true;
18165 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18167 gcc_assert (TARGET_64BIT);
18168 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18169 code = 'q';
18171 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18173 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18174 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18175 if (parts.base != NULL_RTX)
18177 parts.index = parts.base;
18178 parts.scale = 1;
18180 parts.base = XVECEXP (addr, 0, 0);
18181 addr = XVECEXP (addr, 0, 0);
18183 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18185 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18186 gcc_assert (parts.index == NULL_RTX);
18187 parts.index = XVECEXP (addr, 0, 1);
18188 addr = XVECEXP (addr, 0, 0);
18190 else
18191 ok = ix86_decompose_address (addr, &parts);
18193 gcc_assert (ok);
18195 base = parts.base;
18196 index = parts.index;
18197 disp = parts.disp;
18198 scale = parts.scale;
18200 if (ADDR_SPACE_GENERIC_P (as))
18201 as = parts.seg;
18202 else
18203 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18205 if (!ADDR_SPACE_GENERIC_P (as))
18207 const char *string;
18209 if (as == ADDR_SPACE_SEG_FS)
18210 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18211 else if (as == ADDR_SPACE_SEG_GS)
18212 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18213 else
18214 gcc_unreachable ();
18215 fputs (string, file);
18218 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18219 if (TARGET_64BIT && !base && !index && !no_rip)
18221 rtx symbol = disp;
18223 if (GET_CODE (disp) == CONST
18224 && GET_CODE (XEXP (disp, 0)) == PLUS
18225 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18226 symbol = XEXP (XEXP (disp, 0), 0);
18228 if (GET_CODE (symbol) == LABEL_REF
18229 || (GET_CODE (symbol) == SYMBOL_REF
18230 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18231 base = pc_rtx;
18234 if (!base && !index)
18236 /* Displacement only requires special attention. */
18237 if (CONST_INT_P (disp))
18239 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18240 fputs ("ds:", file);
18241 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18243 /* Load the external function address via the GOT slot to avoid PLT. */
18244 else if (GET_CODE (disp) == CONST
18245 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18246 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18247 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18248 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18249 output_pic_addr_const (file, disp, 0);
18250 else if (flag_pic)
18251 output_pic_addr_const (file, disp, 0);
18252 else
18253 output_addr_const (file, disp);
18255 else
18257 /* Print SImode register names to force addr32 prefix. */
18258 if (SImode_address_operand (addr, VOIDmode))
18260 if (flag_checking)
18262 gcc_assert (TARGET_64BIT);
18263 switch (GET_CODE (addr))
18265 case SUBREG:
18266 gcc_assert (GET_MODE (addr) == SImode);
18267 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18268 break;
18269 case ZERO_EXTEND:
18270 case AND:
18271 gcc_assert (GET_MODE (addr) == DImode);
18272 break;
18273 default:
18274 gcc_unreachable ();
18277 gcc_assert (!code);
18278 code = 'k';
18280 else if (code == 0
18281 && TARGET_X32
18282 && disp
18283 && CONST_INT_P (disp)
18284 && INTVAL (disp) < -16*1024*1024)
18286 /* X32 runs in 64-bit mode, where displacement, DISP, in
18287 address DISP(%r64), is encoded as 32-bit immediate sign-
18288 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18289 address is %r64 + 0xffffffffbffffd00. When %r64 <
18290 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18291 which is invalid for x32. The correct address is %r64
18292 - 0x40000300 == 0xf7ffdd64. To properly encode
18293 -0x40000300(%r64) for x32, we zero-extend negative
18294 displacement by forcing addr32 prefix which truncates
18295 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18296 zero-extend all negative displacements, including -1(%rsp).
18297 However, for small negative displacements, sign-extension
18298 won't cause overflow. We only zero-extend negative
18299 displacements if they < -16*1024*1024, which is also used
18300 to check legitimate address displacements for PIC. */
18301 code = 'k';
18304 /* Since the upper 32 bits of RSP are always zero for x32,
18305 we can encode %esp as %rsp to avoid 0x67 prefix if
18306 there is no index register. */
18307 if (TARGET_X32 && Pmode == SImode
18308 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18309 code = 'q';
18311 if (ASSEMBLER_DIALECT == ASM_ATT)
18313 if (disp)
18315 if (flag_pic)
18316 output_pic_addr_const (file, disp, 0);
18317 else if (GET_CODE (disp) == LABEL_REF)
18318 output_asm_label (disp);
18319 else
18320 output_addr_const (file, disp);
18323 putc ('(', file);
18324 if (base)
18325 print_reg (base, code, file);
18326 if (index)
18328 putc (',', file);
18329 print_reg (index, vsib ? 0 : code, file);
18330 if (scale != 1 || vsib)
18331 fprintf (file, ",%d", scale);
18333 putc (')', file);
18335 else
18337 rtx offset = NULL_RTX;
18339 if (disp)
18341 /* Pull out the offset of a symbol; print any symbol itself. */
18342 if (GET_CODE (disp) == CONST
18343 && GET_CODE (XEXP (disp, 0)) == PLUS
18344 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18346 offset = XEXP (XEXP (disp, 0), 1);
18347 disp = gen_rtx_CONST (VOIDmode,
18348 XEXP (XEXP (disp, 0), 0));
18351 if (flag_pic)
18352 output_pic_addr_const (file, disp, 0);
18353 else if (GET_CODE (disp) == LABEL_REF)
18354 output_asm_label (disp);
18355 else if (CONST_INT_P (disp))
18356 offset = disp;
18357 else
18358 output_addr_const (file, disp);
18361 putc ('[', file);
18362 if (base)
18364 print_reg (base, code, file);
18365 if (offset)
18367 if (INTVAL (offset) >= 0)
18368 putc ('+', file);
18369 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18372 else if (offset)
18373 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18374 else
18375 putc ('0', file);
18377 if (index)
18379 putc ('+', file);
18380 print_reg (index, vsib ? 0 : code, file);
18381 if (scale != 1 || vsib)
18382 fprintf (file, "*%d", scale);
18384 putc (']', file);
18389 static void
18390 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18392 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18395 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18397 static bool
18398 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18400 rtx op;
18402 if (GET_CODE (x) != UNSPEC)
18403 return false;
18405 op = XVECEXP (x, 0, 0);
18406 switch (XINT (x, 1))
18408 case UNSPEC_GOTOFF:
18409 output_addr_const (file, op);
18410 fputs ("@gotoff", file);
18411 break;
18412 case UNSPEC_GOTTPOFF:
18413 output_addr_const (file, op);
18414 /* FIXME: This might be @TPOFF in Sun ld. */
18415 fputs ("@gottpoff", file);
18416 break;
18417 case UNSPEC_TPOFF:
18418 output_addr_const (file, op);
18419 fputs ("@tpoff", file);
18420 break;
18421 case UNSPEC_NTPOFF:
18422 output_addr_const (file, op);
18423 if (TARGET_64BIT)
18424 fputs ("@tpoff", file);
18425 else
18426 fputs ("@ntpoff", file);
18427 break;
18428 case UNSPEC_DTPOFF:
18429 output_addr_const (file, op);
18430 fputs ("@dtpoff", file);
18431 break;
18432 case UNSPEC_GOTNTPOFF:
18433 output_addr_const (file, op);
18434 if (TARGET_64BIT)
18435 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18436 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18437 else
18438 fputs ("@gotntpoff", file);
18439 break;
18440 case UNSPEC_INDNTPOFF:
18441 output_addr_const (file, op);
18442 fputs ("@indntpoff", file);
18443 break;
18444 #if TARGET_MACHO
18445 case UNSPEC_MACHOPIC_OFFSET:
18446 output_addr_const (file, op);
18447 putc ('-', file);
18448 machopic_output_function_base_name (file);
18449 break;
18450 #endif
18452 default:
18453 return false;
18456 return true;
18459 /* Split one or more double-mode RTL references into pairs of half-mode
18460 references. The RTL can be REG, offsettable MEM, integer constant, or
18461 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18462 split and "num" is its length. lo_half and hi_half are output arrays
18463 that parallel "operands". */
18465 void
18466 split_double_mode (machine_mode mode, rtx operands[],
18467 int num, rtx lo_half[], rtx hi_half[])
18469 machine_mode half_mode;
18470 unsigned int byte;
18472 switch (mode)
18474 case E_TImode:
18475 half_mode = DImode;
18476 break;
18477 case E_DImode:
18478 half_mode = SImode;
18479 break;
18480 default:
18481 gcc_unreachable ();
18484 byte = GET_MODE_SIZE (half_mode);
18486 while (num--)
18488 rtx op = operands[num];
18490 /* simplify_subreg refuse to split volatile memory addresses,
18491 but we still have to handle it. */
18492 if (MEM_P (op))
18494 lo_half[num] = adjust_address (op, half_mode, 0);
18495 hi_half[num] = adjust_address (op, half_mode, byte);
18497 else
18499 lo_half[num] = simplify_gen_subreg (half_mode, op,
18500 GET_MODE (op) == VOIDmode
18501 ? mode : GET_MODE (op), 0);
18502 hi_half[num] = simplify_gen_subreg (half_mode, op,
18503 GET_MODE (op) == VOIDmode
18504 ? mode : GET_MODE (op), byte);
18509 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18510 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18511 is the expression of the binary operation. The output may either be
18512 emitted here, or returned to the caller, like all output_* functions.
18514 There is no guarantee that the operands are the same mode, as they
18515 might be within FLOAT or FLOAT_EXTEND expressions. */
18517 #ifndef SYSV386_COMPAT
18518 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18519 wants to fix the assemblers because that causes incompatibility
18520 with gcc. No-one wants to fix gcc because that causes
18521 incompatibility with assemblers... You can use the option of
18522 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18523 #define SYSV386_COMPAT 1
18524 #endif
18526 const char *
18527 output_387_binary_op (rtx_insn *insn, rtx *operands)
18529 static char buf[40];
18530 const char *p;
18531 bool is_sse
18532 = (SSE_REG_P (operands[0])
18533 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18535 if (is_sse)
18536 p = "%v";
18537 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18538 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18539 p = "fi";
18540 else
18541 p = "f";
18543 strcpy (buf, p);
18545 switch (GET_CODE (operands[3]))
18547 case PLUS:
18548 p = "add"; break;
18549 case MINUS:
18550 p = "sub"; break;
18551 case MULT:
18552 p = "mul"; break;
18553 case DIV:
18554 p = "div"; break;
18555 default:
18556 gcc_unreachable ();
18559 strcat (buf, p);
18561 if (is_sse)
18563 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18564 strcat (buf, p);
18566 if (TARGET_AVX)
18567 p = "\t{%2, %1, %0|%0, %1, %2}";
18568 else
18569 p = "\t{%2, %0|%0, %2}";
18571 strcat (buf, p);
18572 return buf;
18575 /* Even if we do not want to check the inputs, this documents input
18576 constraints. Which helps in understanding the following code. */
18577 if (flag_checking)
18579 if (STACK_REG_P (operands[0])
18580 && ((REG_P (operands[1])
18581 && REGNO (operands[0]) == REGNO (operands[1])
18582 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18583 || (REG_P (operands[2])
18584 && REGNO (operands[0]) == REGNO (operands[2])
18585 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18586 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18587 ; /* ok */
18588 else
18589 gcc_unreachable ();
18592 switch (GET_CODE (operands[3]))
18594 case MULT:
18595 case PLUS:
18596 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18597 std::swap (operands[1], operands[2]);
18599 /* know operands[0] == operands[1]. */
18601 if (MEM_P (operands[2]))
18603 p = "%Z2\t%2";
18604 break;
18607 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18609 if (STACK_TOP_P (operands[0]))
18610 /* How is it that we are storing to a dead operand[2]?
18611 Well, presumably operands[1] is dead too. We can't
18612 store the result to st(0) as st(0) gets popped on this
18613 instruction. Instead store to operands[2] (which I
18614 think has to be st(1)). st(1) will be popped later.
18615 gcc <= 2.8.1 didn't have this check and generated
18616 assembly code that the Unixware assembler rejected. */
18617 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18618 else
18619 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18620 break;
18623 if (STACK_TOP_P (operands[0]))
18624 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18625 else
18626 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18627 break;
18629 case MINUS:
18630 case DIV:
18631 if (MEM_P (operands[1]))
18633 p = "r%Z1\t%1";
18634 break;
18637 if (MEM_P (operands[2]))
18639 p = "%Z2\t%2";
18640 break;
18643 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18645 #if SYSV386_COMPAT
18646 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18647 derived assemblers, confusingly reverse the direction of
18648 the operation for fsub{r} and fdiv{r} when the
18649 destination register is not st(0). The Intel assembler
18650 doesn't have this brain damage. Read !SYSV386_COMPAT to
18651 figure out what the hardware really does. */
18652 if (STACK_TOP_P (operands[0]))
18653 p = "{p\t%0, %2|rp\t%2, %0}";
18654 else
18655 p = "{rp\t%2, %0|p\t%0, %2}";
18656 #else
18657 if (STACK_TOP_P (operands[0]))
18658 /* As above for fmul/fadd, we can't store to st(0). */
18659 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18660 else
18661 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18662 #endif
18663 break;
18666 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18668 #if SYSV386_COMPAT
18669 if (STACK_TOP_P (operands[0]))
18670 p = "{rp\t%0, %1|p\t%1, %0}";
18671 else
18672 p = "{p\t%1, %0|rp\t%0, %1}";
18673 #else
18674 if (STACK_TOP_P (operands[0]))
18675 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18676 else
18677 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18678 #endif
18679 break;
18682 if (STACK_TOP_P (operands[0]))
18684 if (STACK_TOP_P (operands[1]))
18685 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18686 else
18687 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18688 break;
18690 else if (STACK_TOP_P (operands[1]))
18692 #if SYSV386_COMPAT
18693 p = "{\t%1, %0|r\t%0, %1}";
18694 #else
18695 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18696 #endif
18698 else
18700 #if SYSV386_COMPAT
18701 p = "{r\t%2, %0|\t%0, %2}";
18702 #else
18703 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18704 #endif
18706 break;
18708 default:
18709 gcc_unreachable ();
18712 strcat (buf, p);
18713 return buf;
18716 /* Return needed mode for entity in optimize_mode_switching pass. */
18718 static int
18719 ix86_dirflag_mode_needed (rtx_insn *insn)
18721 if (CALL_P (insn))
18723 if (cfun->machine->func_type == TYPE_NORMAL)
18724 return X86_DIRFLAG_ANY;
18725 else
18726 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18727 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18730 if (recog_memoized (insn) < 0)
18731 return X86_DIRFLAG_ANY;
18733 if (get_attr_type (insn) == TYPE_STR)
18735 /* Emit cld instruction if stringops are used in the function. */
18736 if (cfun->machine->func_type == TYPE_NORMAL)
18737 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18738 else
18739 return X86_DIRFLAG_RESET;
18742 return X86_DIRFLAG_ANY;
18745 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
18747 static bool
18748 ix86_check_avx_upper_register (const_rtx exp)
18750 if (SUBREG_P (exp))
18751 exp = SUBREG_REG (exp);
18753 return (REG_P (exp)
18754 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
18755 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
18758 /* Return needed mode for entity in optimize_mode_switching pass. */
18760 static int
18761 ix86_avx_u128_mode_needed (rtx_insn *insn)
18763 if (CALL_P (insn))
18765 rtx link;
18767 /* Needed mode is set to AVX_U128_CLEAN if there are
18768 no 256bit or 512bit modes used in function arguments. */
18769 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18770 link;
18771 link = XEXP (link, 1))
18773 if (GET_CODE (XEXP (link, 0)) == USE)
18775 rtx arg = XEXP (XEXP (link, 0), 0);
18777 if (ix86_check_avx_upper_register (arg))
18778 return AVX_U128_DIRTY;
18782 return AVX_U128_CLEAN;
18785 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
18786 Hardware changes state only when a 256bit register is written to,
18787 but we need to prevent the compiler from moving optimal insertion
18788 point above eventual read from 256bit or 512 bit register. */
18789 subrtx_iterator::array_type array;
18790 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18791 if (ix86_check_avx_upper_register (*iter))
18792 return AVX_U128_DIRTY;
18794 return AVX_U128_ANY;
18797 /* Return mode that i387 must be switched into
18798 prior to the execution of insn. */
18800 static int
18801 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18803 enum attr_i387_cw mode;
18805 /* The mode UNINITIALIZED is used to store control word after a
18806 function call or ASM pattern. The mode ANY specify that function
18807 has no requirements on the control word and make no changes in the
18808 bits we are interested in. */
18810 if (CALL_P (insn)
18811 || (NONJUMP_INSN_P (insn)
18812 && (asm_noperands (PATTERN (insn)) >= 0
18813 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18814 return I387_CW_UNINITIALIZED;
18816 if (recog_memoized (insn) < 0)
18817 return I387_CW_ANY;
18819 mode = get_attr_i387_cw (insn);
18821 switch (entity)
18823 case I387_TRUNC:
18824 if (mode == I387_CW_TRUNC)
18825 return mode;
18826 break;
18828 case I387_FLOOR:
18829 if (mode == I387_CW_FLOOR)
18830 return mode;
18831 break;
18833 case I387_CEIL:
18834 if (mode == I387_CW_CEIL)
18835 return mode;
18836 break;
18838 case I387_MASK_PM:
18839 if (mode == I387_CW_MASK_PM)
18840 return mode;
18841 break;
18843 default:
18844 gcc_unreachable ();
18847 return I387_CW_ANY;
18850 /* Return mode that entity must be switched into
18851 prior to the execution of insn. */
18853 static int
18854 ix86_mode_needed (int entity, rtx_insn *insn)
18856 switch (entity)
18858 case X86_DIRFLAG:
18859 return ix86_dirflag_mode_needed (insn);
18860 case AVX_U128:
18861 return ix86_avx_u128_mode_needed (insn);
18862 case I387_TRUNC:
18863 case I387_FLOOR:
18864 case I387_CEIL:
18865 case I387_MASK_PM:
18866 return ix86_i387_mode_needed (entity, insn);
18867 default:
18868 gcc_unreachable ();
18870 return 0;
18873 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
18875 static void
18876 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
18878 if (ix86_check_avx_upper_register (dest))
18880 bool *used = (bool *) data;
18881 *used = true;
18885 /* Calculate mode of upper 128bit AVX registers after the insn. */
18887 static int
18888 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18890 rtx pat = PATTERN (insn);
18892 if (vzeroupper_operation (pat, VOIDmode)
18893 || vzeroall_operation (pat, VOIDmode))
18894 return AVX_U128_CLEAN;
18896 /* We know that state is clean after CALL insn if there are no
18897 256bit or 512bit registers used in the function return register. */
18898 if (CALL_P (insn))
18900 bool avx_upper_reg_found = false;
18901 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
18903 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18906 /* Otherwise, return current mode. Remember that if insn
18907 references AVX 256bit or 512bit registers, the mode was already
18908 changed to DIRTY from MODE_NEEDED. */
18909 return mode;
18912 /* Return the mode that an insn results in. */
18914 static int
18915 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18917 switch (entity)
18919 case X86_DIRFLAG:
18920 return mode;
18921 case AVX_U128:
18922 return ix86_avx_u128_mode_after (mode, insn);
18923 case I387_TRUNC:
18924 case I387_FLOOR:
18925 case I387_CEIL:
18926 case I387_MASK_PM:
18927 return mode;
18928 default:
18929 gcc_unreachable ();
18933 static int
18934 ix86_dirflag_mode_entry (void)
18936 /* For TARGET_CLD or in the interrupt handler we can't assume
18937 direction flag state at function entry. */
18938 if (TARGET_CLD
18939 || cfun->machine->func_type != TYPE_NORMAL)
18940 return X86_DIRFLAG_ANY;
18942 return X86_DIRFLAG_RESET;
18945 static int
18946 ix86_avx_u128_mode_entry (void)
18948 tree arg;
18950 /* Entry mode is set to AVX_U128_DIRTY if there are
18951 256bit or 512bit modes used in function arguments. */
18952 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
18953 arg = TREE_CHAIN (arg))
18955 rtx incoming = DECL_INCOMING_RTL (arg);
18957 if (incoming && ix86_check_avx_upper_register (incoming))
18958 return AVX_U128_DIRTY;
18961 return AVX_U128_CLEAN;
18964 /* Return a mode that ENTITY is assumed to be
18965 switched to at function entry. */
18967 static int
18968 ix86_mode_entry (int entity)
18970 switch (entity)
18972 case X86_DIRFLAG:
18973 return ix86_dirflag_mode_entry ();
18974 case AVX_U128:
18975 return ix86_avx_u128_mode_entry ();
18976 case I387_TRUNC:
18977 case I387_FLOOR:
18978 case I387_CEIL:
18979 case I387_MASK_PM:
18980 return I387_CW_ANY;
18981 default:
18982 gcc_unreachable ();
18986 static int
18987 ix86_avx_u128_mode_exit (void)
18989 rtx reg = crtl->return_rtx;
18991 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
18992 or 512 bit modes used in the function return register. */
18993 if (reg && ix86_check_avx_upper_register (reg))
18994 return AVX_U128_DIRTY;
18996 return AVX_U128_CLEAN;
18999 /* Return a mode that ENTITY is assumed to be
19000 switched to at function exit. */
19002 static int
19003 ix86_mode_exit (int entity)
19005 switch (entity)
19007 case X86_DIRFLAG:
19008 return X86_DIRFLAG_ANY;
19009 case AVX_U128:
19010 return ix86_avx_u128_mode_exit ();
19011 case I387_TRUNC:
19012 case I387_FLOOR:
19013 case I387_CEIL:
19014 case I387_MASK_PM:
19015 return I387_CW_ANY;
19016 default:
19017 gcc_unreachable ();
19021 static int
19022 ix86_mode_priority (int, int n)
19024 return n;
19027 /* Output code to initialize control word copies used by trunc?f?i and
19028 rounding patterns. CURRENT_MODE is set to current control word,
19029 while NEW_MODE is set to new control word. */
19031 static void
19032 emit_i387_cw_initialization (int mode)
19034 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19035 rtx new_mode;
19037 enum ix86_stack_slot slot;
19039 rtx reg = gen_reg_rtx (HImode);
19041 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19042 emit_move_insn (reg, copy_rtx (stored_mode));
19044 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19045 || optimize_insn_for_size_p ())
19047 switch (mode)
19049 case I387_CW_TRUNC:
19050 /* round toward zero (truncate) */
19051 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19052 slot = SLOT_CW_TRUNC;
19053 break;
19055 case I387_CW_FLOOR:
19056 /* round down toward -oo */
19057 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19058 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19059 slot = SLOT_CW_FLOOR;
19060 break;
19062 case I387_CW_CEIL:
19063 /* round up toward +oo */
19064 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19065 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19066 slot = SLOT_CW_CEIL;
19067 break;
19069 case I387_CW_MASK_PM:
19070 /* mask precision exception for nearbyint() */
19071 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19072 slot = SLOT_CW_MASK_PM;
19073 break;
19075 default:
19076 gcc_unreachable ();
19079 else
19081 switch (mode)
19083 case I387_CW_TRUNC:
19084 /* round toward zero (truncate) */
19085 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19086 slot = SLOT_CW_TRUNC;
19087 break;
19089 case I387_CW_FLOOR:
19090 /* round down toward -oo */
19091 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19092 slot = SLOT_CW_FLOOR;
19093 break;
19095 case I387_CW_CEIL:
19096 /* round up toward +oo */
19097 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19098 slot = SLOT_CW_CEIL;
19099 break;
19101 case I387_CW_MASK_PM:
19102 /* mask precision exception for nearbyint() */
19103 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19104 slot = SLOT_CW_MASK_PM;
19105 break;
19107 default:
19108 gcc_unreachable ();
19112 gcc_assert (slot < MAX_386_STACK_LOCALS);
19114 new_mode = assign_386_stack_local (HImode, slot);
19115 emit_move_insn (new_mode, reg);
19118 /* Emit vzeroupper. */
19120 void
19121 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19123 int i;
19125 /* Cancel automatic vzeroupper insertion if there are
19126 live call-saved SSE registers at the insertion point. */
19128 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19129 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19130 return;
19132 if (TARGET_64BIT)
19133 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19134 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19135 return;
19137 emit_insn (gen_avx_vzeroupper ());
19140 /* Generate one or more insns to set ENTITY to MODE. */
19142 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19143 is the set of hard registers live at the point where the insn(s)
19144 are to be inserted. */
19146 static void
19147 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19148 HARD_REG_SET regs_live)
19150 switch (entity)
19152 case X86_DIRFLAG:
19153 if (mode == X86_DIRFLAG_RESET)
19154 emit_insn (gen_cld ());
19155 break;
19156 case AVX_U128:
19157 if (mode == AVX_U128_CLEAN)
19158 ix86_avx_emit_vzeroupper (regs_live);
19159 break;
19160 case I387_TRUNC:
19161 case I387_FLOOR:
19162 case I387_CEIL:
19163 case I387_MASK_PM:
19164 if (mode != I387_CW_ANY
19165 && mode != I387_CW_UNINITIALIZED)
19166 emit_i387_cw_initialization (mode);
19167 break;
19168 default:
19169 gcc_unreachable ();
19173 /* Output code for INSN to convert a float to a signed int. OPERANDS
19174 are the insn operands. The output may be [HSD]Imode and the input
19175 operand may be [SDX]Fmode. */
19177 const char *
19178 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19180 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19181 bool dimode_p = GET_MODE (operands[0]) == DImode;
19182 int round_mode = get_attr_i387_cw (insn);
19184 static char buf[40];
19185 const char *p;
19187 /* Jump through a hoop or two for DImode, since the hardware has no
19188 non-popping instruction. We used to do this a different way, but
19189 that was somewhat fragile and broke with post-reload splitters. */
19190 if ((dimode_p || fisttp) && !stack_top_dies)
19191 output_asm_insn ("fld\t%y1", operands);
19193 gcc_assert (STACK_TOP_P (operands[1]));
19194 gcc_assert (MEM_P (operands[0]));
19195 gcc_assert (GET_MODE (operands[1]) != TFmode);
19197 if (fisttp)
19198 return "fisttp%Z0\t%0";
19200 strcpy (buf, "fist");
19202 if (round_mode != I387_CW_ANY)
19203 output_asm_insn ("fldcw\t%3", operands);
19205 p = "p%Z0\t%0";
19206 strcat (buf, p + !(stack_top_dies || dimode_p));
19208 output_asm_insn (buf, operands);
19210 if (round_mode != I387_CW_ANY)
19211 output_asm_insn ("fldcw\t%2", operands);
19213 return "";
19216 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19217 have the values zero or one, indicates the ffreep insn's operand
19218 from the OPERANDS array. */
19220 static const char *
19221 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19223 if (TARGET_USE_FFREEP)
19224 #ifdef HAVE_AS_IX86_FFREEP
19225 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19226 #else
19228 static char retval[32];
19229 int regno = REGNO (operands[opno]);
19231 gcc_assert (STACK_REGNO_P (regno));
19233 regno -= FIRST_STACK_REG;
19235 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19236 return retval;
19238 #endif
19240 return opno ? "fstp\t%y1" : "fstp\t%y0";
19244 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19245 should be used. UNORDERED_P is true when fucom should be used. */
19247 const char *
19248 output_fp_compare (rtx_insn *insn, rtx *operands,
19249 bool eflags_p, bool unordered_p)
19251 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19252 bool stack_top_dies;
19254 static char buf[40];
19255 const char *p;
19257 gcc_assert (STACK_TOP_P (xops[0]));
19259 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19261 if (eflags_p)
19263 p = unordered_p ? "fucomi" : "fcomi";
19264 strcpy (buf, p);
19266 p = "p\t{%y1, %0|%0, %y1}";
19267 strcat (buf, p + !stack_top_dies);
19269 return buf;
19272 if (STACK_REG_P (xops[1])
19273 && stack_top_dies
19274 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19276 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19278 /* If both the top of the 387 stack die, and the other operand
19279 is also a stack register that dies, then this must be a
19280 `fcompp' float compare. */
19281 p = unordered_p ? "fucompp" : "fcompp";
19282 strcpy (buf, p);
19284 else if (const0_operand (xops[1], VOIDmode))
19286 gcc_assert (!unordered_p);
19287 strcpy (buf, "ftst");
19289 else
19291 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19293 gcc_assert (!unordered_p);
19294 p = "ficom";
19296 else
19297 p = unordered_p ? "fucom" : "fcom";
19299 strcpy (buf, p);
19301 p = "p%Z2\t%y2";
19302 strcat (buf, p + !stack_top_dies);
19305 output_asm_insn (buf, operands);
19306 return "fnstsw\t%0";
19309 void
19310 ix86_output_addr_vec_elt (FILE *file, int value)
19312 const char *directive = ASM_LONG;
19314 #ifdef ASM_QUAD
19315 if (TARGET_LP64)
19316 directive = ASM_QUAD;
19317 #else
19318 gcc_assert (!TARGET_64BIT);
19319 #endif
19321 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19324 void
19325 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19327 const char *directive = ASM_LONG;
19329 #ifdef ASM_QUAD
19330 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19331 directive = ASM_QUAD;
19332 #else
19333 gcc_assert (!TARGET_64BIT);
19334 #endif
19335 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19336 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19337 fprintf (file, "%s%s%d-%s%d\n",
19338 directive, LPREFIX, value, LPREFIX, rel);
19339 else if (HAVE_AS_GOTOFF_IN_DATA)
19340 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19341 #if TARGET_MACHO
19342 else if (TARGET_MACHO)
19344 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19345 machopic_output_function_base_name (file);
19346 putc ('\n', file);
19348 #endif
19349 else
19350 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19351 GOT_SYMBOL_NAME, LPREFIX, value);
19354 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19355 for the target. */
19357 void
19358 ix86_expand_clear (rtx dest)
19360 rtx tmp;
19362 /* We play register width games, which are only valid after reload. */
19363 gcc_assert (reload_completed);
19365 /* Avoid HImode and its attendant prefix byte. */
19366 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19367 dest = gen_rtx_REG (SImode, REGNO (dest));
19368 tmp = gen_rtx_SET (dest, const0_rtx);
19370 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19372 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19373 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19376 emit_insn (tmp);
19379 void
19380 ix86_expand_move (machine_mode mode, rtx operands[])
19382 rtx op0, op1;
19383 rtx tmp, addend = NULL_RTX;
19384 enum tls_model model;
19386 op0 = operands[0];
19387 op1 = operands[1];
19389 switch (GET_CODE (op1))
19391 case CONST:
19392 tmp = XEXP (op1, 0);
19394 if (GET_CODE (tmp) != PLUS
19395 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19396 break;
19398 op1 = XEXP (tmp, 0);
19399 addend = XEXP (tmp, 1);
19400 /* FALLTHRU */
19402 case SYMBOL_REF:
19403 model = SYMBOL_REF_TLS_MODEL (op1);
19405 if (model)
19406 op1 = legitimize_tls_address (op1, model, true);
19407 else if (ix86_force_load_from_GOT_p (op1))
19409 /* Load the external function address via GOT slot to avoid PLT. */
19410 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19411 (TARGET_64BIT
19412 ? UNSPEC_GOTPCREL
19413 : UNSPEC_GOT));
19414 op1 = gen_rtx_CONST (Pmode, op1);
19415 op1 = gen_const_mem (Pmode, op1);
19416 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19418 else
19420 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19421 if (tmp)
19423 op1 = tmp;
19424 if (!addend)
19425 break;
19427 else
19429 op1 = operands[1];
19430 break;
19434 if (addend)
19436 op1 = force_operand (op1, NULL_RTX);
19437 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19438 op0, 1, OPTAB_DIRECT);
19440 else
19441 op1 = force_operand (op1, op0);
19443 if (op1 == op0)
19444 return;
19446 op1 = convert_to_mode (mode, op1, 1);
19448 default:
19449 break;
19452 if ((flag_pic || MACHOPIC_INDIRECT)
19453 && symbolic_operand (op1, mode))
19455 if (TARGET_MACHO && !TARGET_64BIT)
19457 #if TARGET_MACHO
19458 /* dynamic-no-pic */
19459 if (MACHOPIC_INDIRECT)
19461 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19462 ? op0 : gen_reg_rtx (Pmode);
19463 op1 = machopic_indirect_data_reference (op1, temp);
19464 if (MACHOPIC_PURE)
19465 op1 = machopic_legitimize_pic_address (op1, mode,
19466 temp == op1 ? 0 : temp);
19468 if (op0 != op1 && GET_CODE (op0) != MEM)
19470 rtx insn = gen_rtx_SET (op0, op1);
19471 emit_insn (insn);
19472 return;
19474 if (GET_CODE (op0) == MEM)
19475 op1 = force_reg (Pmode, op1);
19476 else
19478 rtx temp = op0;
19479 if (GET_CODE (temp) != REG)
19480 temp = gen_reg_rtx (Pmode);
19481 temp = legitimize_pic_address (op1, temp);
19482 if (temp == op0)
19483 return;
19484 op1 = temp;
19486 /* dynamic-no-pic */
19487 #endif
19489 else
19491 if (MEM_P (op0))
19492 op1 = force_reg (mode, op1);
19493 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19495 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19496 op1 = legitimize_pic_address (op1, reg);
19497 if (op0 == op1)
19498 return;
19499 op1 = convert_to_mode (mode, op1, 1);
19503 else
19505 if (MEM_P (op0)
19506 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19507 || !push_operand (op0, mode))
19508 && MEM_P (op1))
19509 op1 = force_reg (mode, op1);
19511 if (push_operand (op0, mode)
19512 && ! general_no_elim_operand (op1, mode))
19513 op1 = copy_to_mode_reg (mode, op1);
19515 /* Force large constants in 64bit compilation into register
19516 to get them CSEed. */
19517 if (can_create_pseudo_p ()
19518 && (mode == DImode) && TARGET_64BIT
19519 && immediate_operand (op1, mode)
19520 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19521 && !register_operand (op0, mode)
19522 && optimize)
19523 op1 = copy_to_mode_reg (mode, op1);
19525 if (can_create_pseudo_p ()
19526 && CONST_DOUBLE_P (op1))
19528 /* If we are loading a floating point constant to a register,
19529 force the value to memory now, since we'll get better code
19530 out the back end. */
19532 op1 = validize_mem (force_const_mem (mode, op1));
19533 if (!register_operand (op0, mode))
19535 rtx temp = gen_reg_rtx (mode);
19536 emit_insn (gen_rtx_SET (temp, op1));
19537 emit_move_insn (op0, temp);
19538 return;
19543 emit_insn (gen_rtx_SET (op0, op1));
19546 void
19547 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19549 rtx op0 = operands[0], op1 = operands[1];
19550 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19551 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19552 unsigned int align = (TARGET_IAMCU
19553 ? GET_MODE_BITSIZE (mode)
19554 : GET_MODE_ALIGNMENT (mode));
19556 if (push_operand (op0, VOIDmode))
19557 op0 = emit_move_resolve_push (mode, op0);
19559 /* Force constants other than zero into memory. We do not know how
19560 the instructions used to build constants modify the upper 64 bits
19561 of the register, once we have that information we may be able
19562 to handle some of them more efficiently. */
19563 if (can_create_pseudo_p ()
19564 && (CONSTANT_P (op1)
19565 || (SUBREG_P (op1)
19566 && CONSTANT_P (SUBREG_REG (op1))))
19567 && ((register_operand (op0, mode)
19568 && !standard_sse_constant_p (op1, mode))
19569 /* ix86_expand_vector_move_misalign() does not like constants. */
19570 || (SSE_REG_MODE_P (mode)
19571 && MEM_P (op0)
19572 && MEM_ALIGN (op0) < align)))
19574 if (SUBREG_P (op1))
19576 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19577 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19578 if (r)
19579 r = validize_mem (r);
19580 else
19581 r = force_reg (imode, SUBREG_REG (op1));
19582 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19584 else
19585 op1 = validize_mem (force_const_mem (mode, op1));
19588 /* We need to check memory alignment for SSE mode since attribute
19589 can make operands unaligned. */
19590 if (can_create_pseudo_p ()
19591 && SSE_REG_MODE_P (mode)
19592 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19593 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19595 rtx tmp[2];
19597 /* ix86_expand_vector_move_misalign() does not like both
19598 arguments in memory. */
19599 if (!register_operand (op0, mode)
19600 && !register_operand (op1, mode))
19601 op1 = force_reg (mode, op1);
19603 tmp[0] = op0; tmp[1] = op1;
19604 ix86_expand_vector_move_misalign (mode, tmp);
19605 return;
19608 /* Make operand1 a register if it isn't already. */
19609 if (can_create_pseudo_p ()
19610 && !register_operand (op0, mode)
19611 && !register_operand (op1, mode))
19613 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19614 return;
19617 emit_insn (gen_rtx_SET (op0, op1));
19620 /* Split 32-byte AVX unaligned load and store if needed. */
19622 static void
19623 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19625 rtx m;
19626 rtx (*extract) (rtx, rtx, rtx);
19627 machine_mode mode;
19629 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19630 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19632 emit_insn (gen_rtx_SET (op0, op1));
19633 return;
19636 rtx orig_op0 = NULL_RTX;
19637 mode = GET_MODE (op0);
19638 switch (GET_MODE_CLASS (mode))
19640 case MODE_VECTOR_INT:
19641 case MODE_INT:
19642 if (mode != V32QImode)
19644 if (!MEM_P (op0))
19646 orig_op0 = op0;
19647 op0 = gen_reg_rtx (V32QImode);
19649 else
19650 op0 = gen_lowpart (V32QImode, op0);
19651 op1 = gen_lowpart (V32QImode, op1);
19652 mode = V32QImode;
19654 break;
19655 case MODE_VECTOR_FLOAT:
19656 break;
19657 default:
19658 gcc_unreachable ();
19661 switch (mode)
19663 default:
19664 gcc_unreachable ();
19665 case E_V32QImode:
19666 extract = gen_avx_vextractf128v32qi;
19667 mode = V16QImode;
19668 break;
19669 case E_V8SFmode:
19670 extract = gen_avx_vextractf128v8sf;
19671 mode = V4SFmode;
19672 break;
19673 case E_V4DFmode:
19674 extract = gen_avx_vextractf128v4df;
19675 mode = V2DFmode;
19676 break;
19679 if (MEM_P (op1))
19681 rtx r = gen_reg_rtx (mode);
19682 m = adjust_address (op1, mode, 0);
19683 emit_move_insn (r, m);
19684 m = adjust_address (op1, mode, 16);
19685 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19686 emit_move_insn (op0, r);
19688 else if (MEM_P (op0))
19690 m = adjust_address (op0, mode, 0);
19691 emit_insn (extract (m, op1, const0_rtx));
19692 m = adjust_address (op0, mode, 16);
19693 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19695 else
19696 gcc_unreachable ();
19698 if (orig_op0)
19699 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19702 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19703 straight to ix86_expand_vector_move. */
19704 /* Code generation for scalar reg-reg moves of single and double precision data:
19705 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19706 movaps reg, reg
19707 else
19708 movss reg, reg
19709 if (x86_sse_partial_reg_dependency == true)
19710 movapd reg, reg
19711 else
19712 movsd reg, reg
19714 Code generation for scalar loads of double precision data:
19715 if (x86_sse_split_regs == true)
19716 movlpd mem, reg (gas syntax)
19717 else
19718 movsd mem, reg
19720 Code generation for unaligned packed loads of single precision data
19721 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19722 if (x86_sse_unaligned_move_optimal)
19723 movups mem, reg
19725 if (x86_sse_partial_reg_dependency == true)
19727 xorps reg, reg
19728 movlps mem, reg
19729 movhps mem+8, reg
19731 else
19733 movlps mem, reg
19734 movhps mem+8, reg
19737 Code generation for unaligned packed loads of double precision data
19738 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19739 if (x86_sse_unaligned_move_optimal)
19740 movupd mem, reg
19742 if (x86_sse_split_regs == true)
19744 movlpd mem, reg
19745 movhpd mem+8, reg
19747 else
19749 movsd mem, reg
19750 movhpd mem+8, reg
19754 void
19755 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19757 rtx op0, op1, m;
19759 op0 = operands[0];
19760 op1 = operands[1];
19762 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19763 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19765 emit_insn (gen_rtx_SET (op0, op1));
19766 return;
19769 if (TARGET_AVX)
19771 if (GET_MODE_SIZE (mode) == 32)
19772 ix86_avx256_split_vector_move_misalign (op0, op1);
19773 else
19774 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19775 emit_insn (gen_rtx_SET (op0, op1));
19776 return;
19779 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19780 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19782 emit_insn (gen_rtx_SET (op0, op1));
19783 return;
19786 /* ??? If we have typed data, then it would appear that using
19787 movdqu is the only way to get unaligned data loaded with
19788 integer type. */
19789 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19791 emit_insn (gen_rtx_SET (op0, op1));
19792 return;
19795 if (MEM_P (op1))
19797 if (TARGET_SSE2 && mode == V2DFmode)
19799 rtx zero;
19801 /* When SSE registers are split into halves, we can avoid
19802 writing to the top half twice. */
19803 if (TARGET_SSE_SPLIT_REGS)
19805 emit_clobber (op0);
19806 zero = op0;
19808 else
19810 /* ??? Not sure about the best option for the Intel chips.
19811 The following would seem to satisfy; the register is
19812 entirely cleared, breaking the dependency chain. We
19813 then store to the upper half, with a dependency depth
19814 of one. A rumor has it that Intel recommends two movsd
19815 followed by an unpacklpd, but this is unconfirmed. And
19816 given that the dependency depth of the unpacklpd would
19817 still be one, I'm not sure why this would be better. */
19818 zero = CONST0_RTX (V2DFmode);
19821 m = adjust_address (op1, DFmode, 0);
19822 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19823 m = adjust_address (op1, DFmode, 8);
19824 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19826 else
19828 rtx t;
19830 if (mode != V4SFmode)
19831 t = gen_reg_rtx (V4SFmode);
19832 else
19833 t = op0;
19835 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19836 emit_move_insn (t, CONST0_RTX (V4SFmode));
19837 else
19838 emit_clobber (t);
19840 m = adjust_address (op1, V2SFmode, 0);
19841 emit_insn (gen_sse_loadlps (t, t, m));
19842 m = adjust_address (op1, V2SFmode, 8);
19843 emit_insn (gen_sse_loadhps (t, t, m));
19844 if (mode != V4SFmode)
19845 emit_move_insn (op0, gen_lowpart (mode, t));
19848 else if (MEM_P (op0))
19850 if (TARGET_SSE2 && mode == V2DFmode)
19852 m = adjust_address (op0, DFmode, 0);
19853 emit_insn (gen_sse2_storelpd (m, op1));
19854 m = adjust_address (op0, DFmode, 8);
19855 emit_insn (gen_sse2_storehpd (m, op1));
19857 else
19859 if (mode != V4SFmode)
19860 op1 = gen_lowpart (V4SFmode, op1);
19862 m = adjust_address (op0, V2SFmode, 0);
19863 emit_insn (gen_sse_storelps (m, op1));
19864 m = adjust_address (op0, V2SFmode, 8);
19865 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19868 else
19869 gcc_unreachable ();
19872 /* Helper function of ix86_fixup_binary_operands to canonicalize
19873 operand order. Returns true if the operands should be swapped. */
19875 static bool
19876 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19877 rtx operands[])
19879 rtx dst = operands[0];
19880 rtx src1 = operands[1];
19881 rtx src2 = operands[2];
19883 /* If the operation is not commutative, we can't do anything. */
19884 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
19885 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
19886 return false;
19888 /* Highest priority is that src1 should match dst. */
19889 if (rtx_equal_p (dst, src1))
19890 return false;
19891 if (rtx_equal_p (dst, src2))
19892 return true;
19894 /* Next highest priority is that immediate constants come second. */
19895 if (immediate_operand (src2, mode))
19896 return false;
19897 if (immediate_operand (src1, mode))
19898 return true;
19900 /* Lowest priority is that memory references should come second. */
19901 if (MEM_P (src2))
19902 return false;
19903 if (MEM_P (src1))
19904 return true;
19906 return false;
19910 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19911 destination to use for the operation. If different from the true
19912 destination in operands[0], a copy operation will be required. */
19915 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19916 rtx operands[])
19918 rtx dst = operands[0];
19919 rtx src1 = operands[1];
19920 rtx src2 = operands[2];
19922 /* Canonicalize operand order. */
19923 if (ix86_swap_binary_operands_p (code, mode, operands))
19925 /* It is invalid to swap operands of different modes. */
19926 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19928 std::swap (src1, src2);
19931 /* Both source operands cannot be in memory. */
19932 if (MEM_P (src1) && MEM_P (src2))
19934 /* Optimization: Only read from memory once. */
19935 if (rtx_equal_p (src1, src2))
19937 src2 = force_reg (mode, src2);
19938 src1 = src2;
19940 else if (rtx_equal_p (dst, src1))
19941 src2 = force_reg (mode, src2);
19942 else
19943 src1 = force_reg (mode, src1);
19946 /* If the destination is memory, and we do not have matching source
19947 operands, do things in registers. */
19948 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19949 dst = gen_reg_rtx (mode);
19951 /* Source 1 cannot be a constant. */
19952 if (CONSTANT_P (src1))
19953 src1 = force_reg (mode, src1);
19955 /* Source 1 cannot be a non-matching memory. */
19956 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19957 src1 = force_reg (mode, src1);
19959 /* Improve address combine. */
19960 if (code == PLUS
19961 && GET_MODE_CLASS (mode) == MODE_INT
19962 && MEM_P (src2))
19963 src2 = force_reg (mode, src2);
19965 operands[1] = src1;
19966 operands[2] = src2;
19967 return dst;
19970 /* Similarly, but assume that the destination has already been
19971 set up properly. */
19973 void
19974 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
19975 machine_mode mode, rtx operands[])
19977 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
19978 gcc_assert (dst == operands[0]);
19981 /* Attempt to expand a binary operator. Make the expansion closer to the
19982 actual machine, then just general_operand, which will allow 3 separate
19983 memory references (one output, two input) in a single insn. */
19985 void
19986 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
19987 rtx operands[])
19989 rtx src1, src2, dst, op, clob;
19991 dst = ix86_fixup_binary_operands (code, mode, operands);
19992 src1 = operands[1];
19993 src2 = operands[2];
19995 /* Emit the instruction. */
19997 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
19999 if (reload_completed
20000 && code == PLUS
20001 && !rtx_equal_p (dst, src1))
20003 /* This is going to be an LEA; avoid splitting it later. */
20004 emit_insn (op);
20006 else
20008 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20009 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20012 /* Fix up the destination if needed. */
20013 if (dst != operands[0])
20014 emit_move_insn (operands[0], dst);
20017 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20018 the given OPERANDS. */
20020 void
20021 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20022 rtx operands[])
20024 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20025 if (SUBREG_P (operands[1]))
20027 op1 = operands[1];
20028 op2 = operands[2];
20030 else if (SUBREG_P (operands[2]))
20032 op1 = operands[2];
20033 op2 = operands[1];
20035 /* Optimize (__m128i) d | (__m128i) e and similar code
20036 when d and e are float vectors into float vector logical
20037 insn. In C/C++ without using intrinsics there is no other way
20038 to express vector logical operation on float vectors than
20039 to cast them temporarily to integer vectors. */
20040 if (op1
20041 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20042 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20043 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20044 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20045 && SUBREG_BYTE (op1) == 0
20046 && (GET_CODE (op2) == CONST_VECTOR
20047 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20048 && SUBREG_BYTE (op2) == 0))
20049 && can_create_pseudo_p ())
20051 rtx dst;
20052 switch (GET_MODE (SUBREG_REG (op1)))
20054 case E_V4SFmode:
20055 case E_V8SFmode:
20056 case E_V16SFmode:
20057 case E_V2DFmode:
20058 case E_V4DFmode:
20059 case E_V8DFmode:
20060 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20061 if (GET_CODE (op2) == CONST_VECTOR)
20063 op2 = gen_lowpart (GET_MODE (dst), op2);
20064 op2 = force_reg (GET_MODE (dst), op2);
20066 else
20068 op1 = operands[1];
20069 op2 = SUBREG_REG (operands[2]);
20070 if (!vector_operand (op2, GET_MODE (dst)))
20071 op2 = force_reg (GET_MODE (dst), op2);
20073 op1 = SUBREG_REG (op1);
20074 if (!vector_operand (op1, GET_MODE (dst)))
20075 op1 = force_reg (GET_MODE (dst), op1);
20076 emit_insn (gen_rtx_SET (dst,
20077 gen_rtx_fmt_ee (code, GET_MODE (dst),
20078 op1, op2)));
20079 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20080 return;
20081 default:
20082 break;
20085 if (!vector_operand (operands[1], mode))
20086 operands[1] = force_reg (mode, operands[1]);
20087 if (!vector_operand (operands[2], mode))
20088 operands[2] = force_reg (mode, operands[2]);
20089 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20090 emit_insn (gen_rtx_SET (operands[0],
20091 gen_rtx_fmt_ee (code, mode, operands[1],
20092 operands[2])));
20095 /* Return TRUE or FALSE depending on whether the binary operator meets the
20096 appropriate constraints. */
20098 bool
20099 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20100 rtx operands[3])
20102 rtx dst = operands[0];
20103 rtx src1 = operands[1];
20104 rtx src2 = operands[2];
20106 /* Both source operands cannot be in memory. */
20107 if (MEM_P (src1) && MEM_P (src2))
20108 return false;
20110 /* Canonicalize operand order for commutative operators. */
20111 if (ix86_swap_binary_operands_p (code, mode, operands))
20112 std::swap (src1, src2);
20114 /* If the destination is memory, we must have a matching source operand. */
20115 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20116 return false;
20118 /* Source 1 cannot be a constant. */
20119 if (CONSTANT_P (src1))
20120 return false;
20122 /* Source 1 cannot be a non-matching memory. */
20123 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20124 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20125 return (code == AND
20126 && (mode == HImode
20127 || mode == SImode
20128 || (TARGET_64BIT && mode == DImode))
20129 && satisfies_constraint_L (src2));
20131 return true;
20134 /* Attempt to expand a unary operator. Make the expansion closer to the
20135 actual machine, then just general_operand, which will allow 2 separate
20136 memory references (one output, one input) in a single insn. */
20138 void
20139 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20140 rtx operands[])
20142 bool matching_memory = false;
20143 rtx src, dst, op, clob;
20145 dst = operands[0];
20146 src = operands[1];
20148 /* If the destination is memory, and we do not have matching source
20149 operands, do things in registers. */
20150 if (MEM_P (dst))
20152 if (rtx_equal_p (dst, src))
20153 matching_memory = true;
20154 else
20155 dst = gen_reg_rtx (mode);
20158 /* When source operand is memory, destination must match. */
20159 if (MEM_P (src) && !matching_memory)
20160 src = force_reg (mode, src);
20162 /* Emit the instruction. */
20164 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20166 if (code == NOT)
20167 emit_insn (op);
20168 else
20170 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20171 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20174 /* Fix up the destination if needed. */
20175 if (dst != operands[0])
20176 emit_move_insn (operands[0], dst);
20179 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20180 divisor are within the range [0-255]. */
20182 void
20183 ix86_split_idivmod (machine_mode mode, rtx operands[],
20184 bool signed_p)
20186 rtx_code_label *end_label, *qimode_label;
20187 rtx div, mod;
20188 rtx_insn *insn;
20189 rtx scratch, tmp0, tmp1, tmp2;
20190 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20191 rtx (*gen_zero_extend) (rtx, rtx);
20192 rtx (*gen_test_ccno_1) (rtx, rtx);
20194 switch (mode)
20196 case E_SImode:
20197 if (GET_MODE (operands[0]) == SImode)
20199 if (GET_MODE (operands[1]) == SImode)
20200 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20201 else
20202 gen_divmod4_1
20203 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20204 gen_zero_extend = gen_zero_extendqisi2;
20206 else
20208 gen_divmod4_1
20209 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20210 gen_zero_extend = gen_zero_extendqidi2;
20212 gen_test_ccno_1 = gen_testsi_ccno_1;
20213 break;
20214 case E_DImode:
20215 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20216 gen_test_ccno_1 = gen_testdi_ccno_1;
20217 gen_zero_extend = gen_zero_extendqidi2;
20218 break;
20219 default:
20220 gcc_unreachable ();
20223 end_label = gen_label_rtx ();
20224 qimode_label = gen_label_rtx ();
20226 scratch = gen_reg_rtx (mode);
20228 /* Use 8bit unsigned divimod if dividend and divisor are within
20229 the range [0-255]. */
20230 emit_move_insn (scratch, operands[2]);
20231 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20232 scratch, 1, OPTAB_DIRECT);
20233 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20234 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20235 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20236 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20237 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20238 pc_rtx);
20239 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20240 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20241 JUMP_LABEL (insn) = qimode_label;
20243 /* Generate original signed/unsigned divimod. */
20244 div = gen_divmod4_1 (operands[0], operands[1],
20245 operands[2], operands[3]);
20246 emit_insn (div);
20248 /* Branch to the end. */
20249 emit_jump_insn (gen_jump (end_label));
20250 emit_barrier ();
20252 /* Generate 8bit unsigned divide. */
20253 emit_label (qimode_label);
20254 /* Don't use operands[0] for result of 8bit divide since not all
20255 registers support QImode ZERO_EXTRACT. */
20256 tmp0 = lowpart_subreg (HImode, scratch, mode);
20257 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20258 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20259 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20261 if (signed_p)
20263 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20264 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20266 else
20268 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20269 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20271 if (mode == SImode)
20273 if (GET_MODE (operands[0]) != SImode)
20274 div = gen_rtx_ZERO_EXTEND (DImode, div);
20275 if (GET_MODE (operands[1]) != SImode)
20276 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20279 /* Extract remainder from AH. */
20280 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20281 tmp0, GEN_INT (8), GEN_INT (8));
20282 if (REG_P (operands[1]))
20283 insn = emit_move_insn (operands[1], tmp1);
20284 else
20286 /* Need a new scratch register since the old one has result
20287 of 8bit divide. */
20288 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20289 emit_move_insn (scratch, tmp1);
20290 insn = emit_move_insn (operands[1], scratch);
20292 set_unique_reg_note (insn, REG_EQUAL, mod);
20294 /* Zero extend quotient from AL. */
20295 tmp1 = gen_lowpart (QImode, tmp0);
20296 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20297 set_unique_reg_note (insn, REG_EQUAL, div);
20299 emit_label (end_label);
20302 #define LEA_MAX_STALL (3)
20303 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20305 /* Increase given DISTANCE in half-cycles according to
20306 dependencies between PREV and NEXT instructions.
20307 Add 1 half-cycle if there is no dependency and
20308 go to next cycle if there is some dependecy. */
20310 static unsigned int
20311 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20313 df_ref def, use;
20315 if (!prev || !next)
20316 return distance + (distance & 1) + 2;
20318 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20319 return distance + 1;
20321 FOR_EACH_INSN_USE (use, next)
20322 FOR_EACH_INSN_DEF (def, prev)
20323 if (!DF_REF_IS_ARTIFICIAL (def)
20324 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20325 return distance + (distance & 1) + 2;
20327 return distance + 1;
20330 /* Function checks if instruction INSN defines register number
20331 REGNO1 or REGNO2. */
20333 static bool
20334 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20335 rtx_insn *insn)
20337 df_ref def;
20339 FOR_EACH_INSN_DEF (def, insn)
20340 if (DF_REF_REG_DEF_P (def)
20341 && !DF_REF_IS_ARTIFICIAL (def)
20342 && (regno1 == DF_REF_REGNO (def)
20343 || regno2 == DF_REF_REGNO (def)))
20344 return true;
20346 return false;
20349 /* Function checks if instruction INSN uses register number
20350 REGNO as a part of address expression. */
20352 static bool
20353 insn_uses_reg_mem (unsigned int regno, rtx insn)
20355 df_ref use;
20357 FOR_EACH_INSN_USE (use, insn)
20358 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20359 return true;
20361 return false;
20364 /* Search backward for non-agu definition of register number REGNO1
20365 or register number REGNO2 in basic block starting from instruction
20366 START up to head of basic block or instruction INSN.
20368 Function puts true value into *FOUND var if definition was found
20369 and false otherwise.
20371 Distance in half-cycles between START and found instruction or head
20372 of BB is added to DISTANCE and returned. */
20374 static int
20375 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20376 rtx_insn *insn, int distance,
20377 rtx_insn *start, bool *found)
20379 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20380 rtx_insn *prev = start;
20381 rtx_insn *next = NULL;
20383 *found = false;
20385 while (prev
20386 && prev != insn
20387 && distance < LEA_SEARCH_THRESHOLD)
20389 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20391 distance = increase_distance (prev, next, distance);
20392 if (insn_defines_reg (regno1, regno2, prev))
20394 if (recog_memoized (prev) < 0
20395 || get_attr_type (prev) != TYPE_LEA)
20397 *found = true;
20398 return distance;
20402 next = prev;
20404 if (prev == BB_HEAD (bb))
20405 break;
20407 prev = PREV_INSN (prev);
20410 return distance;
20413 /* Search backward for non-agu definition of register number REGNO1
20414 or register number REGNO2 in INSN's basic block until
20415 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20416 2. Reach neighbor BBs boundary, or
20417 3. Reach agu definition.
20418 Returns the distance between the non-agu definition point and INSN.
20419 If no definition point, returns -1. */
20421 static int
20422 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20423 rtx_insn *insn)
20425 basic_block bb = BLOCK_FOR_INSN (insn);
20426 int distance = 0;
20427 bool found = false;
20429 if (insn != BB_HEAD (bb))
20430 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20431 distance, PREV_INSN (insn),
20432 &found);
20434 if (!found && distance < LEA_SEARCH_THRESHOLD)
20436 edge e;
20437 edge_iterator ei;
20438 bool simple_loop = false;
20440 FOR_EACH_EDGE (e, ei, bb->preds)
20441 if (e->src == bb)
20443 simple_loop = true;
20444 break;
20447 if (simple_loop)
20448 distance = distance_non_agu_define_in_bb (regno1, regno2,
20449 insn, distance,
20450 BB_END (bb), &found);
20451 else
20453 int shortest_dist = -1;
20454 bool found_in_bb = false;
20456 FOR_EACH_EDGE (e, ei, bb->preds)
20458 int bb_dist
20459 = distance_non_agu_define_in_bb (regno1, regno2,
20460 insn, distance,
20461 BB_END (e->src),
20462 &found_in_bb);
20463 if (found_in_bb)
20465 if (shortest_dist < 0)
20466 shortest_dist = bb_dist;
20467 else if (bb_dist > 0)
20468 shortest_dist = MIN (bb_dist, shortest_dist);
20470 found = true;
20474 distance = shortest_dist;
20478 /* get_attr_type may modify recog data. We want to make sure
20479 that recog data is valid for instruction INSN, on which
20480 distance_non_agu_define is called. INSN is unchanged here. */
20481 extract_insn_cached (insn);
20483 if (!found)
20484 return -1;
20486 return distance >> 1;
20489 /* Return the distance in half-cycles between INSN and the next
20490 insn that uses register number REGNO in memory address added
20491 to DISTANCE. Return -1 if REGNO0 is set.
20493 Put true value into *FOUND if register usage was found and
20494 false otherwise.
20495 Put true value into *REDEFINED if register redefinition was
20496 found and false otherwise. */
20498 static int
20499 distance_agu_use_in_bb (unsigned int regno,
20500 rtx_insn *insn, int distance, rtx_insn *start,
20501 bool *found, bool *redefined)
20503 basic_block bb = NULL;
20504 rtx_insn *next = start;
20505 rtx_insn *prev = NULL;
20507 *found = false;
20508 *redefined = false;
20510 if (start != NULL_RTX)
20512 bb = BLOCK_FOR_INSN (start);
20513 if (start != BB_HEAD (bb))
20514 /* If insn and start belong to the same bb, set prev to insn,
20515 so the call to increase_distance will increase the distance
20516 between insns by 1. */
20517 prev = insn;
20520 while (next
20521 && next != insn
20522 && distance < LEA_SEARCH_THRESHOLD)
20524 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20526 distance = increase_distance(prev, next, distance);
20527 if (insn_uses_reg_mem (regno, next))
20529 /* Return DISTANCE if OP0 is used in memory
20530 address in NEXT. */
20531 *found = true;
20532 return distance;
20535 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20537 /* Return -1 if OP0 is set in NEXT. */
20538 *redefined = true;
20539 return -1;
20542 prev = next;
20545 if (next == BB_END (bb))
20546 break;
20548 next = NEXT_INSN (next);
20551 return distance;
20554 /* Return the distance between INSN and the next insn that uses
20555 register number REGNO0 in memory address. Return -1 if no such
20556 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20558 static int
20559 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20561 basic_block bb = BLOCK_FOR_INSN (insn);
20562 int distance = 0;
20563 bool found = false;
20564 bool redefined = false;
20566 if (insn != BB_END (bb))
20567 distance = distance_agu_use_in_bb (regno0, insn, distance,
20568 NEXT_INSN (insn),
20569 &found, &redefined);
20571 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20573 edge e;
20574 edge_iterator ei;
20575 bool simple_loop = false;
20577 FOR_EACH_EDGE (e, ei, bb->succs)
20578 if (e->dest == bb)
20580 simple_loop = true;
20581 break;
20584 if (simple_loop)
20585 distance = distance_agu_use_in_bb (regno0, insn,
20586 distance, BB_HEAD (bb),
20587 &found, &redefined);
20588 else
20590 int shortest_dist = -1;
20591 bool found_in_bb = false;
20592 bool redefined_in_bb = false;
20594 FOR_EACH_EDGE (e, ei, bb->succs)
20596 int bb_dist
20597 = distance_agu_use_in_bb (regno0, insn,
20598 distance, BB_HEAD (e->dest),
20599 &found_in_bb, &redefined_in_bb);
20600 if (found_in_bb)
20602 if (shortest_dist < 0)
20603 shortest_dist = bb_dist;
20604 else if (bb_dist > 0)
20605 shortest_dist = MIN (bb_dist, shortest_dist);
20607 found = true;
20611 distance = shortest_dist;
20615 if (!found || redefined)
20616 return -1;
20618 return distance >> 1;
20621 /* Define this macro to tune LEA priority vs ADD, it take effect when
20622 there is a dilemma of choicing LEA or ADD
20623 Negative value: ADD is more preferred than LEA
20624 Zero: Netrual
20625 Positive value: LEA is more preferred than ADD*/
20626 #define IX86_LEA_PRIORITY 0
20628 /* Return true if usage of lea INSN has performance advantage
20629 over a sequence of instructions. Instructions sequence has
20630 SPLIT_COST cycles higher latency than lea latency. */
20632 static bool
20633 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20634 unsigned int regno2, int split_cost, bool has_scale)
20636 int dist_define, dist_use;
20638 /* For Silvermont if using a 2-source or 3-source LEA for
20639 non-destructive destination purposes, or due to wanting
20640 ability to use SCALE, the use of LEA is justified. */
20641 if (TARGET_SILVERMONT || TARGET_INTEL)
20643 if (has_scale)
20644 return true;
20645 if (split_cost < 1)
20646 return false;
20647 if (regno0 == regno1 || regno0 == regno2)
20648 return false;
20649 return true;
20652 dist_define = distance_non_agu_define (regno1, regno2, insn);
20653 dist_use = distance_agu_use (regno0, insn);
20655 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20657 /* If there is no non AGU operand definition, no AGU
20658 operand usage and split cost is 0 then both lea
20659 and non lea variants have same priority. Currently
20660 we prefer lea for 64 bit code and non lea on 32 bit
20661 code. */
20662 if (dist_use < 0 && split_cost == 0)
20663 return TARGET_64BIT || IX86_LEA_PRIORITY;
20664 else
20665 return true;
20668 /* With longer definitions distance lea is more preferable.
20669 Here we change it to take into account splitting cost and
20670 lea priority. */
20671 dist_define += split_cost + IX86_LEA_PRIORITY;
20673 /* If there is no use in memory addess then we just check
20674 that split cost exceeds AGU stall. */
20675 if (dist_use < 0)
20676 return dist_define > LEA_MAX_STALL;
20678 /* If this insn has both backward non-agu dependence and forward
20679 agu dependence, the one with short distance takes effect. */
20680 return dist_define >= dist_use;
20683 /* Return true if it is legal to clobber flags by INSN and
20684 false otherwise. */
20686 static bool
20687 ix86_ok_to_clobber_flags (rtx_insn *insn)
20689 basic_block bb = BLOCK_FOR_INSN (insn);
20690 df_ref use;
20691 bitmap live;
20693 while (insn)
20695 if (NONDEBUG_INSN_P (insn))
20697 FOR_EACH_INSN_USE (use, insn)
20698 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20699 return false;
20701 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20702 return true;
20705 if (insn == BB_END (bb))
20706 break;
20708 insn = NEXT_INSN (insn);
20711 live = df_get_live_out(bb);
20712 return !REGNO_REG_SET_P (live, FLAGS_REG);
20715 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20716 move and add to avoid AGU stalls. */
20718 bool
20719 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20721 unsigned int regno0, regno1, regno2;
20723 /* Check if we need to optimize. */
20724 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20725 return false;
20727 /* Check it is correct to split here. */
20728 if (!ix86_ok_to_clobber_flags(insn))
20729 return false;
20731 regno0 = true_regnum (operands[0]);
20732 regno1 = true_regnum (operands[1]);
20733 regno2 = true_regnum (operands[2]);
20735 /* We need to split only adds with non destructive
20736 destination operand. */
20737 if (regno0 == regno1 || regno0 == regno2)
20738 return false;
20739 else
20740 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20743 /* Return true if we should emit lea instruction instead of mov
20744 instruction. */
20746 bool
20747 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20749 unsigned int regno0, regno1;
20751 /* Check if we need to optimize. */
20752 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20753 return false;
20755 /* Use lea for reg to reg moves only. */
20756 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20757 return false;
20759 regno0 = true_regnum (operands[0]);
20760 regno1 = true_regnum (operands[1]);
20762 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20765 /* Return true if we need to split lea into a sequence of
20766 instructions to avoid AGU stalls. */
20768 bool
20769 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20771 unsigned int regno0, regno1, regno2;
20772 int split_cost;
20773 struct ix86_address parts;
20774 int ok;
20776 /* Check we need to optimize. */
20777 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20778 return false;
20780 /* The "at least two components" test below might not catch simple
20781 move or zero extension insns if parts.base is non-NULL and parts.disp
20782 is const0_rtx as the only components in the address, e.g. if the
20783 register is %rbp or %r13. As this test is much cheaper and moves or
20784 zero extensions are the common case, do this check first. */
20785 if (REG_P (operands[1])
20786 || (SImode_address_operand (operands[1], VOIDmode)
20787 && REG_P (XEXP (operands[1], 0))))
20788 return false;
20790 /* Check if it is OK to split here. */
20791 if (!ix86_ok_to_clobber_flags (insn))
20792 return false;
20794 ok = ix86_decompose_address (operands[1], &parts);
20795 gcc_assert (ok);
20797 /* There should be at least two components in the address. */
20798 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20799 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20800 return false;
20802 /* We should not split into add if non legitimate pic
20803 operand is used as displacement. */
20804 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20805 return false;
20807 regno0 = true_regnum (operands[0]) ;
20808 regno1 = INVALID_REGNUM;
20809 regno2 = INVALID_REGNUM;
20811 if (parts.base)
20812 regno1 = true_regnum (parts.base);
20813 if (parts.index)
20814 regno2 = true_regnum (parts.index);
20816 split_cost = 0;
20818 /* Compute how many cycles we will add to execution time
20819 if split lea into a sequence of instructions. */
20820 if (parts.base || parts.index)
20822 /* Have to use mov instruction if non desctructive
20823 destination form is used. */
20824 if (regno1 != regno0 && regno2 != regno0)
20825 split_cost += 1;
20827 /* Have to add index to base if both exist. */
20828 if (parts.base && parts.index)
20829 split_cost += 1;
20831 /* Have to use shift and adds if scale is 2 or greater. */
20832 if (parts.scale > 1)
20834 if (regno0 != regno1)
20835 split_cost += 1;
20836 else if (regno2 == regno0)
20837 split_cost += 4;
20838 else
20839 split_cost += parts.scale;
20842 /* Have to use add instruction with immediate if
20843 disp is non zero. */
20844 if (parts.disp && parts.disp != const0_rtx)
20845 split_cost += 1;
20847 /* Subtract the price of lea. */
20848 split_cost -= 1;
20851 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20852 parts.scale > 1);
20855 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20856 matches destination. RTX includes clobber of FLAGS_REG. */
20858 static void
20859 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20860 rtx dst, rtx src)
20862 rtx op, clob;
20864 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20865 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20867 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20870 /* Return true if regno1 def is nearest to the insn. */
20872 static bool
20873 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20875 rtx_insn *prev = insn;
20876 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20878 if (insn == start)
20879 return false;
20880 while (prev && prev != start)
20882 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20884 prev = PREV_INSN (prev);
20885 continue;
20887 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20888 return true;
20889 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20890 return false;
20891 prev = PREV_INSN (prev);
20894 /* None of the regs is defined in the bb. */
20895 return false;
20898 /* Split lea instructions into a sequence of instructions
20899 which are executed on ALU to avoid AGU stalls.
20900 It is assumed that it is allowed to clobber flags register
20901 at lea position. */
20903 void
20904 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20906 unsigned int regno0, regno1, regno2;
20907 struct ix86_address parts;
20908 rtx target, tmp;
20909 int ok, adds;
20911 ok = ix86_decompose_address (operands[1], &parts);
20912 gcc_assert (ok);
20914 target = gen_lowpart (mode, operands[0]);
20916 regno0 = true_regnum (target);
20917 regno1 = INVALID_REGNUM;
20918 regno2 = INVALID_REGNUM;
20920 if (parts.base)
20922 parts.base = gen_lowpart (mode, parts.base);
20923 regno1 = true_regnum (parts.base);
20926 if (parts.index)
20928 parts.index = gen_lowpart (mode, parts.index);
20929 regno2 = true_regnum (parts.index);
20932 if (parts.disp)
20933 parts.disp = gen_lowpart (mode, parts.disp);
20935 if (parts.scale > 1)
20937 /* Case r1 = r1 + ... */
20938 if (regno1 == regno0)
20940 /* If we have a case r1 = r1 + C * r2 then we
20941 should use multiplication which is very
20942 expensive. Assume cost model is wrong if we
20943 have such case here. */
20944 gcc_assert (regno2 != regno0);
20946 for (adds = parts.scale; adds > 0; adds--)
20947 ix86_emit_binop (PLUS, mode, target, parts.index);
20949 else
20951 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
20952 if (regno0 != regno2)
20953 emit_insn (gen_rtx_SET (target, parts.index));
20955 /* Use shift for scaling. */
20956 ix86_emit_binop (ASHIFT, mode, target,
20957 GEN_INT (exact_log2 (parts.scale)));
20959 if (parts.base)
20960 ix86_emit_binop (PLUS, mode, target, parts.base);
20962 if (parts.disp && parts.disp != const0_rtx)
20963 ix86_emit_binop (PLUS, mode, target, parts.disp);
20966 else if (!parts.base && !parts.index)
20968 gcc_assert(parts.disp);
20969 emit_insn (gen_rtx_SET (target, parts.disp));
20971 else
20973 if (!parts.base)
20975 if (regno0 != regno2)
20976 emit_insn (gen_rtx_SET (target, parts.index));
20978 else if (!parts.index)
20980 if (regno0 != regno1)
20981 emit_insn (gen_rtx_SET (target, parts.base));
20983 else
20985 if (regno0 == regno1)
20986 tmp = parts.index;
20987 else if (regno0 == regno2)
20988 tmp = parts.base;
20989 else
20991 rtx tmp1;
20993 /* Find better operand for SET instruction, depending
20994 on which definition is farther from the insn. */
20995 if (find_nearest_reg_def (insn, regno1, regno2))
20996 tmp = parts.index, tmp1 = parts.base;
20997 else
20998 tmp = parts.base, tmp1 = parts.index;
21000 emit_insn (gen_rtx_SET (target, tmp));
21002 if (parts.disp && parts.disp != const0_rtx)
21003 ix86_emit_binop (PLUS, mode, target, parts.disp);
21005 ix86_emit_binop (PLUS, mode, target, tmp1);
21006 return;
21009 ix86_emit_binop (PLUS, mode, target, tmp);
21012 if (parts.disp && parts.disp != const0_rtx)
21013 ix86_emit_binop (PLUS, mode, target, parts.disp);
21017 /* Return true if it is ok to optimize an ADD operation to LEA
21018 operation to avoid flag register consumation. For most processors,
21019 ADD is faster than LEA. For the processors like BONNELL, if the
21020 destination register of LEA holds an actual address which will be
21021 used soon, LEA is better and otherwise ADD is better. */
21023 bool
21024 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21026 unsigned int regno0 = true_regnum (operands[0]);
21027 unsigned int regno1 = true_regnum (operands[1]);
21028 unsigned int regno2 = true_regnum (operands[2]);
21030 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21031 if (regno0 != regno1 && regno0 != regno2)
21032 return true;
21034 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21035 return false;
21037 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21040 /* Return true if destination reg of SET_BODY is shift count of
21041 USE_BODY. */
21043 static bool
21044 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21046 rtx set_dest;
21047 rtx shift_rtx;
21048 int i;
21050 /* Retrieve destination of SET_BODY. */
21051 switch (GET_CODE (set_body))
21053 case SET:
21054 set_dest = SET_DEST (set_body);
21055 if (!set_dest || !REG_P (set_dest))
21056 return false;
21057 break;
21058 case PARALLEL:
21059 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21060 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21061 use_body))
21062 return true;
21063 /* FALLTHROUGH */
21064 default:
21065 return false;
21068 /* Retrieve shift count of USE_BODY. */
21069 switch (GET_CODE (use_body))
21071 case SET:
21072 shift_rtx = XEXP (use_body, 1);
21073 break;
21074 case PARALLEL:
21075 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21076 if (ix86_dep_by_shift_count_body (set_body,
21077 XVECEXP (use_body, 0, i)))
21078 return true;
21079 /* FALLTHROUGH */
21080 default:
21081 return false;
21084 if (shift_rtx
21085 && (GET_CODE (shift_rtx) == ASHIFT
21086 || GET_CODE (shift_rtx) == LSHIFTRT
21087 || GET_CODE (shift_rtx) == ASHIFTRT
21088 || GET_CODE (shift_rtx) == ROTATE
21089 || GET_CODE (shift_rtx) == ROTATERT))
21091 rtx shift_count = XEXP (shift_rtx, 1);
21093 /* Return true if shift count is dest of SET_BODY. */
21094 if (REG_P (shift_count))
21096 /* Add check since it can be invoked before register
21097 allocation in pre-reload schedule. */
21098 if (reload_completed
21099 && true_regnum (set_dest) == true_regnum (shift_count))
21100 return true;
21101 else if (REGNO(set_dest) == REGNO(shift_count))
21102 return true;
21106 return false;
21109 /* Return true if destination reg of SET_INSN is shift count of
21110 USE_INSN. */
21112 bool
21113 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21115 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21116 PATTERN (use_insn));
21119 /* Return TRUE or FALSE depending on whether the unary operator meets the
21120 appropriate constraints. */
21122 bool
21123 ix86_unary_operator_ok (enum rtx_code,
21124 machine_mode,
21125 rtx operands[2])
21127 /* If one of operands is memory, source and destination must match. */
21128 if ((MEM_P (operands[0])
21129 || MEM_P (operands[1]))
21130 && ! rtx_equal_p (operands[0], operands[1]))
21131 return false;
21132 return true;
21135 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21136 are ok, keeping in mind the possible movddup alternative. */
21138 bool
21139 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21141 if (MEM_P (operands[0]))
21142 return rtx_equal_p (operands[0], operands[1 + high]);
21143 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21144 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21145 return true;
21148 /* Post-reload splitter for converting an SF or DFmode value in an
21149 SSE register into an unsigned SImode. */
21151 void
21152 ix86_split_convert_uns_si_sse (rtx operands[])
21154 machine_mode vecmode;
21155 rtx value, large, zero_or_two31, input, two31, x;
21157 large = operands[1];
21158 zero_or_two31 = operands[2];
21159 input = operands[3];
21160 two31 = operands[4];
21161 vecmode = GET_MODE (large);
21162 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21164 /* Load up the value into the low element. We must ensure that the other
21165 elements are valid floats -- zero is the easiest such value. */
21166 if (MEM_P (input))
21168 if (vecmode == V4SFmode)
21169 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21170 else
21171 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21173 else
21175 input = gen_rtx_REG (vecmode, REGNO (input));
21176 emit_move_insn (value, CONST0_RTX (vecmode));
21177 if (vecmode == V4SFmode)
21178 emit_insn (gen_sse_movss (value, value, input));
21179 else
21180 emit_insn (gen_sse2_movsd (value, value, input));
21183 emit_move_insn (large, two31);
21184 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21186 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21187 emit_insn (gen_rtx_SET (large, x));
21189 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21190 emit_insn (gen_rtx_SET (zero_or_two31, x));
21192 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21193 emit_insn (gen_rtx_SET (value, x));
21195 large = gen_rtx_REG (V4SImode, REGNO (large));
21196 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21198 x = gen_rtx_REG (V4SImode, REGNO (value));
21199 if (vecmode == V4SFmode)
21200 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21201 else
21202 emit_insn (gen_sse2_cvttpd2dq (x, value));
21203 value = x;
21205 emit_insn (gen_xorv4si3 (value, value, large));
21208 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21209 Expects the 64-bit DImode to be supplied in a pair of integral
21210 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21211 -mfpmath=sse, !optimize_size only. */
21213 void
21214 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21216 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21217 rtx int_xmm, fp_xmm;
21218 rtx biases, exponents;
21219 rtx x;
21221 int_xmm = gen_reg_rtx (V4SImode);
21222 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21223 emit_insn (gen_movdi_to_sse (int_xmm, input));
21224 else if (TARGET_SSE_SPLIT_REGS)
21226 emit_clobber (int_xmm);
21227 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21229 else
21231 x = gen_reg_rtx (V2DImode);
21232 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21233 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21236 x = gen_rtx_CONST_VECTOR (V4SImode,
21237 gen_rtvec (4, GEN_INT (0x43300000UL),
21238 GEN_INT (0x45300000UL),
21239 const0_rtx, const0_rtx));
21240 exponents = validize_mem (force_const_mem (V4SImode, x));
21242 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21243 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21245 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21246 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21247 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21248 (0x1.0p84 + double(fp_value_hi_xmm)).
21249 Note these exponents differ by 32. */
21251 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21253 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21254 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21255 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21256 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21257 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21258 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21259 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21260 biases = validize_mem (force_const_mem (V2DFmode, biases));
21261 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21263 /* Add the upper and lower DFmode values together. */
21264 if (TARGET_SSE3)
21265 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21266 else
21268 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21269 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21270 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21273 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21276 /* Not used, but eases macroization of patterns. */
21277 void
21278 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21280 gcc_unreachable ();
21283 /* Convert an unsigned SImode value into a DFmode. Only currently used
21284 for SSE, but applicable anywhere. */
21286 void
21287 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21289 REAL_VALUE_TYPE TWO31r;
21290 rtx x, fp;
21292 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21293 NULL, 1, OPTAB_DIRECT);
21295 fp = gen_reg_rtx (DFmode);
21296 emit_insn (gen_floatsidf2 (fp, x));
21298 real_ldexp (&TWO31r, &dconst1, 31);
21299 x = const_double_from_real_value (TWO31r, DFmode);
21301 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21302 if (x != target)
21303 emit_move_insn (target, x);
21306 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21307 32-bit mode; otherwise we have a direct convert instruction. */
21309 void
21310 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21312 REAL_VALUE_TYPE TWO32r;
21313 rtx fp_lo, fp_hi, x;
21315 fp_lo = gen_reg_rtx (DFmode);
21316 fp_hi = gen_reg_rtx (DFmode);
21318 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21320 real_ldexp (&TWO32r, &dconst1, 32);
21321 x = const_double_from_real_value (TWO32r, DFmode);
21322 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21324 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21326 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21327 0, OPTAB_DIRECT);
21328 if (x != target)
21329 emit_move_insn (target, x);
21332 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21333 For x86_32, -mfpmath=sse, !optimize_size only. */
21334 void
21335 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21337 REAL_VALUE_TYPE ONE16r;
21338 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21340 real_ldexp (&ONE16r, &dconst1, 16);
21341 x = const_double_from_real_value (ONE16r, SFmode);
21342 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21343 NULL, 0, OPTAB_DIRECT);
21344 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21345 NULL, 0, OPTAB_DIRECT);
21346 fp_hi = gen_reg_rtx (SFmode);
21347 fp_lo = gen_reg_rtx (SFmode);
21348 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21349 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21350 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21351 0, OPTAB_DIRECT);
21352 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21353 0, OPTAB_DIRECT);
21354 if (!rtx_equal_p (target, fp_hi))
21355 emit_move_insn (target, fp_hi);
21358 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21359 a vector of unsigned ints VAL to vector of floats TARGET. */
21361 void
21362 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21364 rtx tmp[8];
21365 REAL_VALUE_TYPE TWO16r;
21366 machine_mode intmode = GET_MODE (val);
21367 machine_mode fltmode = GET_MODE (target);
21368 rtx (*cvt) (rtx, rtx);
21370 if (intmode == V4SImode)
21371 cvt = gen_floatv4siv4sf2;
21372 else
21373 cvt = gen_floatv8siv8sf2;
21374 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21375 tmp[0] = force_reg (intmode, tmp[0]);
21376 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21377 OPTAB_DIRECT);
21378 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21379 NULL_RTX, 1, OPTAB_DIRECT);
21380 tmp[3] = gen_reg_rtx (fltmode);
21381 emit_insn (cvt (tmp[3], tmp[1]));
21382 tmp[4] = gen_reg_rtx (fltmode);
21383 emit_insn (cvt (tmp[4], tmp[2]));
21384 real_ldexp (&TWO16r, &dconst1, 16);
21385 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21386 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21387 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21388 OPTAB_DIRECT);
21389 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21390 OPTAB_DIRECT);
21391 if (tmp[7] != target)
21392 emit_move_insn (target, tmp[7]);
21395 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21396 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21397 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21398 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21401 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21403 REAL_VALUE_TYPE TWO31r;
21404 rtx two31r, tmp[4];
21405 machine_mode mode = GET_MODE (val);
21406 machine_mode scalarmode = GET_MODE_INNER (mode);
21407 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21408 rtx (*cmp) (rtx, rtx, rtx, rtx);
21409 int i;
21411 for (i = 0; i < 3; i++)
21412 tmp[i] = gen_reg_rtx (mode);
21413 real_ldexp (&TWO31r, &dconst1, 31);
21414 two31r = const_double_from_real_value (TWO31r, scalarmode);
21415 two31r = ix86_build_const_vector (mode, 1, two31r);
21416 two31r = force_reg (mode, two31r);
21417 switch (mode)
21419 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21420 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21421 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21422 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21423 default: gcc_unreachable ();
21425 tmp[3] = gen_rtx_LE (mode, two31r, val);
21426 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21427 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21428 0, OPTAB_DIRECT);
21429 if (intmode == V4SImode || TARGET_AVX2)
21430 *xorp = expand_simple_binop (intmode, ASHIFT,
21431 gen_lowpart (intmode, tmp[0]),
21432 GEN_INT (31), NULL_RTX, 0,
21433 OPTAB_DIRECT);
21434 else
21436 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21437 two31 = ix86_build_const_vector (intmode, 1, two31);
21438 *xorp = expand_simple_binop (intmode, AND,
21439 gen_lowpart (intmode, tmp[0]),
21440 two31, NULL_RTX, 0,
21441 OPTAB_DIRECT);
21443 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21444 0, OPTAB_DIRECT);
21447 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21448 then replicate the value for all elements of the vector
21449 register. */
21452 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21454 int i, n_elt;
21455 rtvec v;
21456 machine_mode scalar_mode;
21458 switch (mode)
21460 case E_V64QImode:
21461 case E_V32QImode:
21462 case E_V16QImode:
21463 case E_V32HImode:
21464 case E_V16HImode:
21465 case E_V8HImode:
21466 case E_V16SImode:
21467 case E_V8SImode:
21468 case E_V4SImode:
21469 case E_V8DImode:
21470 case E_V4DImode:
21471 case E_V2DImode:
21472 gcc_assert (vect);
21473 /* FALLTHRU */
21474 case E_V16SFmode:
21475 case E_V8SFmode:
21476 case E_V4SFmode:
21477 case E_V8DFmode:
21478 case E_V4DFmode:
21479 case E_V2DFmode:
21480 n_elt = GET_MODE_NUNITS (mode);
21481 v = rtvec_alloc (n_elt);
21482 scalar_mode = GET_MODE_INNER (mode);
21484 RTVEC_ELT (v, 0) = value;
21486 for (i = 1; i < n_elt; ++i)
21487 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21489 return gen_rtx_CONST_VECTOR (mode, v);
21491 default:
21492 gcc_unreachable ();
21496 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21497 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21498 for an SSE register. If VECT is true, then replicate the mask for
21499 all elements of the vector register. If INVERT is true, then create
21500 a mask excluding the sign bit. */
21503 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21505 machine_mode vec_mode, imode;
21506 wide_int w;
21507 rtx mask, v;
21509 switch (mode)
21511 case E_V16SImode:
21512 case E_V16SFmode:
21513 case E_V8SImode:
21514 case E_V4SImode:
21515 case E_V8SFmode:
21516 case E_V4SFmode:
21517 vec_mode = mode;
21518 imode = SImode;
21519 break;
21521 case E_V8DImode:
21522 case E_V4DImode:
21523 case E_V2DImode:
21524 case E_V8DFmode:
21525 case E_V4DFmode:
21526 case E_V2DFmode:
21527 vec_mode = mode;
21528 imode = DImode;
21529 break;
21531 case E_TImode:
21532 case E_TFmode:
21533 vec_mode = VOIDmode;
21534 imode = TImode;
21535 break;
21537 default:
21538 gcc_unreachable ();
21541 machine_mode inner_mode = GET_MODE_INNER (mode);
21542 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21543 GET_MODE_BITSIZE (inner_mode));
21544 if (invert)
21545 w = wi::bit_not (w);
21547 /* Force this value into the low part of a fp vector constant. */
21548 mask = immed_wide_int_const (w, imode);
21549 mask = gen_lowpart (inner_mode, mask);
21551 if (vec_mode == VOIDmode)
21552 return force_reg (inner_mode, mask);
21554 v = ix86_build_const_vector (vec_mode, vect, mask);
21555 return force_reg (vec_mode, v);
21558 /* Generate code for floating point ABS or NEG. */
21560 void
21561 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21562 rtx operands[])
21564 rtx mask, set, dst, src;
21565 bool use_sse = false;
21566 bool vector_mode = VECTOR_MODE_P (mode);
21567 machine_mode vmode = mode;
21569 if (vector_mode)
21570 use_sse = true;
21571 else if (mode == TFmode)
21572 use_sse = true;
21573 else if (TARGET_SSE_MATH)
21575 use_sse = SSE_FLOAT_MODE_P (mode);
21576 if (mode == SFmode)
21577 vmode = V4SFmode;
21578 else if (mode == DFmode)
21579 vmode = V2DFmode;
21582 /* NEG and ABS performed with SSE use bitwise mask operations.
21583 Create the appropriate mask now. */
21584 if (use_sse)
21585 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21586 else
21587 mask = NULL_RTX;
21589 dst = operands[0];
21590 src = operands[1];
21592 set = gen_rtx_fmt_e (code, mode, src);
21593 set = gen_rtx_SET (dst, set);
21595 if (mask)
21597 rtx use, clob;
21598 rtvec par;
21600 use = gen_rtx_USE (VOIDmode, mask);
21601 if (vector_mode)
21602 par = gen_rtvec (2, set, use);
21603 else
21605 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21606 par = gen_rtvec (3, set, use, clob);
21608 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21610 else
21611 emit_insn (set);
21614 /* Expand a copysign operation. Special case operand 0 being a constant. */
21616 void
21617 ix86_expand_copysign (rtx operands[])
21619 machine_mode mode, vmode;
21620 rtx dest, op0, op1, mask, nmask;
21622 dest = operands[0];
21623 op0 = operands[1];
21624 op1 = operands[2];
21626 mode = GET_MODE (dest);
21628 if (mode == SFmode)
21629 vmode = V4SFmode;
21630 else if (mode == DFmode)
21631 vmode = V2DFmode;
21632 else
21633 vmode = mode;
21635 if (CONST_DOUBLE_P (op0))
21637 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21639 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21640 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21642 if (mode == SFmode || mode == DFmode)
21644 if (op0 == CONST0_RTX (mode))
21645 op0 = CONST0_RTX (vmode);
21646 else
21648 rtx v = ix86_build_const_vector (vmode, false, op0);
21650 op0 = force_reg (vmode, v);
21653 else if (op0 != CONST0_RTX (mode))
21654 op0 = force_reg (mode, op0);
21656 mask = ix86_build_signbit_mask (vmode, 0, 0);
21658 if (mode == SFmode)
21659 copysign_insn = gen_copysignsf3_const;
21660 else if (mode == DFmode)
21661 copysign_insn = gen_copysigndf3_const;
21662 else
21663 copysign_insn = gen_copysigntf3_const;
21665 emit_insn (copysign_insn (dest, op0, op1, mask));
21667 else
21669 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21671 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21672 mask = ix86_build_signbit_mask (vmode, 0, 0);
21674 if (mode == SFmode)
21675 copysign_insn = gen_copysignsf3_var;
21676 else if (mode == DFmode)
21677 copysign_insn = gen_copysigndf3_var;
21678 else
21679 copysign_insn = gen_copysigntf3_var;
21681 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21685 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21686 be a constant, and so has already been expanded into a vector constant. */
21688 void
21689 ix86_split_copysign_const (rtx operands[])
21691 machine_mode mode, vmode;
21692 rtx dest, op0, mask, x;
21694 dest = operands[0];
21695 op0 = operands[1];
21696 mask = operands[3];
21698 mode = GET_MODE (dest);
21699 vmode = GET_MODE (mask);
21701 dest = lowpart_subreg (vmode, dest, mode);
21702 x = gen_rtx_AND (vmode, dest, mask);
21703 emit_insn (gen_rtx_SET (dest, x));
21705 if (op0 != CONST0_RTX (vmode))
21707 x = gen_rtx_IOR (vmode, dest, op0);
21708 emit_insn (gen_rtx_SET (dest, x));
21712 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21713 so we have to do two masks. */
21715 void
21716 ix86_split_copysign_var (rtx operands[])
21718 machine_mode mode, vmode;
21719 rtx dest, scratch, op0, op1, mask, nmask, x;
21721 dest = operands[0];
21722 scratch = operands[1];
21723 op0 = operands[2];
21724 op1 = operands[3];
21725 nmask = operands[4];
21726 mask = operands[5];
21728 mode = GET_MODE (dest);
21729 vmode = GET_MODE (mask);
21731 if (rtx_equal_p (op0, op1))
21733 /* Shouldn't happen often (it's useless, obviously), but when it does
21734 we'd generate incorrect code if we continue below. */
21735 emit_move_insn (dest, op0);
21736 return;
21739 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21741 gcc_assert (REGNO (op1) == REGNO (scratch));
21743 x = gen_rtx_AND (vmode, scratch, mask);
21744 emit_insn (gen_rtx_SET (scratch, x));
21746 dest = mask;
21747 op0 = lowpart_subreg (vmode, op0, mode);
21748 x = gen_rtx_NOT (vmode, dest);
21749 x = gen_rtx_AND (vmode, x, op0);
21750 emit_insn (gen_rtx_SET (dest, x));
21752 else
21754 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21756 x = gen_rtx_AND (vmode, scratch, mask);
21758 else /* alternative 2,4 */
21760 gcc_assert (REGNO (mask) == REGNO (scratch));
21761 op1 = lowpart_subreg (vmode, op1, mode);
21762 x = gen_rtx_AND (vmode, scratch, op1);
21764 emit_insn (gen_rtx_SET (scratch, x));
21766 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21768 dest = lowpart_subreg (vmode, op0, mode);
21769 x = gen_rtx_AND (vmode, dest, nmask);
21771 else /* alternative 3,4 */
21773 gcc_assert (REGNO (nmask) == REGNO (dest));
21774 dest = nmask;
21775 op0 = lowpart_subreg (vmode, op0, mode);
21776 x = gen_rtx_AND (vmode, dest, op0);
21778 emit_insn (gen_rtx_SET (dest, x));
21781 x = gen_rtx_IOR (vmode, dest, scratch);
21782 emit_insn (gen_rtx_SET (dest, x));
21785 /* Return TRUE or FALSE depending on whether the first SET in INSN
21786 has source and destination with matching CC modes, and that the
21787 CC mode is at least as constrained as REQ_MODE. */
21789 bool
21790 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21792 rtx set;
21793 machine_mode set_mode;
21795 set = PATTERN (insn);
21796 if (GET_CODE (set) == PARALLEL)
21797 set = XVECEXP (set, 0, 0);
21798 gcc_assert (GET_CODE (set) == SET);
21799 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21801 set_mode = GET_MODE (SET_DEST (set));
21802 switch (set_mode)
21804 case E_CCNOmode:
21805 if (req_mode != CCNOmode
21806 && (req_mode != CCmode
21807 || XEXP (SET_SRC (set), 1) != const0_rtx))
21808 return false;
21809 break;
21810 case E_CCmode:
21811 if (req_mode == CCGCmode)
21812 return false;
21813 /* FALLTHRU */
21814 case E_CCGCmode:
21815 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21816 return false;
21817 /* FALLTHRU */
21818 case E_CCGOCmode:
21819 if (req_mode == CCZmode)
21820 return false;
21821 /* FALLTHRU */
21822 case E_CCZmode:
21823 break;
21825 case E_CCGZmode:
21827 case E_CCAmode:
21828 case E_CCCmode:
21829 case E_CCOmode:
21830 case E_CCPmode:
21831 case E_CCSmode:
21832 if (set_mode != req_mode)
21833 return false;
21834 break;
21836 default:
21837 gcc_unreachable ();
21840 return GET_MODE (SET_SRC (set)) == set_mode;
21843 /* Generate insn patterns to do an integer compare of OPERANDS. */
21845 static rtx
21846 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21848 machine_mode cmpmode;
21849 rtx tmp, flags;
21851 cmpmode = SELECT_CC_MODE (code, op0, op1);
21852 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21854 /* This is very simple, but making the interface the same as in the
21855 FP case makes the rest of the code easier. */
21856 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21857 emit_insn (gen_rtx_SET (flags, tmp));
21859 /* Return the test that should be put into the flags user, i.e.
21860 the bcc, scc, or cmov instruction. */
21861 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21864 /* Figure out whether to use unordered fp comparisons. */
21866 static bool
21867 ix86_unordered_fp_compare (enum rtx_code code)
21869 if (!TARGET_IEEE_FP)
21870 return false;
21872 switch (code)
21874 case GT:
21875 case GE:
21876 case LT:
21877 case LE:
21878 return false;
21880 case EQ:
21881 case NE:
21883 case LTGT:
21884 case UNORDERED:
21885 case ORDERED:
21886 case UNLT:
21887 case UNLE:
21888 case UNGT:
21889 case UNGE:
21890 case UNEQ:
21891 return true;
21893 default:
21894 gcc_unreachable ();
21898 machine_mode
21899 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21901 machine_mode mode = GET_MODE (op0);
21903 if (SCALAR_FLOAT_MODE_P (mode))
21905 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21906 return CCFPmode;
21909 switch (code)
21911 /* Only zero flag is needed. */
21912 case EQ: /* ZF=0 */
21913 case NE: /* ZF!=0 */
21914 return CCZmode;
21915 /* Codes needing carry flag. */
21916 case GEU: /* CF=0 */
21917 case LTU: /* CF=1 */
21918 /* Detect overflow checks. They need just the carry flag. */
21919 if (GET_CODE (op0) == PLUS
21920 && (rtx_equal_p (op1, XEXP (op0, 0))
21921 || rtx_equal_p (op1, XEXP (op0, 1))))
21922 return CCCmode;
21923 else
21924 return CCmode;
21925 case GTU: /* CF=0 & ZF=0 */
21926 case LEU: /* CF=1 | ZF=1 */
21927 return CCmode;
21928 /* Codes possibly doable only with sign flag when
21929 comparing against zero. */
21930 case GE: /* SF=OF or SF=0 */
21931 case LT: /* SF<>OF or SF=1 */
21932 if (op1 == const0_rtx)
21933 return CCGOCmode;
21934 else
21935 /* For other cases Carry flag is not required. */
21936 return CCGCmode;
21937 /* Codes doable only with sign flag when comparing
21938 against zero, but we miss jump instruction for it
21939 so we need to use relational tests against overflow
21940 that thus needs to be zero. */
21941 case GT: /* ZF=0 & SF=OF */
21942 case LE: /* ZF=1 | SF<>OF */
21943 if (op1 == const0_rtx)
21944 return CCNOmode;
21945 else
21946 return CCGCmode;
21947 /* strcmp pattern do (use flags) and combine may ask us for proper
21948 mode. */
21949 case USE:
21950 return CCmode;
21951 default:
21952 gcc_unreachable ();
21956 /* Return the fixed registers used for condition codes. */
21958 static bool
21959 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
21961 *p1 = FLAGS_REG;
21962 *p2 = FPSR_REG;
21963 return true;
21966 /* If two condition code modes are compatible, return a condition code
21967 mode which is compatible with both. Otherwise, return
21968 VOIDmode. */
21970 static machine_mode
21971 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
21973 if (m1 == m2)
21974 return m1;
21976 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
21977 return VOIDmode;
21979 if ((m1 == CCGCmode && m2 == CCGOCmode)
21980 || (m1 == CCGOCmode && m2 == CCGCmode))
21981 return CCGCmode;
21983 if ((m1 == CCNOmode && m2 == CCGOCmode)
21984 || (m1 == CCGOCmode && m2 == CCNOmode))
21985 return CCNOmode;
21987 if (m1 == CCZmode
21988 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
21989 return m2;
21990 else if (m2 == CCZmode
21991 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
21992 return m1;
21994 switch (m1)
21996 default:
21997 gcc_unreachable ();
21999 case E_CCmode:
22000 case E_CCGCmode:
22001 case E_CCGOCmode:
22002 case E_CCNOmode:
22003 case E_CCAmode:
22004 case E_CCCmode:
22005 case E_CCOmode:
22006 case E_CCPmode:
22007 case E_CCSmode:
22008 case E_CCZmode:
22009 switch (m2)
22011 default:
22012 return VOIDmode;
22014 case E_CCmode:
22015 case E_CCGCmode:
22016 case E_CCGOCmode:
22017 case E_CCNOmode:
22018 case E_CCAmode:
22019 case E_CCCmode:
22020 case E_CCOmode:
22021 case E_CCPmode:
22022 case E_CCSmode:
22023 case E_CCZmode:
22024 return CCmode;
22027 case E_CCFPmode:
22028 /* These are only compatible with themselves, which we already
22029 checked above. */
22030 return VOIDmode;
22035 /* Return a comparison we can do and that it is equivalent to
22036 swap_condition (code) apart possibly from orderedness.
22037 But, never change orderedness if TARGET_IEEE_FP, returning
22038 UNKNOWN in that case if necessary. */
22040 static enum rtx_code
22041 ix86_fp_swap_condition (enum rtx_code code)
22043 switch (code)
22045 case GT: /* GTU - CF=0 & ZF=0 */
22046 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22047 case GE: /* GEU - CF=0 */
22048 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22049 case UNLT: /* LTU - CF=1 */
22050 return TARGET_IEEE_FP ? UNKNOWN : GT;
22051 case UNLE: /* LEU - CF=1 | ZF=1 */
22052 return TARGET_IEEE_FP ? UNKNOWN : GE;
22053 default:
22054 return swap_condition (code);
22058 /* Return cost of comparison CODE using the best strategy for performance.
22059 All following functions do use number of instructions as a cost metrics.
22060 In future this should be tweaked to compute bytes for optimize_size and
22061 take into account performance of various instructions on various CPUs. */
22063 static int
22064 ix86_fp_comparison_cost (enum rtx_code code)
22066 int arith_cost;
22068 /* The cost of code using bit-twiddling on %ah. */
22069 switch (code)
22071 case UNLE:
22072 case UNLT:
22073 case LTGT:
22074 case GT:
22075 case GE:
22076 case UNORDERED:
22077 case ORDERED:
22078 case UNEQ:
22079 arith_cost = 4;
22080 break;
22081 case LT:
22082 case NE:
22083 case EQ:
22084 case UNGE:
22085 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22086 break;
22087 case LE:
22088 case UNGT:
22089 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22090 break;
22091 default:
22092 gcc_unreachable ();
22095 switch (ix86_fp_comparison_strategy (code))
22097 case IX86_FPCMP_COMI:
22098 return arith_cost > 4 ? 3 : 2;
22099 case IX86_FPCMP_SAHF:
22100 return arith_cost > 4 ? 4 : 3;
22101 default:
22102 return arith_cost;
22106 /* Return strategy to use for floating-point. We assume that fcomi is always
22107 preferrable where available, since that is also true when looking at size
22108 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22110 enum ix86_fpcmp_strategy
22111 ix86_fp_comparison_strategy (enum rtx_code)
22113 /* Do fcomi/sahf based test when profitable. */
22115 if (TARGET_CMOVE)
22116 return IX86_FPCMP_COMI;
22118 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22119 return IX86_FPCMP_SAHF;
22121 return IX86_FPCMP_ARITH;
22124 /* Swap, force into registers, or otherwise massage the two operands
22125 to a fp comparison. The operands are updated in place; the new
22126 comparison code is returned. */
22128 static enum rtx_code
22129 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22131 bool unordered_compare = ix86_unordered_fp_compare (code);
22132 rtx op0 = *pop0, op1 = *pop1;
22133 machine_mode op_mode = GET_MODE (op0);
22134 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22136 /* All of the unordered compare instructions only work on registers.
22137 The same is true of the fcomi compare instructions. The XFmode
22138 compare instructions require registers except when comparing
22139 against zero or when converting operand 1 from fixed point to
22140 floating point. */
22142 if (!is_sse
22143 && (unordered_compare
22144 || (op_mode == XFmode
22145 && ! (standard_80387_constant_p (op0) == 1
22146 || standard_80387_constant_p (op1) == 1)
22147 && GET_CODE (op1) != FLOAT)
22148 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22150 op0 = force_reg (op_mode, op0);
22151 op1 = force_reg (op_mode, op1);
22153 else
22155 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22156 things around if they appear profitable, otherwise force op0
22157 into a register. */
22159 if (standard_80387_constant_p (op0) == 0
22160 || (MEM_P (op0)
22161 && ! (standard_80387_constant_p (op1) == 0
22162 || MEM_P (op1))))
22164 enum rtx_code new_code = ix86_fp_swap_condition (code);
22165 if (new_code != UNKNOWN)
22167 std::swap (op0, op1);
22168 code = new_code;
22172 if (!REG_P (op0))
22173 op0 = force_reg (op_mode, op0);
22175 if (CONSTANT_P (op1))
22177 int tmp = standard_80387_constant_p (op1);
22178 if (tmp == 0)
22179 op1 = validize_mem (force_const_mem (op_mode, op1));
22180 else if (tmp == 1)
22182 if (TARGET_CMOVE)
22183 op1 = force_reg (op_mode, op1);
22185 else
22186 op1 = force_reg (op_mode, op1);
22190 /* Try to rearrange the comparison to make it cheaper. */
22191 if (ix86_fp_comparison_cost (code)
22192 > ix86_fp_comparison_cost (swap_condition (code))
22193 && (REG_P (op1) || can_create_pseudo_p ()))
22195 std::swap (op0, op1);
22196 code = swap_condition (code);
22197 if (!REG_P (op0))
22198 op0 = force_reg (op_mode, op0);
22201 *pop0 = op0;
22202 *pop1 = op1;
22203 return code;
22206 /* Convert comparison codes we use to represent FP comparison to integer
22207 code that will result in proper branch. Return UNKNOWN if no such code
22208 is available. */
22210 enum rtx_code
22211 ix86_fp_compare_code_to_integer (enum rtx_code code)
22213 switch (code)
22215 case GT:
22216 return GTU;
22217 case GE:
22218 return GEU;
22219 case ORDERED:
22220 case UNORDERED:
22221 return code;
22222 case UNEQ:
22223 return EQ;
22224 case UNLT:
22225 return LTU;
22226 case UNLE:
22227 return LEU;
22228 case LTGT:
22229 return NE;
22230 default:
22231 return UNKNOWN;
22235 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22237 static rtx
22238 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22240 bool unordered_compare = ix86_unordered_fp_compare (code);
22241 machine_mode intcmp_mode;
22242 rtx tmp, tmp2;
22244 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22246 /* Do fcomi/sahf based test when profitable. */
22247 switch (ix86_fp_comparison_strategy (code))
22249 case IX86_FPCMP_COMI:
22250 intcmp_mode = CCFPmode;
22251 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22252 if (unordered_compare)
22253 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22254 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22255 break;
22257 case IX86_FPCMP_SAHF:
22258 intcmp_mode = CCFPmode;
22259 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22260 if (unordered_compare)
22261 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22262 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22263 if (!scratch)
22264 scratch = gen_reg_rtx (HImode);
22265 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22266 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22267 break;
22269 case IX86_FPCMP_ARITH:
22270 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22271 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22272 if (unordered_compare)
22273 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22274 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22275 if (!scratch)
22276 scratch = gen_reg_rtx (HImode);
22277 emit_insn (gen_rtx_SET (scratch, tmp));
22279 /* In the unordered case, we have to check C2 for NaN's, which
22280 doesn't happen to work out to anything nice combination-wise.
22281 So do some bit twiddling on the value we've got in AH to come
22282 up with an appropriate set of condition codes. */
22284 intcmp_mode = CCNOmode;
22285 switch (code)
22287 case GT:
22288 case UNGT:
22289 if (code == GT || !TARGET_IEEE_FP)
22291 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22292 code = EQ;
22294 else
22296 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22297 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22298 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22299 intcmp_mode = CCmode;
22300 code = GEU;
22302 break;
22303 case LT:
22304 case UNLT:
22305 if (code == LT && TARGET_IEEE_FP)
22307 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22308 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22309 intcmp_mode = CCmode;
22310 code = EQ;
22312 else
22314 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22315 code = NE;
22317 break;
22318 case GE:
22319 case UNGE:
22320 if (code == GE || !TARGET_IEEE_FP)
22322 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22323 code = EQ;
22325 else
22327 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22328 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22329 code = NE;
22331 break;
22332 case LE:
22333 case UNLE:
22334 if (code == LE && TARGET_IEEE_FP)
22336 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22337 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22338 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22339 intcmp_mode = CCmode;
22340 code = LTU;
22342 else
22344 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22345 code = NE;
22347 break;
22348 case EQ:
22349 case UNEQ:
22350 if (code == EQ && TARGET_IEEE_FP)
22352 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22353 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22354 intcmp_mode = CCmode;
22355 code = EQ;
22357 else
22359 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22360 code = NE;
22362 break;
22363 case NE:
22364 case LTGT:
22365 if (code == NE && TARGET_IEEE_FP)
22367 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22368 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22369 GEN_INT (0x40)));
22370 code = NE;
22372 else
22374 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22375 code = EQ;
22377 break;
22379 case UNORDERED:
22380 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22381 code = NE;
22382 break;
22383 case ORDERED:
22384 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22385 code = EQ;
22386 break;
22388 default:
22389 gcc_unreachable ();
22391 break;
22393 default:
22394 gcc_unreachable();
22397 /* Return the test that should be put into the flags user, i.e.
22398 the bcc, scc, or cmov instruction. */
22399 return gen_rtx_fmt_ee (code, VOIDmode,
22400 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22401 const0_rtx);
22404 static rtx
22405 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22407 rtx ret;
22409 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22410 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22412 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22414 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22415 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22417 else
22418 ret = ix86_expand_int_compare (code, op0, op1);
22420 return ret;
22423 void
22424 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22426 machine_mode mode = GET_MODE (op0);
22427 rtx tmp;
22429 /* Handle special case - vector comparsion with boolean result, transform
22430 it using ptest instruction. */
22431 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22433 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22434 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22436 gcc_assert (code == EQ || code == NE);
22437 /* Generate XOR since we can't check that one operand is zero vector. */
22438 tmp = gen_reg_rtx (mode);
22439 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22440 tmp = gen_lowpart (p_mode, tmp);
22441 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22442 gen_rtx_UNSPEC (CCmode,
22443 gen_rtvec (2, tmp, tmp),
22444 UNSPEC_PTEST)));
22445 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22446 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22447 gen_rtx_LABEL_REF (VOIDmode, label),
22448 pc_rtx);
22449 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22450 return;
22453 switch (mode)
22455 case E_SFmode:
22456 case E_DFmode:
22457 case E_XFmode:
22458 case E_QImode:
22459 case E_HImode:
22460 case E_SImode:
22461 simple:
22462 tmp = ix86_expand_compare (code, op0, op1);
22463 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22464 gen_rtx_LABEL_REF (VOIDmode, label),
22465 pc_rtx);
22466 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22467 return;
22469 case E_DImode:
22470 if (TARGET_64BIT)
22471 goto simple;
22472 /* For 32-bit target DI comparison may be performed on
22473 SSE registers. To allow this we should avoid split
22474 to SI mode which is achieved by doing xor in DI mode
22475 and then comparing with zero (which is recognized by
22476 STV pass). We don't compare using xor when optimizing
22477 for size. */
22478 if (!optimize_insn_for_size_p ()
22479 && TARGET_STV
22480 && (code == EQ || code == NE))
22482 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22483 op1 = const0_rtx;
22485 /* FALLTHRU */
22486 case E_TImode:
22487 /* Expand DImode branch into multiple compare+branch. */
22489 rtx lo[2], hi[2];
22490 rtx_code_label *label2;
22491 enum rtx_code code1, code2, code3;
22492 machine_mode submode;
22494 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22496 std::swap (op0, op1);
22497 code = swap_condition (code);
22500 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22501 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22503 submode = mode == DImode ? SImode : DImode;
22505 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22506 avoid two branches. This costs one extra insn, so disable when
22507 optimizing for size. */
22509 if ((code == EQ || code == NE)
22510 && (!optimize_insn_for_size_p ()
22511 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22513 rtx xor0, xor1;
22515 xor1 = hi[0];
22516 if (hi[1] != const0_rtx)
22517 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22518 NULL_RTX, 0, OPTAB_WIDEN);
22520 xor0 = lo[0];
22521 if (lo[1] != const0_rtx)
22522 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22523 NULL_RTX, 0, OPTAB_WIDEN);
22525 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22526 NULL_RTX, 0, OPTAB_WIDEN);
22528 ix86_expand_branch (code, tmp, const0_rtx, label);
22529 return;
22532 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22533 op1 is a constant and the low word is zero, then we can just
22534 examine the high word. Similarly for low word -1 and
22535 less-or-equal-than or greater-than. */
22537 if (CONST_INT_P (hi[1]))
22538 switch (code)
22540 case LT: case LTU: case GE: case GEU:
22541 if (lo[1] == const0_rtx)
22543 ix86_expand_branch (code, hi[0], hi[1], label);
22544 return;
22546 break;
22547 case LE: case LEU: case GT: case GTU:
22548 if (lo[1] == constm1_rtx)
22550 ix86_expand_branch (code, hi[0], hi[1], label);
22551 return;
22553 break;
22554 default:
22555 break;
22558 /* Emulate comparisons that do not depend on Zero flag with
22559 double-word subtraction. Note that only Overflow, Sign
22560 and Carry flags are valid, so swap arguments and condition
22561 of comparisons that would otherwise test Zero flag. */
22563 switch (code)
22565 case LE: case LEU: case GT: case GTU:
22566 std::swap (lo[0], lo[1]);
22567 std::swap (hi[0], hi[1]);
22568 code = swap_condition (code);
22569 /* FALLTHRU */
22571 case LT: case LTU: case GE: case GEU:
22573 rtx (*cmp_insn) (rtx, rtx);
22574 rtx (*sbb_insn) (rtx, rtx, rtx);
22575 bool uns = (code == LTU || code == GEU);
22577 if (TARGET_64BIT)
22579 cmp_insn = gen_cmpdi_1;
22580 sbb_insn
22581 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22583 else
22585 cmp_insn = gen_cmpsi_1;
22586 sbb_insn
22587 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22590 if (!nonimmediate_operand (lo[0], submode))
22591 lo[0] = force_reg (submode, lo[0]);
22592 if (!x86_64_general_operand (lo[1], submode))
22593 lo[1] = force_reg (submode, lo[1]);
22595 if (!register_operand (hi[0], submode))
22596 hi[0] = force_reg (submode, hi[0]);
22597 if ((uns && !nonimmediate_operand (hi[1], submode))
22598 || (!uns && !x86_64_general_operand (hi[1], submode)))
22599 hi[1] = force_reg (submode, hi[1]);
22601 emit_insn (cmp_insn (lo[0], lo[1]));
22602 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22604 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22606 ix86_expand_branch (code, tmp, const0_rtx, label);
22607 return;
22610 default:
22611 break;
22614 /* Otherwise, we need two or three jumps. */
22616 label2 = gen_label_rtx ();
22618 code1 = code;
22619 code2 = swap_condition (code);
22620 code3 = unsigned_condition (code);
22622 switch (code)
22624 case LT: case GT: case LTU: case GTU:
22625 break;
22627 case LE: code1 = LT; code2 = GT; break;
22628 case GE: code1 = GT; code2 = LT; break;
22629 case LEU: code1 = LTU; code2 = GTU; break;
22630 case GEU: code1 = GTU; code2 = LTU; break;
22632 case EQ: code1 = UNKNOWN; code2 = NE; break;
22633 case NE: code2 = UNKNOWN; break;
22635 default:
22636 gcc_unreachable ();
22640 * a < b =>
22641 * if (hi(a) < hi(b)) goto true;
22642 * if (hi(a) > hi(b)) goto false;
22643 * if (lo(a) < lo(b)) goto true;
22644 * false:
22647 if (code1 != UNKNOWN)
22648 ix86_expand_branch (code1, hi[0], hi[1], label);
22649 if (code2 != UNKNOWN)
22650 ix86_expand_branch (code2, hi[0], hi[1], label2);
22652 ix86_expand_branch (code3, lo[0], lo[1], label);
22654 if (code2 != UNKNOWN)
22655 emit_label (label2);
22656 return;
22659 default:
22660 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22661 goto simple;
22665 void
22666 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22668 rtx ret;
22670 gcc_assert (GET_MODE (dest) == QImode);
22672 ret = ix86_expand_compare (code, op0, op1);
22673 PUT_MODE (ret, QImode);
22674 emit_insn (gen_rtx_SET (dest, ret));
22677 /* Expand comparison setting or clearing carry flag. Return true when
22678 successful and set pop for the operation. */
22679 static bool
22680 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22682 machine_mode mode =
22683 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22685 /* Do not handle double-mode compares that go through special path. */
22686 if (mode == (TARGET_64BIT ? TImode : DImode))
22687 return false;
22689 if (SCALAR_FLOAT_MODE_P (mode))
22691 rtx compare_op;
22692 rtx_insn *compare_seq;
22694 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22696 /* Shortcut: following common codes never translate
22697 into carry flag compares. */
22698 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22699 || code == ORDERED || code == UNORDERED)
22700 return false;
22702 /* These comparisons require zero flag; swap operands so they won't. */
22703 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22704 && !TARGET_IEEE_FP)
22706 std::swap (op0, op1);
22707 code = swap_condition (code);
22710 /* Try to expand the comparison and verify that we end up with
22711 carry flag based comparison. This fails to be true only when
22712 we decide to expand comparison using arithmetic that is not
22713 too common scenario. */
22714 start_sequence ();
22715 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22716 compare_seq = get_insns ();
22717 end_sequence ();
22719 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
22720 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22721 else
22722 code = GET_CODE (compare_op);
22724 if (code != LTU && code != GEU)
22725 return false;
22727 emit_insn (compare_seq);
22728 *pop = compare_op;
22729 return true;
22732 if (!INTEGRAL_MODE_P (mode))
22733 return false;
22735 switch (code)
22737 case LTU:
22738 case GEU:
22739 break;
22741 /* Convert a==0 into (unsigned)a<1. */
22742 case EQ:
22743 case NE:
22744 if (op1 != const0_rtx)
22745 return false;
22746 op1 = const1_rtx;
22747 code = (code == EQ ? LTU : GEU);
22748 break;
22750 /* Convert a>b into b<a or a>=b-1. */
22751 case GTU:
22752 case LEU:
22753 if (CONST_INT_P (op1))
22755 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22756 /* Bail out on overflow. We still can swap operands but that
22757 would force loading of the constant into register. */
22758 if (op1 == const0_rtx
22759 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22760 return false;
22761 code = (code == GTU ? GEU : LTU);
22763 else
22765 std::swap (op0, op1);
22766 code = (code == GTU ? LTU : GEU);
22768 break;
22770 /* Convert a>=0 into (unsigned)a<0x80000000. */
22771 case LT:
22772 case GE:
22773 if (mode == DImode || op1 != const0_rtx)
22774 return false;
22775 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22776 code = (code == LT ? GEU : LTU);
22777 break;
22778 case LE:
22779 case GT:
22780 if (mode == DImode || op1 != constm1_rtx)
22781 return false;
22782 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22783 code = (code == LE ? GEU : LTU);
22784 break;
22786 default:
22787 return false;
22789 /* Swapping operands may cause constant to appear as first operand. */
22790 if (!nonimmediate_operand (op0, VOIDmode))
22792 if (!can_create_pseudo_p ())
22793 return false;
22794 op0 = force_reg (mode, op0);
22796 *pop = ix86_expand_compare (code, op0, op1);
22797 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22798 return true;
22801 bool
22802 ix86_expand_int_movcc (rtx operands[])
22804 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22805 rtx_insn *compare_seq;
22806 rtx compare_op;
22807 machine_mode mode = GET_MODE (operands[0]);
22808 bool sign_bit_compare_p = false;
22809 rtx op0 = XEXP (operands[1], 0);
22810 rtx op1 = XEXP (operands[1], 1);
22812 if (GET_MODE (op0) == TImode
22813 || (GET_MODE (op0) == DImode
22814 && !TARGET_64BIT))
22815 return false;
22817 start_sequence ();
22818 compare_op = ix86_expand_compare (code, op0, op1);
22819 compare_seq = get_insns ();
22820 end_sequence ();
22822 compare_code = GET_CODE (compare_op);
22824 if ((op1 == const0_rtx && (code == GE || code == LT))
22825 || (op1 == constm1_rtx && (code == GT || code == LE)))
22826 sign_bit_compare_p = true;
22828 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22829 HImode insns, we'd be swallowed in word prefix ops. */
22831 if ((mode != HImode || TARGET_FAST_PREFIX)
22832 && (mode != (TARGET_64BIT ? TImode : DImode))
22833 && CONST_INT_P (operands[2])
22834 && CONST_INT_P (operands[3]))
22836 rtx out = operands[0];
22837 HOST_WIDE_INT ct = INTVAL (operands[2]);
22838 HOST_WIDE_INT cf = INTVAL (operands[3]);
22839 HOST_WIDE_INT diff;
22841 diff = ct - cf;
22842 /* Sign bit compares are better done using shifts than we do by using
22843 sbb. */
22844 if (sign_bit_compare_p
22845 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22847 /* Detect overlap between destination and compare sources. */
22848 rtx tmp = out;
22850 if (!sign_bit_compare_p)
22852 rtx flags;
22853 bool fpcmp = false;
22855 compare_code = GET_CODE (compare_op);
22857 flags = XEXP (compare_op, 0);
22859 if (GET_MODE (flags) == CCFPmode)
22861 fpcmp = true;
22862 compare_code
22863 = ix86_fp_compare_code_to_integer (compare_code);
22866 /* To simplify rest of code, restrict to the GEU case. */
22867 if (compare_code == LTU)
22869 std::swap (ct, cf);
22870 compare_code = reverse_condition (compare_code);
22871 code = reverse_condition (code);
22873 else
22875 if (fpcmp)
22876 PUT_CODE (compare_op,
22877 reverse_condition_maybe_unordered
22878 (GET_CODE (compare_op)));
22879 else
22880 PUT_CODE (compare_op,
22881 reverse_condition (GET_CODE (compare_op)));
22883 diff = ct - cf;
22885 if (reg_overlap_mentioned_p (out, op0)
22886 || reg_overlap_mentioned_p (out, op1))
22887 tmp = gen_reg_rtx (mode);
22889 if (mode == DImode)
22890 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22891 else
22892 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22893 flags, compare_op));
22895 else
22897 if (code == GT || code == GE)
22898 code = reverse_condition (code);
22899 else
22901 std::swap (ct, cf);
22902 diff = ct - cf;
22904 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22907 if (diff == 1)
22910 * cmpl op0,op1
22911 * sbbl dest,dest
22912 * [addl dest, ct]
22914 * Size 5 - 8.
22916 if (ct)
22917 tmp = expand_simple_binop (mode, PLUS,
22918 tmp, GEN_INT (ct),
22919 copy_rtx (tmp), 1, OPTAB_DIRECT);
22921 else if (cf == -1)
22924 * cmpl op0,op1
22925 * sbbl dest,dest
22926 * orl $ct, dest
22928 * Size 8.
22930 tmp = expand_simple_binop (mode, IOR,
22931 tmp, GEN_INT (ct),
22932 copy_rtx (tmp), 1, OPTAB_DIRECT);
22934 else if (diff == -1 && ct)
22937 * cmpl op0,op1
22938 * sbbl dest,dest
22939 * notl dest
22940 * [addl dest, cf]
22942 * Size 8 - 11.
22944 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22945 if (cf)
22946 tmp = expand_simple_binop (mode, PLUS,
22947 copy_rtx (tmp), GEN_INT (cf),
22948 copy_rtx (tmp), 1, OPTAB_DIRECT);
22950 else
22953 * cmpl op0,op1
22954 * sbbl dest,dest
22955 * [notl dest]
22956 * andl cf - ct, dest
22957 * [addl dest, ct]
22959 * Size 8 - 11.
22962 if (cf == 0)
22964 cf = ct;
22965 ct = 0;
22966 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22969 tmp = expand_simple_binop (mode, AND,
22970 copy_rtx (tmp),
22971 gen_int_mode (cf - ct, mode),
22972 copy_rtx (tmp), 1, OPTAB_DIRECT);
22973 if (ct)
22974 tmp = expand_simple_binop (mode, PLUS,
22975 copy_rtx (tmp), GEN_INT (ct),
22976 copy_rtx (tmp), 1, OPTAB_DIRECT);
22979 if (!rtx_equal_p (tmp, out))
22980 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
22982 return true;
22985 if (diff < 0)
22987 machine_mode cmp_mode = GET_MODE (op0);
22988 enum rtx_code new_code;
22990 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22992 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22994 /* We may be reversing unordered compare to normal compare, that
22995 is not valid in general (we may convert non-trapping condition
22996 to trapping one), however on i386 we currently emit all
22997 comparisons unordered. */
22998 new_code = reverse_condition_maybe_unordered (code);
23000 else
23001 new_code = ix86_reverse_condition (code, cmp_mode);
23002 if (new_code != UNKNOWN)
23004 std::swap (ct, cf);
23005 diff = -diff;
23006 code = new_code;
23010 compare_code = UNKNOWN;
23011 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23012 && CONST_INT_P (op1))
23014 if (op1 == const0_rtx
23015 && (code == LT || code == GE))
23016 compare_code = code;
23017 else if (op1 == constm1_rtx)
23019 if (code == LE)
23020 compare_code = LT;
23021 else if (code == GT)
23022 compare_code = GE;
23026 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23027 if (compare_code != UNKNOWN
23028 && GET_MODE (op0) == GET_MODE (out)
23029 && (cf == -1 || ct == -1))
23031 /* If lea code below could be used, only optimize
23032 if it results in a 2 insn sequence. */
23034 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23035 || diff == 3 || diff == 5 || diff == 9)
23036 || (compare_code == LT && ct == -1)
23037 || (compare_code == GE && cf == -1))
23040 * notl op1 (if necessary)
23041 * sarl $31, op1
23042 * orl cf, op1
23044 if (ct != -1)
23046 cf = ct;
23047 ct = -1;
23048 code = reverse_condition (code);
23051 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23053 out = expand_simple_binop (mode, IOR,
23054 out, GEN_INT (cf),
23055 out, 1, OPTAB_DIRECT);
23056 if (out != operands[0])
23057 emit_move_insn (operands[0], out);
23059 return true;
23064 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23065 || diff == 3 || diff == 5 || diff == 9)
23066 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23067 && (mode != DImode
23068 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23071 * xorl dest,dest
23072 * cmpl op1,op2
23073 * setcc dest
23074 * lea cf(dest*(ct-cf)),dest
23076 * Size 14.
23078 * This also catches the degenerate setcc-only case.
23081 rtx tmp;
23082 int nops;
23084 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23086 nops = 0;
23087 /* On x86_64 the lea instruction operates on Pmode, so we need
23088 to get arithmetics done in proper mode to match. */
23089 if (diff == 1)
23090 tmp = copy_rtx (out);
23091 else
23093 rtx out1;
23094 out1 = copy_rtx (out);
23095 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23096 nops++;
23097 if (diff & 1)
23099 tmp = gen_rtx_PLUS (mode, tmp, out1);
23100 nops++;
23103 if (cf != 0)
23105 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23106 nops++;
23108 if (!rtx_equal_p (tmp, out))
23110 if (nops == 1)
23111 out = force_operand (tmp, copy_rtx (out));
23112 else
23113 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23115 if (!rtx_equal_p (out, operands[0]))
23116 emit_move_insn (operands[0], copy_rtx (out));
23118 return true;
23122 * General case: Jumpful:
23123 * xorl dest,dest cmpl op1, op2
23124 * cmpl op1, op2 movl ct, dest
23125 * setcc dest jcc 1f
23126 * decl dest movl cf, dest
23127 * andl (cf-ct),dest 1:
23128 * addl ct,dest
23130 * Size 20. Size 14.
23132 * This is reasonably steep, but branch mispredict costs are
23133 * high on modern cpus, so consider failing only if optimizing
23134 * for space.
23137 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23138 && BRANCH_COST (optimize_insn_for_speed_p (),
23139 false) >= 2)
23141 if (cf == 0)
23143 machine_mode cmp_mode = GET_MODE (op0);
23144 enum rtx_code new_code;
23146 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23148 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23150 /* We may be reversing unordered compare to normal compare,
23151 that is not valid in general (we may convert non-trapping
23152 condition to trapping one), however on i386 we currently
23153 emit all comparisons unordered. */
23154 new_code = reverse_condition_maybe_unordered (code);
23156 else
23158 new_code = ix86_reverse_condition (code, cmp_mode);
23159 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23160 compare_code = reverse_condition (compare_code);
23163 if (new_code != UNKNOWN)
23165 cf = ct;
23166 ct = 0;
23167 code = new_code;
23171 if (compare_code != UNKNOWN)
23173 /* notl op1 (if needed)
23174 sarl $31, op1
23175 andl (cf-ct), op1
23176 addl ct, op1
23178 For x < 0 (resp. x <= -1) there will be no notl,
23179 so if possible swap the constants to get rid of the
23180 complement.
23181 True/false will be -1/0 while code below (store flag
23182 followed by decrement) is 0/-1, so the constants need
23183 to be exchanged once more. */
23185 if (compare_code == GE || !cf)
23187 code = reverse_condition (code);
23188 compare_code = LT;
23190 else
23191 std::swap (ct, cf);
23193 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23195 else
23197 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23199 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23200 constm1_rtx,
23201 copy_rtx (out), 1, OPTAB_DIRECT);
23204 out = expand_simple_binop (mode, AND, copy_rtx (out),
23205 gen_int_mode (cf - ct, mode),
23206 copy_rtx (out), 1, OPTAB_DIRECT);
23207 if (ct)
23208 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23209 copy_rtx (out), 1, OPTAB_DIRECT);
23210 if (!rtx_equal_p (out, operands[0]))
23211 emit_move_insn (operands[0], copy_rtx (out));
23213 return true;
23217 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23219 /* Try a few things more with specific constants and a variable. */
23221 optab op;
23222 rtx var, orig_out, out, tmp;
23224 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23225 return false;
23227 /* If one of the two operands is an interesting constant, load a
23228 constant with the above and mask it in with a logical operation. */
23230 if (CONST_INT_P (operands[2]))
23232 var = operands[3];
23233 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23234 operands[3] = constm1_rtx, op = and_optab;
23235 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23236 operands[3] = const0_rtx, op = ior_optab;
23237 else
23238 return false;
23240 else if (CONST_INT_P (operands[3]))
23242 var = operands[2];
23243 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23244 operands[2] = constm1_rtx, op = and_optab;
23245 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23246 operands[2] = const0_rtx, op = ior_optab;
23247 else
23248 return false;
23250 else
23251 return false;
23253 orig_out = operands[0];
23254 tmp = gen_reg_rtx (mode);
23255 operands[0] = tmp;
23257 /* Recurse to get the constant loaded. */
23258 if (!ix86_expand_int_movcc (operands))
23259 return false;
23261 /* Mask in the interesting variable. */
23262 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23263 OPTAB_WIDEN);
23264 if (!rtx_equal_p (out, orig_out))
23265 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23267 return true;
23271 * For comparison with above,
23273 * movl cf,dest
23274 * movl ct,tmp
23275 * cmpl op1,op2
23276 * cmovcc tmp,dest
23278 * Size 15.
23281 if (! nonimmediate_operand (operands[2], mode))
23282 operands[2] = force_reg (mode, operands[2]);
23283 if (! nonimmediate_operand (operands[3], mode))
23284 operands[3] = force_reg (mode, operands[3]);
23286 if (! register_operand (operands[2], VOIDmode)
23287 && (mode == QImode
23288 || ! register_operand (operands[3], VOIDmode)))
23289 operands[2] = force_reg (mode, operands[2]);
23291 if (mode == QImode
23292 && ! register_operand (operands[3], VOIDmode))
23293 operands[3] = force_reg (mode, operands[3]);
23295 emit_insn (compare_seq);
23296 emit_insn (gen_rtx_SET (operands[0],
23297 gen_rtx_IF_THEN_ELSE (mode,
23298 compare_op, operands[2],
23299 operands[3])));
23300 return true;
23303 /* Swap, force into registers, or otherwise massage the two operands
23304 to an sse comparison with a mask result. Thus we differ a bit from
23305 ix86_prepare_fp_compare_args which expects to produce a flags result.
23307 The DEST operand exists to help determine whether to commute commutative
23308 operators. The POP0/POP1 operands are updated in place. The new
23309 comparison code is returned, or UNKNOWN if not implementable. */
23311 static enum rtx_code
23312 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23313 rtx *pop0, rtx *pop1)
23315 switch (code)
23317 case LTGT:
23318 case UNEQ:
23319 /* AVX supports all the needed comparisons. */
23320 if (TARGET_AVX)
23321 break;
23322 /* We have no LTGT as an operator. We could implement it with
23323 NE & ORDERED, but this requires an extra temporary. It's
23324 not clear that it's worth it. */
23325 return UNKNOWN;
23327 case LT:
23328 case LE:
23329 case UNGT:
23330 case UNGE:
23331 /* These are supported directly. */
23332 break;
23334 case EQ:
23335 case NE:
23336 case UNORDERED:
23337 case ORDERED:
23338 /* AVX has 3 operand comparisons, no need to swap anything. */
23339 if (TARGET_AVX)
23340 break;
23341 /* For commutative operators, try to canonicalize the destination
23342 operand to be first in the comparison - this helps reload to
23343 avoid extra moves. */
23344 if (!dest || !rtx_equal_p (dest, *pop1))
23345 break;
23346 /* FALLTHRU */
23348 case GE:
23349 case GT:
23350 case UNLE:
23351 case UNLT:
23352 /* These are not supported directly before AVX, and furthermore
23353 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23354 comparison operands to transform into something that is
23355 supported. */
23356 std::swap (*pop0, *pop1);
23357 code = swap_condition (code);
23358 break;
23360 default:
23361 gcc_unreachable ();
23364 return code;
23367 /* Detect conditional moves that exactly match min/max operational
23368 semantics. Note that this is IEEE safe, as long as we don't
23369 interchange the operands.
23371 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23372 and TRUE if the operation is successful and instructions are emitted. */
23374 static bool
23375 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23376 rtx cmp_op1, rtx if_true, rtx if_false)
23378 machine_mode mode;
23379 bool is_min;
23380 rtx tmp;
23382 if (code == LT)
23384 else if (code == UNGE)
23385 std::swap (if_true, if_false);
23386 else
23387 return false;
23389 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23390 is_min = true;
23391 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23392 is_min = false;
23393 else
23394 return false;
23396 mode = GET_MODE (dest);
23398 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23399 but MODE may be a vector mode and thus not appropriate. */
23400 if (!flag_finite_math_only || flag_signed_zeros)
23402 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23403 rtvec v;
23405 if_true = force_reg (mode, if_true);
23406 v = gen_rtvec (2, if_true, if_false);
23407 tmp = gen_rtx_UNSPEC (mode, v, u);
23409 else
23411 code = is_min ? SMIN : SMAX;
23412 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23415 emit_insn (gen_rtx_SET (dest, tmp));
23416 return true;
23419 /* Expand an sse vector comparison. Return the register with the result. */
23421 static rtx
23422 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23423 rtx op_true, rtx op_false)
23425 machine_mode mode = GET_MODE (dest);
23426 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23428 /* In general case result of comparison can differ from operands' type. */
23429 machine_mode cmp_mode;
23431 /* In AVX512F the result of comparison is an integer mask. */
23432 bool maskcmp = false;
23433 rtx x;
23435 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23437 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23438 cmp_mode = int_mode_for_size (nbits, 0).require ();
23439 maskcmp = true;
23441 else
23442 cmp_mode = cmp_ops_mode;
23445 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23446 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23447 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23449 if (optimize
23450 || (maskcmp && cmp_mode != mode)
23451 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23452 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23453 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23455 /* Compare patterns for int modes are unspec in AVX512F only. */
23456 if (maskcmp && (code == GT || code == EQ))
23458 rtx (*gen)(rtx, rtx, rtx);
23460 switch (cmp_ops_mode)
23462 case E_V64QImode:
23463 gcc_assert (TARGET_AVX512BW);
23464 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23465 break;
23466 case E_V32HImode:
23467 gcc_assert (TARGET_AVX512BW);
23468 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23469 break;
23470 case E_V16SImode:
23471 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23472 break;
23473 case E_V8DImode:
23474 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23475 break;
23476 default:
23477 gen = NULL;
23480 if (gen)
23482 emit_insn (gen (dest, cmp_op0, cmp_op1));
23483 return dest;
23486 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23488 if (cmp_mode != mode && !maskcmp)
23490 x = force_reg (cmp_ops_mode, x);
23491 convert_move (dest, x, false);
23493 else
23494 emit_insn (gen_rtx_SET (dest, x));
23496 return dest;
23499 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23500 operations. This is used for both scalar and vector conditional moves. */
23502 void
23503 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23505 machine_mode mode = GET_MODE (dest);
23506 machine_mode cmpmode = GET_MODE (cmp);
23508 /* In AVX512F the result of comparison is an integer mask. */
23509 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23511 rtx t2, t3, x;
23513 /* If we have an integer mask and FP value then we need
23514 to cast mask to FP mode. */
23515 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23517 cmp = force_reg (cmpmode, cmp);
23518 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23521 if (vector_all_ones_operand (op_true, mode)
23522 && rtx_equal_p (op_false, CONST0_RTX (mode))
23523 && !maskcmp)
23525 emit_insn (gen_rtx_SET (dest, cmp));
23527 else if (op_false == CONST0_RTX (mode)
23528 && !maskcmp)
23530 op_true = force_reg (mode, op_true);
23531 x = gen_rtx_AND (mode, cmp, op_true);
23532 emit_insn (gen_rtx_SET (dest, x));
23534 else if (op_true == CONST0_RTX (mode)
23535 && !maskcmp)
23537 op_false = force_reg (mode, op_false);
23538 x = gen_rtx_NOT (mode, cmp);
23539 x = gen_rtx_AND (mode, x, op_false);
23540 emit_insn (gen_rtx_SET (dest, x));
23542 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23543 && !maskcmp)
23545 op_false = force_reg (mode, op_false);
23546 x = gen_rtx_IOR (mode, cmp, op_false);
23547 emit_insn (gen_rtx_SET (dest, x));
23549 else if (TARGET_XOP
23550 && !maskcmp)
23552 op_true = force_reg (mode, op_true);
23554 if (!nonimmediate_operand (op_false, mode))
23555 op_false = force_reg (mode, op_false);
23557 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23558 op_true,
23559 op_false)));
23561 else
23563 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23564 rtx d = dest;
23566 if (!nonimmediate_operand (op_true, mode))
23567 op_true = force_reg (mode, op_true);
23569 op_false = force_reg (mode, op_false);
23571 switch (mode)
23573 case E_V4SFmode:
23574 if (TARGET_SSE4_1)
23575 gen = gen_sse4_1_blendvps;
23576 break;
23577 case E_V2DFmode:
23578 if (TARGET_SSE4_1)
23579 gen = gen_sse4_1_blendvpd;
23580 break;
23581 case E_V16QImode:
23582 case E_V8HImode:
23583 case E_V4SImode:
23584 case E_V2DImode:
23585 if (TARGET_SSE4_1)
23587 gen = gen_sse4_1_pblendvb;
23588 if (mode != V16QImode)
23589 d = gen_reg_rtx (V16QImode);
23590 op_false = gen_lowpart (V16QImode, op_false);
23591 op_true = gen_lowpart (V16QImode, op_true);
23592 cmp = gen_lowpart (V16QImode, cmp);
23594 break;
23595 case E_V8SFmode:
23596 if (TARGET_AVX)
23597 gen = gen_avx_blendvps256;
23598 break;
23599 case E_V4DFmode:
23600 if (TARGET_AVX)
23601 gen = gen_avx_blendvpd256;
23602 break;
23603 case E_V32QImode:
23604 case E_V16HImode:
23605 case E_V8SImode:
23606 case E_V4DImode:
23607 if (TARGET_AVX2)
23609 gen = gen_avx2_pblendvb;
23610 if (mode != V32QImode)
23611 d = gen_reg_rtx (V32QImode);
23612 op_false = gen_lowpart (V32QImode, op_false);
23613 op_true = gen_lowpart (V32QImode, op_true);
23614 cmp = gen_lowpart (V32QImode, cmp);
23616 break;
23618 case E_V64QImode:
23619 gen = gen_avx512bw_blendmv64qi;
23620 break;
23621 case E_V32HImode:
23622 gen = gen_avx512bw_blendmv32hi;
23623 break;
23624 case E_V16SImode:
23625 gen = gen_avx512f_blendmv16si;
23626 break;
23627 case E_V8DImode:
23628 gen = gen_avx512f_blendmv8di;
23629 break;
23630 case E_V8DFmode:
23631 gen = gen_avx512f_blendmv8df;
23632 break;
23633 case E_V16SFmode:
23634 gen = gen_avx512f_blendmv16sf;
23635 break;
23637 default:
23638 break;
23641 if (gen != NULL)
23643 emit_insn (gen (d, op_false, op_true, cmp));
23644 if (d != dest)
23645 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23647 else
23649 op_true = force_reg (mode, op_true);
23651 t2 = gen_reg_rtx (mode);
23652 if (optimize)
23653 t3 = gen_reg_rtx (mode);
23654 else
23655 t3 = dest;
23657 x = gen_rtx_AND (mode, op_true, cmp);
23658 emit_insn (gen_rtx_SET (t2, x));
23660 x = gen_rtx_NOT (mode, cmp);
23661 x = gen_rtx_AND (mode, x, op_false);
23662 emit_insn (gen_rtx_SET (t3, x));
23664 x = gen_rtx_IOR (mode, t3, t2);
23665 emit_insn (gen_rtx_SET (dest, x));
23670 /* Expand a floating-point conditional move. Return true if successful. */
23672 bool
23673 ix86_expand_fp_movcc (rtx operands[])
23675 machine_mode mode = GET_MODE (operands[0]);
23676 enum rtx_code code = GET_CODE (operands[1]);
23677 rtx tmp, compare_op;
23678 rtx op0 = XEXP (operands[1], 0);
23679 rtx op1 = XEXP (operands[1], 1);
23681 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23683 machine_mode cmode;
23685 /* Since we've no cmove for sse registers, don't force bad register
23686 allocation just to gain access to it. Deny movcc when the
23687 comparison mode doesn't match the move mode. */
23688 cmode = GET_MODE (op0);
23689 if (cmode == VOIDmode)
23690 cmode = GET_MODE (op1);
23691 if (cmode != mode)
23692 return false;
23694 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23695 if (code == UNKNOWN)
23696 return false;
23698 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23699 operands[2], operands[3]))
23700 return true;
23702 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23703 operands[2], operands[3]);
23704 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23705 return true;
23708 if (GET_MODE (op0) == TImode
23709 || (GET_MODE (op0) == DImode
23710 && !TARGET_64BIT))
23711 return false;
23713 /* The floating point conditional move instructions don't directly
23714 support conditions resulting from a signed integer comparison. */
23716 compare_op = ix86_expand_compare (code, op0, op1);
23717 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23719 tmp = gen_reg_rtx (QImode);
23720 ix86_expand_setcc (tmp, code, op0, op1);
23722 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23725 emit_insn (gen_rtx_SET (operands[0],
23726 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23727 operands[2], operands[3])));
23729 return true;
23732 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23734 static int
23735 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23737 switch (code)
23739 case EQ:
23740 return 0;
23741 case LT:
23742 case LTU:
23743 return 1;
23744 case LE:
23745 case LEU:
23746 return 2;
23747 case NE:
23748 return 4;
23749 case GE:
23750 case GEU:
23751 return 5;
23752 case GT:
23753 case GTU:
23754 return 6;
23755 default:
23756 gcc_unreachable ();
23760 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23762 static int
23763 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23765 switch (code)
23767 case EQ:
23768 return 0x00;
23769 case NE:
23770 return 0x04;
23771 case GT:
23772 return 0x0e;
23773 case LE:
23774 return 0x02;
23775 case GE:
23776 return 0x0d;
23777 case LT:
23778 return 0x01;
23779 case UNLE:
23780 return 0x0a;
23781 case UNLT:
23782 return 0x09;
23783 case UNGE:
23784 return 0x05;
23785 case UNGT:
23786 return 0x06;
23787 case UNEQ:
23788 return 0x18;
23789 case LTGT:
23790 return 0x0c;
23791 case ORDERED:
23792 return 0x07;
23793 case UNORDERED:
23794 return 0x03;
23795 default:
23796 gcc_unreachable ();
23800 /* Return immediate value to be used in UNSPEC_PCMP
23801 for comparison CODE in MODE. */
23803 static int
23804 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23806 if (FLOAT_MODE_P (mode))
23807 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23808 return ix86_int_cmp_code_to_pcmp_immediate (code);
23811 /* Expand AVX-512 vector comparison. */
23813 bool
23814 ix86_expand_mask_vec_cmp (rtx operands[])
23816 machine_mode mask_mode = GET_MODE (operands[0]);
23817 machine_mode cmp_mode = GET_MODE (operands[2]);
23818 enum rtx_code code = GET_CODE (operands[1]);
23819 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23820 int unspec_code;
23821 rtx unspec;
23823 switch (code)
23825 case LEU:
23826 case GTU:
23827 case GEU:
23828 case LTU:
23829 unspec_code = UNSPEC_UNSIGNED_PCMP;
23830 break;
23832 default:
23833 unspec_code = UNSPEC_PCMP;
23836 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23837 operands[3], imm),
23838 unspec_code);
23839 emit_insn (gen_rtx_SET (operands[0], unspec));
23841 return true;
23844 /* Expand fp vector comparison. */
23846 bool
23847 ix86_expand_fp_vec_cmp (rtx operands[])
23849 enum rtx_code code = GET_CODE (operands[1]);
23850 rtx cmp;
23852 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23853 &operands[2], &operands[3]);
23854 if (code == UNKNOWN)
23856 rtx temp;
23857 switch (GET_CODE (operands[1]))
23859 case LTGT:
23860 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23861 operands[3], NULL, NULL);
23862 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23863 operands[3], NULL, NULL);
23864 code = AND;
23865 break;
23866 case UNEQ:
23867 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23868 operands[3], NULL, NULL);
23869 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23870 operands[3], NULL, NULL);
23871 code = IOR;
23872 break;
23873 default:
23874 gcc_unreachable ();
23876 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23877 OPTAB_DIRECT);
23879 else
23880 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23881 operands[1], operands[2]);
23883 if (operands[0] != cmp)
23884 emit_move_insn (operands[0], cmp);
23886 return true;
23889 static rtx
23890 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23891 rtx op_true, rtx op_false, bool *negate)
23893 machine_mode data_mode = GET_MODE (dest);
23894 machine_mode mode = GET_MODE (cop0);
23895 rtx x;
23897 *negate = false;
23899 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23900 if (TARGET_XOP
23901 && (mode == V16QImode || mode == V8HImode
23902 || mode == V4SImode || mode == V2DImode))
23904 else
23906 /* Canonicalize the comparison to EQ, GT, GTU. */
23907 switch (code)
23909 case EQ:
23910 case GT:
23911 case GTU:
23912 break;
23914 case NE:
23915 case LE:
23916 case LEU:
23917 code = reverse_condition (code);
23918 *negate = true;
23919 break;
23921 case GE:
23922 case GEU:
23923 code = reverse_condition (code);
23924 *negate = true;
23925 /* FALLTHRU */
23927 case LT:
23928 case LTU:
23929 std::swap (cop0, cop1);
23930 code = swap_condition (code);
23931 break;
23933 default:
23934 gcc_unreachable ();
23937 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23938 if (mode == V2DImode)
23940 switch (code)
23942 case EQ:
23943 /* SSE4.1 supports EQ. */
23944 if (!TARGET_SSE4_1)
23945 return NULL;
23946 break;
23948 case GT:
23949 case GTU:
23950 /* SSE4.2 supports GT/GTU. */
23951 if (!TARGET_SSE4_2)
23952 return NULL;
23953 break;
23955 default:
23956 gcc_unreachable ();
23960 /* Unsigned parallel compare is not supported by the hardware.
23961 Play some tricks to turn this into a signed comparison
23962 against 0. */
23963 if (code == GTU)
23965 cop0 = force_reg (mode, cop0);
23967 switch (mode)
23969 case E_V16SImode:
23970 case E_V8DImode:
23971 case E_V8SImode:
23972 case E_V4DImode:
23973 case E_V4SImode:
23974 case E_V2DImode:
23976 rtx t1, t2, mask;
23977 rtx (*gen_sub3) (rtx, rtx, rtx);
23979 switch (mode)
23981 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
23982 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
23983 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
23984 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
23985 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
23986 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
23987 default:
23988 gcc_unreachable ();
23990 /* Subtract (-(INT MAX) - 1) from both operands to make
23991 them signed. */
23992 mask = ix86_build_signbit_mask (mode, true, false);
23993 t1 = gen_reg_rtx (mode);
23994 emit_insn (gen_sub3 (t1, cop0, mask));
23996 t2 = gen_reg_rtx (mode);
23997 emit_insn (gen_sub3 (t2, cop1, mask));
23999 cop0 = t1;
24000 cop1 = t2;
24001 code = GT;
24003 break;
24005 case E_V64QImode:
24006 case E_V32HImode:
24007 case E_V32QImode:
24008 case E_V16HImode:
24009 case E_V16QImode:
24010 case E_V8HImode:
24011 /* Perform a parallel unsigned saturating subtraction. */
24012 x = gen_reg_rtx (mode);
24013 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24014 cop1)));
24016 cop0 = x;
24017 cop1 = CONST0_RTX (mode);
24018 code = EQ;
24019 *negate = !*negate;
24020 break;
24022 default:
24023 gcc_unreachable ();
24028 if (*negate)
24029 std::swap (op_true, op_false);
24031 /* Allow the comparison to be done in one mode, but the movcc to
24032 happen in another mode. */
24033 if (data_mode == mode)
24035 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24036 op_true, op_false);
24038 else
24040 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24041 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24042 op_true, op_false);
24043 if (GET_MODE (x) == mode)
24044 x = gen_lowpart (data_mode, x);
24047 return x;
24050 /* Expand integer vector comparison. */
24052 bool
24053 ix86_expand_int_vec_cmp (rtx operands[])
24055 rtx_code code = GET_CODE (operands[1]);
24056 bool negate = false;
24057 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24058 operands[3], NULL, NULL, &negate);
24060 if (!cmp)
24061 return false;
24063 if (negate)
24064 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24065 CONST0_RTX (GET_MODE (cmp)),
24066 NULL, NULL, &negate);
24068 gcc_assert (!negate);
24070 if (operands[0] != cmp)
24071 emit_move_insn (operands[0], cmp);
24073 return true;
24076 /* Expand a floating-point vector conditional move; a vcond operation
24077 rather than a movcc operation. */
24079 bool
24080 ix86_expand_fp_vcond (rtx operands[])
24082 enum rtx_code code = GET_CODE (operands[3]);
24083 rtx cmp;
24085 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24086 &operands[4], &operands[5]);
24087 if (code == UNKNOWN)
24089 rtx temp;
24090 switch (GET_CODE (operands[3]))
24092 case LTGT:
24093 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24094 operands[5], operands[0], operands[0]);
24095 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24096 operands[5], operands[1], operands[2]);
24097 code = AND;
24098 break;
24099 case UNEQ:
24100 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24101 operands[5], operands[0], operands[0]);
24102 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24103 operands[5], operands[1], operands[2]);
24104 code = IOR;
24105 break;
24106 default:
24107 gcc_unreachable ();
24109 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24110 OPTAB_DIRECT);
24111 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24112 return true;
24115 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24116 operands[5], operands[1], operands[2]))
24117 return true;
24119 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24120 operands[1], operands[2]);
24121 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24122 return true;
24125 /* Expand a signed/unsigned integral vector conditional move. */
24127 bool
24128 ix86_expand_int_vcond (rtx operands[])
24130 machine_mode data_mode = GET_MODE (operands[0]);
24131 machine_mode mode = GET_MODE (operands[4]);
24132 enum rtx_code code = GET_CODE (operands[3]);
24133 bool negate = false;
24134 rtx x, cop0, cop1;
24136 cop0 = operands[4];
24137 cop1 = operands[5];
24139 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24140 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24141 if ((code == LT || code == GE)
24142 && data_mode == mode
24143 && cop1 == CONST0_RTX (mode)
24144 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24145 && GET_MODE_UNIT_SIZE (data_mode) > 1
24146 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24147 && (GET_MODE_SIZE (data_mode) == 16
24148 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24150 rtx negop = operands[2 - (code == LT)];
24151 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24152 if (negop == CONST1_RTX (data_mode))
24154 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24155 operands[0], 1, OPTAB_DIRECT);
24156 if (res != operands[0])
24157 emit_move_insn (operands[0], res);
24158 return true;
24160 else if (GET_MODE_INNER (data_mode) != DImode
24161 && vector_all_ones_operand (negop, data_mode))
24163 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24164 operands[0], 0, OPTAB_DIRECT);
24165 if (res != operands[0])
24166 emit_move_insn (operands[0], res);
24167 return true;
24171 if (!nonimmediate_operand (cop1, mode))
24172 cop1 = force_reg (mode, cop1);
24173 if (!general_operand (operands[1], data_mode))
24174 operands[1] = force_reg (data_mode, operands[1]);
24175 if (!general_operand (operands[2], data_mode))
24176 operands[2] = force_reg (data_mode, operands[2]);
24178 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24179 operands[1], operands[2], &negate);
24181 if (!x)
24182 return false;
24184 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24185 operands[2-negate]);
24186 return true;
24189 /* AVX512F does support 64-byte integer vector operations,
24190 thus the longest vector we are faced with is V64QImode. */
24191 #define MAX_VECT_LEN 64
24193 struct expand_vec_perm_d
24195 rtx target, op0, op1;
24196 unsigned char perm[MAX_VECT_LEN];
24197 machine_mode vmode;
24198 unsigned char nelt;
24199 bool one_operand_p;
24200 bool testing_p;
24203 static bool
24204 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24205 struct expand_vec_perm_d *d)
24207 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24208 expander, so args are either in d, or in op0, op1 etc. */
24209 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24210 machine_mode maskmode = mode;
24211 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24213 switch (mode)
24215 case E_V8HImode:
24216 if (TARGET_AVX512VL && TARGET_AVX512BW)
24217 gen = gen_avx512vl_vpermt2varv8hi3;
24218 break;
24219 case E_V16HImode:
24220 if (TARGET_AVX512VL && TARGET_AVX512BW)
24221 gen = gen_avx512vl_vpermt2varv16hi3;
24222 break;
24223 case E_V64QImode:
24224 if (TARGET_AVX512VBMI)
24225 gen = gen_avx512bw_vpermt2varv64qi3;
24226 break;
24227 case E_V32HImode:
24228 if (TARGET_AVX512BW)
24229 gen = gen_avx512bw_vpermt2varv32hi3;
24230 break;
24231 case E_V4SImode:
24232 if (TARGET_AVX512VL)
24233 gen = gen_avx512vl_vpermt2varv4si3;
24234 break;
24235 case E_V8SImode:
24236 if (TARGET_AVX512VL)
24237 gen = gen_avx512vl_vpermt2varv8si3;
24238 break;
24239 case E_V16SImode:
24240 if (TARGET_AVX512F)
24241 gen = gen_avx512f_vpermt2varv16si3;
24242 break;
24243 case E_V4SFmode:
24244 if (TARGET_AVX512VL)
24246 gen = gen_avx512vl_vpermt2varv4sf3;
24247 maskmode = V4SImode;
24249 break;
24250 case E_V8SFmode:
24251 if (TARGET_AVX512VL)
24253 gen = gen_avx512vl_vpermt2varv8sf3;
24254 maskmode = V8SImode;
24256 break;
24257 case E_V16SFmode:
24258 if (TARGET_AVX512F)
24260 gen = gen_avx512f_vpermt2varv16sf3;
24261 maskmode = V16SImode;
24263 break;
24264 case E_V2DImode:
24265 if (TARGET_AVX512VL)
24266 gen = gen_avx512vl_vpermt2varv2di3;
24267 break;
24268 case E_V4DImode:
24269 if (TARGET_AVX512VL)
24270 gen = gen_avx512vl_vpermt2varv4di3;
24271 break;
24272 case E_V8DImode:
24273 if (TARGET_AVX512F)
24274 gen = gen_avx512f_vpermt2varv8di3;
24275 break;
24276 case E_V2DFmode:
24277 if (TARGET_AVX512VL)
24279 gen = gen_avx512vl_vpermt2varv2df3;
24280 maskmode = V2DImode;
24282 break;
24283 case E_V4DFmode:
24284 if (TARGET_AVX512VL)
24286 gen = gen_avx512vl_vpermt2varv4df3;
24287 maskmode = V4DImode;
24289 break;
24290 case E_V8DFmode:
24291 if (TARGET_AVX512F)
24293 gen = gen_avx512f_vpermt2varv8df3;
24294 maskmode = V8DImode;
24296 break;
24297 default:
24298 break;
24301 if (gen == NULL)
24302 return false;
24304 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24305 expander, so args are either in d, or in op0, op1 etc. */
24306 if (d)
24308 rtx vec[64];
24309 target = d->target;
24310 op0 = d->op0;
24311 op1 = d->op1;
24312 for (int i = 0; i < d->nelt; ++i)
24313 vec[i] = GEN_INT (d->perm[i]);
24314 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24317 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24318 return true;
24321 /* Expand a variable vector permutation. */
24323 void
24324 ix86_expand_vec_perm (rtx operands[])
24326 rtx target = operands[0];
24327 rtx op0 = operands[1];
24328 rtx op1 = operands[2];
24329 rtx mask = operands[3];
24330 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24331 machine_mode mode = GET_MODE (op0);
24332 machine_mode maskmode = GET_MODE (mask);
24333 int w, e, i;
24334 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24336 /* Number of elements in the vector. */
24337 w = GET_MODE_NUNITS (mode);
24338 e = GET_MODE_UNIT_SIZE (mode);
24339 gcc_assert (w <= 64);
24341 if (TARGET_AVX512F && one_operand_shuffle)
24343 rtx (*gen) (rtx, rtx, rtx) = NULL;
24344 switch (mode)
24346 case E_V16SImode:
24347 gen =gen_avx512f_permvarv16si;
24348 break;
24349 case E_V16SFmode:
24350 gen = gen_avx512f_permvarv16sf;
24351 break;
24352 case E_V8DImode:
24353 gen = gen_avx512f_permvarv8di;
24354 break;
24355 case E_V8DFmode:
24356 gen = gen_avx512f_permvarv8df;
24357 break;
24358 default:
24359 break;
24361 if (gen != NULL)
24363 emit_insn (gen (target, op0, mask));
24364 return;
24368 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24369 return;
24371 if (TARGET_AVX2)
24373 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24375 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24376 an constant shuffle operand. With a tiny bit of effort we can
24377 use VPERMD instead. A re-interpretation stall for V4DFmode is
24378 unfortunate but there's no avoiding it.
24379 Similarly for V16HImode we don't have instructions for variable
24380 shuffling, while for V32QImode we can use after preparing suitable
24381 masks vpshufb; vpshufb; vpermq; vpor. */
24383 if (mode == V16HImode)
24385 maskmode = mode = V32QImode;
24386 w = 32;
24387 e = 1;
24389 else
24391 maskmode = mode = V8SImode;
24392 w = 8;
24393 e = 4;
24395 t1 = gen_reg_rtx (maskmode);
24397 /* Replicate the low bits of the V4DImode mask into V8SImode:
24398 mask = { A B C D }
24399 t1 = { A A B B C C D D }. */
24400 for (i = 0; i < w / 2; ++i)
24401 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24402 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24403 vt = force_reg (maskmode, vt);
24404 mask = gen_lowpart (maskmode, mask);
24405 if (maskmode == V8SImode)
24406 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24407 else
24408 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24410 /* Multiply the shuffle indicies by two. */
24411 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24412 OPTAB_DIRECT);
24414 /* Add one to the odd shuffle indicies:
24415 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24416 for (i = 0; i < w / 2; ++i)
24418 vec[i * 2] = const0_rtx;
24419 vec[i * 2 + 1] = const1_rtx;
24421 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24422 vt = validize_mem (force_const_mem (maskmode, vt));
24423 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24424 OPTAB_DIRECT);
24426 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24427 operands[3] = mask = t1;
24428 target = gen_reg_rtx (mode);
24429 op0 = gen_lowpart (mode, op0);
24430 op1 = gen_lowpart (mode, op1);
24433 switch (mode)
24435 case E_V8SImode:
24436 /* The VPERMD and VPERMPS instructions already properly ignore
24437 the high bits of the shuffle elements. No need for us to
24438 perform an AND ourselves. */
24439 if (one_operand_shuffle)
24441 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24442 if (target != operands[0])
24443 emit_move_insn (operands[0],
24444 gen_lowpart (GET_MODE (operands[0]), target));
24446 else
24448 t1 = gen_reg_rtx (V8SImode);
24449 t2 = gen_reg_rtx (V8SImode);
24450 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24451 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24452 goto merge_two;
24454 return;
24456 case E_V8SFmode:
24457 mask = gen_lowpart (V8SImode, mask);
24458 if (one_operand_shuffle)
24459 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24460 else
24462 t1 = gen_reg_rtx (V8SFmode);
24463 t2 = gen_reg_rtx (V8SFmode);
24464 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24465 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24466 goto merge_two;
24468 return;
24470 case E_V4SImode:
24471 /* By combining the two 128-bit input vectors into one 256-bit
24472 input vector, we can use VPERMD and VPERMPS for the full
24473 two-operand shuffle. */
24474 t1 = gen_reg_rtx (V8SImode);
24475 t2 = gen_reg_rtx (V8SImode);
24476 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24477 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24478 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24479 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24480 return;
24482 case E_V4SFmode:
24483 t1 = gen_reg_rtx (V8SFmode);
24484 t2 = gen_reg_rtx (V8SImode);
24485 mask = gen_lowpart (V4SImode, mask);
24486 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24487 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24488 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24489 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24490 return;
24492 case E_V32QImode:
24493 t1 = gen_reg_rtx (V32QImode);
24494 t2 = gen_reg_rtx (V32QImode);
24495 t3 = gen_reg_rtx (V32QImode);
24496 vt2 = GEN_INT (-128);
24497 vt = gen_const_vec_duplicate (V32QImode, vt2);
24498 vt = force_reg (V32QImode, vt);
24499 for (i = 0; i < 32; i++)
24500 vec[i] = i < 16 ? vt2 : const0_rtx;
24501 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24502 vt2 = force_reg (V32QImode, vt2);
24503 /* From mask create two adjusted masks, which contain the same
24504 bits as mask in the low 7 bits of each vector element.
24505 The first mask will have the most significant bit clear
24506 if it requests element from the same 128-bit lane
24507 and MSB set if it requests element from the other 128-bit lane.
24508 The second mask will have the opposite values of the MSB,
24509 and additionally will have its 128-bit lanes swapped.
24510 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24511 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24512 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24513 stands for other 12 bytes. */
24514 /* The bit whether element is from the same lane or the other
24515 lane is bit 4, so shift it up by 3 to the MSB position. */
24516 t5 = gen_reg_rtx (V4DImode);
24517 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24518 GEN_INT (3)));
24519 /* Clear MSB bits from the mask just in case it had them set. */
24520 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24521 /* After this t1 will have MSB set for elements from other lane. */
24522 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24523 /* Clear bits other than MSB. */
24524 emit_insn (gen_andv32qi3 (t1, t1, vt));
24525 /* Or in the lower bits from mask into t3. */
24526 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24527 /* And invert MSB bits in t1, so MSB is set for elements from the same
24528 lane. */
24529 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24530 /* Swap 128-bit lanes in t3. */
24531 t6 = gen_reg_rtx (V4DImode);
24532 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24533 const2_rtx, GEN_INT (3),
24534 const0_rtx, const1_rtx));
24535 /* And or in the lower bits from mask into t1. */
24536 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24537 if (one_operand_shuffle)
24539 /* Each of these shuffles will put 0s in places where
24540 element from the other 128-bit lane is needed, otherwise
24541 will shuffle in the requested value. */
24542 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24543 gen_lowpart (V32QImode, t6)));
24544 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24545 /* For t3 the 128-bit lanes are swapped again. */
24546 t7 = gen_reg_rtx (V4DImode);
24547 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24548 const2_rtx, GEN_INT (3),
24549 const0_rtx, const1_rtx));
24550 /* And oring both together leads to the result. */
24551 emit_insn (gen_iorv32qi3 (target, t1,
24552 gen_lowpart (V32QImode, t7)));
24553 if (target != operands[0])
24554 emit_move_insn (operands[0],
24555 gen_lowpart (GET_MODE (operands[0]), target));
24556 return;
24559 t4 = gen_reg_rtx (V32QImode);
24560 /* Similarly to the above one_operand_shuffle code,
24561 just for repeated twice for each operand. merge_two:
24562 code will merge the two results together. */
24563 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24564 gen_lowpart (V32QImode, t6)));
24565 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24566 gen_lowpart (V32QImode, t6)));
24567 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24568 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24569 t7 = gen_reg_rtx (V4DImode);
24570 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24571 const2_rtx, GEN_INT (3),
24572 const0_rtx, const1_rtx));
24573 t8 = gen_reg_rtx (V4DImode);
24574 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24575 const2_rtx, GEN_INT (3),
24576 const0_rtx, const1_rtx));
24577 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24578 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24579 t1 = t4;
24580 t2 = t3;
24581 goto merge_two;
24583 default:
24584 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24585 break;
24589 if (TARGET_XOP)
24591 /* The XOP VPPERM insn supports three inputs. By ignoring the
24592 one_operand_shuffle special case, we avoid creating another
24593 set of constant vectors in memory. */
24594 one_operand_shuffle = false;
24596 /* mask = mask & {2*w-1, ...} */
24597 vt = GEN_INT (2*w - 1);
24599 else
24601 /* mask = mask & {w-1, ...} */
24602 vt = GEN_INT (w - 1);
24605 vt = gen_const_vec_duplicate (maskmode, vt);
24606 mask = expand_simple_binop (maskmode, AND, mask, vt,
24607 NULL_RTX, 0, OPTAB_DIRECT);
24609 /* For non-QImode operations, convert the word permutation control
24610 into a byte permutation control. */
24611 if (mode != V16QImode)
24613 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24614 GEN_INT (exact_log2 (e)),
24615 NULL_RTX, 0, OPTAB_DIRECT);
24617 /* Convert mask to vector of chars. */
24618 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24620 /* Replicate each of the input bytes into byte positions:
24621 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24622 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24623 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24624 for (i = 0; i < 16; ++i)
24625 vec[i] = GEN_INT (i/e * e);
24626 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24627 vt = validize_mem (force_const_mem (V16QImode, vt));
24628 if (TARGET_XOP)
24629 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24630 else
24631 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24633 /* Convert it into the byte positions by doing
24634 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24635 for (i = 0; i < 16; ++i)
24636 vec[i] = GEN_INT (i % e);
24637 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24638 vt = validize_mem (force_const_mem (V16QImode, vt));
24639 emit_insn (gen_addv16qi3 (mask, mask, vt));
24642 /* The actual shuffle operations all operate on V16QImode. */
24643 op0 = gen_lowpart (V16QImode, op0);
24644 op1 = gen_lowpart (V16QImode, op1);
24646 if (TARGET_XOP)
24648 if (GET_MODE (target) != V16QImode)
24649 target = gen_reg_rtx (V16QImode);
24650 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24651 if (target != operands[0])
24652 emit_move_insn (operands[0],
24653 gen_lowpart (GET_MODE (operands[0]), target));
24655 else if (one_operand_shuffle)
24657 if (GET_MODE (target) != V16QImode)
24658 target = gen_reg_rtx (V16QImode);
24659 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24660 if (target != operands[0])
24661 emit_move_insn (operands[0],
24662 gen_lowpart (GET_MODE (operands[0]), target));
24664 else
24666 rtx xops[6];
24667 bool ok;
24669 /* Shuffle the two input vectors independently. */
24670 t1 = gen_reg_rtx (V16QImode);
24671 t2 = gen_reg_rtx (V16QImode);
24672 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24673 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24675 merge_two:
24676 /* Then merge them together. The key is whether any given control
24677 element contained a bit set that indicates the second word. */
24678 mask = operands[3];
24679 vt = GEN_INT (w);
24680 if (maskmode == V2DImode && !TARGET_SSE4_1)
24682 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24683 more shuffle to convert the V2DI input mask into a V4SI
24684 input mask. At which point the masking that expand_int_vcond
24685 will work as desired. */
24686 rtx t3 = gen_reg_rtx (V4SImode);
24687 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24688 const0_rtx, const0_rtx,
24689 const2_rtx, const2_rtx));
24690 mask = t3;
24691 maskmode = V4SImode;
24692 e = w = 4;
24695 vt = gen_const_vec_duplicate (maskmode, vt);
24696 vt = force_reg (maskmode, vt);
24697 mask = expand_simple_binop (maskmode, AND, mask, vt,
24698 NULL_RTX, 0, OPTAB_DIRECT);
24700 if (GET_MODE (target) != mode)
24701 target = gen_reg_rtx (mode);
24702 xops[0] = target;
24703 xops[1] = gen_lowpart (mode, t2);
24704 xops[2] = gen_lowpart (mode, t1);
24705 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24706 xops[4] = mask;
24707 xops[5] = vt;
24708 ok = ix86_expand_int_vcond (xops);
24709 gcc_assert (ok);
24710 if (target != operands[0])
24711 emit_move_insn (operands[0],
24712 gen_lowpart (GET_MODE (operands[0]), target));
24716 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24717 true if we should do zero extension, else sign extension. HIGH_P is
24718 true if we want the N/2 high elements, else the low elements. */
24720 void
24721 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24723 machine_mode imode = GET_MODE (src);
24724 rtx tmp;
24726 if (TARGET_SSE4_1)
24728 rtx (*unpack)(rtx, rtx);
24729 rtx (*extract)(rtx, rtx) = NULL;
24730 machine_mode halfmode = BLKmode;
24732 switch (imode)
24734 case E_V64QImode:
24735 if (unsigned_p)
24736 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24737 else
24738 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24739 halfmode = V32QImode;
24740 extract
24741 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24742 break;
24743 case E_V32QImode:
24744 if (unsigned_p)
24745 unpack = gen_avx2_zero_extendv16qiv16hi2;
24746 else
24747 unpack = gen_avx2_sign_extendv16qiv16hi2;
24748 halfmode = V16QImode;
24749 extract
24750 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24751 break;
24752 case E_V32HImode:
24753 if (unsigned_p)
24754 unpack = gen_avx512f_zero_extendv16hiv16si2;
24755 else
24756 unpack = gen_avx512f_sign_extendv16hiv16si2;
24757 halfmode = V16HImode;
24758 extract
24759 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24760 break;
24761 case E_V16HImode:
24762 if (unsigned_p)
24763 unpack = gen_avx2_zero_extendv8hiv8si2;
24764 else
24765 unpack = gen_avx2_sign_extendv8hiv8si2;
24766 halfmode = V8HImode;
24767 extract
24768 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24769 break;
24770 case E_V16SImode:
24771 if (unsigned_p)
24772 unpack = gen_avx512f_zero_extendv8siv8di2;
24773 else
24774 unpack = gen_avx512f_sign_extendv8siv8di2;
24775 halfmode = V8SImode;
24776 extract
24777 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24778 break;
24779 case E_V8SImode:
24780 if (unsigned_p)
24781 unpack = gen_avx2_zero_extendv4siv4di2;
24782 else
24783 unpack = gen_avx2_sign_extendv4siv4di2;
24784 halfmode = V4SImode;
24785 extract
24786 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24787 break;
24788 case E_V16QImode:
24789 if (unsigned_p)
24790 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24791 else
24792 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24793 break;
24794 case E_V8HImode:
24795 if (unsigned_p)
24796 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24797 else
24798 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24799 break;
24800 case E_V4SImode:
24801 if (unsigned_p)
24802 unpack = gen_sse4_1_zero_extendv2siv2di2;
24803 else
24804 unpack = gen_sse4_1_sign_extendv2siv2di2;
24805 break;
24806 default:
24807 gcc_unreachable ();
24810 if (GET_MODE_SIZE (imode) >= 32)
24812 tmp = gen_reg_rtx (halfmode);
24813 emit_insn (extract (tmp, src));
24815 else if (high_p)
24817 /* Shift higher 8 bytes to lower 8 bytes. */
24818 tmp = gen_reg_rtx (V1TImode);
24819 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24820 GEN_INT (64)));
24821 tmp = gen_lowpart (imode, tmp);
24823 else
24824 tmp = src;
24826 emit_insn (unpack (dest, tmp));
24828 else
24830 rtx (*unpack)(rtx, rtx, rtx);
24832 switch (imode)
24834 case E_V16QImode:
24835 if (high_p)
24836 unpack = gen_vec_interleave_highv16qi;
24837 else
24838 unpack = gen_vec_interleave_lowv16qi;
24839 break;
24840 case E_V8HImode:
24841 if (high_p)
24842 unpack = gen_vec_interleave_highv8hi;
24843 else
24844 unpack = gen_vec_interleave_lowv8hi;
24845 break;
24846 case E_V4SImode:
24847 if (high_p)
24848 unpack = gen_vec_interleave_highv4si;
24849 else
24850 unpack = gen_vec_interleave_lowv4si;
24851 break;
24852 default:
24853 gcc_unreachable ();
24856 if (unsigned_p)
24857 tmp = force_reg (imode, CONST0_RTX (imode));
24858 else
24859 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24860 src, pc_rtx, pc_rtx);
24862 rtx tmp2 = gen_reg_rtx (imode);
24863 emit_insn (unpack (tmp2, src, tmp));
24864 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24868 /* Expand conditional increment or decrement using adb/sbb instructions.
24869 The default case using setcc followed by the conditional move can be
24870 done by generic code. */
24871 bool
24872 ix86_expand_int_addcc (rtx operands[])
24874 enum rtx_code code = GET_CODE (operands[1]);
24875 rtx flags;
24876 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24877 rtx compare_op;
24878 rtx val = const0_rtx;
24879 bool fpcmp = false;
24880 machine_mode mode;
24881 rtx op0 = XEXP (operands[1], 0);
24882 rtx op1 = XEXP (operands[1], 1);
24884 if (operands[3] != const1_rtx
24885 && operands[3] != constm1_rtx)
24886 return false;
24887 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24888 return false;
24889 code = GET_CODE (compare_op);
24891 flags = XEXP (compare_op, 0);
24893 if (GET_MODE (flags) == CCFPmode)
24895 fpcmp = true;
24896 code = ix86_fp_compare_code_to_integer (code);
24899 if (code != LTU)
24901 val = constm1_rtx;
24902 if (fpcmp)
24903 PUT_CODE (compare_op,
24904 reverse_condition_maybe_unordered
24905 (GET_CODE (compare_op)));
24906 else
24907 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24910 mode = GET_MODE (operands[0]);
24912 /* Construct either adc or sbb insn. */
24913 if ((code == LTU) == (operands[3] == constm1_rtx))
24915 switch (mode)
24917 case E_QImode:
24918 insn = gen_subqi3_carry;
24919 break;
24920 case E_HImode:
24921 insn = gen_subhi3_carry;
24922 break;
24923 case E_SImode:
24924 insn = gen_subsi3_carry;
24925 break;
24926 case E_DImode:
24927 insn = gen_subdi3_carry;
24928 break;
24929 default:
24930 gcc_unreachable ();
24933 else
24935 switch (mode)
24937 case E_QImode:
24938 insn = gen_addqi3_carry;
24939 break;
24940 case E_HImode:
24941 insn = gen_addhi3_carry;
24942 break;
24943 case E_SImode:
24944 insn = gen_addsi3_carry;
24945 break;
24946 case E_DImode:
24947 insn = gen_adddi3_carry;
24948 break;
24949 default:
24950 gcc_unreachable ();
24953 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
24955 return true;
24959 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
24960 but works for floating pointer parameters and nonoffsetable memories.
24961 For pushes, it returns just stack offsets; the values will be saved
24962 in the right order. Maximally three parts are generated. */
24964 static int
24965 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
24967 int size;
24969 if (!TARGET_64BIT)
24970 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
24971 else
24972 size = (GET_MODE_SIZE (mode) + 4) / 8;
24974 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
24975 gcc_assert (size >= 2 && size <= 4);
24977 /* Optimize constant pool reference to immediates. This is used by fp
24978 moves, that force all constants to memory to allow combining. */
24979 if (MEM_P (operand) && MEM_READONLY_P (operand))
24980 operand = avoid_constant_pool_reference (operand);
24982 if (MEM_P (operand) && !offsettable_memref_p (operand))
24984 /* The only non-offsetable memories we handle are pushes. */
24985 int ok = push_operand (operand, VOIDmode);
24987 gcc_assert (ok);
24989 operand = copy_rtx (operand);
24990 PUT_MODE (operand, word_mode);
24991 parts[0] = parts[1] = parts[2] = parts[3] = operand;
24992 return size;
24995 if (GET_CODE (operand) == CONST_VECTOR)
24997 scalar_int_mode imode = int_mode_for_mode (mode).require ();
24998 /* Caution: if we looked through a constant pool memory above,
24999 the operand may actually have a different mode now. That's
25000 ok, since we want to pun this all the way back to an integer. */
25001 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25002 gcc_assert (operand != NULL);
25003 mode = imode;
25006 if (!TARGET_64BIT)
25008 if (mode == DImode)
25009 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25010 else
25012 int i;
25014 if (REG_P (operand))
25016 gcc_assert (reload_completed);
25017 for (i = 0; i < size; i++)
25018 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25020 else if (offsettable_memref_p (operand))
25022 operand = adjust_address (operand, SImode, 0);
25023 parts[0] = operand;
25024 for (i = 1; i < size; i++)
25025 parts[i] = adjust_address (operand, SImode, 4 * i);
25027 else if (CONST_DOUBLE_P (operand))
25029 const REAL_VALUE_TYPE *r;
25030 long l[4];
25032 r = CONST_DOUBLE_REAL_VALUE (operand);
25033 switch (mode)
25035 case E_TFmode:
25036 real_to_target (l, r, mode);
25037 parts[3] = gen_int_mode (l[3], SImode);
25038 parts[2] = gen_int_mode (l[2], SImode);
25039 break;
25040 case E_XFmode:
25041 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25042 long double may not be 80-bit. */
25043 real_to_target (l, r, mode);
25044 parts[2] = gen_int_mode (l[2], SImode);
25045 break;
25046 case E_DFmode:
25047 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25048 break;
25049 default:
25050 gcc_unreachable ();
25052 parts[1] = gen_int_mode (l[1], SImode);
25053 parts[0] = gen_int_mode (l[0], SImode);
25055 else
25056 gcc_unreachable ();
25059 else
25061 if (mode == TImode)
25062 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25063 if (mode == XFmode || mode == TFmode)
25065 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25066 if (REG_P (operand))
25068 gcc_assert (reload_completed);
25069 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25070 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25072 else if (offsettable_memref_p (operand))
25074 operand = adjust_address (operand, DImode, 0);
25075 parts[0] = operand;
25076 parts[1] = adjust_address (operand, upper_mode, 8);
25078 else if (CONST_DOUBLE_P (operand))
25080 long l[4];
25082 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25084 /* real_to_target puts 32-bit pieces in each long. */
25085 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25086 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25087 << 32), DImode);
25089 if (upper_mode == SImode)
25090 parts[1] = gen_int_mode (l[2], SImode);
25091 else
25092 parts[1]
25093 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25094 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25095 << 32), DImode);
25097 else
25098 gcc_unreachable ();
25102 return size;
25105 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25106 Return false when normal moves are needed; true when all required
25107 insns have been emitted. Operands 2-4 contain the input values
25108 int the correct order; operands 5-7 contain the output values. */
25110 void
25111 ix86_split_long_move (rtx operands[])
25113 rtx part[2][4];
25114 int nparts, i, j;
25115 int push = 0;
25116 int collisions = 0;
25117 machine_mode mode = GET_MODE (operands[0]);
25118 bool collisionparts[4];
25120 /* The DFmode expanders may ask us to move double.
25121 For 64bit target this is single move. By hiding the fact
25122 here we simplify i386.md splitters. */
25123 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25125 /* Optimize constant pool reference to immediates. This is used by
25126 fp moves, that force all constants to memory to allow combining. */
25128 if (MEM_P (operands[1])
25129 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25130 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25131 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25132 if (push_operand (operands[0], VOIDmode))
25134 operands[0] = copy_rtx (operands[0]);
25135 PUT_MODE (operands[0], word_mode);
25137 else
25138 operands[0] = gen_lowpart (DImode, operands[0]);
25139 operands[1] = gen_lowpart (DImode, operands[1]);
25140 emit_move_insn (operands[0], operands[1]);
25141 return;
25144 /* The only non-offsettable memory we handle is push. */
25145 if (push_operand (operands[0], VOIDmode))
25146 push = 1;
25147 else
25148 gcc_assert (!MEM_P (operands[0])
25149 || offsettable_memref_p (operands[0]));
25151 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25152 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25154 /* When emitting push, take care for source operands on the stack. */
25155 if (push && MEM_P (operands[1])
25156 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25158 rtx src_base = XEXP (part[1][nparts - 1], 0);
25160 /* Compensate for the stack decrement by 4. */
25161 if (!TARGET_64BIT && nparts == 3
25162 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25163 src_base = plus_constant (Pmode, src_base, 4);
25165 /* src_base refers to the stack pointer and is
25166 automatically decreased by emitted push. */
25167 for (i = 0; i < nparts; i++)
25168 part[1][i] = change_address (part[1][i],
25169 GET_MODE (part[1][i]), src_base);
25172 /* We need to do copy in the right order in case an address register
25173 of the source overlaps the destination. */
25174 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25176 rtx tmp;
25178 for (i = 0; i < nparts; i++)
25180 collisionparts[i]
25181 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25182 if (collisionparts[i])
25183 collisions++;
25186 /* Collision in the middle part can be handled by reordering. */
25187 if (collisions == 1 && nparts == 3 && collisionparts [1])
25189 std::swap (part[0][1], part[0][2]);
25190 std::swap (part[1][1], part[1][2]);
25192 else if (collisions == 1
25193 && nparts == 4
25194 && (collisionparts [1] || collisionparts [2]))
25196 if (collisionparts [1])
25198 std::swap (part[0][1], part[0][2]);
25199 std::swap (part[1][1], part[1][2]);
25201 else
25203 std::swap (part[0][2], part[0][3]);
25204 std::swap (part[1][2], part[1][3]);
25208 /* If there are more collisions, we can't handle it by reordering.
25209 Do an lea to the last part and use only one colliding move. */
25210 else if (collisions > 1)
25212 rtx base, addr;
25214 collisions = 1;
25216 base = part[0][nparts - 1];
25218 /* Handle the case when the last part isn't valid for lea.
25219 Happens in 64-bit mode storing the 12-byte XFmode. */
25220 if (GET_MODE (base) != Pmode)
25221 base = gen_rtx_REG (Pmode, REGNO (base));
25223 addr = XEXP (part[1][0], 0);
25224 if (TARGET_TLS_DIRECT_SEG_REFS)
25226 struct ix86_address parts;
25227 int ok = ix86_decompose_address (addr, &parts);
25228 gcc_assert (ok);
25229 /* It is not valid to use %gs: or %fs: in lea. */
25230 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25232 emit_insn (gen_rtx_SET (base, addr));
25233 part[1][0] = replace_equiv_address (part[1][0], base);
25234 for (i = 1; i < nparts; i++)
25236 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25237 part[1][i] = replace_equiv_address (part[1][i], tmp);
25242 if (push)
25244 if (!TARGET_64BIT)
25246 if (nparts == 3)
25248 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25249 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25250 stack_pointer_rtx, GEN_INT (-4)));
25251 emit_move_insn (part[0][2], part[1][2]);
25253 else if (nparts == 4)
25255 emit_move_insn (part[0][3], part[1][3]);
25256 emit_move_insn (part[0][2], part[1][2]);
25259 else
25261 /* In 64bit mode we don't have 32bit push available. In case this is
25262 register, it is OK - we will just use larger counterpart. We also
25263 retype memory - these comes from attempt to avoid REX prefix on
25264 moving of second half of TFmode value. */
25265 if (GET_MODE (part[1][1]) == SImode)
25267 switch (GET_CODE (part[1][1]))
25269 case MEM:
25270 part[1][1] = adjust_address (part[1][1], DImode, 0);
25271 break;
25273 case REG:
25274 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25275 break;
25277 default:
25278 gcc_unreachable ();
25281 if (GET_MODE (part[1][0]) == SImode)
25282 part[1][0] = part[1][1];
25285 emit_move_insn (part[0][1], part[1][1]);
25286 emit_move_insn (part[0][0], part[1][0]);
25287 return;
25290 /* Choose correct order to not overwrite the source before it is copied. */
25291 if ((REG_P (part[0][0])
25292 && REG_P (part[1][1])
25293 && (REGNO (part[0][0]) == REGNO (part[1][1])
25294 || (nparts == 3
25295 && REGNO (part[0][0]) == REGNO (part[1][2]))
25296 || (nparts == 4
25297 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25298 || (collisions > 0
25299 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25301 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25303 operands[2 + i] = part[0][j];
25304 operands[6 + i] = part[1][j];
25307 else
25309 for (i = 0; i < nparts; i++)
25311 operands[2 + i] = part[0][i];
25312 operands[6 + i] = part[1][i];
25316 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25317 if (optimize_insn_for_size_p ())
25319 for (j = 0; j < nparts - 1; j++)
25320 if (CONST_INT_P (operands[6 + j])
25321 && operands[6 + j] != const0_rtx
25322 && REG_P (operands[2 + j]))
25323 for (i = j; i < nparts - 1; i++)
25324 if (CONST_INT_P (operands[7 + i])
25325 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25326 operands[7 + i] = operands[2 + j];
25329 for (i = 0; i < nparts; i++)
25330 emit_move_insn (operands[2 + i], operands[6 + i]);
25332 return;
25335 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25336 left shift by a constant, either using a single shift or
25337 a sequence of add instructions. */
25339 static void
25340 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25342 rtx (*insn)(rtx, rtx, rtx);
25344 if (count == 1
25345 || (count * ix86_cost->add <= ix86_cost->shift_const
25346 && !optimize_insn_for_size_p ()))
25348 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25349 while (count-- > 0)
25350 emit_insn (insn (operand, operand, operand));
25352 else
25354 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25355 emit_insn (insn (operand, operand, GEN_INT (count)));
25359 void
25360 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25362 rtx (*gen_ashl3)(rtx, rtx, rtx);
25363 rtx (*gen_shld)(rtx, rtx, rtx);
25364 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25366 rtx low[2], high[2];
25367 int count;
25369 if (CONST_INT_P (operands[2]))
25371 split_double_mode (mode, operands, 2, low, high);
25372 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25374 if (count >= half_width)
25376 emit_move_insn (high[0], low[1]);
25377 emit_move_insn (low[0], const0_rtx);
25379 if (count > half_width)
25380 ix86_expand_ashl_const (high[0], count - half_width, mode);
25382 else
25384 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25386 if (!rtx_equal_p (operands[0], operands[1]))
25387 emit_move_insn (operands[0], operands[1]);
25389 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25390 ix86_expand_ashl_const (low[0], count, mode);
25392 return;
25395 split_double_mode (mode, operands, 1, low, high);
25397 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25399 if (operands[1] == const1_rtx)
25401 /* Assuming we've chosen a QImode capable registers, then 1 << N
25402 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25403 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25405 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25407 ix86_expand_clear (low[0]);
25408 ix86_expand_clear (high[0]);
25409 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25411 d = gen_lowpart (QImode, low[0]);
25412 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25413 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25414 emit_insn (gen_rtx_SET (d, s));
25416 d = gen_lowpart (QImode, high[0]);
25417 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25418 s = gen_rtx_NE (QImode, flags, const0_rtx);
25419 emit_insn (gen_rtx_SET (d, s));
25422 /* Otherwise, we can get the same results by manually performing
25423 a bit extract operation on bit 5/6, and then performing the two
25424 shifts. The two methods of getting 0/1 into low/high are exactly
25425 the same size. Avoiding the shift in the bit extract case helps
25426 pentium4 a bit; no one else seems to care much either way. */
25427 else
25429 machine_mode half_mode;
25430 rtx (*gen_lshr3)(rtx, rtx, rtx);
25431 rtx (*gen_and3)(rtx, rtx, rtx);
25432 rtx (*gen_xor3)(rtx, rtx, rtx);
25433 HOST_WIDE_INT bits;
25434 rtx x;
25436 if (mode == DImode)
25438 half_mode = SImode;
25439 gen_lshr3 = gen_lshrsi3;
25440 gen_and3 = gen_andsi3;
25441 gen_xor3 = gen_xorsi3;
25442 bits = 5;
25444 else
25446 half_mode = DImode;
25447 gen_lshr3 = gen_lshrdi3;
25448 gen_and3 = gen_anddi3;
25449 gen_xor3 = gen_xordi3;
25450 bits = 6;
25453 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25454 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25455 else
25456 x = gen_lowpart (half_mode, operands[2]);
25457 emit_insn (gen_rtx_SET (high[0], x));
25459 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25460 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25461 emit_move_insn (low[0], high[0]);
25462 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25465 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25466 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25467 return;
25470 if (operands[1] == constm1_rtx)
25472 /* For -1 << N, we can avoid the shld instruction, because we
25473 know that we're shifting 0...31/63 ones into a -1. */
25474 emit_move_insn (low[0], constm1_rtx);
25475 if (optimize_insn_for_size_p ())
25476 emit_move_insn (high[0], low[0]);
25477 else
25478 emit_move_insn (high[0], constm1_rtx);
25480 else
25482 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25484 if (!rtx_equal_p (operands[0], operands[1]))
25485 emit_move_insn (operands[0], operands[1]);
25487 split_double_mode (mode, operands, 1, low, high);
25488 emit_insn (gen_shld (high[0], low[0], operands[2]));
25491 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25493 if (TARGET_CMOVE && scratch)
25495 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25496 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25498 ix86_expand_clear (scratch);
25499 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25501 else
25503 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25504 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25506 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25510 void
25511 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25513 rtx (*gen_ashr3)(rtx, rtx, rtx)
25514 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25515 rtx (*gen_shrd)(rtx, rtx, rtx);
25516 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25518 rtx low[2], high[2];
25519 int count;
25521 if (CONST_INT_P (operands[2]))
25523 split_double_mode (mode, operands, 2, low, high);
25524 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25526 if (count == GET_MODE_BITSIZE (mode) - 1)
25528 emit_move_insn (high[0], high[1]);
25529 emit_insn (gen_ashr3 (high[0], high[0],
25530 GEN_INT (half_width - 1)));
25531 emit_move_insn (low[0], high[0]);
25534 else if (count >= half_width)
25536 emit_move_insn (low[0], high[1]);
25537 emit_move_insn (high[0], low[0]);
25538 emit_insn (gen_ashr3 (high[0], high[0],
25539 GEN_INT (half_width - 1)));
25541 if (count > half_width)
25542 emit_insn (gen_ashr3 (low[0], low[0],
25543 GEN_INT (count - half_width)));
25545 else
25547 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25549 if (!rtx_equal_p (operands[0], operands[1]))
25550 emit_move_insn (operands[0], operands[1]);
25552 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25553 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25556 else
25558 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25560 if (!rtx_equal_p (operands[0], operands[1]))
25561 emit_move_insn (operands[0], operands[1]);
25563 split_double_mode (mode, operands, 1, low, high);
25565 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25566 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25568 if (TARGET_CMOVE && scratch)
25570 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25571 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25573 emit_move_insn (scratch, high[0]);
25574 emit_insn (gen_ashr3 (scratch, scratch,
25575 GEN_INT (half_width - 1)));
25576 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25577 scratch));
25579 else
25581 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25582 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25584 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25589 void
25590 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25592 rtx (*gen_lshr3)(rtx, rtx, rtx)
25593 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25594 rtx (*gen_shrd)(rtx, rtx, rtx);
25595 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25597 rtx low[2], high[2];
25598 int count;
25600 if (CONST_INT_P (operands[2]))
25602 split_double_mode (mode, operands, 2, low, high);
25603 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25605 if (count >= half_width)
25607 emit_move_insn (low[0], high[1]);
25608 ix86_expand_clear (high[0]);
25610 if (count > half_width)
25611 emit_insn (gen_lshr3 (low[0], low[0],
25612 GEN_INT (count - half_width)));
25614 else
25616 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25618 if (!rtx_equal_p (operands[0], operands[1]))
25619 emit_move_insn (operands[0], operands[1]);
25621 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25622 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25625 else
25627 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25629 if (!rtx_equal_p (operands[0], operands[1]))
25630 emit_move_insn (operands[0], operands[1]);
25632 split_double_mode (mode, operands, 1, low, high);
25634 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25635 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25637 if (TARGET_CMOVE && scratch)
25639 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25640 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25642 ix86_expand_clear (scratch);
25643 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25644 scratch));
25646 else
25648 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25649 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25651 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25656 /* Predict just emitted jump instruction to be taken with probability PROB. */
25657 static void
25658 predict_jump (int prob)
25660 rtx_insn *insn = get_last_insn ();
25661 gcc_assert (JUMP_P (insn));
25662 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25665 /* Helper function for the string operations below. Dest VARIABLE whether
25666 it is aligned to VALUE bytes. If true, jump to the label. */
25667 static rtx_code_label *
25668 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25670 rtx_code_label *label = gen_label_rtx ();
25671 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25672 if (GET_MODE (variable) == DImode)
25673 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25674 else
25675 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25676 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25677 1, label);
25678 if (epilogue)
25679 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25680 else
25681 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25682 return label;
25685 /* Adjust COUNTER by the VALUE. */
25686 static void
25687 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25689 rtx (*gen_add)(rtx, rtx, rtx)
25690 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25692 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25695 /* Zero extend possibly SImode EXP to Pmode register. */
25697 ix86_zero_extend_to_Pmode (rtx exp)
25699 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25702 /* Divide COUNTREG by SCALE. */
25703 static rtx
25704 scale_counter (rtx countreg, int scale)
25706 rtx sc;
25708 if (scale == 1)
25709 return countreg;
25710 if (CONST_INT_P (countreg))
25711 return GEN_INT (INTVAL (countreg) / scale);
25712 gcc_assert (REG_P (countreg));
25714 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25715 GEN_INT (exact_log2 (scale)),
25716 NULL, 1, OPTAB_DIRECT);
25717 return sc;
25720 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25721 DImode for constant loop counts. */
25723 static machine_mode
25724 counter_mode (rtx count_exp)
25726 if (GET_MODE (count_exp) != VOIDmode)
25727 return GET_MODE (count_exp);
25728 if (!CONST_INT_P (count_exp))
25729 return Pmode;
25730 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25731 return DImode;
25732 return SImode;
25735 /* Copy the address to a Pmode register. This is used for x32 to
25736 truncate DImode TLS address to a SImode register. */
25738 static rtx
25739 ix86_copy_addr_to_reg (rtx addr)
25741 rtx reg;
25742 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25744 reg = copy_addr_to_reg (addr);
25745 REG_POINTER (reg) = 1;
25746 return reg;
25748 else
25750 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25751 reg = copy_to_mode_reg (DImode, addr);
25752 REG_POINTER (reg) = 1;
25753 return gen_rtx_SUBREG (SImode, reg, 0);
25757 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25758 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25759 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25760 memory by VALUE (supposed to be in MODE).
25762 The size is rounded down to whole number of chunk size moved at once.
25763 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25766 static void
25767 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25768 rtx destptr, rtx srcptr, rtx value,
25769 rtx count, machine_mode mode, int unroll,
25770 int expected_size, bool issetmem)
25772 rtx_code_label *out_label, *top_label;
25773 rtx iter, tmp;
25774 machine_mode iter_mode = counter_mode (count);
25775 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25776 rtx piece_size = GEN_INT (piece_size_n);
25777 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25778 rtx size;
25779 int i;
25781 top_label = gen_label_rtx ();
25782 out_label = gen_label_rtx ();
25783 iter = gen_reg_rtx (iter_mode);
25785 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25786 NULL, 1, OPTAB_DIRECT);
25787 /* Those two should combine. */
25788 if (piece_size == const1_rtx)
25790 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25791 true, out_label);
25792 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25794 emit_move_insn (iter, const0_rtx);
25796 emit_label (top_label);
25798 tmp = convert_modes (Pmode, iter_mode, iter, true);
25800 /* This assert could be relaxed - in this case we'll need to compute
25801 smallest power of two, containing in PIECE_SIZE_N and pass it to
25802 offset_address. */
25803 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25804 destmem = offset_address (destmem, tmp, piece_size_n);
25805 destmem = adjust_address (destmem, mode, 0);
25807 if (!issetmem)
25809 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25810 srcmem = adjust_address (srcmem, mode, 0);
25812 /* When unrolling for chips that reorder memory reads and writes,
25813 we can save registers by using single temporary.
25814 Also using 4 temporaries is overkill in 32bit mode. */
25815 if (!TARGET_64BIT && 0)
25817 for (i = 0; i < unroll; i++)
25819 if (i)
25821 destmem =
25822 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25823 srcmem =
25824 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25826 emit_move_insn (destmem, srcmem);
25829 else
25831 rtx tmpreg[4];
25832 gcc_assert (unroll <= 4);
25833 for (i = 0; i < unroll; i++)
25835 tmpreg[i] = gen_reg_rtx (mode);
25836 if (i)
25838 srcmem =
25839 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25841 emit_move_insn (tmpreg[i], srcmem);
25843 for (i = 0; i < unroll; i++)
25845 if (i)
25847 destmem =
25848 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25850 emit_move_insn (destmem, tmpreg[i]);
25854 else
25855 for (i = 0; i < unroll; i++)
25857 if (i)
25858 destmem =
25859 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25860 emit_move_insn (destmem, value);
25863 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25864 true, OPTAB_LIB_WIDEN);
25865 if (tmp != iter)
25866 emit_move_insn (iter, tmp);
25868 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25869 true, top_label);
25870 if (expected_size != -1)
25872 expected_size /= GET_MODE_SIZE (mode) * unroll;
25873 if (expected_size == 0)
25874 predict_jump (0);
25875 else if (expected_size > REG_BR_PROB_BASE)
25876 predict_jump (REG_BR_PROB_BASE - 1);
25877 else
25878 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25880 else
25881 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25882 iter = ix86_zero_extend_to_Pmode (iter);
25883 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25884 true, OPTAB_LIB_WIDEN);
25885 if (tmp != destptr)
25886 emit_move_insn (destptr, tmp);
25887 if (!issetmem)
25889 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25890 true, OPTAB_LIB_WIDEN);
25891 if (tmp != srcptr)
25892 emit_move_insn (srcptr, tmp);
25894 emit_label (out_label);
25897 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25898 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25899 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25900 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25901 ORIG_VALUE is the original value passed to memset to fill the memory with.
25902 Other arguments have same meaning as for previous function. */
25904 static void
25905 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25906 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25907 rtx count,
25908 machine_mode mode, bool issetmem)
25910 rtx destexp;
25911 rtx srcexp;
25912 rtx countreg;
25913 HOST_WIDE_INT rounded_count;
25915 /* If possible, it is shorter to use rep movs.
25916 TODO: Maybe it is better to move this logic to decide_alg. */
25917 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25918 && (!issetmem || orig_value == const0_rtx))
25919 mode = SImode;
25921 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25922 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25924 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25925 GET_MODE_SIZE (mode)));
25926 if (mode != QImode)
25928 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25929 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25930 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25932 else
25933 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25934 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25936 rounded_count
25937 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25938 destmem = shallow_copy_rtx (destmem);
25939 set_mem_size (destmem, rounded_count);
25941 else if (MEM_SIZE_KNOWN_P (destmem))
25942 clear_mem_size (destmem);
25944 if (issetmem)
25946 value = force_reg (mode, gen_lowpart (mode, value));
25947 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25949 else
25951 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
25952 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
25953 if (mode != QImode)
25955 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
25956 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25957 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
25959 else
25960 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
25961 if (CONST_INT_P (count))
25963 rounded_count
25964 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25965 srcmem = shallow_copy_rtx (srcmem);
25966 set_mem_size (srcmem, rounded_count);
25968 else
25970 if (MEM_SIZE_KNOWN_P (srcmem))
25971 clear_mem_size (srcmem);
25973 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
25974 destexp, srcexp));
25978 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
25979 DESTMEM.
25980 SRC is passed by pointer to be updated on return.
25981 Return value is updated DST. */
25982 static rtx
25983 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
25984 HOST_WIDE_INT size_to_move)
25986 rtx dst = destmem, src = *srcmem, adjust, tempreg;
25987 enum insn_code code;
25988 machine_mode move_mode;
25989 int piece_size, i;
25991 /* Find the widest mode in which we could perform moves.
25992 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
25993 it until move of such size is supported. */
25994 piece_size = 1 << floor_log2 (size_to_move);
25995 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
25996 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25998 gcc_assert (piece_size > 1);
25999 piece_size >>= 1;
26002 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26003 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26004 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26006 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26007 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26008 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26010 move_mode = word_mode;
26011 piece_size = GET_MODE_SIZE (move_mode);
26012 code = optab_handler (mov_optab, move_mode);
26015 gcc_assert (code != CODE_FOR_nothing);
26017 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26018 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26020 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26021 gcc_assert (size_to_move % piece_size == 0);
26022 adjust = GEN_INT (piece_size);
26023 for (i = 0; i < size_to_move; i += piece_size)
26025 /* We move from memory to memory, so we'll need to do it via
26026 a temporary register. */
26027 tempreg = gen_reg_rtx (move_mode);
26028 emit_insn (GEN_FCN (code) (tempreg, src));
26029 emit_insn (GEN_FCN (code) (dst, tempreg));
26031 emit_move_insn (destptr,
26032 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26033 emit_move_insn (srcptr,
26034 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26036 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26037 piece_size);
26038 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26039 piece_size);
26042 /* Update DST and SRC rtx. */
26043 *srcmem = src;
26044 return dst;
26047 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26048 static void
26049 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26050 rtx destptr, rtx srcptr, rtx count, int max_size)
26052 rtx src, dest;
26053 if (CONST_INT_P (count))
26055 HOST_WIDE_INT countval = INTVAL (count);
26056 HOST_WIDE_INT epilogue_size = countval % max_size;
26057 int i;
26059 /* For now MAX_SIZE should be a power of 2. This assert could be
26060 relaxed, but it'll require a bit more complicated epilogue
26061 expanding. */
26062 gcc_assert ((max_size & (max_size - 1)) == 0);
26063 for (i = max_size; i >= 1; i >>= 1)
26065 if (epilogue_size & i)
26066 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26068 return;
26070 if (max_size > 8)
26072 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26073 count, 1, OPTAB_DIRECT);
26074 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26075 count, QImode, 1, 4, false);
26076 return;
26079 /* When there are stringops, we can cheaply increase dest and src pointers.
26080 Otherwise we save code size by maintaining offset (zero is readily
26081 available from preceding rep operation) and using x86 addressing modes.
26083 if (TARGET_SINGLE_STRINGOP)
26085 if (max_size > 4)
26087 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26088 src = change_address (srcmem, SImode, srcptr);
26089 dest = change_address (destmem, SImode, destptr);
26090 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26091 emit_label (label);
26092 LABEL_NUSES (label) = 1;
26094 if (max_size > 2)
26096 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26097 src = change_address (srcmem, HImode, srcptr);
26098 dest = change_address (destmem, HImode, destptr);
26099 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26100 emit_label (label);
26101 LABEL_NUSES (label) = 1;
26103 if (max_size > 1)
26105 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26106 src = change_address (srcmem, QImode, srcptr);
26107 dest = change_address (destmem, QImode, destptr);
26108 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26109 emit_label (label);
26110 LABEL_NUSES (label) = 1;
26113 else
26115 rtx offset = force_reg (Pmode, const0_rtx);
26116 rtx tmp;
26118 if (max_size > 4)
26120 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26121 src = change_address (srcmem, SImode, srcptr);
26122 dest = change_address (destmem, SImode, destptr);
26123 emit_move_insn (dest, src);
26124 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26125 true, OPTAB_LIB_WIDEN);
26126 if (tmp != offset)
26127 emit_move_insn (offset, tmp);
26128 emit_label (label);
26129 LABEL_NUSES (label) = 1;
26131 if (max_size > 2)
26133 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26134 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26135 src = change_address (srcmem, HImode, tmp);
26136 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26137 dest = change_address (destmem, HImode, tmp);
26138 emit_move_insn (dest, src);
26139 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26140 true, OPTAB_LIB_WIDEN);
26141 if (tmp != offset)
26142 emit_move_insn (offset, tmp);
26143 emit_label (label);
26144 LABEL_NUSES (label) = 1;
26146 if (max_size > 1)
26148 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26149 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26150 src = change_address (srcmem, QImode, tmp);
26151 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26152 dest = change_address (destmem, QImode, tmp);
26153 emit_move_insn (dest, src);
26154 emit_label (label);
26155 LABEL_NUSES (label) = 1;
26160 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26161 with value PROMOTED_VAL.
26162 SRC is passed by pointer to be updated on return.
26163 Return value is updated DST. */
26164 static rtx
26165 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26166 HOST_WIDE_INT size_to_move)
26168 rtx dst = destmem, adjust;
26169 enum insn_code code;
26170 machine_mode move_mode;
26171 int piece_size, i;
26173 /* Find the widest mode in which we could perform moves.
26174 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26175 it until move of such size is supported. */
26176 move_mode = GET_MODE (promoted_val);
26177 if (move_mode == VOIDmode)
26178 move_mode = QImode;
26179 if (size_to_move < GET_MODE_SIZE (move_mode))
26181 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26182 move_mode = int_mode_for_size (move_bits, 0).require ();
26183 promoted_val = gen_lowpart (move_mode, promoted_val);
26185 piece_size = GET_MODE_SIZE (move_mode);
26186 code = optab_handler (mov_optab, move_mode);
26187 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26189 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26191 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26192 gcc_assert (size_to_move % piece_size == 0);
26193 adjust = GEN_INT (piece_size);
26194 for (i = 0; i < size_to_move; i += piece_size)
26196 if (piece_size <= GET_MODE_SIZE (word_mode))
26198 emit_insn (gen_strset (destptr, dst, promoted_val));
26199 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26200 piece_size);
26201 continue;
26204 emit_insn (GEN_FCN (code) (dst, promoted_val));
26206 emit_move_insn (destptr,
26207 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26209 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26210 piece_size);
26213 /* Update DST rtx. */
26214 return dst;
26216 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26217 static void
26218 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26219 rtx count, int max_size)
26221 count =
26222 expand_simple_binop (counter_mode (count), AND, count,
26223 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26224 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26225 gen_lowpart (QImode, value), count, QImode,
26226 1, max_size / 2, true);
26229 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26230 static void
26231 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26232 rtx count, int max_size)
26234 rtx dest;
26236 if (CONST_INT_P (count))
26238 HOST_WIDE_INT countval = INTVAL (count);
26239 HOST_WIDE_INT epilogue_size = countval % max_size;
26240 int i;
26242 /* For now MAX_SIZE should be a power of 2. This assert could be
26243 relaxed, but it'll require a bit more complicated epilogue
26244 expanding. */
26245 gcc_assert ((max_size & (max_size - 1)) == 0);
26246 for (i = max_size; i >= 1; i >>= 1)
26248 if (epilogue_size & i)
26250 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26251 destmem = emit_memset (destmem, destptr, vec_value, i);
26252 else
26253 destmem = emit_memset (destmem, destptr, value, i);
26256 return;
26258 if (max_size > 32)
26260 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26261 return;
26263 if (max_size > 16)
26265 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26266 if (TARGET_64BIT)
26268 dest = change_address (destmem, DImode, destptr);
26269 emit_insn (gen_strset (destptr, dest, value));
26270 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26271 emit_insn (gen_strset (destptr, dest, value));
26273 else
26275 dest = change_address (destmem, SImode, destptr);
26276 emit_insn (gen_strset (destptr, dest, value));
26277 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26278 emit_insn (gen_strset (destptr, dest, value));
26279 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26280 emit_insn (gen_strset (destptr, dest, value));
26281 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26282 emit_insn (gen_strset (destptr, dest, value));
26284 emit_label (label);
26285 LABEL_NUSES (label) = 1;
26287 if (max_size > 8)
26289 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26290 if (TARGET_64BIT)
26292 dest = change_address (destmem, DImode, destptr);
26293 emit_insn (gen_strset (destptr, dest, value));
26295 else
26297 dest = change_address (destmem, SImode, destptr);
26298 emit_insn (gen_strset (destptr, dest, value));
26299 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26300 emit_insn (gen_strset (destptr, dest, value));
26302 emit_label (label);
26303 LABEL_NUSES (label) = 1;
26305 if (max_size > 4)
26307 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26308 dest = change_address (destmem, SImode, destptr);
26309 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26310 emit_label (label);
26311 LABEL_NUSES (label) = 1;
26313 if (max_size > 2)
26315 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26316 dest = change_address (destmem, HImode, destptr);
26317 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26318 emit_label (label);
26319 LABEL_NUSES (label) = 1;
26321 if (max_size > 1)
26323 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26324 dest = change_address (destmem, QImode, destptr);
26325 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26326 emit_label (label);
26327 LABEL_NUSES (label) = 1;
26331 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26332 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26333 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26334 ignored.
26335 Return value is updated DESTMEM. */
26336 static rtx
26337 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26338 rtx destptr, rtx srcptr, rtx value,
26339 rtx vec_value, rtx count, int align,
26340 int desired_alignment, bool issetmem)
26342 int i;
26343 for (i = 1; i < desired_alignment; i <<= 1)
26345 if (align <= i)
26347 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26348 if (issetmem)
26350 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26351 destmem = emit_memset (destmem, destptr, vec_value, i);
26352 else
26353 destmem = emit_memset (destmem, destptr, value, i);
26355 else
26356 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26357 ix86_adjust_counter (count, i);
26358 emit_label (label);
26359 LABEL_NUSES (label) = 1;
26360 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26363 return destmem;
26366 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26367 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26368 and jump to DONE_LABEL. */
26369 static void
26370 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26371 rtx destptr, rtx srcptr,
26372 rtx value, rtx vec_value,
26373 rtx count, int size,
26374 rtx done_label, bool issetmem)
26376 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26377 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26378 rtx modesize;
26379 int n;
26381 /* If we do not have vector value to copy, we must reduce size. */
26382 if (issetmem)
26384 if (!vec_value)
26386 if (GET_MODE (value) == VOIDmode && size > 8)
26387 mode = Pmode;
26388 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26389 mode = GET_MODE (value);
26391 else
26392 mode = GET_MODE (vec_value), value = vec_value;
26394 else
26396 /* Choose appropriate vector mode. */
26397 if (size >= 32)
26398 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26399 else if (size >= 16)
26400 mode = TARGET_SSE ? V16QImode : DImode;
26401 srcmem = change_address (srcmem, mode, srcptr);
26403 destmem = change_address (destmem, mode, destptr);
26404 modesize = GEN_INT (GET_MODE_SIZE (mode));
26405 gcc_assert (GET_MODE_SIZE (mode) <= size);
26406 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26408 if (issetmem)
26409 emit_move_insn (destmem, gen_lowpart (mode, value));
26410 else
26412 emit_move_insn (destmem, srcmem);
26413 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26415 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26418 destmem = offset_address (destmem, count, 1);
26419 destmem = offset_address (destmem, GEN_INT (-2 * size),
26420 GET_MODE_SIZE (mode));
26421 if (!issetmem)
26423 srcmem = offset_address (srcmem, count, 1);
26424 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26425 GET_MODE_SIZE (mode));
26427 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26429 if (issetmem)
26430 emit_move_insn (destmem, gen_lowpart (mode, value));
26431 else
26433 emit_move_insn (destmem, srcmem);
26434 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26436 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26438 emit_jump_insn (gen_jump (done_label));
26439 emit_barrier ();
26441 emit_label (label);
26442 LABEL_NUSES (label) = 1;
26445 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26446 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26447 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26448 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26449 DONE_LABEL is a label after the whole copying sequence. The label is created
26450 on demand if *DONE_LABEL is NULL.
26451 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26452 bounds after the initial copies.
26454 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26455 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26456 we will dispatch to a library call for large blocks.
26458 In pseudocode we do:
26460 if (COUNT < SIZE)
26462 Assume that SIZE is 4. Bigger sizes are handled analogously
26463 if (COUNT & 4)
26465 copy 4 bytes from SRCPTR to DESTPTR
26466 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26467 goto done_label
26469 if (!COUNT)
26470 goto done_label;
26471 copy 1 byte from SRCPTR to DESTPTR
26472 if (COUNT & 2)
26474 copy 2 bytes from SRCPTR to DESTPTR
26475 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26478 else
26480 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26481 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26483 OLD_DESPTR = DESTPTR;
26484 Align DESTPTR up to DESIRED_ALIGN
26485 SRCPTR += DESTPTR - OLD_DESTPTR
26486 COUNT -= DEST_PTR - OLD_DESTPTR
26487 if (DYNAMIC_CHECK)
26488 Round COUNT down to multiple of SIZE
26489 << optional caller supplied zero size guard is here >>
26490 << optional caller supplied dynamic check is here >>
26491 << caller supplied main copy loop is here >>
26493 done_label:
26495 static void
26496 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26497 rtx *destptr, rtx *srcptr,
26498 machine_mode mode,
26499 rtx value, rtx vec_value,
26500 rtx *count,
26501 rtx_code_label **done_label,
26502 int size,
26503 int desired_align,
26504 int align,
26505 unsigned HOST_WIDE_INT *min_size,
26506 bool dynamic_check,
26507 bool issetmem)
26509 rtx_code_label *loop_label = NULL, *label;
26510 int n;
26511 rtx modesize;
26512 int prolog_size = 0;
26513 rtx mode_value;
26515 /* Chose proper value to copy. */
26516 if (issetmem && VECTOR_MODE_P (mode))
26517 mode_value = vec_value;
26518 else
26519 mode_value = value;
26520 gcc_assert (GET_MODE_SIZE (mode) <= size);
26522 /* See if block is big or small, handle small blocks. */
26523 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26525 int size2 = size;
26526 loop_label = gen_label_rtx ();
26528 if (!*done_label)
26529 *done_label = gen_label_rtx ();
26531 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26532 1, loop_label);
26533 size2 >>= 1;
26535 /* Handle sizes > 3. */
26536 for (;size2 > 2; size2 >>= 1)
26537 expand_small_movmem_or_setmem (destmem, srcmem,
26538 *destptr, *srcptr,
26539 value, vec_value,
26540 *count,
26541 size2, *done_label, issetmem);
26542 /* Nothing to copy? Jump to DONE_LABEL if so */
26543 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26544 1, *done_label);
26546 /* Do a byte copy. */
26547 destmem = change_address (destmem, QImode, *destptr);
26548 if (issetmem)
26549 emit_move_insn (destmem, gen_lowpart (QImode, value));
26550 else
26552 srcmem = change_address (srcmem, QImode, *srcptr);
26553 emit_move_insn (destmem, srcmem);
26556 /* Handle sizes 2 and 3. */
26557 label = ix86_expand_aligntest (*count, 2, false);
26558 destmem = change_address (destmem, HImode, *destptr);
26559 destmem = offset_address (destmem, *count, 1);
26560 destmem = offset_address (destmem, GEN_INT (-2), 2);
26561 if (issetmem)
26562 emit_move_insn (destmem, gen_lowpart (HImode, value));
26563 else
26565 srcmem = change_address (srcmem, HImode, *srcptr);
26566 srcmem = offset_address (srcmem, *count, 1);
26567 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26568 emit_move_insn (destmem, srcmem);
26571 emit_label (label);
26572 LABEL_NUSES (label) = 1;
26573 emit_jump_insn (gen_jump (*done_label));
26574 emit_barrier ();
26576 else
26577 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26578 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26580 /* Start memcpy for COUNT >= SIZE. */
26581 if (loop_label)
26583 emit_label (loop_label);
26584 LABEL_NUSES (loop_label) = 1;
26587 /* Copy first desired_align bytes. */
26588 if (!issetmem)
26589 srcmem = change_address (srcmem, mode, *srcptr);
26590 destmem = change_address (destmem, mode, *destptr);
26591 modesize = GEN_INT (GET_MODE_SIZE (mode));
26592 for (n = 0; prolog_size < desired_align - align; n++)
26594 if (issetmem)
26595 emit_move_insn (destmem, mode_value);
26596 else
26598 emit_move_insn (destmem, srcmem);
26599 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26601 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26602 prolog_size += GET_MODE_SIZE (mode);
26606 /* Copy last SIZE bytes. */
26607 destmem = offset_address (destmem, *count, 1);
26608 destmem = offset_address (destmem,
26609 GEN_INT (-size - prolog_size),
26611 if (issetmem)
26612 emit_move_insn (destmem, mode_value);
26613 else
26615 srcmem = offset_address (srcmem, *count, 1);
26616 srcmem = offset_address (srcmem,
26617 GEN_INT (-size - prolog_size),
26619 emit_move_insn (destmem, srcmem);
26621 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26623 destmem = offset_address (destmem, modesize, 1);
26624 if (issetmem)
26625 emit_move_insn (destmem, mode_value);
26626 else
26628 srcmem = offset_address (srcmem, modesize, 1);
26629 emit_move_insn (destmem, srcmem);
26633 /* Align destination. */
26634 if (desired_align > 1 && desired_align > align)
26636 rtx saveddest = *destptr;
26638 gcc_assert (desired_align <= size);
26639 /* Align destptr up, place it to new register. */
26640 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26641 GEN_INT (prolog_size),
26642 NULL_RTX, 1, OPTAB_DIRECT);
26643 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26644 REG_POINTER (*destptr) = 1;
26645 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26646 GEN_INT (-desired_align),
26647 *destptr, 1, OPTAB_DIRECT);
26648 /* See how many bytes we skipped. */
26649 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26650 *destptr,
26651 saveddest, 1, OPTAB_DIRECT);
26652 /* Adjust srcptr and count. */
26653 if (!issetmem)
26654 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26655 saveddest, *srcptr, 1, OPTAB_DIRECT);
26656 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26657 saveddest, *count, 1, OPTAB_DIRECT);
26658 /* We copied at most size + prolog_size. */
26659 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26660 *min_size
26661 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26662 else
26663 *min_size = 0;
26665 /* Our loops always round down the block size, but for dispatch to
26666 library we need precise value. */
26667 if (dynamic_check)
26668 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26669 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26671 else
26673 gcc_assert (prolog_size == 0);
26674 /* Decrease count, so we won't end up copying last word twice. */
26675 if (!CONST_INT_P (*count))
26676 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26677 constm1_rtx, *count, 1, OPTAB_DIRECT);
26678 else
26679 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26680 (unsigned HOST_WIDE_INT)size));
26681 if (*min_size)
26682 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26687 /* This function is like the previous one, except here we know how many bytes
26688 need to be copied. That allows us to update alignment not only of DST, which
26689 is returned, but also of SRC, which is passed as a pointer for that
26690 reason. */
26691 static rtx
26692 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26693 rtx srcreg, rtx value, rtx vec_value,
26694 int desired_align, int align_bytes,
26695 bool issetmem)
26697 rtx src = NULL;
26698 rtx orig_dst = dst;
26699 rtx orig_src = NULL;
26700 int piece_size = 1;
26701 int copied_bytes = 0;
26703 if (!issetmem)
26705 gcc_assert (srcp != NULL);
26706 src = *srcp;
26707 orig_src = src;
26710 for (piece_size = 1;
26711 piece_size <= desired_align && copied_bytes < align_bytes;
26712 piece_size <<= 1)
26714 if (align_bytes & piece_size)
26716 if (issetmem)
26718 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26719 dst = emit_memset (dst, destreg, vec_value, piece_size);
26720 else
26721 dst = emit_memset (dst, destreg, value, piece_size);
26723 else
26724 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26725 copied_bytes += piece_size;
26728 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26729 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26730 if (MEM_SIZE_KNOWN_P (orig_dst))
26731 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26733 if (!issetmem)
26735 int src_align_bytes = get_mem_align_offset (src, desired_align
26736 * BITS_PER_UNIT);
26737 if (src_align_bytes >= 0)
26738 src_align_bytes = desired_align - src_align_bytes;
26739 if (src_align_bytes >= 0)
26741 unsigned int src_align;
26742 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26744 if ((src_align_bytes & (src_align - 1))
26745 == (align_bytes & (src_align - 1)))
26746 break;
26748 if (src_align > (unsigned int) desired_align)
26749 src_align = desired_align;
26750 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26751 set_mem_align (src, src_align * BITS_PER_UNIT);
26753 if (MEM_SIZE_KNOWN_P (orig_src))
26754 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26755 *srcp = src;
26758 return dst;
26761 /* Return true if ALG can be used in current context.
26762 Assume we expand memset if MEMSET is true. */
26763 static bool
26764 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26766 if (alg == no_stringop)
26767 return false;
26768 if (alg == vector_loop)
26769 return TARGET_SSE || TARGET_AVX;
26770 /* Algorithms using the rep prefix want at least edi and ecx;
26771 additionally, memset wants eax and memcpy wants esi. Don't
26772 consider such algorithms if the user has appropriated those
26773 registers for their own purposes, or if we have a non-default
26774 address space, since some string insns cannot override the segment. */
26775 if (alg == rep_prefix_1_byte
26776 || alg == rep_prefix_4_byte
26777 || alg == rep_prefix_8_byte)
26779 if (have_as)
26780 return false;
26781 if (fixed_regs[CX_REG]
26782 || fixed_regs[DI_REG]
26783 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26784 return false;
26786 return true;
26789 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26790 static enum stringop_alg
26791 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26792 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26793 bool memset, bool zero_memset, bool have_as,
26794 int *dynamic_check, bool *noalign, bool recur)
26796 const struct stringop_algs *algs;
26797 bool optimize_for_speed;
26798 int max = 0;
26799 const struct processor_costs *cost;
26800 int i;
26801 bool any_alg_usable_p = false;
26803 *noalign = false;
26804 *dynamic_check = -1;
26806 /* Even if the string operation call is cold, we still might spend a lot
26807 of time processing large blocks. */
26808 if (optimize_function_for_size_p (cfun)
26809 || (optimize_insn_for_size_p ()
26810 && (max_size < 256
26811 || (expected_size != -1 && expected_size < 256))))
26812 optimize_for_speed = false;
26813 else
26814 optimize_for_speed = true;
26816 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26817 if (memset)
26818 algs = &cost->memset[TARGET_64BIT != 0];
26819 else
26820 algs = &cost->memcpy[TARGET_64BIT != 0];
26822 /* See maximal size for user defined algorithm. */
26823 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26825 enum stringop_alg candidate = algs->size[i].alg;
26826 bool usable = alg_usable_p (candidate, memset, have_as);
26827 any_alg_usable_p |= usable;
26829 if (candidate != libcall && candidate && usable)
26830 max = algs->size[i].max;
26833 /* If expected size is not known but max size is small enough
26834 so inline version is a win, set expected size into
26835 the range. */
26836 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26837 && expected_size == -1)
26838 expected_size = min_size / 2 + max_size / 2;
26840 /* If user specified the algorithm, honor it if possible. */
26841 if (ix86_stringop_alg != no_stringop
26842 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26843 return ix86_stringop_alg;
26844 /* rep; movq or rep; movl is the smallest variant. */
26845 else if (!optimize_for_speed)
26847 *noalign = true;
26848 if (!count || (count & 3) || (memset && !zero_memset))
26849 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26850 ? rep_prefix_1_byte : loop_1_byte;
26851 else
26852 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26853 ? rep_prefix_4_byte : loop;
26855 /* Very tiny blocks are best handled via the loop, REP is expensive to
26856 setup. */
26857 else if (expected_size != -1 && expected_size < 4)
26858 return loop_1_byte;
26859 else if (expected_size != -1)
26861 enum stringop_alg alg = libcall;
26862 bool alg_noalign = false;
26863 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26865 /* We get here if the algorithms that were not libcall-based
26866 were rep-prefix based and we are unable to use rep prefixes
26867 based on global register usage. Break out of the loop and
26868 use the heuristic below. */
26869 if (algs->size[i].max == 0)
26870 break;
26871 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26873 enum stringop_alg candidate = algs->size[i].alg;
26875 if (candidate != libcall
26876 && alg_usable_p (candidate, memset, have_as))
26878 alg = candidate;
26879 alg_noalign = algs->size[i].noalign;
26881 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26882 last non-libcall inline algorithm. */
26883 if (TARGET_INLINE_ALL_STRINGOPS)
26885 /* When the current size is best to be copied by a libcall,
26886 but we are still forced to inline, run the heuristic below
26887 that will pick code for medium sized blocks. */
26888 if (alg != libcall)
26890 *noalign = alg_noalign;
26891 return alg;
26893 else if (!any_alg_usable_p)
26894 break;
26896 else if (alg_usable_p (candidate, memset, have_as))
26898 *noalign = algs->size[i].noalign;
26899 return candidate;
26904 /* When asked to inline the call anyway, try to pick meaningful choice.
26905 We look for maximal size of block that is faster to copy by hand and
26906 take blocks of at most of that size guessing that average size will
26907 be roughly half of the block.
26909 If this turns out to be bad, we might simply specify the preferred
26910 choice in ix86_costs. */
26911 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26912 && (algs->unknown_size == libcall
26913 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26915 enum stringop_alg alg;
26916 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26918 /* If there aren't any usable algorithms or if recursing already,
26919 then recursing on smaller sizes or same size isn't going to
26920 find anything. Just return the simple byte-at-a-time copy loop. */
26921 if (!any_alg_usable_p || recur)
26923 /* Pick something reasonable. */
26924 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26925 *dynamic_check = 128;
26926 return loop_1_byte;
26928 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26929 zero_memset, have_as, dynamic_check, noalign, true);
26930 gcc_assert (*dynamic_check == -1);
26931 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26932 *dynamic_check = max;
26933 else
26934 gcc_assert (alg != libcall);
26935 return alg;
26937 return (alg_usable_p (algs->unknown_size, memset, have_as)
26938 ? algs->unknown_size : libcall);
26941 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26942 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26943 static int
26944 decide_alignment (int align,
26945 enum stringop_alg alg,
26946 int expected_size,
26947 machine_mode move_mode)
26949 int desired_align = 0;
26951 gcc_assert (alg != no_stringop);
26953 if (alg == libcall)
26954 return 0;
26955 if (move_mode == VOIDmode)
26956 return 0;
26958 desired_align = GET_MODE_SIZE (move_mode);
26959 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
26960 copying whole cacheline at once. */
26961 if (TARGET_PENTIUMPRO
26962 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
26963 desired_align = 8;
26965 if (optimize_size)
26966 desired_align = 1;
26967 if (desired_align < align)
26968 desired_align = align;
26969 if (expected_size != -1 && expected_size < 4)
26970 desired_align = align;
26972 return desired_align;
26976 /* Helper function for memcpy. For QImode value 0xXY produce
26977 0xXYXYXYXY of wide specified by MODE. This is essentially
26978 a * 0x10101010, but we can do slightly better than
26979 synth_mult by unwinding the sequence by hand on CPUs with
26980 slow multiply. */
26981 static rtx
26982 promote_duplicated_reg (machine_mode mode, rtx val)
26984 machine_mode valmode = GET_MODE (val);
26985 rtx tmp;
26986 int nops = mode == DImode ? 3 : 2;
26988 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
26989 if (val == const0_rtx)
26990 return copy_to_mode_reg (mode, CONST0_RTX (mode));
26991 if (CONST_INT_P (val))
26993 HOST_WIDE_INT v = INTVAL (val) & 255;
26995 v |= v << 8;
26996 v |= v << 16;
26997 if (mode == DImode)
26998 v |= (v << 16) << 16;
26999 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27002 if (valmode == VOIDmode)
27003 valmode = QImode;
27004 if (valmode != QImode)
27005 val = gen_lowpart (QImode, val);
27006 if (mode == QImode)
27007 return val;
27008 if (!TARGET_PARTIAL_REG_STALL)
27009 nops--;
27010 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27011 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27012 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27013 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27015 rtx reg = convert_modes (mode, QImode, val, true);
27016 tmp = promote_duplicated_reg (mode, const1_rtx);
27017 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27018 OPTAB_DIRECT);
27020 else
27022 rtx reg = convert_modes (mode, QImode, val, true);
27024 if (!TARGET_PARTIAL_REG_STALL)
27025 if (mode == SImode)
27026 emit_insn (gen_insvsi_1 (reg, reg));
27027 else
27028 emit_insn (gen_insvdi_1 (reg, reg));
27029 else
27031 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27032 NULL, 1, OPTAB_DIRECT);
27033 reg =
27034 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27036 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27037 NULL, 1, OPTAB_DIRECT);
27038 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27039 if (mode == SImode)
27040 return reg;
27041 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27042 NULL, 1, OPTAB_DIRECT);
27043 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27044 return reg;
27048 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27049 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27050 alignment from ALIGN to DESIRED_ALIGN. */
27051 static rtx
27052 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27053 int align)
27055 rtx promoted_val;
27057 if (TARGET_64BIT
27058 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27059 promoted_val = promote_duplicated_reg (DImode, val);
27060 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27061 promoted_val = promote_duplicated_reg (SImode, val);
27062 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27063 promoted_val = promote_duplicated_reg (HImode, val);
27064 else
27065 promoted_val = val;
27067 return promoted_val;
27070 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27071 operations when profitable. The code depends upon architecture, block size
27072 and alignment, but always has one of the following overall structures:
27074 Aligned move sequence:
27076 1) Prologue guard: Conditional that jumps up to epilogues for small
27077 blocks that can be handled by epilogue alone. This is faster
27078 but also needed for correctness, since prologue assume the block
27079 is larger than the desired alignment.
27081 Optional dynamic check for size and libcall for large
27082 blocks is emitted here too, with -minline-stringops-dynamically.
27084 2) Prologue: copy first few bytes in order to get destination
27085 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27086 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27087 copied. We emit either a jump tree on power of two sized
27088 blocks, or a byte loop.
27090 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27091 with specified algorithm.
27093 4) Epilogue: code copying tail of the block that is too small to be
27094 handled by main body (or up to size guarded by prologue guard).
27096 Misaligned move sequence
27098 1) missaligned move prologue/epilogue containing:
27099 a) Prologue handling small memory blocks and jumping to done_label
27100 (skipped if blocks are known to be large enough)
27101 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27102 needed by single possibly misaligned move
27103 (skipped if alignment is not needed)
27104 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27106 2) Zero size guard dispatching to done_label, if needed
27108 3) dispatch to library call, if needed,
27110 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27111 with specified algorithm. */
27112 bool
27113 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27114 rtx align_exp, rtx expected_align_exp,
27115 rtx expected_size_exp, rtx min_size_exp,
27116 rtx max_size_exp, rtx probable_max_size_exp,
27117 bool issetmem)
27119 rtx destreg;
27120 rtx srcreg = NULL;
27121 rtx_code_label *label = NULL;
27122 rtx tmp;
27123 rtx_code_label *jump_around_label = NULL;
27124 HOST_WIDE_INT align = 1;
27125 unsigned HOST_WIDE_INT count = 0;
27126 HOST_WIDE_INT expected_size = -1;
27127 int size_needed = 0, epilogue_size_needed;
27128 int desired_align = 0, align_bytes = 0;
27129 enum stringop_alg alg;
27130 rtx promoted_val = NULL;
27131 rtx vec_promoted_val = NULL;
27132 bool force_loopy_epilogue = false;
27133 int dynamic_check;
27134 bool need_zero_guard = false;
27135 bool noalign;
27136 machine_mode move_mode = VOIDmode;
27137 machine_mode wider_mode;
27138 int unroll_factor = 1;
27139 /* TODO: Once value ranges are available, fill in proper data. */
27140 unsigned HOST_WIDE_INT min_size = 0;
27141 unsigned HOST_WIDE_INT max_size = -1;
27142 unsigned HOST_WIDE_INT probable_max_size = -1;
27143 bool misaligned_prologue_used = false;
27144 bool have_as;
27146 if (CONST_INT_P (align_exp))
27147 align = INTVAL (align_exp);
27148 /* i386 can do misaligned access on reasonably increased cost. */
27149 if (CONST_INT_P (expected_align_exp)
27150 && INTVAL (expected_align_exp) > align)
27151 align = INTVAL (expected_align_exp);
27152 /* ALIGN is the minimum of destination and source alignment, but we care here
27153 just about destination alignment. */
27154 else if (!issetmem
27155 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27156 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27158 if (CONST_INT_P (count_exp))
27160 min_size = max_size = probable_max_size = count = expected_size
27161 = INTVAL (count_exp);
27162 /* When COUNT is 0, there is nothing to do. */
27163 if (!count)
27164 return true;
27166 else
27168 if (min_size_exp)
27169 min_size = INTVAL (min_size_exp);
27170 if (max_size_exp)
27171 max_size = INTVAL (max_size_exp);
27172 if (probable_max_size_exp)
27173 probable_max_size = INTVAL (probable_max_size_exp);
27174 if (CONST_INT_P (expected_size_exp))
27175 expected_size = INTVAL (expected_size_exp);
27178 /* Make sure we don't need to care about overflow later on. */
27179 if (count > (HOST_WIDE_INT_1U << 30))
27180 return false;
27182 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27183 if (!issetmem)
27184 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27186 /* Step 0: Decide on preferred algorithm, desired alignment and
27187 size of chunks to be copied by main loop. */
27188 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27189 issetmem,
27190 issetmem && val_exp == const0_rtx, have_as,
27191 &dynamic_check, &noalign, false);
27192 if (alg == libcall)
27193 return false;
27194 gcc_assert (alg != no_stringop);
27196 /* For now vector-version of memset is generated only for memory zeroing, as
27197 creating of promoted vector value is very cheap in this case. */
27198 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27199 alg = unrolled_loop;
27201 if (!count)
27202 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27203 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27204 if (!issetmem)
27205 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27207 unroll_factor = 1;
27208 move_mode = word_mode;
27209 switch (alg)
27211 case libcall:
27212 case no_stringop:
27213 case last_alg:
27214 gcc_unreachable ();
27215 case loop_1_byte:
27216 need_zero_guard = true;
27217 move_mode = QImode;
27218 break;
27219 case loop:
27220 need_zero_guard = true;
27221 break;
27222 case unrolled_loop:
27223 need_zero_guard = true;
27224 unroll_factor = (TARGET_64BIT ? 4 : 2);
27225 break;
27226 case vector_loop:
27227 need_zero_guard = true;
27228 unroll_factor = 4;
27229 /* Find the widest supported mode. */
27230 move_mode = word_mode;
27231 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27232 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27233 move_mode = wider_mode;
27235 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27236 move_mode = TImode;
27238 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27239 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27240 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27242 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27243 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27244 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27245 move_mode = word_mode;
27247 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27248 break;
27249 case rep_prefix_8_byte:
27250 move_mode = DImode;
27251 break;
27252 case rep_prefix_4_byte:
27253 move_mode = SImode;
27254 break;
27255 case rep_prefix_1_byte:
27256 move_mode = QImode;
27257 break;
27259 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27260 epilogue_size_needed = size_needed;
27262 /* If we are going to call any library calls conditionally, make sure any
27263 pending stack adjustment happen before the first conditional branch,
27264 otherwise they will be emitted before the library call only and won't
27265 happen from the other branches. */
27266 if (dynamic_check != -1)
27267 do_pending_stack_adjust ();
27269 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27270 if (!TARGET_ALIGN_STRINGOPS || noalign)
27271 align = desired_align;
27273 /* Step 1: Prologue guard. */
27275 /* Alignment code needs count to be in register. */
27276 if (CONST_INT_P (count_exp) && desired_align > align)
27278 if (INTVAL (count_exp) > desired_align
27279 && INTVAL (count_exp) > size_needed)
27281 align_bytes
27282 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27283 if (align_bytes <= 0)
27284 align_bytes = 0;
27285 else
27286 align_bytes = desired_align - align_bytes;
27288 if (align_bytes == 0)
27289 count_exp = force_reg (counter_mode (count_exp), count_exp);
27291 gcc_assert (desired_align >= 1 && align >= 1);
27293 /* Misaligned move sequences handle both prologue and epilogue at once.
27294 Default code generation results in a smaller code for large alignments
27295 and also avoids redundant job when sizes are known precisely. */
27296 misaligned_prologue_used
27297 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27298 && MAX (desired_align, epilogue_size_needed) <= 32
27299 && desired_align <= epilogue_size_needed
27300 && ((desired_align > align && !align_bytes)
27301 || (!count && epilogue_size_needed > 1)));
27303 /* Do the cheap promotion to allow better CSE across the
27304 main loop and epilogue (ie one load of the big constant in the
27305 front of all code.
27306 For now the misaligned move sequences do not have fast path
27307 without broadcasting. */
27308 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27310 if (alg == vector_loop)
27312 gcc_assert (val_exp == const0_rtx);
27313 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27314 promoted_val = promote_duplicated_reg_to_size (val_exp,
27315 GET_MODE_SIZE (word_mode),
27316 desired_align, align);
27318 else
27320 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27321 desired_align, align);
27324 /* Misaligned move sequences handles both prologues and epilogues at once.
27325 Default code generation results in smaller code for large alignments and
27326 also avoids redundant job when sizes are known precisely. */
27327 if (misaligned_prologue_used)
27329 /* Misaligned move prologue handled small blocks by itself. */
27330 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27331 (dst, src, &destreg, &srcreg,
27332 move_mode, promoted_val, vec_promoted_val,
27333 &count_exp,
27334 &jump_around_label,
27335 desired_align < align
27336 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27337 desired_align, align, &min_size, dynamic_check, issetmem);
27338 if (!issetmem)
27339 src = change_address (src, BLKmode, srcreg);
27340 dst = change_address (dst, BLKmode, destreg);
27341 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27342 epilogue_size_needed = 0;
27343 if (need_zero_guard
27344 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27346 /* It is possible that we copied enough so the main loop will not
27347 execute. */
27348 gcc_assert (size_needed > 1);
27349 if (jump_around_label == NULL_RTX)
27350 jump_around_label = gen_label_rtx ();
27351 emit_cmp_and_jump_insns (count_exp,
27352 GEN_INT (size_needed),
27353 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27354 if (expected_size == -1
27355 || expected_size < (desired_align - align) / 2 + size_needed)
27356 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27357 else
27358 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27361 /* Ensure that alignment prologue won't copy past end of block. */
27362 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27364 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27365 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27366 Make sure it is power of 2. */
27367 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27369 /* To improve performance of small blocks, we jump around the VAL
27370 promoting mode. This mean that if the promoted VAL is not constant,
27371 we might not use it in the epilogue and have to use byte
27372 loop variant. */
27373 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27374 force_loopy_epilogue = true;
27375 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27376 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27378 /* If main algorithm works on QImode, no epilogue is needed.
27379 For small sizes just don't align anything. */
27380 if (size_needed == 1)
27381 desired_align = align;
27382 else
27383 goto epilogue;
27385 else if (!count
27386 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27388 label = gen_label_rtx ();
27389 emit_cmp_and_jump_insns (count_exp,
27390 GEN_INT (epilogue_size_needed),
27391 LTU, 0, counter_mode (count_exp), 1, label);
27392 if (expected_size == -1 || expected_size < epilogue_size_needed)
27393 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27394 else
27395 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27399 /* Emit code to decide on runtime whether library call or inline should be
27400 used. */
27401 if (dynamic_check != -1)
27403 if (!issetmem && CONST_INT_P (count_exp))
27405 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27407 emit_block_copy_via_libcall (dst, src, count_exp);
27408 count_exp = const0_rtx;
27409 goto epilogue;
27412 else
27414 rtx_code_label *hot_label = gen_label_rtx ();
27415 if (jump_around_label == NULL_RTX)
27416 jump_around_label = gen_label_rtx ();
27417 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27418 LEU, 0, counter_mode (count_exp),
27419 1, hot_label);
27420 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27421 if (issetmem)
27422 set_storage_via_libcall (dst, count_exp, val_exp);
27423 else
27424 emit_block_copy_via_libcall (dst, src, count_exp);
27425 emit_jump (jump_around_label);
27426 emit_label (hot_label);
27430 /* Step 2: Alignment prologue. */
27431 /* Do the expensive promotion once we branched off the small blocks. */
27432 if (issetmem && !promoted_val)
27433 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27434 desired_align, align);
27436 if (desired_align > align && !misaligned_prologue_used)
27438 if (align_bytes == 0)
27440 /* Except for the first move in prologue, we no longer know
27441 constant offset in aliasing info. It don't seems to worth
27442 the pain to maintain it for the first move, so throw away
27443 the info early. */
27444 dst = change_address (dst, BLKmode, destreg);
27445 if (!issetmem)
27446 src = change_address (src, BLKmode, srcreg);
27447 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27448 promoted_val, vec_promoted_val,
27449 count_exp, align, desired_align,
27450 issetmem);
27451 /* At most desired_align - align bytes are copied. */
27452 if (min_size < (unsigned)(desired_align - align))
27453 min_size = 0;
27454 else
27455 min_size -= desired_align - align;
27457 else
27459 /* If we know how many bytes need to be stored before dst is
27460 sufficiently aligned, maintain aliasing info accurately. */
27461 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27462 srcreg,
27463 promoted_val,
27464 vec_promoted_val,
27465 desired_align,
27466 align_bytes,
27467 issetmem);
27469 count_exp = plus_constant (counter_mode (count_exp),
27470 count_exp, -align_bytes);
27471 count -= align_bytes;
27472 min_size -= align_bytes;
27473 max_size -= align_bytes;
27475 if (need_zero_guard
27476 && min_size < (unsigned HOST_WIDE_INT) size_needed
27477 && (count < (unsigned HOST_WIDE_INT) size_needed
27478 || (align_bytes == 0
27479 && count < ((unsigned HOST_WIDE_INT) size_needed
27480 + desired_align - align))))
27482 /* It is possible that we copied enough so the main loop will not
27483 execute. */
27484 gcc_assert (size_needed > 1);
27485 if (label == NULL_RTX)
27486 label = gen_label_rtx ();
27487 emit_cmp_and_jump_insns (count_exp,
27488 GEN_INT (size_needed),
27489 LTU, 0, counter_mode (count_exp), 1, label);
27490 if (expected_size == -1
27491 || expected_size < (desired_align - align) / 2 + size_needed)
27492 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27493 else
27494 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27497 if (label && size_needed == 1)
27499 emit_label (label);
27500 LABEL_NUSES (label) = 1;
27501 label = NULL;
27502 epilogue_size_needed = 1;
27503 if (issetmem)
27504 promoted_val = val_exp;
27506 else if (label == NULL_RTX && !misaligned_prologue_used)
27507 epilogue_size_needed = size_needed;
27509 /* Step 3: Main loop. */
27511 switch (alg)
27513 case libcall:
27514 case no_stringop:
27515 case last_alg:
27516 gcc_unreachable ();
27517 case loop_1_byte:
27518 case loop:
27519 case unrolled_loop:
27520 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27521 count_exp, move_mode, unroll_factor,
27522 expected_size, issetmem);
27523 break;
27524 case vector_loop:
27525 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27526 vec_promoted_val, count_exp, move_mode,
27527 unroll_factor, expected_size, issetmem);
27528 break;
27529 case rep_prefix_8_byte:
27530 case rep_prefix_4_byte:
27531 case rep_prefix_1_byte:
27532 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27533 val_exp, count_exp, move_mode, issetmem);
27534 break;
27536 /* Adjust properly the offset of src and dest memory for aliasing. */
27537 if (CONST_INT_P (count_exp))
27539 if (!issetmem)
27540 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27541 (count / size_needed) * size_needed);
27542 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27543 (count / size_needed) * size_needed);
27545 else
27547 if (!issetmem)
27548 src = change_address (src, BLKmode, srcreg);
27549 dst = change_address (dst, BLKmode, destreg);
27552 /* Step 4: Epilogue to copy the remaining bytes. */
27553 epilogue:
27554 if (label)
27556 /* When the main loop is done, COUNT_EXP might hold original count,
27557 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27558 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27559 bytes. Compensate if needed. */
27561 if (size_needed < epilogue_size_needed)
27563 tmp =
27564 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27565 GEN_INT (size_needed - 1), count_exp, 1,
27566 OPTAB_DIRECT);
27567 if (tmp != count_exp)
27568 emit_move_insn (count_exp, tmp);
27570 emit_label (label);
27571 LABEL_NUSES (label) = 1;
27574 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27576 if (force_loopy_epilogue)
27577 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27578 epilogue_size_needed);
27579 else
27581 if (issetmem)
27582 expand_setmem_epilogue (dst, destreg, promoted_val,
27583 vec_promoted_val, count_exp,
27584 epilogue_size_needed);
27585 else
27586 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27587 epilogue_size_needed);
27590 if (jump_around_label)
27591 emit_label (jump_around_label);
27592 return true;
27596 /* Expand the appropriate insns for doing strlen if not just doing
27597 repnz; scasb
27599 out = result, initialized with the start address
27600 align_rtx = alignment of the address.
27601 scratch = scratch register, initialized with the startaddress when
27602 not aligned, otherwise undefined
27604 This is just the body. It needs the initializations mentioned above and
27605 some address computing at the end. These things are done in i386.md. */
27607 static void
27608 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27610 int align;
27611 rtx tmp;
27612 rtx_code_label *align_2_label = NULL;
27613 rtx_code_label *align_3_label = NULL;
27614 rtx_code_label *align_4_label = gen_label_rtx ();
27615 rtx_code_label *end_0_label = gen_label_rtx ();
27616 rtx mem;
27617 rtx tmpreg = gen_reg_rtx (SImode);
27618 rtx scratch = gen_reg_rtx (SImode);
27619 rtx cmp;
27621 align = 0;
27622 if (CONST_INT_P (align_rtx))
27623 align = INTVAL (align_rtx);
27625 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27627 /* Is there a known alignment and is it less than 4? */
27628 if (align < 4)
27630 rtx scratch1 = gen_reg_rtx (Pmode);
27631 emit_move_insn (scratch1, out);
27632 /* Is there a known alignment and is it not 2? */
27633 if (align != 2)
27635 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27636 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27638 /* Leave just the 3 lower bits. */
27639 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27640 NULL_RTX, 0, OPTAB_WIDEN);
27642 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27643 Pmode, 1, align_4_label);
27644 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27645 Pmode, 1, align_2_label);
27646 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27647 Pmode, 1, align_3_label);
27649 else
27651 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27652 check if is aligned to 4 - byte. */
27654 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27655 NULL_RTX, 0, OPTAB_WIDEN);
27657 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27658 Pmode, 1, align_4_label);
27661 mem = change_address (src, QImode, out);
27663 /* Now compare the bytes. */
27665 /* Compare the first n unaligned byte on a byte per byte basis. */
27666 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27667 QImode, 1, end_0_label);
27669 /* Increment the address. */
27670 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27672 /* Not needed with an alignment of 2 */
27673 if (align != 2)
27675 emit_label (align_2_label);
27677 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27678 end_0_label);
27680 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27682 emit_label (align_3_label);
27685 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27686 end_0_label);
27688 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27691 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27692 align this loop. It gives only huge programs, but does not help to
27693 speed up. */
27694 emit_label (align_4_label);
27696 mem = change_address (src, SImode, out);
27697 emit_move_insn (scratch, mem);
27698 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27700 /* This formula yields a nonzero result iff one of the bytes is zero.
27701 This saves three branches inside loop and many cycles. */
27703 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27704 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27705 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27706 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27707 gen_int_mode (0x80808080, SImode)));
27708 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27709 align_4_label);
27711 if (TARGET_CMOVE)
27713 rtx reg = gen_reg_rtx (SImode);
27714 rtx reg2 = gen_reg_rtx (Pmode);
27715 emit_move_insn (reg, tmpreg);
27716 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27718 /* If zero is not in the first two bytes, move two bytes forward. */
27719 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27720 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27721 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27722 emit_insn (gen_rtx_SET (tmpreg,
27723 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27724 reg,
27725 tmpreg)));
27726 /* Emit lea manually to avoid clobbering of flags. */
27727 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27729 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27730 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27731 emit_insn (gen_rtx_SET (out,
27732 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27733 reg2,
27734 out)));
27736 else
27738 rtx_code_label *end_2_label = gen_label_rtx ();
27739 /* Is zero in the first two bytes? */
27741 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27742 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27743 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27744 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27745 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27746 pc_rtx);
27747 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27748 JUMP_LABEL (tmp) = end_2_label;
27750 /* Not in the first two. Move two bytes forward. */
27751 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27752 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27754 emit_label (end_2_label);
27758 /* Avoid branch in fixing the byte. */
27759 tmpreg = gen_lowpart (QImode, tmpreg);
27760 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27761 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27762 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27763 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27765 emit_label (end_0_label);
27768 /* Expand strlen. */
27770 bool
27771 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27773 rtx addr, scratch1, scratch2, scratch3, scratch4;
27775 /* The generic case of strlen expander is long. Avoid it's
27776 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27778 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27779 && !TARGET_INLINE_ALL_STRINGOPS
27780 && !optimize_insn_for_size_p ()
27781 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27782 return false;
27784 addr = force_reg (Pmode, XEXP (src, 0));
27785 scratch1 = gen_reg_rtx (Pmode);
27787 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27788 && !optimize_insn_for_size_p ())
27790 /* Well it seems that some optimizer does not combine a call like
27791 foo(strlen(bar), strlen(bar));
27792 when the move and the subtraction is done here. It does calculate
27793 the length just once when these instructions are done inside of
27794 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27795 often used and I use one fewer register for the lifetime of
27796 output_strlen_unroll() this is better. */
27798 emit_move_insn (out, addr);
27800 ix86_expand_strlensi_unroll_1 (out, src, align);
27802 /* strlensi_unroll_1 returns the address of the zero at the end of
27803 the string, like memchr(), so compute the length by subtracting
27804 the start address. */
27805 emit_insn (ix86_gen_sub3 (out, out, addr));
27807 else
27809 rtx unspec;
27811 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27812 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27813 return false;
27814 /* Can't use this for non-default address spaces. */
27815 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27816 return false;
27818 scratch2 = gen_reg_rtx (Pmode);
27819 scratch3 = gen_reg_rtx (Pmode);
27820 scratch4 = force_reg (Pmode, constm1_rtx);
27822 emit_move_insn (scratch3, addr);
27823 eoschar = force_reg (QImode, eoschar);
27825 src = replace_equiv_address_nv (src, scratch3);
27827 /* If .md starts supporting :P, this can be done in .md. */
27828 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27829 scratch4), UNSPEC_SCAS);
27830 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27831 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27832 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27834 return true;
27837 /* For given symbol (function) construct code to compute address of it's PLT
27838 entry in large x86-64 PIC model. */
27839 static rtx
27840 construct_plt_address (rtx symbol)
27842 rtx tmp, unspec;
27844 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27845 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27846 gcc_assert (Pmode == DImode);
27848 tmp = gen_reg_rtx (Pmode);
27849 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27851 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27852 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27853 return tmp;
27857 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27858 rtx callarg2,
27859 rtx pop, bool sibcall)
27861 rtx vec[3];
27862 rtx use = NULL, call;
27863 unsigned int vec_len = 0;
27864 tree fndecl;
27866 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27868 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27869 if (fndecl
27870 && (lookup_attribute ("interrupt",
27871 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27872 error ("interrupt service routine can't be called directly");
27874 else
27875 fndecl = NULL_TREE;
27877 if (pop == const0_rtx)
27878 pop = NULL;
27879 gcc_assert (!TARGET_64BIT || !pop);
27881 if (TARGET_MACHO && !TARGET_64BIT)
27883 #if TARGET_MACHO
27884 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27885 fnaddr = machopic_indirect_call_target (fnaddr);
27886 #endif
27888 else
27890 /* Static functions and indirect calls don't need the pic register. Also,
27891 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27892 it an indirect call. */
27893 rtx addr = XEXP (fnaddr, 0);
27894 if (flag_pic
27895 && GET_CODE (addr) == SYMBOL_REF
27896 && !SYMBOL_REF_LOCAL_P (addr))
27898 if (flag_plt
27899 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27900 || !lookup_attribute ("noplt",
27901 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27903 if (!TARGET_64BIT
27904 || (ix86_cmodel == CM_LARGE_PIC
27905 && DEFAULT_ABI != MS_ABI))
27907 use_reg (&use, gen_rtx_REG (Pmode,
27908 REAL_PIC_OFFSET_TABLE_REGNUM));
27909 if (ix86_use_pseudo_pic_reg ())
27910 emit_move_insn (gen_rtx_REG (Pmode,
27911 REAL_PIC_OFFSET_TABLE_REGNUM),
27912 pic_offset_table_rtx);
27915 else if (!TARGET_PECOFF && !TARGET_MACHO)
27917 if (TARGET_64BIT)
27919 fnaddr = gen_rtx_UNSPEC (Pmode,
27920 gen_rtvec (1, addr),
27921 UNSPEC_GOTPCREL);
27922 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27924 else
27926 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27927 UNSPEC_GOT);
27928 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27929 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27930 fnaddr);
27932 fnaddr = gen_const_mem (Pmode, fnaddr);
27933 /* Pmode may not be the same as word_mode for x32, which
27934 doesn't support indirect branch via 32-bit memory slot.
27935 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27936 indirect branch via x32 GOT slot is OK. */
27937 if (GET_MODE (fnaddr) != word_mode)
27938 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27939 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27944 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27945 parameters passed in vector registers. */
27946 if (TARGET_64BIT
27947 && (INTVAL (callarg2) > 0
27948 || (INTVAL (callarg2) == 0
27949 && (TARGET_SSE || !flag_skip_rax_setup))))
27951 rtx al = gen_rtx_REG (QImode, AX_REG);
27952 emit_move_insn (al, callarg2);
27953 use_reg (&use, al);
27956 if (ix86_cmodel == CM_LARGE_PIC
27957 && !TARGET_PECOFF
27958 && MEM_P (fnaddr)
27959 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
27960 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
27961 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
27962 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
27963 branch via x32 GOT slot is OK. */
27964 else if (!(TARGET_X32
27965 && MEM_P (fnaddr)
27966 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
27967 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
27968 && (sibcall
27969 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
27970 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
27972 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
27973 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
27976 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
27978 if (retval)
27980 /* We should add bounds as destination register in case
27981 pointer with bounds may be returned. */
27982 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
27984 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
27985 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
27986 if (GET_CODE (retval) == PARALLEL)
27988 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
27989 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
27990 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
27991 retval = chkp_join_splitted_slot (retval, par);
27993 else
27995 retval = gen_rtx_PARALLEL (VOIDmode,
27996 gen_rtvec (3, retval, b0, b1));
27997 chkp_put_regs_to_expr_list (retval);
28001 call = gen_rtx_SET (retval, call);
28003 vec[vec_len++] = call;
28005 if (pop)
28007 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28008 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28009 vec[vec_len++] = pop;
28012 if (cfun->machine->no_caller_saved_registers
28013 && (!fndecl
28014 || (!TREE_THIS_VOLATILE (fndecl)
28015 && !lookup_attribute ("no_caller_saved_registers",
28016 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28018 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28019 bool is_64bit_ms_abi = (TARGET_64BIT
28020 && ix86_function_abi (fndecl) == MS_ABI);
28021 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28023 /* If there are no caller-saved registers, add all registers
28024 that are clobbered by the call which returns. */
28025 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28026 if (!fixed_regs[i]
28027 && (ix86_call_used_regs[i] == 1
28028 || (ix86_call_used_regs[i] & c_mask))
28029 && !STACK_REGNO_P (i)
28030 && !MMX_REGNO_P (i))
28031 clobber_reg (&use,
28032 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28034 else if (TARGET_64BIT_MS_ABI
28035 && (!callarg2 || INTVAL (callarg2) != -2))
28037 unsigned i;
28039 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28041 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28042 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28044 clobber_reg (&use, gen_rtx_REG (mode, regno));
28047 /* Set here, but it may get cleared later. */
28048 if (TARGET_CALL_MS2SYSV_XLOGUES)
28050 if (!TARGET_SSE)
28053 /* Don't break hot-patched functions. */
28054 else if (ix86_function_ms_hook_prologue (current_function_decl))
28057 /* TODO: Cases not yet examined. */
28058 else if (flag_split_stack)
28059 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28061 else
28063 gcc_assert (!reload_completed);
28064 cfun->machine->call_ms2sysv = true;
28069 if (vec_len > 1)
28070 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28071 call = emit_call_insn (call);
28072 if (use)
28073 CALL_INSN_FUNCTION_USAGE (call) = use;
28075 return call;
28078 /* Return true if the function being called was marked with attribute
28079 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28080 to handle the non-PIC case in the backend because there is no easy
28081 interface for the front-end to force non-PLT calls to use the GOT.
28082 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28083 to call the function marked "noplt" indirectly. */
28085 static bool
28086 ix86_nopic_noplt_attribute_p (rtx call_op)
28088 if (flag_pic || ix86_cmodel == CM_LARGE
28089 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28090 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28091 || SYMBOL_REF_LOCAL_P (call_op))
28092 return false;
28094 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28096 if (!flag_plt
28097 || (symbol_decl != NULL_TREE
28098 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28099 return true;
28101 return false;
28104 /* Output the assembly for a call instruction. */
28106 const char *
28107 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28109 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28110 bool seh_nop_p = false;
28111 const char *xasm;
28113 if (SIBLING_CALL_P (insn))
28115 if (direct_p)
28117 if (ix86_nopic_noplt_attribute_p (call_op))
28119 if (TARGET_64BIT)
28120 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28121 else
28122 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28124 else
28125 xasm = "%!jmp\t%P0";
28127 /* SEH epilogue detection requires the indirect branch case
28128 to include REX.W. */
28129 else if (TARGET_SEH)
28130 xasm = "%!rex.W jmp\t%A0";
28131 else
28132 xasm = "%!jmp\t%A0";
28134 output_asm_insn (xasm, &call_op);
28135 return "";
28138 /* SEH unwinding can require an extra nop to be emitted in several
28139 circumstances. Determine if we have one of those. */
28140 if (TARGET_SEH)
28142 rtx_insn *i;
28144 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28146 /* If we get to another real insn, we don't need the nop. */
28147 if (INSN_P (i))
28148 break;
28150 /* If we get to the epilogue note, prevent a catch region from
28151 being adjacent to the standard epilogue sequence. If non-
28152 call-exceptions, we'll have done this during epilogue emission. */
28153 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28154 && !flag_non_call_exceptions
28155 && !can_throw_internal (insn))
28157 seh_nop_p = true;
28158 break;
28162 /* If we didn't find a real insn following the call, prevent the
28163 unwinder from looking into the next function. */
28164 if (i == NULL)
28165 seh_nop_p = true;
28168 if (direct_p)
28170 if (ix86_nopic_noplt_attribute_p (call_op))
28172 if (TARGET_64BIT)
28173 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28174 else
28175 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28177 else
28178 xasm = "%!call\t%P0";
28180 else
28181 xasm = "%!call\t%A0";
28183 output_asm_insn (xasm, &call_op);
28185 if (seh_nop_p)
28186 return "nop";
28188 return "";
28191 /* Clear stack slot assignments remembered from previous functions.
28192 This is called from INIT_EXPANDERS once before RTL is emitted for each
28193 function. */
28195 static struct machine_function *
28196 ix86_init_machine_status (void)
28198 struct machine_function *f;
28200 f = ggc_cleared_alloc<machine_function> ();
28201 f->call_abi = ix86_abi;
28203 return f;
28206 /* Return a MEM corresponding to a stack slot with mode MODE.
28207 Allocate a new slot if necessary.
28209 The RTL for a function can have several slots available: N is
28210 which slot to use. */
28213 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28215 struct stack_local_entry *s;
28217 gcc_assert (n < MAX_386_STACK_LOCALS);
28219 for (s = ix86_stack_locals; s; s = s->next)
28220 if (s->mode == mode && s->n == n)
28221 return validize_mem (copy_rtx (s->rtl));
28223 s = ggc_alloc<stack_local_entry> ();
28224 s->n = n;
28225 s->mode = mode;
28226 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28228 s->next = ix86_stack_locals;
28229 ix86_stack_locals = s;
28230 return validize_mem (copy_rtx (s->rtl));
28233 static void
28234 ix86_instantiate_decls (void)
28236 struct stack_local_entry *s;
28238 for (s = ix86_stack_locals; s; s = s->next)
28239 if (s->rtl != NULL_RTX)
28240 instantiate_decl_rtl (s->rtl);
28243 /* Return the number used for encoding REG, in the range 0..7. */
28245 static int
28246 reg_encoded_number (rtx reg)
28248 unsigned regno = REGNO (reg);
28249 switch (regno)
28251 case AX_REG:
28252 return 0;
28253 case CX_REG:
28254 return 1;
28255 case DX_REG:
28256 return 2;
28257 case BX_REG:
28258 return 3;
28259 case SP_REG:
28260 return 4;
28261 case BP_REG:
28262 return 5;
28263 case SI_REG:
28264 return 6;
28265 case DI_REG:
28266 return 7;
28267 default:
28268 break;
28270 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28271 return regno - FIRST_STACK_REG;
28272 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28273 return regno - FIRST_SSE_REG;
28274 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28275 return regno - FIRST_MMX_REG;
28276 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28277 return regno - FIRST_REX_SSE_REG;
28278 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28279 return regno - FIRST_REX_INT_REG;
28280 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28281 return regno - FIRST_MASK_REG;
28282 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28283 return regno - FIRST_BND_REG;
28284 return -1;
28287 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28288 in its encoding if it could be relevant for ROP mitigation, otherwise
28289 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28290 used for calculating it into them. */
28292 static int
28293 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28294 int *popno0 = 0, int *popno1 = 0)
28296 if (asm_noperands (PATTERN (insn)) >= 0)
28297 return -1;
28298 int has_modrm = get_attr_modrm (insn);
28299 if (!has_modrm)
28300 return -1;
28301 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28302 rtx op0, op1;
28303 switch (cls)
28305 case MODRM_CLASS_OP02:
28306 gcc_assert (noperands >= 3);
28307 if (popno0)
28309 *popno0 = 0;
28310 *popno1 = 2;
28312 op0 = operands[0];
28313 op1 = operands[2];
28314 break;
28315 case MODRM_CLASS_OP01:
28316 gcc_assert (noperands >= 2);
28317 if (popno0)
28319 *popno0 = 0;
28320 *popno1 = 1;
28322 op0 = operands[0];
28323 op1 = operands[1];
28324 break;
28325 default:
28326 return -1;
28328 if (REG_P (op0) && REG_P (op1))
28330 int enc0 = reg_encoded_number (op0);
28331 int enc1 = reg_encoded_number (op1);
28332 return 0xc0 + (enc1 << 3) + enc0;
28334 return -1;
28337 /* Check whether x86 address PARTS is a pc-relative address. */
28339 bool
28340 ix86_rip_relative_addr_p (struct ix86_address *parts)
28342 rtx base, index, disp;
28344 base = parts->base;
28345 index = parts->index;
28346 disp = parts->disp;
28348 if (disp && !base && !index)
28350 if (TARGET_64BIT)
28352 rtx symbol = disp;
28354 if (GET_CODE (disp) == CONST)
28355 symbol = XEXP (disp, 0);
28356 if (GET_CODE (symbol) == PLUS
28357 && CONST_INT_P (XEXP (symbol, 1)))
28358 symbol = XEXP (symbol, 0);
28360 if (GET_CODE (symbol) == LABEL_REF
28361 || (GET_CODE (symbol) == SYMBOL_REF
28362 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28363 || (GET_CODE (symbol) == UNSPEC
28364 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28365 || XINT (symbol, 1) == UNSPEC_PCREL
28366 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28367 return true;
28370 return false;
28373 /* Calculate the length of the memory address in the instruction encoding.
28374 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28375 or other prefixes. We never generate addr32 prefix for LEA insn. */
28378 memory_address_length (rtx addr, bool lea)
28380 struct ix86_address parts;
28381 rtx base, index, disp;
28382 int len;
28383 int ok;
28385 if (GET_CODE (addr) == PRE_DEC
28386 || GET_CODE (addr) == POST_INC
28387 || GET_CODE (addr) == PRE_MODIFY
28388 || GET_CODE (addr) == POST_MODIFY)
28389 return 0;
28391 ok = ix86_decompose_address (addr, &parts);
28392 gcc_assert (ok);
28394 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28396 /* If this is not LEA instruction, add the length of addr32 prefix. */
28397 if (TARGET_64BIT && !lea
28398 && (SImode_address_operand (addr, VOIDmode)
28399 || (parts.base && GET_MODE (parts.base) == SImode)
28400 || (parts.index && GET_MODE (parts.index) == SImode)))
28401 len++;
28403 base = parts.base;
28404 index = parts.index;
28405 disp = parts.disp;
28407 if (base && SUBREG_P (base))
28408 base = SUBREG_REG (base);
28409 if (index && SUBREG_P (index))
28410 index = SUBREG_REG (index);
28412 gcc_assert (base == NULL_RTX || REG_P (base));
28413 gcc_assert (index == NULL_RTX || REG_P (index));
28415 /* Rule of thumb:
28416 - esp as the base always wants an index,
28417 - ebp as the base always wants a displacement,
28418 - r12 as the base always wants an index,
28419 - r13 as the base always wants a displacement. */
28421 /* Register Indirect. */
28422 if (base && !index && !disp)
28424 /* esp (for its index) and ebp (for its displacement) need
28425 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28426 code. */
28427 if (base == arg_pointer_rtx
28428 || base == frame_pointer_rtx
28429 || REGNO (base) == SP_REG
28430 || REGNO (base) == BP_REG
28431 || REGNO (base) == R12_REG
28432 || REGNO (base) == R13_REG)
28433 len++;
28436 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28437 is not disp32, but disp32(%rip), so for disp32
28438 SIB byte is needed, unless print_operand_address
28439 optimizes it into disp32(%rip) or (%rip) is implied
28440 by UNSPEC. */
28441 else if (disp && !base && !index)
28443 len += 4;
28444 if (!ix86_rip_relative_addr_p (&parts))
28445 len++;
28447 else
28449 /* Find the length of the displacement constant. */
28450 if (disp)
28452 if (base && satisfies_constraint_K (disp))
28453 len += 1;
28454 else
28455 len += 4;
28457 /* ebp always wants a displacement. Similarly r13. */
28458 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28459 len++;
28461 /* An index requires the two-byte modrm form.... */
28462 if (index
28463 /* ...like esp (or r12), which always wants an index. */
28464 || base == arg_pointer_rtx
28465 || base == frame_pointer_rtx
28466 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28467 len++;
28470 return len;
28473 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28474 is set, expect that insn have 8bit immediate alternative. */
28476 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28478 int len = 0;
28479 int i;
28480 extract_insn_cached (insn);
28481 for (i = recog_data.n_operands - 1; i >= 0; --i)
28482 if (CONSTANT_P (recog_data.operand[i]))
28484 enum attr_mode mode = get_attr_mode (insn);
28486 gcc_assert (!len);
28487 if (shortform && CONST_INT_P (recog_data.operand[i]))
28489 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28490 switch (mode)
28492 case MODE_QI:
28493 len = 1;
28494 continue;
28495 case MODE_HI:
28496 ival = trunc_int_for_mode (ival, HImode);
28497 break;
28498 case MODE_SI:
28499 ival = trunc_int_for_mode (ival, SImode);
28500 break;
28501 default:
28502 break;
28504 if (IN_RANGE (ival, -128, 127))
28506 len = 1;
28507 continue;
28510 switch (mode)
28512 case MODE_QI:
28513 len = 1;
28514 break;
28515 case MODE_HI:
28516 len = 2;
28517 break;
28518 case MODE_SI:
28519 len = 4;
28520 break;
28521 /* Immediates for DImode instructions are encoded
28522 as 32bit sign extended values. */
28523 case MODE_DI:
28524 len = 4;
28525 break;
28526 default:
28527 fatal_insn ("unknown insn mode", insn);
28530 return len;
28533 /* Compute default value for "length_address" attribute. */
28535 ix86_attr_length_address_default (rtx_insn *insn)
28537 int i;
28539 if (get_attr_type (insn) == TYPE_LEA)
28541 rtx set = PATTERN (insn), addr;
28543 if (GET_CODE (set) == PARALLEL)
28544 set = XVECEXP (set, 0, 0);
28546 gcc_assert (GET_CODE (set) == SET);
28548 addr = SET_SRC (set);
28550 return memory_address_length (addr, true);
28553 extract_insn_cached (insn);
28554 for (i = recog_data.n_operands - 1; i >= 0; --i)
28556 rtx op = recog_data.operand[i];
28557 if (MEM_P (op))
28559 constrain_operands_cached (insn, reload_completed);
28560 if (which_alternative != -1)
28562 const char *constraints = recog_data.constraints[i];
28563 int alt = which_alternative;
28565 while (*constraints == '=' || *constraints == '+')
28566 constraints++;
28567 while (alt-- > 0)
28568 while (*constraints++ != ',')
28570 /* Skip ignored operands. */
28571 if (*constraints == 'X')
28572 continue;
28575 int len = memory_address_length (XEXP (op, 0), false);
28577 /* Account for segment prefix for non-default addr spaces. */
28578 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28579 len++;
28581 return len;
28584 return 0;
28587 /* Compute default value for "length_vex" attribute. It includes
28588 2 or 3 byte VEX prefix and 1 opcode byte. */
28591 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28592 bool has_vex_w)
28594 int i;
28596 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28597 byte VEX prefix. */
28598 if (!has_0f_opcode || has_vex_w)
28599 return 3 + 1;
28601 /* We can always use 2 byte VEX prefix in 32bit. */
28602 if (!TARGET_64BIT)
28603 return 2 + 1;
28605 extract_insn_cached (insn);
28607 for (i = recog_data.n_operands - 1; i >= 0; --i)
28608 if (REG_P (recog_data.operand[i]))
28610 /* REX.W bit uses 3 byte VEX prefix. */
28611 if (GET_MODE (recog_data.operand[i]) == DImode
28612 && GENERAL_REG_P (recog_data.operand[i]))
28613 return 3 + 1;
28615 else
28617 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28618 if (MEM_P (recog_data.operand[i])
28619 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28620 return 3 + 1;
28623 return 2 + 1;
28627 static bool
28628 ix86_class_likely_spilled_p (reg_class_t);
28630 /* Returns true if lhs of insn is HW function argument register and set up
28631 is_spilled to true if it is likely spilled HW register. */
28632 static bool
28633 insn_is_function_arg (rtx insn, bool* is_spilled)
28635 rtx dst;
28637 if (!NONDEBUG_INSN_P (insn))
28638 return false;
28639 /* Call instructions are not movable, ignore it. */
28640 if (CALL_P (insn))
28641 return false;
28642 insn = PATTERN (insn);
28643 if (GET_CODE (insn) == PARALLEL)
28644 insn = XVECEXP (insn, 0, 0);
28645 if (GET_CODE (insn) != SET)
28646 return false;
28647 dst = SET_DEST (insn);
28648 if (REG_P (dst) && HARD_REGISTER_P (dst)
28649 && ix86_function_arg_regno_p (REGNO (dst)))
28651 /* Is it likely spilled HW register? */
28652 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28653 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28654 *is_spilled = true;
28655 return true;
28657 return false;
28660 /* Add output dependencies for chain of function adjacent arguments if only
28661 there is a move to likely spilled HW register. Return first argument
28662 if at least one dependence was added or NULL otherwise. */
28663 static rtx_insn *
28664 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28666 rtx_insn *insn;
28667 rtx_insn *last = call;
28668 rtx_insn *first_arg = NULL;
28669 bool is_spilled = false;
28671 head = PREV_INSN (head);
28673 /* Find nearest to call argument passing instruction. */
28674 while (true)
28676 last = PREV_INSN (last);
28677 if (last == head)
28678 return NULL;
28679 if (!NONDEBUG_INSN_P (last))
28680 continue;
28681 if (insn_is_function_arg (last, &is_spilled))
28682 break;
28683 return NULL;
28686 first_arg = last;
28687 while (true)
28689 insn = PREV_INSN (last);
28690 if (!INSN_P (insn))
28691 break;
28692 if (insn == head)
28693 break;
28694 if (!NONDEBUG_INSN_P (insn))
28696 last = insn;
28697 continue;
28699 if (insn_is_function_arg (insn, &is_spilled))
28701 /* Add output depdendence between two function arguments if chain
28702 of output arguments contains likely spilled HW registers. */
28703 if (is_spilled)
28704 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28705 first_arg = last = insn;
28707 else
28708 break;
28710 if (!is_spilled)
28711 return NULL;
28712 return first_arg;
28715 /* Add output or anti dependency from insn to first_arg to restrict its code
28716 motion. */
28717 static void
28718 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28720 rtx set;
28721 rtx tmp;
28723 /* Add anti dependencies for bounds stores. */
28724 if (INSN_P (insn)
28725 && GET_CODE (PATTERN (insn)) == PARALLEL
28726 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28727 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28729 add_dependence (first_arg, insn, REG_DEP_ANTI);
28730 return;
28733 set = single_set (insn);
28734 if (!set)
28735 return;
28736 tmp = SET_DEST (set);
28737 if (REG_P (tmp))
28739 /* Add output dependency to the first function argument. */
28740 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28741 return;
28743 /* Add anti dependency. */
28744 add_dependence (first_arg, insn, REG_DEP_ANTI);
28747 /* Avoid cross block motion of function argument through adding dependency
28748 from the first non-jump instruction in bb. */
28749 static void
28750 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28752 rtx_insn *insn = BB_END (bb);
28754 while (insn)
28756 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28758 rtx set = single_set (insn);
28759 if (set)
28761 avoid_func_arg_motion (arg, insn);
28762 return;
28765 if (insn == BB_HEAD (bb))
28766 return;
28767 insn = PREV_INSN (insn);
28771 /* Hook for pre-reload schedule - avoid motion of function arguments
28772 passed in likely spilled HW registers. */
28773 static void
28774 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28776 rtx_insn *insn;
28777 rtx_insn *first_arg = NULL;
28778 if (reload_completed)
28779 return;
28780 while (head != tail && DEBUG_INSN_P (head))
28781 head = NEXT_INSN (head);
28782 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28783 if (INSN_P (insn) && CALL_P (insn))
28785 first_arg = add_parameter_dependencies (insn, head);
28786 if (first_arg)
28788 /* Add dependee for first argument to predecessors if only
28789 region contains more than one block. */
28790 basic_block bb = BLOCK_FOR_INSN (insn);
28791 int rgn = CONTAINING_RGN (bb->index);
28792 int nr_blks = RGN_NR_BLOCKS (rgn);
28793 /* Skip trivial regions and region head blocks that can have
28794 predecessors outside of region. */
28795 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28797 edge e;
28798 edge_iterator ei;
28800 /* Regions are SCCs with the exception of selective
28801 scheduling with pipelining of outer blocks enabled.
28802 So also check that immediate predecessors of a non-head
28803 block are in the same region. */
28804 FOR_EACH_EDGE (e, ei, bb->preds)
28806 /* Avoid creating of loop-carried dependencies through
28807 using topological ordering in the region. */
28808 if (rgn == CONTAINING_RGN (e->src->index)
28809 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28810 add_dependee_for_func_arg (first_arg, e->src);
28813 insn = first_arg;
28814 if (insn == head)
28815 break;
28818 else if (first_arg)
28819 avoid_func_arg_motion (first_arg, insn);
28822 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28823 HW registers to maximum, to schedule them at soon as possible. These are
28824 moves from function argument registers at the top of the function entry
28825 and moves from function return value registers after call. */
28826 static int
28827 ix86_adjust_priority (rtx_insn *insn, int priority)
28829 rtx set;
28831 if (reload_completed)
28832 return priority;
28834 if (!NONDEBUG_INSN_P (insn))
28835 return priority;
28837 set = single_set (insn);
28838 if (set)
28840 rtx tmp = SET_SRC (set);
28841 if (REG_P (tmp)
28842 && HARD_REGISTER_P (tmp)
28843 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28844 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28845 return current_sched_info->sched_max_insns_priority;
28848 return priority;
28851 /* Prepare for scheduling pass. */
28852 static void
28853 ix86_sched_init_global (FILE *, int, int)
28855 /* Install scheduling hooks for current CPU. Some of these hooks are used
28856 in time-critical parts of the scheduler, so we only set them up when
28857 they are actually used. */
28858 switch (ix86_tune)
28860 case PROCESSOR_CORE2:
28861 case PROCESSOR_NEHALEM:
28862 case PROCESSOR_SANDYBRIDGE:
28863 case PROCESSOR_HASWELL:
28864 case PROCESSOR_GENERIC:
28865 /* Do not perform multipass scheduling for pre-reload schedule
28866 to save compile time. */
28867 if (reload_completed)
28869 ix86_core2i7_init_hooks ();
28870 break;
28872 /* Fall through. */
28873 default:
28874 targetm.sched.dfa_post_advance_cycle = NULL;
28875 targetm.sched.first_cycle_multipass_init = NULL;
28876 targetm.sched.first_cycle_multipass_begin = NULL;
28877 targetm.sched.first_cycle_multipass_issue = NULL;
28878 targetm.sched.first_cycle_multipass_backtrack = NULL;
28879 targetm.sched.first_cycle_multipass_end = NULL;
28880 targetm.sched.first_cycle_multipass_fini = NULL;
28881 break;
28886 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
28888 static HOST_WIDE_INT
28889 ix86_static_rtx_alignment (machine_mode mode)
28891 if (mode == DFmode)
28892 return 64;
28893 if (ALIGN_MODE_128 (mode))
28894 return MAX (128, GET_MODE_ALIGNMENT (mode));
28895 return GET_MODE_ALIGNMENT (mode);
28898 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28900 static HOST_WIDE_INT
28901 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28903 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28904 || TREE_CODE (exp) == INTEGER_CST)
28906 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
28907 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
28908 return MAX (mode_align, align);
28910 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28911 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28912 return BITS_PER_WORD;
28914 return align;
28917 /* Implement TARGET_EMPTY_RECORD_P. */
28919 static bool
28920 ix86_is_empty_record (const_tree type)
28922 if (!TARGET_64BIT)
28923 return false;
28924 return default_is_empty_record (type);
28927 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
28929 static void
28930 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
28932 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
28934 if (!cum->warn_empty)
28935 return;
28937 if (!TYPE_EMPTY_P (type))
28938 return;
28940 const_tree ctx = get_ultimate_context (cum->decl);
28941 if (ctx != NULL_TREE
28942 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
28943 return;
28945 /* If the actual size of the type is zero, then there is no change
28946 in how objects of this size are passed. */
28947 if (int_size_in_bytes (type) == 0)
28948 return;
28950 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
28951 "changes in -fabi-version=12 (GCC 8)", type);
28953 /* Only warn once. */
28954 cum->warn_empty = false;
28957 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
28958 the data type, and ALIGN is the alignment that the object would
28959 ordinarily have. */
28961 static int
28962 iamcu_alignment (tree type, int align)
28964 machine_mode mode;
28966 if (align < 32 || TYPE_USER_ALIGN (type))
28967 return align;
28969 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
28970 bytes. */
28971 mode = TYPE_MODE (strip_array_types (type));
28972 switch (GET_MODE_CLASS (mode))
28974 case MODE_INT:
28975 case MODE_COMPLEX_INT:
28976 case MODE_COMPLEX_FLOAT:
28977 case MODE_FLOAT:
28978 case MODE_DECIMAL_FLOAT:
28979 return 32;
28980 default:
28981 return align;
28985 /* Compute the alignment for a static variable.
28986 TYPE is the data type, and ALIGN is the alignment that
28987 the object would ordinarily have. The value of this function is used
28988 instead of that alignment to align the object. */
28991 ix86_data_alignment (tree type, int align, bool opt)
28993 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
28994 for symbols from other compilation units or symbols that don't need
28995 to bind locally. In order to preserve some ABI compatibility with
28996 those compilers, ensure we don't decrease alignment from what we
28997 used to assume. */
28999 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
29001 /* A data structure, equal or greater than the size of a cache line
29002 (64 bytes in the Pentium 4 and other recent Intel processors, including
29003 processors based on Intel Core microarchitecture) should be aligned
29004 so that its base address is a multiple of a cache line size. */
29006 int max_align
29007 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29009 if (max_align < BITS_PER_WORD)
29010 max_align = BITS_PER_WORD;
29012 switch (ix86_align_data_type)
29014 case ix86_align_data_type_abi: opt = false; break;
29015 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29016 case ix86_align_data_type_cacheline: break;
29019 if (TARGET_IAMCU)
29020 align = iamcu_alignment (type, align);
29022 if (opt
29023 && AGGREGATE_TYPE_P (type)
29024 && TYPE_SIZE (type)
29025 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29027 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29028 && align < max_align_compat)
29029 align = max_align_compat;
29030 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29031 && align < max_align)
29032 align = max_align;
29035 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29036 to 16byte boundary. */
29037 if (TARGET_64BIT)
29039 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29040 && TYPE_SIZE (type)
29041 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29042 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29043 && align < 128)
29044 return 128;
29047 if (!opt)
29048 return align;
29050 if (TREE_CODE (type) == ARRAY_TYPE)
29052 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29053 return 64;
29054 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29055 return 128;
29057 else if (TREE_CODE (type) == COMPLEX_TYPE)
29060 if (TYPE_MODE (type) == DCmode && align < 64)
29061 return 64;
29062 if ((TYPE_MODE (type) == XCmode
29063 || TYPE_MODE (type) == TCmode) && align < 128)
29064 return 128;
29066 else if ((TREE_CODE (type) == RECORD_TYPE
29067 || TREE_CODE (type) == UNION_TYPE
29068 || TREE_CODE (type) == QUAL_UNION_TYPE)
29069 && TYPE_FIELDS (type))
29071 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29072 return 64;
29073 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29074 return 128;
29076 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29077 || TREE_CODE (type) == INTEGER_TYPE)
29079 if (TYPE_MODE (type) == DFmode && align < 64)
29080 return 64;
29081 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29082 return 128;
29085 return align;
29088 /* Compute the alignment for a local variable or a stack slot. EXP is
29089 the data type or decl itself, MODE is the widest mode available and
29090 ALIGN is the alignment that the object would ordinarily have. The
29091 value of this macro is used instead of that alignment to align the
29092 object. */
29094 unsigned int
29095 ix86_local_alignment (tree exp, machine_mode mode,
29096 unsigned int align)
29098 tree type, decl;
29100 if (exp && DECL_P (exp))
29102 type = TREE_TYPE (exp);
29103 decl = exp;
29105 else
29107 type = exp;
29108 decl = NULL;
29111 /* Don't do dynamic stack realignment for long long objects with
29112 -mpreferred-stack-boundary=2. */
29113 if (!TARGET_64BIT
29114 && align == 64
29115 && ix86_preferred_stack_boundary < 64
29116 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29117 && (!type || !TYPE_USER_ALIGN (type))
29118 && (!decl || !DECL_USER_ALIGN (decl)))
29119 align = 32;
29121 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29122 register in MODE. We will return the largest alignment of XF
29123 and DF. */
29124 if (!type)
29126 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29127 align = GET_MODE_ALIGNMENT (DFmode);
29128 return align;
29131 /* Don't increase alignment for Intel MCU psABI. */
29132 if (TARGET_IAMCU)
29133 return align;
29135 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29136 to 16byte boundary. Exact wording is:
29138 An array uses the same alignment as its elements, except that a local or
29139 global array variable of length at least 16 bytes or
29140 a C99 variable-length array variable always has alignment of at least 16 bytes.
29142 This was added to allow use of aligned SSE instructions at arrays. This
29143 rule is meant for static storage (where compiler can not do the analysis
29144 by itself). We follow it for automatic variables only when convenient.
29145 We fully control everything in the function compiled and functions from
29146 other unit can not rely on the alignment.
29148 Exclude va_list type. It is the common case of local array where
29149 we can not benefit from the alignment.
29151 TODO: Probably one should optimize for size only when var is not escaping. */
29152 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29153 && TARGET_SSE)
29155 if (AGGREGATE_TYPE_P (type)
29156 && (va_list_type_node == NULL_TREE
29157 || (TYPE_MAIN_VARIANT (type)
29158 != TYPE_MAIN_VARIANT (va_list_type_node)))
29159 && TYPE_SIZE (type)
29160 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29161 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29162 && align < 128)
29163 return 128;
29165 if (TREE_CODE (type) == ARRAY_TYPE)
29167 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29168 return 64;
29169 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29170 return 128;
29172 else if (TREE_CODE (type) == COMPLEX_TYPE)
29174 if (TYPE_MODE (type) == DCmode && align < 64)
29175 return 64;
29176 if ((TYPE_MODE (type) == XCmode
29177 || TYPE_MODE (type) == TCmode) && align < 128)
29178 return 128;
29180 else if ((TREE_CODE (type) == RECORD_TYPE
29181 || TREE_CODE (type) == UNION_TYPE
29182 || TREE_CODE (type) == QUAL_UNION_TYPE)
29183 && TYPE_FIELDS (type))
29185 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29186 return 64;
29187 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29188 return 128;
29190 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29191 || TREE_CODE (type) == INTEGER_TYPE)
29194 if (TYPE_MODE (type) == DFmode && align < 64)
29195 return 64;
29196 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29197 return 128;
29199 return align;
29202 /* Compute the minimum required alignment for dynamic stack realignment
29203 purposes for a local variable, parameter or a stack slot. EXP is
29204 the data type or decl itself, MODE is its mode and ALIGN is the
29205 alignment that the object would ordinarily have. */
29207 unsigned int
29208 ix86_minimum_alignment (tree exp, machine_mode mode,
29209 unsigned int align)
29211 tree type, decl;
29213 if (exp && DECL_P (exp))
29215 type = TREE_TYPE (exp);
29216 decl = exp;
29218 else
29220 type = exp;
29221 decl = NULL;
29224 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29225 return align;
29227 /* Don't do dynamic stack realignment for long long objects with
29228 -mpreferred-stack-boundary=2. */
29229 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29230 && (!type || !TYPE_USER_ALIGN (type))
29231 && (!decl || !DECL_USER_ALIGN (decl)))
29233 gcc_checking_assert (!TARGET_STV);
29234 return 32;
29237 return align;
29240 /* Find a location for the static chain incoming to a nested function.
29241 This is a register, unless all free registers are used by arguments. */
29243 static rtx
29244 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29246 unsigned regno;
29248 if (TARGET_64BIT)
29250 /* We always use R10 in 64-bit mode. */
29251 regno = R10_REG;
29253 else
29255 const_tree fntype, fndecl;
29256 unsigned int ccvt;
29258 /* By default in 32-bit mode we use ECX to pass the static chain. */
29259 regno = CX_REG;
29261 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29263 fntype = TREE_TYPE (fndecl_or_type);
29264 fndecl = fndecl_or_type;
29266 else
29268 fntype = fndecl_or_type;
29269 fndecl = NULL;
29272 ccvt = ix86_get_callcvt (fntype);
29273 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29275 /* Fastcall functions use ecx/edx for arguments, which leaves
29276 us with EAX for the static chain.
29277 Thiscall functions use ecx for arguments, which also
29278 leaves us with EAX for the static chain. */
29279 regno = AX_REG;
29281 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29283 /* Thiscall functions use ecx for arguments, which leaves
29284 us with EAX and EDX for the static chain.
29285 We are using for abi-compatibility EAX. */
29286 regno = AX_REG;
29288 else if (ix86_function_regparm (fntype, fndecl) == 3)
29290 /* For regparm 3, we have no free call-clobbered registers in
29291 which to store the static chain. In order to implement this,
29292 we have the trampoline push the static chain to the stack.
29293 However, we can't push a value below the return address when
29294 we call the nested function directly, so we have to use an
29295 alternate entry point. For this we use ESI, and have the
29296 alternate entry point push ESI, so that things appear the
29297 same once we're executing the nested function. */
29298 if (incoming_p)
29300 if (fndecl == current_function_decl
29301 && !ix86_static_chain_on_stack)
29303 gcc_assert (!reload_completed);
29304 ix86_static_chain_on_stack = true;
29306 return gen_frame_mem (SImode,
29307 plus_constant (Pmode,
29308 arg_pointer_rtx, -8));
29310 regno = SI_REG;
29314 return gen_rtx_REG (Pmode, regno);
29317 /* Emit RTL insns to initialize the variable parts of a trampoline.
29318 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29319 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29320 to be passed to the target function. */
29322 static void
29323 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29325 rtx mem, fnaddr;
29326 int opcode;
29327 int offset = 0;
29329 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29331 if (TARGET_64BIT)
29333 int size;
29335 /* Load the function address to r11. Try to load address using
29336 the shorter movl instead of movabs. We may want to support
29337 movq for kernel mode, but kernel does not use trampolines at
29338 the moment. FNADDR is a 32bit address and may not be in
29339 DImode when ptr_mode == SImode. Always use movl in this
29340 case. */
29341 if (ptr_mode == SImode
29342 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29344 fnaddr = copy_addr_to_reg (fnaddr);
29346 mem = adjust_address (m_tramp, HImode, offset);
29347 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29349 mem = adjust_address (m_tramp, SImode, offset + 2);
29350 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29351 offset += 6;
29353 else
29355 mem = adjust_address (m_tramp, HImode, offset);
29356 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29358 mem = adjust_address (m_tramp, DImode, offset + 2);
29359 emit_move_insn (mem, fnaddr);
29360 offset += 10;
29363 /* Load static chain using movabs to r10. Use the shorter movl
29364 instead of movabs when ptr_mode == SImode. */
29365 if (ptr_mode == SImode)
29367 opcode = 0xba41;
29368 size = 6;
29370 else
29372 opcode = 0xba49;
29373 size = 10;
29376 mem = adjust_address (m_tramp, HImode, offset);
29377 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29379 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29380 emit_move_insn (mem, chain_value);
29381 offset += size;
29383 /* Jump to r11; the last (unused) byte is a nop, only there to
29384 pad the write out to a single 32-bit store. */
29385 mem = adjust_address (m_tramp, SImode, offset);
29386 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29387 offset += 4;
29389 else
29391 rtx disp, chain;
29393 /* Depending on the static chain location, either load a register
29394 with a constant, or push the constant to the stack. All of the
29395 instructions are the same size. */
29396 chain = ix86_static_chain (fndecl, true);
29397 if (REG_P (chain))
29399 switch (REGNO (chain))
29401 case AX_REG:
29402 opcode = 0xb8; break;
29403 case CX_REG:
29404 opcode = 0xb9; break;
29405 default:
29406 gcc_unreachable ();
29409 else
29410 opcode = 0x68;
29412 mem = adjust_address (m_tramp, QImode, offset);
29413 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29415 mem = adjust_address (m_tramp, SImode, offset + 1);
29416 emit_move_insn (mem, chain_value);
29417 offset += 5;
29419 mem = adjust_address (m_tramp, QImode, offset);
29420 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29422 mem = adjust_address (m_tramp, SImode, offset + 1);
29424 /* Compute offset from the end of the jmp to the target function.
29425 In the case in which the trampoline stores the static chain on
29426 the stack, we need to skip the first insn which pushes the
29427 (call-saved) register static chain; this push is 1 byte. */
29428 offset += 5;
29429 disp = expand_binop (SImode, sub_optab, fnaddr,
29430 plus_constant (Pmode, XEXP (m_tramp, 0),
29431 offset - (MEM_P (chain) ? 1 : 0)),
29432 NULL_RTX, 1, OPTAB_DIRECT);
29433 emit_move_insn (mem, disp);
29436 gcc_assert (offset <= TRAMPOLINE_SIZE);
29438 #ifdef HAVE_ENABLE_EXECUTE_STACK
29439 #ifdef CHECK_EXECUTE_STACK_ENABLED
29440 if (CHECK_EXECUTE_STACK_ENABLED)
29441 #endif
29442 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29443 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29444 #endif
29447 static bool
29448 ix86_allocate_stack_slots_for_args (void)
29450 /* Naked functions should not allocate stack slots for arguments. */
29451 return !ix86_function_naked (current_function_decl);
29454 static bool
29455 ix86_warn_func_return (tree decl)
29457 /* Naked functions are implemented entirely in assembly, including the
29458 return sequence, so suppress warnings about this. */
29459 return !ix86_function_naked (decl);
29462 /* The following file contains several enumerations and data structures
29463 built from the definitions in i386-builtin-types.def. */
29465 #include "i386-builtin-types.inc"
29467 /* Table for the ix86 builtin non-function types. */
29468 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29470 /* Retrieve an element from the above table, building some of
29471 the types lazily. */
29473 static tree
29474 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29476 unsigned int index;
29477 tree type, itype;
29479 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29481 type = ix86_builtin_type_tab[(int) tcode];
29482 if (type != NULL)
29483 return type;
29485 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29486 if (tcode <= IX86_BT_LAST_VECT)
29488 machine_mode mode;
29490 index = tcode - IX86_BT_LAST_PRIM - 1;
29491 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29492 mode = ix86_builtin_type_vect_mode[index];
29494 type = build_vector_type_for_mode (itype, mode);
29496 else
29498 int quals;
29500 index = tcode - IX86_BT_LAST_VECT - 1;
29501 if (tcode <= IX86_BT_LAST_PTR)
29502 quals = TYPE_UNQUALIFIED;
29503 else
29504 quals = TYPE_QUAL_CONST;
29506 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29507 if (quals != TYPE_UNQUALIFIED)
29508 itype = build_qualified_type (itype, quals);
29510 type = build_pointer_type (itype);
29513 ix86_builtin_type_tab[(int) tcode] = type;
29514 return type;
29517 /* Table for the ix86 builtin function types. */
29518 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29520 /* Retrieve an element from the above table, building some of
29521 the types lazily. */
29523 static tree
29524 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29526 tree type;
29528 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29530 type = ix86_builtin_func_type_tab[(int) tcode];
29531 if (type != NULL)
29532 return type;
29534 if (tcode <= IX86_BT_LAST_FUNC)
29536 unsigned start = ix86_builtin_func_start[(int) tcode];
29537 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29538 tree rtype, atype, args = void_list_node;
29539 unsigned i;
29541 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29542 for (i = after - 1; i > start; --i)
29544 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29545 args = tree_cons (NULL, atype, args);
29548 type = build_function_type (rtype, args);
29550 else
29552 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29553 enum ix86_builtin_func_type icode;
29555 icode = ix86_builtin_func_alias_base[index];
29556 type = ix86_get_builtin_func_type (icode);
29559 ix86_builtin_func_type_tab[(int) tcode] = type;
29560 return type;
29564 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29565 bdesc_* arrays below should come first, then builtins for each bdesc_*
29566 array in ascending order, so that we can use direct array accesses. */
29567 enum ix86_builtins
29569 IX86_BUILTIN_MASKMOVQ,
29570 IX86_BUILTIN_LDMXCSR,
29571 IX86_BUILTIN_STMXCSR,
29572 IX86_BUILTIN_MASKMOVDQU,
29573 IX86_BUILTIN_PSLLDQ128,
29574 IX86_BUILTIN_CLFLUSH,
29575 IX86_BUILTIN_MONITOR,
29576 IX86_BUILTIN_MWAIT,
29577 IX86_BUILTIN_CLZERO,
29578 IX86_BUILTIN_VEC_INIT_V2SI,
29579 IX86_BUILTIN_VEC_INIT_V4HI,
29580 IX86_BUILTIN_VEC_INIT_V8QI,
29581 IX86_BUILTIN_VEC_EXT_V2DF,
29582 IX86_BUILTIN_VEC_EXT_V2DI,
29583 IX86_BUILTIN_VEC_EXT_V4SF,
29584 IX86_BUILTIN_VEC_EXT_V4SI,
29585 IX86_BUILTIN_VEC_EXT_V8HI,
29586 IX86_BUILTIN_VEC_EXT_V2SI,
29587 IX86_BUILTIN_VEC_EXT_V4HI,
29588 IX86_BUILTIN_VEC_EXT_V16QI,
29589 IX86_BUILTIN_VEC_SET_V2DI,
29590 IX86_BUILTIN_VEC_SET_V4SF,
29591 IX86_BUILTIN_VEC_SET_V4SI,
29592 IX86_BUILTIN_VEC_SET_V8HI,
29593 IX86_BUILTIN_VEC_SET_V4HI,
29594 IX86_BUILTIN_VEC_SET_V16QI,
29595 IX86_BUILTIN_GATHERSIV2DF,
29596 IX86_BUILTIN_GATHERSIV4DF,
29597 IX86_BUILTIN_GATHERDIV2DF,
29598 IX86_BUILTIN_GATHERDIV4DF,
29599 IX86_BUILTIN_GATHERSIV4SF,
29600 IX86_BUILTIN_GATHERSIV8SF,
29601 IX86_BUILTIN_GATHERDIV4SF,
29602 IX86_BUILTIN_GATHERDIV8SF,
29603 IX86_BUILTIN_GATHERSIV2DI,
29604 IX86_BUILTIN_GATHERSIV4DI,
29605 IX86_BUILTIN_GATHERDIV2DI,
29606 IX86_BUILTIN_GATHERDIV4DI,
29607 IX86_BUILTIN_GATHERSIV4SI,
29608 IX86_BUILTIN_GATHERSIV8SI,
29609 IX86_BUILTIN_GATHERDIV4SI,
29610 IX86_BUILTIN_GATHERDIV8SI,
29611 IX86_BUILTIN_VFMSUBSD3_MASK3,
29612 IX86_BUILTIN_VFMSUBSS3_MASK3,
29613 IX86_BUILTIN_GATHER3SIV8SF,
29614 IX86_BUILTIN_GATHER3SIV4SF,
29615 IX86_BUILTIN_GATHER3SIV4DF,
29616 IX86_BUILTIN_GATHER3SIV2DF,
29617 IX86_BUILTIN_GATHER3DIV8SF,
29618 IX86_BUILTIN_GATHER3DIV4SF,
29619 IX86_BUILTIN_GATHER3DIV4DF,
29620 IX86_BUILTIN_GATHER3DIV2DF,
29621 IX86_BUILTIN_GATHER3SIV8SI,
29622 IX86_BUILTIN_GATHER3SIV4SI,
29623 IX86_BUILTIN_GATHER3SIV4DI,
29624 IX86_BUILTIN_GATHER3SIV2DI,
29625 IX86_BUILTIN_GATHER3DIV8SI,
29626 IX86_BUILTIN_GATHER3DIV4SI,
29627 IX86_BUILTIN_GATHER3DIV4DI,
29628 IX86_BUILTIN_GATHER3DIV2DI,
29629 IX86_BUILTIN_SCATTERSIV8SF,
29630 IX86_BUILTIN_SCATTERSIV4SF,
29631 IX86_BUILTIN_SCATTERSIV4DF,
29632 IX86_BUILTIN_SCATTERSIV2DF,
29633 IX86_BUILTIN_SCATTERDIV8SF,
29634 IX86_BUILTIN_SCATTERDIV4SF,
29635 IX86_BUILTIN_SCATTERDIV4DF,
29636 IX86_BUILTIN_SCATTERDIV2DF,
29637 IX86_BUILTIN_SCATTERSIV8SI,
29638 IX86_BUILTIN_SCATTERSIV4SI,
29639 IX86_BUILTIN_SCATTERSIV4DI,
29640 IX86_BUILTIN_SCATTERSIV2DI,
29641 IX86_BUILTIN_SCATTERDIV8SI,
29642 IX86_BUILTIN_SCATTERDIV4SI,
29643 IX86_BUILTIN_SCATTERDIV4DI,
29644 IX86_BUILTIN_SCATTERDIV2DI,
29645 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29646 where all operands are 32-byte or 64-byte wide respectively. */
29647 IX86_BUILTIN_GATHERALTSIV4DF,
29648 IX86_BUILTIN_GATHERALTDIV8SF,
29649 IX86_BUILTIN_GATHERALTSIV4DI,
29650 IX86_BUILTIN_GATHERALTDIV8SI,
29651 IX86_BUILTIN_GATHER3ALTDIV16SF,
29652 IX86_BUILTIN_GATHER3ALTDIV16SI,
29653 IX86_BUILTIN_GATHER3ALTSIV4DF,
29654 IX86_BUILTIN_GATHER3ALTDIV8SF,
29655 IX86_BUILTIN_GATHER3ALTSIV4DI,
29656 IX86_BUILTIN_GATHER3ALTDIV8SI,
29657 IX86_BUILTIN_GATHER3ALTSIV8DF,
29658 IX86_BUILTIN_GATHER3ALTSIV8DI,
29659 IX86_BUILTIN_GATHER3DIV16SF,
29660 IX86_BUILTIN_GATHER3DIV16SI,
29661 IX86_BUILTIN_GATHER3DIV8DF,
29662 IX86_BUILTIN_GATHER3DIV8DI,
29663 IX86_BUILTIN_GATHER3SIV16SF,
29664 IX86_BUILTIN_GATHER3SIV16SI,
29665 IX86_BUILTIN_GATHER3SIV8DF,
29666 IX86_BUILTIN_GATHER3SIV8DI,
29667 IX86_BUILTIN_SCATTERALTSIV8DF,
29668 IX86_BUILTIN_SCATTERALTDIV16SF,
29669 IX86_BUILTIN_SCATTERALTSIV8DI,
29670 IX86_BUILTIN_SCATTERALTDIV16SI,
29671 IX86_BUILTIN_SCATTERDIV16SF,
29672 IX86_BUILTIN_SCATTERDIV16SI,
29673 IX86_BUILTIN_SCATTERDIV8DF,
29674 IX86_BUILTIN_SCATTERDIV8DI,
29675 IX86_BUILTIN_SCATTERSIV16SF,
29676 IX86_BUILTIN_SCATTERSIV16SI,
29677 IX86_BUILTIN_SCATTERSIV8DF,
29678 IX86_BUILTIN_SCATTERSIV8DI,
29679 IX86_BUILTIN_GATHERPFQPD,
29680 IX86_BUILTIN_GATHERPFDPS,
29681 IX86_BUILTIN_GATHERPFDPD,
29682 IX86_BUILTIN_GATHERPFQPS,
29683 IX86_BUILTIN_SCATTERPFDPD,
29684 IX86_BUILTIN_SCATTERPFDPS,
29685 IX86_BUILTIN_SCATTERPFQPD,
29686 IX86_BUILTIN_SCATTERPFQPS,
29687 IX86_BUILTIN_CLWB,
29688 IX86_BUILTIN_CLFLUSHOPT,
29689 IX86_BUILTIN_INFQ,
29690 IX86_BUILTIN_HUGE_VALQ,
29691 IX86_BUILTIN_NANQ,
29692 IX86_BUILTIN_NANSQ,
29693 IX86_BUILTIN_XABORT,
29694 IX86_BUILTIN_ADDCARRYX32,
29695 IX86_BUILTIN_ADDCARRYX64,
29696 IX86_BUILTIN_SBB32,
29697 IX86_BUILTIN_SBB64,
29698 IX86_BUILTIN_RDRAND16_STEP,
29699 IX86_BUILTIN_RDRAND32_STEP,
29700 IX86_BUILTIN_RDRAND64_STEP,
29701 IX86_BUILTIN_RDSEED16_STEP,
29702 IX86_BUILTIN_RDSEED32_STEP,
29703 IX86_BUILTIN_RDSEED64_STEP,
29704 IX86_BUILTIN_MONITORX,
29705 IX86_BUILTIN_MWAITX,
29706 IX86_BUILTIN_CFSTRING,
29707 IX86_BUILTIN_CPU_INIT,
29708 IX86_BUILTIN_CPU_IS,
29709 IX86_BUILTIN_CPU_SUPPORTS,
29710 IX86_BUILTIN_READ_FLAGS,
29711 IX86_BUILTIN_WRITE_FLAGS,
29713 /* All the remaining builtins are tracked in bdesc_* arrays in
29714 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29715 this point. */
29716 #define BDESC(mask, icode, name, code, comparison, flag) \
29717 code,
29718 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29719 code, \
29720 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29721 #define BDESC_END(kind, next_kind)
29723 #include "i386-builtin.def"
29725 #undef BDESC
29726 #undef BDESC_FIRST
29727 #undef BDESC_END
29729 IX86_BUILTIN_MAX,
29731 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29733 /* Now just the aliases for bdesc_* start/end. */
29734 #define BDESC(mask, icode, name, code, comparison, flag)
29735 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29736 #define BDESC_END(kind, next_kind) \
29737 IX86_BUILTIN__BDESC_##kind##_LAST \
29738 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29740 #include "i386-builtin.def"
29742 #undef BDESC
29743 #undef BDESC_FIRST
29744 #undef BDESC_END
29746 /* Just to make sure there is no comma after the last enumerator. */
29747 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29750 /* Table for the ix86 builtin decls. */
29751 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29753 /* Table of all of the builtin functions that are possible with different ISA's
29754 but are waiting to be built until a function is declared to use that
29755 ISA. */
29756 struct builtin_isa {
29757 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29758 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29759 const char *name; /* function name */
29760 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29761 unsigned char const_p:1; /* true if the declaration is constant */
29762 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29763 bool leaf_p; /* true if the declaration has leaf attribute */
29764 bool nothrow_p; /* true if the declaration has nothrow attribute */
29765 bool set_and_not_built_p;
29768 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29770 /* Bits that can still enable any inclusion of a builtin. */
29771 static HOST_WIDE_INT deferred_isa_values = 0;
29772 static HOST_WIDE_INT deferred_isa_values2 = 0;
29774 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29775 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29776 function decl in the ix86_builtins array. Returns the function decl or
29777 NULL_TREE, if the builtin was not added.
29779 If the front end has a special hook for builtin functions, delay adding
29780 builtin functions that aren't in the current ISA until the ISA is changed
29781 with function specific optimization. Doing so, can save about 300K for the
29782 default compiler. When the builtin is expanded, check at that time whether
29783 it is valid.
29785 If the front end doesn't have a special hook, record all builtins, even if
29786 it isn't an instruction set in the current ISA in case the user uses
29787 function specific options for a different ISA, so that we don't get scope
29788 errors if a builtin is added in the middle of a function scope. */
29790 static inline tree
29791 def_builtin (HOST_WIDE_INT mask, const char *name,
29792 enum ix86_builtin_func_type tcode,
29793 enum ix86_builtins code)
29795 tree decl = NULL_TREE;
29797 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29799 ix86_builtins_isa[(int) code].isa = mask;
29801 /* OPTION_MASK_ISA_AVX512{F,VL,BW} have special meaning. Despite of
29802 generic case, where any bit set means that built-in is enable, this
29803 bit must be *and-ed* with another one. E.g.:
29804 OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
29805 means that *both* cpuid bits must be set for the built-in to
29806 be available. Handle this here. */
29807 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29808 && mask != OPTION_MASK_ISA_AVX512VL)
29809 mask &= ~OPTION_MASK_ISA_AVX512VL;
29810 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
29811 && mask != OPTION_MASK_ISA_AVX512BW)
29812 mask &= ~OPTION_MASK_ISA_AVX512BW;
29813 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512F)
29814 && mask != OPTION_MASK_ISA_AVX512F)
29815 mask &= ~OPTION_MASK_ISA_AVX512F;
29817 mask &= ~OPTION_MASK_ISA_64BIT;
29818 if (mask == 0
29819 || (mask & ix86_isa_flags) != 0
29820 || (lang_hooks.builtin_function
29821 == lang_hooks.builtin_function_ext_scope))
29824 tree type = ix86_get_builtin_func_type (tcode);
29825 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29826 NULL, NULL_TREE);
29827 ix86_builtins[(int) code] = decl;
29828 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29830 else
29832 /* Just a MASK where set_and_not_built_p == true can potentially
29833 include a builtin. */
29834 deferred_isa_values |= mask;
29835 ix86_builtins[(int) code] = NULL_TREE;
29836 ix86_builtins_isa[(int) code].tcode = tcode;
29837 ix86_builtins_isa[(int) code].name = name;
29838 ix86_builtins_isa[(int) code].leaf_p = false;
29839 ix86_builtins_isa[(int) code].nothrow_p = false;
29840 ix86_builtins_isa[(int) code].const_p = false;
29841 ix86_builtins_isa[(int) code].pure_p = false;
29842 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29846 return decl;
29849 /* Like def_builtin, but also marks the function decl "const". */
29851 static inline tree
29852 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29853 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29855 tree decl = def_builtin (mask, name, tcode, code);
29856 if (decl)
29857 TREE_READONLY (decl) = 1;
29858 else
29859 ix86_builtins_isa[(int) code].const_p = true;
29861 return decl;
29864 /* Like def_builtin, but also marks the function decl "pure". */
29866 static inline tree
29867 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29868 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29870 tree decl = def_builtin (mask, name, tcode, code);
29871 if (decl)
29872 DECL_PURE_P (decl) = 1;
29873 else
29874 ix86_builtins_isa[(int) code].pure_p = true;
29876 return decl;
29879 /* Like def_builtin, but for additional isa2 flags. */
29881 static inline tree
29882 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29883 enum ix86_builtin_func_type tcode,
29884 enum ix86_builtins code)
29886 tree decl = NULL_TREE;
29888 ix86_builtins_isa[(int) code].isa2 = mask;
29890 if (mask == 0
29891 || (mask & ix86_isa_flags2) != 0
29892 || (lang_hooks.builtin_function
29893 == lang_hooks.builtin_function_ext_scope))
29896 tree type = ix86_get_builtin_func_type (tcode);
29897 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29898 NULL, NULL_TREE);
29899 ix86_builtins[(int) code] = decl;
29900 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29902 else
29904 /* Just a MASK where set_and_not_built_p == true can potentially
29905 include a builtin. */
29906 deferred_isa_values2 |= mask;
29907 ix86_builtins[(int) code] = NULL_TREE;
29908 ix86_builtins_isa[(int) code].tcode = tcode;
29909 ix86_builtins_isa[(int) code].name = name;
29910 ix86_builtins_isa[(int) code].leaf_p = false;
29911 ix86_builtins_isa[(int) code].nothrow_p = false;
29912 ix86_builtins_isa[(int) code].const_p = false;
29913 ix86_builtins_isa[(int) code].pure_p = false;
29914 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29917 return decl;
29920 /* Like def_builtin, but also marks the function decl "const". */
29922 static inline tree
29923 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29924 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29926 tree decl = def_builtin2 (mask, name, tcode, code);
29927 if (decl)
29928 TREE_READONLY (decl) = 1;
29929 else
29930 ix86_builtins_isa[(int) code].const_p = true;
29932 return decl;
29935 /* Like def_builtin, but also marks the function decl "pure". */
29937 static inline tree
29938 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29939 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29941 tree decl = def_builtin2 (mask, name, tcode, code);
29942 if (decl)
29943 DECL_PURE_P (decl) = 1;
29944 else
29945 ix86_builtins_isa[(int) code].pure_p = true;
29947 return decl;
29950 /* Add any new builtin functions for a given ISA that may not have been
29951 declared. This saves a bit of space compared to adding all of the
29952 declarations to the tree, even if we didn't use them. */
29954 static void
29955 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29957 if ((isa & deferred_isa_values) == 0
29958 && (isa2 & deferred_isa_values2) == 0)
29959 return;
29961 /* Bits in ISA value can be removed from potential isa values. */
29962 deferred_isa_values &= ~isa;
29963 deferred_isa_values2 &= ~isa2;
29965 int i;
29966 tree saved_current_target_pragma = current_target_pragma;
29967 current_target_pragma = NULL_TREE;
29969 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
29971 if (((ix86_builtins_isa[i].isa & isa) != 0
29972 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
29973 && ix86_builtins_isa[i].set_and_not_built_p)
29975 tree decl, type;
29977 /* Don't define the builtin again. */
29978 ix86_builtins_isa[i].set_and_not_built_p = false;
29980 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
29981 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
29982 type, i, BUILT_IN_MD, NULL,
29983 NULL_TREE);
29985 ix86_builtins[i] = decl;
29986 if (ix86_builtins_isa[i].const_p)
29987 TREE_READONLY (decl) = 1;
29988 if (ix86_builtins_isa[i].pure_p)
29989 DECL_PURE_P (decl) = 1;
29990 if (ix86_builtins_isa[i].leaf_p)
29991 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
29992 NULL_TREE);
29993 if (ix86_builtins_isa[i].nothrow_p)
29994 TREE_NOTHROW (decl) = 1;
29998 current_target_pragma = saved_current_target_pragma;
30001 /* Bits for builtin_description.flag. */
30003 /* Set when we don't support the comparison natively, and should
30004 swap_comparison in order to support it. */
30005 #define BUILTIN_DESC_SWAP_OPERANDS 1
30007 struct builtin_description
30009 const HOST_WIDE_INT mask;
30010 const enum insn_code icode;
30011 const char *const name;
30012 const enum ix86_builtins code;
30013 const enum rtx_code comparison;
30014 const int flag;
30017 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30018 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30019 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30020 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30021 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30022 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30023 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30024 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30025 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30026 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30027 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30028 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30029 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30030 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30031 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30032 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30033 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30034 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30035 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30036 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30037 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30038 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30039 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30040 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30041 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30042 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30043 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30044 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30045 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30046 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30047 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30048 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30049 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30050 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30051 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30052 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30053 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30054 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30055 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30056 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30057 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30058 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30059 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30060 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30061 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30062 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30063 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30064 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30065 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30066 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30067 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30068 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30070 #define BDESC(mask, icode, name, code, comparison, flag) \
30071 { mask, icode, name, code, comparison, flag },
30072 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30073 static const struct builtin_description bdesc_##kind[] = \
30075 BDESC (mask, icode, name, code, comparison, flag)
30076 #define BDESC_END(kind, next_kind) \
30079 #include "i386-builtin.def"
30081 #undef BDESC
30082 #undef BDESC_FIRST
30083 #undef BDESC_END
30085 /* TM vector builtins. */
30087 /* Reuse the existing x86-specific `struct builtin_description' cause
30088 we're lazy. Add casts to make them fit. */
30089 static const struct builtin_description bdesc_tm[] =
30091 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30092 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30093 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30094 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30095 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30096 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30097 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30099 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30100 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30101 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30102 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30103 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30104 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30105 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30107 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30108 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30109 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30110 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30111 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30112 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30113 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30115 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30116 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30117 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30120 /* Initialize the transactional memory vector load/store builtins. */
30122 static void
30123 ix86_init_tm_builtins (void)
30125 enum ix86_builtin_func_type ftype;
30126 const struct builtin_description *d;
30127 size_t i;
30128 tree decl;
30129 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30130 tree attrs_log, attrs_type_log;
30132 if (!flag_tm)
30133 return;
30135 /* If there are no builtins defined, we must be compiling in a
30136 language without trans-mem support. */
30137 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30138 return;
30140 /* Use whatever attributes a normal TM load has. */
30141 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30142 attrs_load = DECL_ATTRIBUTES (decl);
30143 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30144 /* Use whatever attributes a normal TM store has. */
30145 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30146 attrs_store = DECL_ATTRIBUTES (decl);
30147 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30148 /* Use whatever attributes a normal TM log has. */
30149 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30150 attrs_log = DECL_ATTRIBUTES (decl);
30151 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30153 for (i = 0, d = bdesc_tm;
30154 i < ARRAY_SIZE (bdesc_tm);
30155 i++, d++)
30157 if ((d->mask & ix86_isa_flags) != 0
30158 || (lang_hooks.builtin_function
30159 == lang_hooks.builtin_function_ext_scope))
30161 tree type, attrs, attrs_type;
30162 enum built_in_function code = (enum built_in_function) d->code;
30164 ftype = (enum ix86_builtin_func_type) d->flag;
30165 type = ix86_get_builtin_func_type (ftype);
30167 if (BUILTIN_TM_LOAD_P (code))
30169 attrs = attrs_load;
30170 attrs_type = attrs_type_load;
30172 else if (BUILTIN_TM_STORE_P (code))
30174 attrs = attrs_store;
30175 attrs_type = attrs_type_store;
30177 else
30179 attrs = attrs_log;
30180 attrs_type = attrs_type_log;
30182 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
30183 /* The builtin without the prefix for
30184 calling it directly. */
30185 d->name + strlen ("__builtin_"),
30186 attrs);
30187 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
30188 set the TYPE_ATTRIBUTES. */
30189 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
30191 set_builtin_decl (code, decl, false);
30196 /* Macros for verification of enum ix86_builtins order. */
30197 #define BDESC_VERIFY(x, y, z) \
30198 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
30199 #define BDESC_VERIFYS(x, y, z) \
30200 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
30202 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30203 IX86_BUILTIN__BDESC_COMI_LAST, 1);
30204 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30205 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
30206 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30207 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
30208 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
30209 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
30210 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30211 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
30212 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
30213 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
30214 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
30215 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
30216 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30217 IX86_BUILTIN__BDESC_MPX_LAST, 1);
30218 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30219 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
30220 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
30221 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
30222 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30223 IX86_BUILTIN__BDESC_CET_LAST, 1);
30224 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30225 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30227 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30228 in the current target ISA to allow the user to compile particular modules
30229 with different target specific options that differ from the command line
30230 options. */
30231 static void
30232 ix86_init_mmx_sse_builtins (void)
30234 const struct builtin_description * d;
30235 enum ix86_builtin_func_type ftype;
30236 size_t i;
30238 /* Add all special builtins with variable number of operands. */
30239 for (i = 0, d = bdesc_special_args;
30240 i < ARRAY_SIZE (bdesc_special_args);
30241 i++, d++)
30243 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30244 if (d->name == 0)
30245 continue;
30247 ftype = (enum ix86_builtin_func_type) d->flag;
30248 def_builtin (d->mask, d->name, ftype, d->code);
30250 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30251 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30252 ARRAY_SIZE (bdesc_special_args) - 1);
30254 /* Add all builtins with variable number of operands. */
30255 for (i = 0, d = bdesc_args;
30256 i < ARRAY_SIZE (bdesc_args);
30257 i++, d++)
30259 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30260 if (d->name == 0)
30261 continue;
30263 ftype = (enum ix86_builtin_func_type) d->flag;
30264 def_builtin_const (d->mask, d->name, ftype, d->code);
30266 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30267 IX86_BUILTIN__BDESC_ARGS_FIRST,
30268 ARRAY_SIZE (bdesc_args) - 1);
30270 /* Add all builtins with variable number of operands. */
30271 for (i = 0, d = bdesc_args2;
30272 i < ARRAY_SIZE (bdesc_args2);
30273 i++, d++)
30275 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
30276 if (d->name == 0)
30277 continue;
30279 ftype = (enum ix86_builtin_func_type) d->flag;
30280 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30282 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
30283 IX86_BUILTIN__BDESC_ARGS2_FIRST,
30284 ARRAY_SIZE (bdesc_args2) - 1);
30286 /* Add all builtins with rounding. */
30287 for (i = 0, d = bdesc_round_args;
30288 i < ARRAY_SIZE (bdesc_round_args);
30289 i++, d++)
30291 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30292 if (d->name == 0)
30293 continue;
30295 ftype = (enum ix86_builtin_func_type) d->flag;
30296 def_builtin_const (d->mask, d->name, ftype, d->code);
30298 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30299 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30300 ARRAY_SIZE (bdesc_round_args) - 1);
30302 /* pcmpestr[im] insns. */
30303 for (i = 0, d = bdesc_pcmpestr;
30304 i < ARRAY_SIZE (bdesc_pcmpestr);
30305 i++, d++)
30307 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30308 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30309 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30310 else
30311 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30312 def_builtin_const (d->mask, d->name, ftype, d->code);
30314 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30315 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30316 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30318 /* pcmpistr[im] insns. */
30319 for (i = 0, d = bdesc_pcmpistr;
30320 i < ARRAY_SIZE (bdesc_pcmpistr);
30321 i++, d++)
30323 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30324 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30325 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30326 else
30327 ftype = INT_FTYPE_V16QI_V16QI_INT;
30328 def_builtin_const (d->mask, d->name, ftype, d->code);
30330 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30331 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30332 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30334 /* comi/ucomi insns. */
30335 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30337 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30338 if (d->mask == OPTION_MASK_ISA_SSE2)
30339 ftype = INT_FTYPE_V2DF_V2DF;
30340 else
30341 ftype = INT_FTYPE_V4SF_V4SF;
30342 def_builtin_const (d->mask, d->name, ftype, d->code);
30344 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30345 IX86_BUILTIN__BDESC_COMI_FIRST,
30346 ARRAY_SIZE (bdesc_comi) - 1);
30348 /* SSE */
30349 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30350 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30351 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30352 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30354 /* SSE or 3DNow!A */
30355 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30356 /* As it uses V4HImode, we have to require -mmmx too. */
30357 | OPTION_MASK_ISA_MMX,
30358 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30359 IX86_BUILTIN_MASKMOVQ);
30361 /* SSE2 */
30362 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30363 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30365 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30366 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30367 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30368 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30370 /* SSE3. */
30371 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30372 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30373 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30374 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30376 /* AES */
30377 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30378 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30379 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30380 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30381 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30382 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30383 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30384 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30385 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30386 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30387 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30388 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30390 /* PCLMUL */
30391 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30392 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30394 /* RDRND */
30395 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30396 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30397 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30398 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30399 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30400 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30401 IX86_BUILTIN_RDRAND64_STEP);
30403 /* AVX2 */
30404 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30405 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30406 IX86_BUILTIN_GATHERSIV2DF);
30408 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30409 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30410 IX86_BUILTIN_GATHERSIV4DF);
30412 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30413 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30414 IX86_BUILTIN_GATHERDIV2DF);
30416 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30417 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30418 IX86_BUILTIN_GATHERDIV4DF);
30420 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30421 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30422 IX86_BUILTIN_GATHERSIV4SF);
30424 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30425 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30426 IX86_BUILTIN_GATHERSIV8SF);
30428 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30429 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30430 IX86_BUILTIN_GATHERDIV4SF);
30432 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30433 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30434 IX86_BUILTIN_GATHERDIV8SF);
30436 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30437 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30438 IX86_BUILTIN_GATHERSIV2DI);
30440 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30441 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30442 IX86_BUILTIN_GATHERSIV4DI);
30444 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30445 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30446 IX86_BUILTIN_GATHERDIV2DI);
30448 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30449 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30450 IX86_BUILTIN_GATHERDIV4DI);
30452 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30453 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30454 IX86_BUILTIN_GATHERSIV4SI);
30456 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30457 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30458 IX86_BUILTIN_GATHERSIV8SI);
30460 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30461 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30462 IX86_BUILTIN_GATHERDIV4SI);
30464 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30465 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30466 IX86_BUILTIN_GATHERDIV8SI);
30468 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30469 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30470 IX86_BUILTIN_GATHERALTSIV4DF);
30472 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30473 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30474 IX86_BUILTIN_GATHERALTDIV8SF);
30476 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30477 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30478 IX86_BUILTIN_GATHERALTSIV4DI);
30480 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30481 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30482 IX86_BUILTIN_GATHERALTDIV8SI);
30484 /* AVX512F */
30485 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30486 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30487 IX86_BUILTIN_GATHER3SIV16SF);
30489 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30490 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30491 IX86_BUILTIN_GATHER3SIV8DF);
30493 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30494 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30495 IX86_BUILTIN_GATHER3DIV16SF);
30497 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30498 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30499 IX86_BUILTIN_GATHER3DIV8DF);
30501 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30502 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30503 IX86_BUILTIN_GATHER3SIV16SI);
30505 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30506 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30507 IX86_BUILTIN_GATHER3SIV8DI);
30509 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30510 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30511 IX86_BUILTIN_GATHER3DIV16SI);
30513 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30514 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30515 IX86_BUILTIN_GATHER3DIV8DI);
30517 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30518 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30519 IX86_BUILTIN_GATHER3ALTSIV8DF);
30521 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30522 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30523 IX86_BUILTIN_GATHER3ALTDIV16SF);
30525 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30526 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30527 IX86_BUILTIN_GATHER3ALTSIV8DI);
30529 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30530 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30531 IX86_BUILTIN_GATHER3ALTDIV16SI);
30533 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30534 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30535 IX86_BUILTIN_SCATTERSIV16SF);
30537 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30538 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30539 IX86_BUILTIN_SCATTERSIV8DF);
30541 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30542 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30543 IX86_BUILTIN_SCATTERDIV16SF);
30545 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30546 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30547 IX86_BUILTIN_SCATTERDIV8DF);
30549 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30550 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30551 IX86_BUILTIN_SCATTERSIV16SI);
30553 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30554 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30555 IX86_BUILTIN_SCATTERSIV8DI);
30557 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30558 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30559 IX86_BUILTIN_SCATTERDIV16SI);
30561 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30562 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30563 IX86_BUILTIN_SCATTERDIV8DI);
30565 /* AVX512VL */
30566 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30567 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30568 IX86_BUILTIN_GATHER3SIV2DF);
30570 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30571 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30572 IX86_BUILTIN_GATHER3SIV4DF);
30574 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30575 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30576 IX86_BUILTIN_GATHER3DIV2DF);
30578 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30579 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30580 IX86_BUILTIN_GATHER3DIV4DF);
30582 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30583 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30584 IX86_BUILTIN_GATHER3SIV4SF);
30586 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30587 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30588 IX86_BUILTIN_GATHER3SIV8SF);
30590 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30591 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30592 IX86_BUILTIN_GATHER3DIV4SF);
30594 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30595 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30596 IX86_BUILTIN_GATHER3DIV8SF);
30598 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30599 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30600 IX86_BUILTIN_GATHER3SIV2DI);
30602 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30603 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30604 IX86_BUILTIN_GATHER3SIV4DI);
30606 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30607 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30608 IX86_BUILTIN_GATHER3DIV2DI);
30610 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30611 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30612 IX86_BUILTIN_GATHER3DIV4DI);
30614 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30615 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30616 IX86_BUILTIN_GATHER3SIV4SI);
30618 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30619 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30620 IX86_BUILTIN_GATHER3SIV8SI);
30622 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30623 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30624 IX86_BUILTIN_GATHER3DIV4SI);
30626 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30627 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30628 IX86_BUILTIN_GATHER3DIV8SI);
30630 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30631 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30632 IX86_BUILTIN_GATHER3ALTSIV4DF);
30634 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30635 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30636 IX86_BUILTIN_GATHER3ALTDIV8SF);
30638 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30639 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30640 IX86_BUILTIN_GATHER3ALTSIV4DI);
30642 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30643 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30644 IX86_BUILTIN_GATHER3ALTDIV8SI);
30646 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30647 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30648 IX86_BUILTIN_SCATTERSIV8SF);
30650 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30651 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30652 IX86_BUILTIN_SCATTERSIV4SF);
30654 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30655 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30656 IX86_BUILTIN_SCATTERSIV4DF);
30658 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30659 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30660 IX86_BUILTIN_SCATTERSIV2DF);
30662 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30663 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30664 IX86_BUILTIN_SCATTERDIV8SF);
30666 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30667 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30668 IX86_BUILTIN_SCATTERDIV4SF);
30670 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30671 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30672 IX86_BUILTIN_SCATTERDIV4DF);
30674 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30675 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30676 IX86_BUILTIN_SCATTERDIV2DF);
30678 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30679 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30680 IX86_BUILTIN_SCATTERSIV8SI);
30682 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30683 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30684 IX86_BUILTIN_SCATTERSIV4SI);
30686 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30687 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30688 IX86_BUILTIN_SCATTERSIV4DI);
30690 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30691 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30692 IX86_BUILTIN_SCATTERSIV2DI);
30694 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30695 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30696 IX86_BUILTIN_SCATTERDIV8SI);
30698 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30699 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30700 IX86_BUILTIN_SCATTERDIV4SI);
30702 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30703 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30704 IX86_BUILTIN_SCATTERDIV4DI);
30706 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30707 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30708 IX86_BUILTIN_SCATTERDIV2DI);
30709 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30710 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30711 IX86_BUILTIN_SCATTERALTSIV8DF);
30713 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30714 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30715 IX86_BUILTIN_SCATTERALTDIV16SF);
30717 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30718 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30719 IX86_BUILTIN_SCATTERALTSIV8DI);
30721 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30722 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30723 IX86_BUILTIN_SCATTERALTDIV16SI);
30725 /* AVX512PF */
30726 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30727 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30728 IX86_BUILTIN_GATHERPFDPD);
30729 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30730 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30731 IX86_BUILTIN_GATHERPFDPS);
30732 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30733 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30734 IX86_BUILTIN_GATHERPFQPD);
30735 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30736 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30737 IX86_BUILTIN_GATHERPFQPS);
30738 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30739 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30740 IX86_BUILTIN_SCATTERPFDPD);
30741 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30742 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30743 IX86_BUILTIN_SCATTERPFDPS);
30744 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30745 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30746 IX86_BUILTIN_SCATTERPFQPD);
30747 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30748 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30749 IX86_BUILTIN_SCATTERPFQPS);
30751 /* SHA */
30752 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30753 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30754 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30755 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30756 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30757 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30758 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30759 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30760 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30761 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30762 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30763 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30764 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30765 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30767 /* RTM. */
30768 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30769 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30771 /* MMX access to the vec_init patterns. */
30772 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30773 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30775 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30776 V4HI_FTYPE_HI_HI_HI_HI,
30777 IX86_BUILTIN_VEC_INIT_V4HI);
30779 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30780 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30781 IX86_BUILTIN_VEC_INIT_V8QI);
30783 /* Access to the vec_extract patterns. */
30784 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30785 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30786 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30787 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30788 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30789 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30790 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30791 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30792 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30793 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30795 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30796 /* As it uses V4HImode, we have to require -mmmx too. */
30797 | OPTION_MASK_ISA_MMX,
30798 "__builtin_ia32_vec_ext_v4hi",
30799 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30801 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30802 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30804 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30805 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30807 /* Access to the vec_set patterns. */
30808 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30809 "__builtin_ia32_vec_set_v2di",
30810 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30812 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30813 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30815 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30816 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30818 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30819 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30821 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30822 /* As it uses V4HImode, we have to require -mmmx too. */
30823 | OPTION_MASK_ISA_MMX,
30824 "__builtin_ia32_vec_set_v4hi",
30825 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30827 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30828 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30830 /* RDSEED */
30831 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30832 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30833 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30834 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30835 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30836 "__builtin_ia32_rdseed_di_step",
30837 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30839 /* ADCX */
30840 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30841 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30842 def_builtin (OPTION_MASK_ISA_64BIT,
30843 "__builtin_ia32_addcarryx_u64",
30844 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30845 IX86_BUILTIN_ADDCARRYX64);
30847 /* SBB */
30848 def_builtin (0, "__builtin_ia32_sbb_u32",
30849 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30850 def_builtin (OPTION_MASK_ISA_64BIT,
30851 "__builtin_ia32_sbb_u64",
30852 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30853 IX86_BUILTIN_SBB64);
30855 /* Read/write FLAGS. */
30856 def_builtin (0, "__builtin_ia32_readeflags_u32",
30857 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30858 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30859 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30860 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30861 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30862 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30863 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30865 /* CLFLUSHOPT. */
30866 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30867 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30869 /* CLWB. */
30870 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30871 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30873 /* MONITORX and MWAITX. */
30874 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30875 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30876 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30877 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30879 /* CLZERO. */
30880 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30881 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30883 /* Add FMA4 multi-arg argument instructions */
30884 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30886 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30887 if (d->name == 0)
30888 continue;
30890 ftype = (enum ix86_builtin_func_type) d->flag;
30891 def_builtin_const (d->mask, d->name, ftype, d->code);
30893 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30894 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30895 ARRAY_SIZE (bdesc_multi_arg) - 1);
30897 /* Add CET inrinsics. */
30898 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30900 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30901 if (d->name == 0)
30902 continue;
30904 ftype = (enum ix86_builtin_func_type) d->flag;
30905 def_builtin (d->mask, d->name, ftype, d->code);
30907 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30908 IX86_BUILTIN__BDESC_CET_FIRST,
30909 ARRAY_SIZE (bdesc_cet) - 1);
30911 for (i = 0, d = bdesc_cet_rdssp;
30912 i < ARRAY_SIZE (bdesc_cet_rdssp);
30913 i++, d++)
30915 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30916 if (d->name == 0)
30917 continue;
30919 ftype = (enum ix86_builtin_func_type) d->flag;
30920 def_builtin (d->mask, d->name, ftype, d->code);
30922 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30923 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30924 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30927 static void
30928 ix86_init_mpx_builtins ()
30930 const struct builtin_description * d;
30931 enum ix86_builtin_func_type ftype;
30932 tree decl;
30933 size_t i;
30935 for (i = 0, d = bdesc_mpx;
30936 i < ARRAY_SIZE (bdesc_mpx);
30937 i++, d++)
30939 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30940 if (d->name == 0)
30941 continue;
30943 ftype = (enum ix86_builtin_func_type) d->flag;
30944 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
30946 /* With no leaf and nothrow flags for MPX builtins
30947 abnormal edges may follow its call when setjmp
30948 presents in the function. Since we may have a lot
30949 of MPX builtins calls it causes lots of useless
30950 edges and enormous PHI nodes. To avoid this we mark
30951 MPX builtins as leaf and nothrow. */
30952 if (decl)
30954 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30955 NULL_TREE);
30956 TREE_NOTHROW (decl) = 1;
30958 else
30960 ix86_builtins_isa[(int)d->code].leaf_p = true;
30961 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30964 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
30965 IX86_BUILTIN__BDESC_MPX_FIRST,
30966 ARRAY_SIZE (bdesc_mpx) - 1);
30968 for (i = 0, d = bdesc_mpx_const;
30969 i < ARRAY_SIZE (bdesc_mpx_const);
30970 i++, d++)
30972 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
30973 if (d->name == 0)
30974 continue;
30976 ftype = (enum ix86_builtin_func_type) d->flag;
30977 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
30979 if (decl)
30981 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30982 NULL_TREE);
30983 TREE_NOTHROW (decl) = 1;
30985 else
30987 ix86_builtins_isa[(int)d->code].leaf_p = true;
30988 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30991 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
30992 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30993 ARRAY_SIZE (bdesc_mpx_const) - 1);
30995 #undef BDESC_VERIFY
30996 #undef BDESC_VERIFYS
30998 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
30999 to return a pointer to VERSION_DECL if the outcome of the expression
31000 formed by PREDICATE_CHAIN is true. This function will be called during
31001 version dispatch to decide which function version to execute. It returns
31002 the basic block at the end, to which more conditions can be added. */
31004 static basic_block
31005 add_condition_to_bb (tree function_decl, tree version_decl,
31006 tree predicate_chain, basic_block new_bb)
31008 gimple *return_stmt;
31009 tree convert_expr, result_var;
31010 gimple *convert_stmt;
31011 gimple *call_cond_stmt;
31012 gimple *if_else_stmt;
31014 basic_block bb1, bb2, bb3;
31015 edge e12, e23;
31017 tree cond_var, and_expr_var = NULL_TREE;
31018 gimple_seq gseq;
31020 tree predicate_decl, predicate_arg;
31022 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31024 gcc_assert (new_bb != NULL);
31025 gseq = bb_seq (new_bb);
31028 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31029 build_fold_addr_expr (version_decl));
31030 result_var = create_tmp_var (ptr_type_node);
31031 convert_stmt = gimple_build_assign (result_var, convert_expr);
31032 return_stmt = gimple_build_return (result_var);
31034 if (predicate_chain == NULL_TREE)
31036 gimple_seq_add_stmt (&gseq, convert_stmt);
31037 gimple_seq_add_stmt (&gseq, return_stmt);
31038 set_bb_seq (new_bb, gseq);
31039 gimple_set_bb (convert_stmt, new_bb);
31040 gimple_set_bb (return_stmt, new_bb);
31041 pop_cfun ();
31042 return new_bb;
31045 while (predicate_chain != NULL)
31047 cond_var = create_tmp_var (integer_type_node);
31048 predicate_decl = TREE_PURPOSE (predicate_chain);
31049 predicate_arg = TREE_VALUE (predicate_chain);
31050 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31051 gimple_call_set_lhs (call_cond_stmt, cond_var);
31053 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31054 gimple_set_bb (call_cond_stmt, new_bb);
31055 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31057 predicate_chain = TREE_CHAIN (predicate_chain);
31059 if (and_expr_var == NULL)
31060 and_expr_var = cond_var;
31061 else
31063 gimple *assign_stmt;
31064 /* Use MIN_EXPR to check if any integer is zero?.
31065 and_expr_var = min_expr <cond_var, and_expr_var> */
31066 assign_stmt = gimple_build_assign (and_expr_var,
31067 build2 (MIN_EXPR, integer_type_node,
31068 cond_var, and_expr_var));
31070 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31071 gimple_set_bb (assign_stmt, new_bb);
31072 gimple_seq_add_stmt (&gseq, assign_stmt);
31076 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31077 integer_zero_node,
31078 NULL_TREE, NULL_TREE);
31079 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31080 gimple_set_bb (if_else_stmt, new_bb);
31081 gimple_seq_add_stmt (&gseq, if_else_stmt);
31083 gimple_seq_add_stmt (&gseq, convert_stmt);
31084 gimple_seq_add_stmt (&gseq, return_stmt);
31085 set_bb_seq (new_bb, gseq);
31087 bb1 = new_bb;
31088 e12 = split_block (bb1, if_else_stmt);
31089 bb2 = e12->dest;
31090 e12->flags &= ~EDGE_FALLTHRU;
31091 e12->flags |= EDGE_TRUE_VALUE;
31093 e23 = split_block (bb2, return_stmt);
31095 gimple_set_bb (convert_stmt, bb2);
31096 gimple_set_bb (return_stmt, bb2);
31098 bb3 = e23->dest;
31099 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31101 remove_edge (e23);
31102 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31104 pop_cfun ();
31106 return bb3;
31109 /* This parses the attribute arguments to target in DECL and determines
31110 the right builtin to use to match the platform specification.
31111 It returns the priority value for this version decl. If PREDICATE_LIST
31112 is not NULL, it stores the list of cpu features that need to be checked
31113 before dispatching this function. */
31115 static unsigned int
31116 get_builtin_code_for_version (tree decl, tree *predicate_list)
31118 tree attrs;
31119 struct cl_target_option cur_target;
31120 tree target_node;
31121 struct cl_target_option *new_target;
31122 const char *arg_str = NULL;
31123 const char *attrs_str = NULL;
31124 char *tok_str = NULL;
31125 char *token;
31127 /* Priority of i386 features, greater value is higher priority. This is
31128 used to decide the order in which function dispatch must happen. For
31129 instance, a version specialized for SSE4.2 should be checked for dispatch
31130 before a version for SSE3, as SSE4.2 implies SSE3. */
31131 enum feature_priority
31133 P_ZERO = 0,
31134 P_MMX,
31135 P_SSE,
31136 P_SSE2,
31137 P_SSE3,
31138 P_SSSE3,
31139 P_PROC_SSSE3,
31140 P_SSE4_A,
31141 P_PROC_SSE4_A,
31142 P_SSE4_1,
31143 P_SSE4_2,
31144 P_PROC_SSE4_2,
31145 P_POPCNT,
31146 P_AES,
31147 P_PCLMUL,
31148 P_AVX,
31149 P_PROC_AVX,
31150 P_BMI,
31151 P_PROC_BMI,
31152 P_FMA4,
31153 P_XOP,
31154 P_PROC_XOP,
31155 P_FMA,
31156 P_PROC_FMA,
31157 P_BMI2,
31158 P_AVX2,
31159 P_PROC_AVX2,
31160 P_AVX512F,
31161 P_PROC_AVX512F
31164 enum feature_priority priority = P_ZERO;
31166 /* These are the target attribute strings for which a dispatcher is
31167 available, from fold_builtin_cpu. */
31169 static struct _feature_list
31171 const char *const name;
31172 const enum feature_priority priority;
31174 const feature_list[] =
31176 {"mmx", P_MMX},
31177 {"sse", P_SSE},
31178 {"sse2", P_SSE2},
31179 {"sse3", P_SSE3},
31180 {"sse4a", P_SSE4_A},
31181 {"ssse3", P_SSSE3},
31182 {"sse4.1", P_SSE4_1},
31183 {"sse4.2", P_SSE4_2},
31184 {"popcnt", P_POPCNT},
31185 {"aes", P_AES},
31186 {"pclmul", P_PCLMUL},
31187 {"avx", P_AVX},
31188 {"bmi", P_BMI},
31189 {"fma4", P_FMA4},
31190 {"xop", P_XOP},
31191 {"fma", P_FMA},
31192 {"bmi2", P_BMI2},
31193 {"avx2", P_AVX2},
31194 {"avx512f", P_AVX512F}
31198 static unsigned int NUM_FEATURES
31199 = sizeof (feature_list) / sizeof (struct _feature_list);
31201 unsigned int i;
31203 tree predicate_chain = NULL_TREE;
31204 tree predicate_decl, predicate_arg;
31206 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31207 gcc_assert (attrs != NULL);
31209 attrs = TREE_VALUE (TREE_VALUE (attrs));
31211 gcc_assert (TREE_CODE (attrs) == STRING_CST);
31212 attrs_str = TREE_STRING_POINTER (attrs);
31214 /* Return priority zero for default function. */
31215 if (strcmp (attrs_str, "default") == 0)
31216 return 0;
31218 /* Handle arch= if specified. For priority, set it to be 1 more than
31219 the best instruction set the processor can handle. For instance, if
31220 there is a version for atom and a version for ssse3 (the highest ISA
31221 priority for atom), the atom version must be checked for dispatch
31222 before the ssse3 version. */
31223 if (strstr (attrs_str, "arch=") != NULL)
31225 cl_target_option_save (&cur_target, &global_options);
31226 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31227 &global_options_set);
31229 gcc_assert (target_node);
31230 new_target = TREE_TARGET_OPTION (target_node);
31231 gcc_assert (new_target);
31233 if (new_target->arch_specified && new_target->arch > 0)
31235 switch (new_target->arch)
31237 case PROCESSOR_CORE2:
31238 arg_str = "core2";
31239 priority = P_PROC_SSSE3;
31240 break;
31241 case PROCESSOR_NEHALEM:
31242 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31244 arg_str = "westmere";
31245 priority = P_AES;
31247 else
31249 /* We translate "arch=corei7" and "arch=nehalem" to
31250 "corei7" so that it will be mapped to M_INTEL_COREI7
31251 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31252 arg_str = "corei7";
31253 priority = P_PROC_SSE4_2;
31255 break;
31256 case PROCESSOR_SANDYBRIDGE:
31257 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31258 arg_str = "ivybridge";
31259 else
31260 arg_str = "sandybridge";
31261 priority = P_PROC_AVX;
31262 break;
31263 case PROCESSOR_HASWELL:
31264 case PROCESSOR_SKYLAKE_AVX512:
31265 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
31266 arg_str = "cannonlake";
31267 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31268 arg_str = "skylake-avx512";
31269 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31270 arg_str = "skylake";
31271 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31272 arg_str = "broadwell";
31273 else
31274 arg_str = "haswell";
31275 priority = P_PROC_AVX2;
31276 break;
31277 case PROCESSOR_BONNELL:
31278 arg_str = "bonnell";
31279 priority = P_PROC_SSSE3;
31280 break;
31281 case PROCESSOR_KNL:
31282 arg_str = "knl";
31283 priority = P_PROC_AVX512F;
31284 break;
31285 case PROCESSOR_KNM:
31286 arg_str = "knm";
31287 priority = P_PROC_AVX512F;
31288 break;
31289 case PROCESSOR_SILVERMONT:
31290 arg_str = "silvermont";
31291 priority = P_PROC_SSE4_2;
31292 break;
31293 case PROCESSOR_AMDFAM10:
31294 arg_str = "amdfam10h";
31295 priority = P_PROC_SSE4_A;
31296 break;
31297 case PROCESSOR_BTVER1:
31298 arg_str = "btver1";
31299 priority = P_PROC_SSE4_A;
31300 break;
31301 case PROCESSOR_BTVER2:
31302 arg_str = "btver2";
31303 priority = P_PROC_BMI;
31304 break;
31305 case PROCESSOR_BDVER1:
31306 arg_str = "bdver1";
31307 priority = P_PROC_XOP;
31308 break;
31309 case PROCESSOR_BDVER2:
31310 arg_str = "bdver2";
31311 priority = P_PROC_FMA;
31312 break;
31313 case PROCESSOR_BDVER3:
31314 arg_str = "bdver3";
31315 priority = P_PROC_FMA;
31316 break;
31317 case PROCESSOR_BDVER4:
31318 arg_str = "bdver4";
31319 priority = P_PROC_AVX2;
31320 break;
31321 case PROCESSOR_ZNVER1:
31322 arg_str = "znver1";
31323 priority = P_PROC_AVX2;
31324 break;
31328 cl_target_option_restore (&global_options, &cur_target);
31330 if (predicate_list && arg_str == NULL)
31332 error_at (DECL_SOURCE_LOCATION (decl),
31333 "No dispatcher found for the versioning attributes");
31334 return 0;
31337 if (predicate_list)
31339 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31340 /* For a C string literal the length includes the trailing NULL. */
31341 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31342 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31343 predicate_chain);
31347 /* Process feature name. */
31348 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31349 strcpy (tok_str, attrs_str);
31350 token = strtok (tok_str, ",");
31351 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31353 while (token != NULL)
31355 /* Do not process "arch=" */
31356 if (strncmp (token, "arch=", 5) == 0)
31358 token = strtok (NULL, ",");
31359 continue;
31361 for (i = 0; i < NUM_FEATURES; ++i)
31363 if (strcmp (token, feature_list[i].name) == 0)
31365 if (predicate_list)
31367 predicate_arg = build_string_literal (
31368 strlen (feature_list[i].name) + 1,
31369 feature_list[i].name);
31370 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31371 predicate_chain);
31373 /* Find the maximum priority feature. */
31374 if (feature_list[i].priority > priority)
31375 priority = feature_list[i].priority;
31377 break;
31380 if (predicate_list && i == NUM_FEATURES)
31382 error_at (DECL_SOURCE_LOCATION (decl),
31383 "No dispatcher found for %s", token);
31384 return 0;
31386 token = strtok (NULL, ",");
31388 free (tok_str);
31390 if (predicate_list && predicate_chain == NULL_TREE)
31392 error_at (DECL_SOURCE_LOCATION (decl),
31393 "No dispatcher found for the versioning attributes : %s",
31394 attrs_str);
31395 return 0;
31397 else if (predicate_list)
31399 predicate_chain = nreverse (predicate_chain);
31400 *predicate_list = predicate_chain;
31403 return priority;
31406 /* This compares the priority of target features in function DECL1
31407 and DECL2. It returns positive value if DECL1 is higher priority,
31408 negative value if DECL2 is higher priority and 0 if they are the
31409 same. */
31411 static int
31412 ix86_compare_version_priority (tree decl1, tree decl2)
31414 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31415 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31417 return (int)priority1 - (int)priority2;
31420 /* V1 and V2 point to function versions with different priorities
31421 based on the target ISA. This function compares their priorities. */
31423 static int
31424 feature_compare (const void *v1, const void *v2)
31426 typedef struct _function_version_info
31428 tree version_decl;
31429 tree predicate_chain;
31430 unsigned int dispatch_priority;
31431 } function_version_info;
31433 const function_version_info c1 = *(const function_version_info *)v1;
31434 const function_version_info c2 = *(const function_version_info *)v2;
31435 return (c2.dispatch_priority - c1.dispatch_priority);
31438 /* This function generates the dispatch function for
31439 multi-versioned functions. DISPATCH_DECL is the function which will
31440 contain the dispatch logic. FNDECLS are the function choices for
31441 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31442 in DISPATCH_DECL in which the dispatch code is generated. */
31444 static int
31445 dispatch_function_versions (tree dispatch_decl,
31446 void *fndecls_p,
31447 basic_block *empty_bb)
31449 tree default_decl;
31450 gimple *ifunc_cpu_init_stmt;
31451 gimple_seq gseq;
31452 int ix;
31453 tree ele;
31454 vec<tree> *fndecls;
31455 unsigned int num_versions = 0;
31456 unsigned int actual_versions = 0;
31457 unsigned int i;
31459 struct _function_version_info
31461 tree version_decl;
31462 tree predicate_chain;
31463 unsigned int dispatch_priority;
31464 }*function_version_info;
31466 gcc_assert (dispatch_decl != NULL
31467 && fndecls_p != NULL
31468 && empty_bb != NULL);
31470 /*fndecls_p is actually a vector. */
31471 fndecls = static_cast<vec<tree> *> (fndecls_p);
31473 /* At least one more version other than the default. */
31474 num_versions = fndecls->length ();
31475 gcc_assert (num_versions >= 2);
31477 function_version_info = (struct _function_version_info *)
31478 XNEWVEC (struct _function_version_info, (num_versions - 1));
31480 /* The first version in the vector is the default decl. */
31481 default_decl = (*fndecls)[0];
31483 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31485 gseq = bb_seq (*empty_bb);
31486 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31487 constructors, so explicity call __builtin_cpu_init here. */
31488 ifunc_cpu_init_stmt = gimple_build_call_vec (
31489 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31490 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31491 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31492 set_bb_seq (*empty_bb, gseq);
31494 pop_cfun ();
31497 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31499 tree version_decl = ele;
31500 tree predicate_chain = NULL_TREE;
31501 unsigned int priority;
31502 /* Get attribute string, parse it and find the right predicate decl.
31503 The predicate function could be a lengthy combination of many
31504 features, like arch-type and various isa-variants. */
31505 priority = get_builtin_code_for_version (version_decl,
31506 &predicate_chain);
31508 if (predicate_chain == NULL_TREE)
31509 continue;
31511 function_version_info [actual_versions].version_decl = version_decl;
31512 function_version_info [actual_versions].predicate_chain
31513 = predicate_chain;
31514 function_version_info [actual_versions].dispatch_priority = priority;
31515 actual_versions++;
31518 /* Sort the versions according to descending order of dispatch priority. The
31519 priority is based on the ISA. This is not a perfect solution. There
31520 could still be ambiguity. If more than one function version is suitable
31521 to execute, which one should be dispatched? In future, allow the user
31522 to specify a dispatch priority next to the version. */
31523 qsort (function_version_info, actual_versions,
31524 sizeof (struct _function_version_info), feature_compare);
31526 for (i = 0; i < actual_versions; ++i)
31527 *empty_bb = add_condition_to_bb (dispatch_decl,
31528 function_version_info[i].version_decl,
31529 function_version_info[i].predicate_chain,
31530 *empty_bb);
31532 /* dispatch default version at the end. */
31533 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31534 NULL, *empty_bb);
31536 free (function_version_info);
31537 return 0;
31540 /* This function changes the assembler name for functions that are
31541 versions. If DECL is a function version and has a "target"
31542 attribute, it appends the attribute string to its assembler name. */
31544 static tree
31545 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31547 tree version_attr;
31548 const char *orig_name, *version_string;
31549 char *attr_str, *assembler_name;
31551 if (DECL_DECLARED_INLINE_P (decl)
31552 && lookup_attribute ("gnu_inline",
31553 DECL_ATTRIBUTES (decl)))
31554 error_at (DECL_SOURCE_LOCATION (decl),
31555 "Function versions cannot be marked as gnu_inline,"
31556 " bodies have to be generated");
31558 if (DECL_VIRTUAL_P (decl)
31559 || DECL_VINDEX (decl))
31560 sorry ("Virtual function multiversioning not supported");
31562 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31564 /* target attribute string cannot be NULL. */
31565 gcc_assert (version_attr != NULL_TREE);
31567 orig_name = IDENTIFIER_POINTER (id);
31568 version_string
31569 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31571 if (strcmp (version_string, "default") == 0)
31572 return id;
31574 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31575 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31577 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31579 /* Allow assembler name to be modified if already set. */
31580 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31581 SET_DECL_RTL (decl, NULL);
31583 tree ret = get_identifier (assembler_name);
31584 XDELETEVEC (attr_str);
31585 XDELETEVEC (assembler_name);
31586 return ret;
31590 static tree
31591 ix86_mangle_decl_assembler_name (tree decl, tree id)
31593 /* For function version, add the target suffix to the assembler name. */
31594 if (TREE_CODE (decl) == FUNCTION_DECL
31595 && DECL_FUNCTION_VERSIONED (decl))
31596 id = ix86_mangle_function_version_assembler_name (decl, id);
31597 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31598 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31599 #endif
31601 return id;
31604 /* Make a dispatcher declaration for the multi-versioned function DECL.
31605 Calls to DECL function will be replaced with calls to the dispatcher
31606 by the front-end. Returns the decl of the dispatcher function. */
31608 static tree
31609 ix86_get_function_versions_dispatcher (void *decl)
31611 tree fn = (tree) decl;
31612 struct cgraph_node *node = NULL;
31613 struct cgraph_node *default_node = NULL;
31614 struct cgraph_function_version_info *node_v = NULL;
31615 struct cgraph_function_version_info *first_v = NULL;
31617 tree dispatch_decl = NULL;
31619 struct cgraph_function_version_info *default_version_info = NULL;
31621 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31623 node = cgraph_node::get (fn);
31624 gcc_assert (node != NULL);
31626 node_v = node->function_version ();
31627 gcc_assert (node_v != NULL);
31629 if (node_v->dispatcher_resolver != NULL)
31630 return node_v->dispatcher_resolver;
31632 /* Find the default version and make it the first node. */
31633 first_v = node_v;
31634 /* Go to the beginning of the chain. */
31635 while (first_v->prev != NULL)
31636 first_v = first_v->prev;
31637 default_version_info = first_v;
31638 while (default_version_info != NULL)
31640 if (is_function_default_version
31641 (default_version_info->this_node->decl))
31642 break;
31643 default_version_info = default_version_info->next;
31646 /* If there is no default node, just return NULL. */
31647 if (default_version_info == NULL)
31648 return NULL;
31650 /* Make default info the first node. */
31651 if (first_v != default_version_info)
31653 default_version_info->prev->next = default_version_info->next;
31654 if (default_version_info->next)
31655 default_version_info->next->prev = default_version_info->prev;
31656 first_v->prev = default_version_info;
31657 default_version_info->next = first_v;
31658 default_version_info->prev = NULL;
31661 default_node = default_version_info->this_node;
31663 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31664 if (targetm.has_ifunc_p ())
31666 struct cgraph_function_version_info *it_v = NULL;
31667 struct cgraph_node *dispatcher_node = NULL;
31668 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31670 /* Right now, the dispatching is done via ifunc. */
31671 dispatch_decl = make_dispatcher_decl (default_node->decl);
31673 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31674 gcc_assert (dispatcher_node != NULL);
31675 dispatcher_node->dispatcher_function = 1;
31676 dispatcher_version_info
31677 = dispatcher_node->insert_new_function_version ();
31678 dispatcher_version_info->next = default_version_info;
31679 dispatcher_node->definition = 1;
31681 /* Set the dispatcher for all the versions. */
31682 it_v = default_version_info;
31683 while (it_v != NULL)
31685 it_v->dispatcher_resolver = dispatch_decl;
31686 it_v = it_v->next;
31689 else
31690 #endif
31692 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31693 "multiversioning needs ifunc which is not supported "
31694 "on this target");
31697 return dispatch_decl;
31700 /* Make the resolver function decl to dispatch the versions of
31701 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31702 ifunc alias that will point to the created resolver. Create an
31703 empty basic block in the resolver and store the pointer in
31704 EMPTY_BB. Return the decl of the resolver function. */
31706 static tree
31707 make_resolver_func (const tree default_decl,
31708 const tree ifunc_alias_decl,
31709 basic_block *empty_bb)
31711 char *resolver_name;
31712 tree decl, type, decl_name, t;
31714 /* IFUNC's have to be globally visible. So, if the default_decl is
31715 not, then the name of the IFUNC should be made unique. */
31716 if (TREE_PUBLIC (default_decl) == 0)
31718 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31719 symtab->change_decl_assembler_name (ifunc_alias_decl,
31720 get_identifier (ifunc_name));
31721 XDELETEVEC (ifunc_name);
31724 resolver_name = make_unique_name (default_decl, "resolver", false);
31726 /* The resolver function should return a (void *). */
31727 type = build_function_type_list (ptr_type_node, NULL_TREE);
31729 decl = build_fn_decl (resolver_name, type);
31730 decl_name = get_identifier (resolver_name);
31731 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31733 DECL_NAME (decl) = decl_name;
31734 TREE_USED (decl) = 1;
31735 DECL_ARTIFICIAL (decl) = 1;
31736 DECL_IGNORED_P (decl) = 1;
31737 TREE_PUBLIC (decl) = 0;
31738 DECL_UNINLINABLE (decl) = 1;
31740 /* Resolver is not external, body is generated. */
31741 DECL_EXTERNAL (decl) = 0;
31742 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31744 DECL_CONTEXT (decl) = NULL_TREE;
31745 DECL_INITIAL (decl) = make_node (BLOCK);
31746 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31748 if (DECL_COMDAT_GROUP (default_decl)
31749 || TREE_PUBLIC (default_decl))
31751 /* In this case, each translation unit with a call to this
31752 versioned function will put out a resolver. Ensure it
31753 is comdat to keep just one copy. */
31754 DECL_COMDAT (decl) = 1;
31755 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31757 /* Build result decl and add to function_decl. */
31758 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31759 DECL_ARTIFICIAL (t) = 1;
31760 DECL_IGNORED_P (t) = 1;
31761 DECL_RESULT (decl) = t;
31763 gimplify_function_tree (decl);
31764 push_cfun (DECL_STRUCT_FUNCTION (decl));
31765 *empty_bb = init_lowered_empty_function (decl, false,
31766 profile_count::uninitialized ());
31768 cgraph_node::add_new_function (decl, true);
31769 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31771 pop_cfun ();
31773 gcc_assert (ifunc_alias_decl != NULL);
31774 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31775 DECL_ATTRIBUTES (ifunc_alias_decl)
31776 = make_attribute ("ifunc", resolver_name,
31777 DECL_ATTRIBUTES (ifunc_alias_decl));
31779 /* Create the alias for dispatch to resolver here. */
31780 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31781 XDELETEVEC (resolver_name);
31782 return decl;
31785 /* Generate the dispatching code body to dispatch multi-versioned function
31786 DECL. The target hook is called to process the "target" attributes and
31787 provide the code to dispatch the right function at run-time. NODE points
31788 to the dispatcher decl whose body will be created. */
31790 static tree
31791 ix86_generate_version_dispatcher_body (void *node_p)
31793 tree resolver_decl;
31794 basic_block empty_bb;
31795 tree default_ver_decl;
31796 struct cgraph_node *versn;
31797 struct cgraph_node *node;
31799 struct cgraph_function_version_info *node_version_info = NULL;
31800 struct cgraph_function_version_info *versn_info = NULL;
31802 node = (cgraph_node *)node_p;
31804 node_version_info = node->function_version ();
31805 gcc_assert (node->dispatcher_function
31806 && node_version_info != NULL);
31808 if (node_version_info->dispatcher_resolver)
31809 return node_version_info->dispatcher_resolver;
31811 /* The first version in the chain corresponds to the default version. */
31812 default_ver_decl = node_version_info->next->this_node->decl;
31814 /* node is going to be an alias, so remove the finalized bit. */
31815 node->definition = false;
31817 resolver_decl = make_resolver_func (default_ver_decl,
31818 node->decl, &empty_bb);
31820 node_version_info->dispatcher_resolver = resolver_decl;
31822 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31824 auto_vec<tree, 2> fn_ver_vec;
31826 for (versn_info = node_version_info->next; versn_info;
31827 versn_info = versn_info->next)
31829 versn = versn_info->this_node;
31830 /* Check for virtual functions here again, as by this time it should
31831 have been determined if this function needs a vtable index or
31832 not. This happens for methods in derived classes that override
31833 virtual methods in base classes but are not explicitly marked as
31834 virtual. */
31835 if (DECL_VINDEX (versn->decl))
31836 sorry ("Virtual function multiversioning not supported");
31838 fn_ver_vec.safe_push (versn->decl);
31841 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31842 cgraph_edge::rebuild_edges ();
31843 pop_cfun ();
31844 return resolver_decl;
31846 /* This builds the processor_model struct type defined in
31847 libgcc/config/i386/cpuinfo.c */
31849 static tree
31850 build_processor_model_struct (void)
31852 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31853 "__cpu_features"};
31854 tree field = NULL_TREE, field_chain = NULL_TREE;
31855 int i;
31856 tree type = make_node (RECORD_TYPE);
31858 /* The first 3 fields are unsigned int. */
31859 for (i = 0; i < 3; ++i)
31861 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31862 get_identifier (field_name[i]), unsigned_type_node);
31863 if (field_chain != NULL_TREE)
31864 DECL_CHAIN (field) = field_chain;
31865 field_chain = field;
31868 /* The last field is an array of unsigned integers of size one. */
31869 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31870 get_identifier (field_name[3]),
31871 build_array_type (unsigned_type_node,
31872 build_index_type (size_one_node)));
31873 if (field_chain != NULL_TREE)
31874 DECL_CHAIN (field) = field_chain;
31875 field_chain = field;
31877 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31878 return type;
31881 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31883 static tree
31884 make_var_decl (tree type, const char *name)
31886 tree new_decl;
31888 new_decl = build_decl (UNKNOWN_LOCATION,
31889 VAR_DECL,
31890 get_identifier(name),
31891 type);
31893 DECL_EXTERNAL (new_decl) = 1;
31894 TREE_STATIC (new_decl) = 1;
31895 TREE_PUBLIC (new_decl) = 1;
31896 DECL_INITIAL (new_decl) = 0;
31897 DECL_ARTIFICIAL (new_decl) = 0;
31898 DECL_PRESERVE_P (new_decl) = 1;
31900 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31901 assemble_variable (new_decl, 0, 0, 0);
31903 return new_decl;
31906 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31907 into an integer defined in libgcc/config/i386/cpuinfo.c */
31909 static tree
31910 fold_builtin_cpu (tree fndecl, tree *args)
31912 unsigned int i;
31913 enum ix86_builtins fn_code = (enum ix86_builtins)
31914 DECL_FUNCTION_CODE (fndecl);
31915 tree param_string_cst = NULL;
31917 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31918 enum processor_features
31920 F_CMOV = 0,
31921 F_MMX,
31922 F_POPCNT,
31923 F_SSE,
31924 F_SSE2,
31925 F_SSE3,
31926 F_SSSE3,
31927 F_SSE4_1,
31928 F_SSE4_2,
31929 F_AVX,
31930 F_AVX2,
31931 F_SSE4_A,
31932 F_FMA4,
31933 F_XOP,
31934 F_FMA,
31935 F_AVX512F,
31936 F_BMI,
31937 F_BMI2,
31938 F_AES,
31939 F_PCLMUL,
31940 F_AVX512VL,
31941 F_AVX512BW,
31942 F_AVX512DQ,
31943 F_AVX512CD,
31944 F_AVX512ER,
31945 F_AVX512PF,
31946 F_AVX512VBMI,
31947 F_AVX512IFMA,
31948 F_AVX5124VNNIW,
31949 F_AVX5124FMAPS,
31950 F_AVX512VPOPCNTDQ,
31951 F_MAX
31954 /* These are the values for vendor types and cpu types and subtypes
31955 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31956 the corresponding start value. */
31957 enum processor_model
31959 M_INTEL = 1,
31960 M_AMD,
31961 M_CPU_TYPE_START,
31962 M_INTEL_BONNELL,
31963 M_INTEL_CORE2,
31964 M_INTEL_COREI7,
31965 M_AMDFAM10H,
31966 M_AMDFAM15H,
31967 M_INTEL_SILVERMONT,
31968 M_INTEL_KNL,
31969 M_AMD_BTVER1,
31970 M_AMD_BTVER2,
31971 M_AMDFAM17H,
31972 M_INTEL_KNM,
31973 M_CPU_SUBTYPE_START,
31974 M_INTEL_COREI7_NEHALEM,
31975 M_INTEL_COREI7_WESTMERE,
31976 M_INTEL_COREI7_SANDYBRIDGE,
31977 M_AMDFAM10H_BARCELONA,
31978 M_AMDFAM10H_SHANGHAI,
31979 M_AMDFAM10H_ISTANBUL,
31980 M_AMDFAM15H_BDVER1,
31981 M_AMDFAM15H_BDVER2,
31982 M_AMDFAM15H_BDVER3,
31983 M_AMDFAM15H_BDVER4,
31984 M_AMDFAM17H_ZNVER1,
31985 M_INTEL_COREI7_IVYBRIDGE,
31986 M_INTEL_COREI7_HASWELL,
31987 M_INTEL_COREI7_BROADWELL,
31988 M_INTEL_COREI7_SKYLAKE,
31989 M_INTEL_COREI7_SKYLAKE_AVX512,
31990 M_INTEL_COREI7_CANNONLAKE
31993 static struct _arch_names_table
31995 const char *const name;
31996 const enum processor_model model;
31998 const arch_names_table[] =
32000 {"amd", M_AMD},
32001 {"intel", M_INTEL},
32002 {"atom", M_INTEL_BONNELL},
32003 {"slm", M_INTEL_SILVERMONT},
32004 {"core2", M_INTEL_CORE2},
32005 {"corei7", M_INTEL_COREI7},
32006 {"nehalem", M_INTEL_COREI7_NEHALEM},
32007 {"westmere", M_INTEL_COREI7_WESTMERE},
32008 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32009 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32010 {"haswell", M_INTEL_COREI7_HASWELL},
32011 {"broadwell", M_INTEL_COREI7_BROADWELL},
32012 {"skylake", M_INTEL_COREI7_SKYLAKE},
32013 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32014 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32015 {"bonnell", M_INTEL_BONNELL},
32016 {"silvermont", M_INTEL_SILVERMONT},
32017 {"knl", M_INTEL_KNL},
32018 {"knm", M_INTEL_KNM},
32019 {"amdfam10h", M_AMDFAM10H},
32020 {"barcelona", M_AMDFAM10H_BARCELONA},
32021 {"shanghai", M_AMDFAM10H_SHANGHAI},
32022 {"istanbul", M_AMDFAM10H_ISTANBUL},
32023 {"btver1", M_AMD_BTVER1},
32024 {"amdfam15h", M_AMDFAM15H},
32025 {"bdver1", M_AMDFAM15H_BDVER1},
32026 {"bdver2", M_AMDFAM15H_BDVER2},
32027 {"bdver3", M_AMDFAM15H_BDVER3},
32028 {"bdver4", M_AMDFAM15H_BDVER4},
32029 {"btver2", M_AMD_BTVER2},
32030 {"amdfam17h", M_AMDFAM17H},
32031 {"znver1", M_AMDFAM17H_ZNVER1},
32034 static struct _isa_names_table
32036 const char *const name;
32037 const enum processor_features feature;
32039 const isa_names_table[] =
32041 {"cmov", F_CMOV},
32042 {"mmx", F_MMX},
32043 {"popcnt", F_POPCNT},
32044 {"sse", F_SSE},
32045 {"sse2", F_SSE2},
32046 {"sse3", F_SSE3},
32047 {"ssse3", F_SSSE3},
32048 {"sse4a", F_SSE4_A},
32049 {"sse4.1", F_SSE4_1},
32050 {"sse4.2", F_SSE4_2},
32051 {"avx", F_AVX},
32052 {"fma4", F_FMA4},
32053 {"xop", F_XOP},
32054 {"fma", F_FMA},
32055 {"avx2", F_AVX2},
32056 {"avx512f", F_AVX512F},
32057 {"bmi", F_BMI},
32058 {"bmi2", F_BMI2},
32059 {"aes", F_AES},
32060 {"pclmul", F_PCLMUL},
32061 {"avx512vl",F_AVX512VL},
32062 {"avx512bw",F_AVX512BW},
32063 {"avx512dq",F_AVX512DQ},
32064 {"avx512cd",F_AVX512CD},
32065 {"avx512er",F_AVX512ER},
32066 {"avx512pf",F_AVX512PF},
32067 {"avx512vbmi",F_AVX512VBMI},
32068 {"avx512ifma",F_AVX512IFMA},
32069 {"avx5124vnniw",F_AVX5124VNNIW},
32070 {"avx5124fmaps",F_AVX5124FMAPS},
32071 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32074 tree __processor_model_type = build_processor_model_struct ();
32075 tree __cpu_model_var = make_var_decl (__processor_model_type,
32076 "__cpu_model");
32079 varpool_node::add (__cpu_model_var);
32081 gcc_assert ((args != NULL) && (*args != NULL));
32083 param_string_cst = *args;
32084 while (param_string_cst
32085 && TREE_CODE (param_string_cst) != STRING_CST)
32087 /* *args must be a expr that can contain other EXPRS leading to a
32088 STRING_CST. */
32089 if (!EXPR_P (param_string_cst))
32091 error ("Parameter to builtin must be a string constant or literal");
32092 return integer_zero_node;
32094 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32097 gcc_assert (param_string_cst);
32099 if (fn_code == IX86_BUILTIN_CPU_IS)
32101 tree ref;
32102 tree field;
32103 tree final;
32105 unsigned int field_val = 0;
32106 unsigned int NUM_ARCH_NAMES
32107 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32109 for (i = 0; i < NUM_ARCH_NAMES; i++)
32110 if (strcmp (arch_names_table[i].name,
32111 TREE_STRING_POINTER (param_string_cst)) == 0)
32112 break;
32114 if (i == NUM_ARCH_NAMES)
32116 error ("Parameter to builtin not valid: %s",
32117 TREE_STRING_POINTER (param_string_cst));
32118 return integer_zero_node;
32121 field = TYPE_FIELDS (__processor_model_type);
32122 field_val = arch_names_table[i].model;
32124 /* CPU types are stored in the next field. */
32125 if (field_val > M_CPU_TYPE_START
32126 && field_val < M_CPU_SUBTYPE_START)
32128 field = DECL_CHAIN (field);
32129 field_val -= M_CPU_TYPE_START;
32132 /* CPU subtypes are stored in the next field. */
32133 if (field_val > M_CPU_SUBTYPE_START)
32135 field = DECL_CHAIN ( DECL_CHAIN (field));
32136 field_val -= M_CPU_SUBTYPE_START;
32139 /* Get the appropriate field in __cpu_model. */
32140 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32141 field, NULL_TREE);
32143 /* Check the value. */
32144 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32145 build_int_cstu (unsigned_type_node, field_val));
32146 return build1 (CONVERT_EXPR, integer_type_node, final);
32148 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32150 tree ref;
32151 tree array_elt;
32152 tree field;
32153 tree final;
32155 unsigned int field_val = 0;
32156 unsigned int NUM_ISA_NAMES
32157 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32159 for (i = 0; i < NUM_ISA_NAMES; i++)
32160 if (strcmp (isa_names_table[i].name,
32161 TREE_STRING_POINTER (param_string_cst)) == 0)
32162 break;
32164 if (i == NUM_ISA_NAMES)
32166 error ("Parameter to builtin not valid: %s",
32167 TREE_STRING_POINTER (param_string_cst));
32168 return integer_zero_node;
32171 field = TYPE_FIELDS (__processor_model_type);
32172 /* Get the last field, which is __cpu_features. */
32173 while (DECL_CHAIN (field))
32174 field = DECL_CHAIN (field);
32176 /* Get the appropriate field: __cpu_model.__cpu_features */
32177 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32178 field, NULL_TREE);
32180 /* Access the 0th element of __cpu_features array. */
32181 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
32182 integer_zero_node, NULL_TREE, NULL_TREE);
32184 field_val = (1 << isa_names_table[i].feature);
32185 /* Return __cpu_model.__cpu_features[0] & field_val */
32186 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
32187 build_int_cstu (unsigned_type_node, field_val));
32188 return build1 (CONVERT_EXPR, integer_type_node, final);
32190 gcc_unreachable ();
32193 static tree
32194 ix86_fold_builtin (tree fndecl, int n_args,
32195 tree *args, bool ignore ATTRIBUTE_UNUSED)
32197 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32199 enum ix86_builtins fn_code = (enum ix86_builtins)
32200 DECL_FUNCTION_CODE (fndecl);
32201 switch (fn_code)
32203 case IX86_BUILTIN_CPU_IS:
32204 case IX86_BUILTIN_CPU_SUPPORTS:
32205 gcc_assert (n_args == 1);
32206 return fold_builtin_cpu (fndecl, args);
32208 case IX86_BUILTIN_NANQ:
32209 case IX86_BUILTIN_NANSQ:
32211 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32212 const char *str = c_getstr (*args);
32213 int quiet = fn_code == IX86_BUILTIN_NANQ;
32214 REAL_VALUE_TYPE real;
32216 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
32217 return build_real (type, real);
32218 return NULL_TREE;
32221 case IX86_BUILTIN_INFQ:
32222 case IX86_BUILTIN_HUGE_VALQ:
32224 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32225 REAL_VALUE_TYPE inf;
32226 real_inf (&inf);
32227 return build_real (type, inf);
32230 case IX86_BUILTIN_TZCNT16:
32231 case IX86_BUILTIN_CTZS:
32232 case IX86_BUILTIN_TZCNT32:
32233 case IX86_BUILTIN_TZCNT64:
32234 gcc_assert (n_args == 1);
32235 if (TREE_CODE (args[0]) == INTEGER_CST)
32237 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32238 tree arg = args[0];
32239 if (fn_code == IX86_BUILTIN_TZCNT16
32240 || fn_code == IX86_BUILTIN_CTZS)
32241 arg = fold_convert (short_unsigned_type_node, arg);
32242 if (integer_zerop (arg))
32243 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32244 else
32245 return fold_const_call (CFN_CTZ, type, arg);
32247 break;
32249 case IX86_BUILTIN_LZCNT16:
32250 case IX86_BUILTIN_CLZS:
32251 case IX86_BUILTIN_LZCNT32:
32252 case IX86_BUILTIN_LZCNT64:
32253 gcc_assert (n_args == 1);
32254 if (TREE_CODE (args[0]) == INTEGER_CST)
32256 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32257 tree arg = args[0];
32258 if (fn_code == IX86_BUILTIN_LZCNT16
32259 || fn_code == IX86_BUILTIN_CLZS)
32260 arg = fold_convert (short_unsigned_type_node, arg);
32261 if (integer_zerop (arg))
32262 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32263 else
32264 return fold_const_call (CFN_CLZ, type, arg);
32266 break;
32268 case IX86_BUILTIN_BEXTR32:
32269 case IX86_BUILTIN_BEXTR64:
32270 case IX86_BUILTIN_BEXTRI32:
32271 case IX86_BUILTIN_BEXTRI64:
32272 gcc_assert (n_args == 2);
32273 if (tree_fits_uhwi_p (args[1]))
32275 unsigned HOST_WIDE_INT res = 0;
32276 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32277 unsigned int start = tree_to_uhwi (args[1]);
32278 unsigned int len = (start & 0xff00) >> 8;
32279 start &= 0xff;
32280 if (start >= prec || len == 0)
32281 res = 0;
32282 else if (!tree_fits_uhwi_p (args[0]))
32283 break;
32284 else
32285 res = tree_to_uhwi (args[0]) >> start;
32286 if (len > prec)
32287 len = prec;
32288 if (len < HOST_BITS_PER_WIDE_INT)
32289 res &= (HOST_WIDE_INT_1U << len) - 1;
32290 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32292 break;
32294 case IX86_BUILTIN_BZHI32:
32295 case IX86_BUILTIN_BZHI64:
32296 gcc_assert (n_args == 2);
32297 if (tree_fits_uhwi_p (args[1]))
32299 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32300 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32301 return args[0];
32302 if (!tree_fits_uhwi_p (args[0]))
32303 break;
32304 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32305 res &= ~(HOST_WIDE_INT_M1U << idx);
32306 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32308 break;
32310 case IX86_BUILTIN_PDEP32:
32311 case IX86_BUILTIN_PDEP64:
32312 gcc_assert (n_args == 2);
32313 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32315 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32316 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32317 unsigned HOST_WIDE_INT res = 0;
32318 unsigned HOST_WIDE_INT m, k = 1;
32319 for (m = 1; m; m <<= 1)
32320 if ((mask & m) != 0)
32322 if ((src & k) != 0)
32323 res |= m;
32324 k <<= 1;
32326 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32328 break;
32330 case IX86_BUILTIN_PEXT32:
32331 case IX86_BUILTIN_PEXT64:
32332 gcc_assert (n_args == 2);
32333 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32335 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32336 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32337 unsigned HOST_WIDE_INT res = 0;
32338 unsigned HOST_WIDE_INT m, k = 1;
32339 for (m = 1; m; m <<= 1)
32340 if ((mask & m) != 0)
32342 if ((src & m) != 0)
32343 res |= k;
32344 k <<= 1;
32346 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32348 break;
32350 default:
32351 break;
32355 #ifdef SUBTARGET_FOLD_BUILTIN
32356 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32357 #endif
32359 return NULL_TREE;
32362 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32363 constant) in GIMPLE. */
32365 bool
32366 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32368 gimple *stmt = gsi_stmt (*gsi);
32369 tree fndecl = gimple_call_fndecl (stmt);
32370 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32371 int n_args = gimple_call_num_args (stmt);
32372 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32373 tree decl = NULL_TREE;
32374 tree arg0, arg1;
32376 switch (fn_code)
32378 case IX86_BUILTIN_TZCNT32:
32379 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32380 goto fold_tzcnt_lzcnt;
32382 case IX86_BUILTIN_TZCNT64:
32383 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32384 goto fold_tzcnt_lzcnt;
32386 case IX86_BUILTIN_LZCNT32:
32387 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32388 goto fold_tzcnt_lzcnt;
32390 case IX86_BUILTIN_LZCNT64:
32391 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32392 goto fold_tzcnt_lzcnt;
32394 fold_tzcnt_lzcnt:
32395 gcc_assert (n_args == 1);
32396 arg0 = gimple_call_arg (stmt, 0);
32397 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32399 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32400 /* If arg0 is provably non-zero, optimize into generic
32401 __builtin_c[tl]z{,ll} function the middle-end handles
32402 better. */
32403 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32404 return false;
32406 location_t loc = gimple_location (stmt);
32407 gimple *g = gimple_build_call (decl, 1, arg0);
32408 gimple_set_location (g, loc);
32409 tree lhs = make_ssa_name (integer_type_node);
32410 gimple_call_set_lhs (g, lhs);
32411 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32412 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32413 gimple_set_location (g, loc);
32414 gsi_replace (gsi, g, false);
32415 return true;
32417 break;
32419 case IX86_BUILTIN_BZHI32:
32420 case IX86_BUILTIN_BZHI64:
32421 gcc_assert (n_args == 2);
32422 arg1 = gimple_call_arg (stmt, 1);
32423 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32425 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32426 arg0 = gimple_call_arg (stmt, 0);
32427 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32428 break;
32429 location_t loc = gimple_location (stmt);
32430 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32431 gimple_set_location (g, loc);
32432 gsi_replace (gsi, g, false);
32433 return true;
32435 break;
32437 case IX86_BUILTIN_PDEP32:
32438 case IX86_BUILTIN_PDEP64:
32439 case IX86_BUILTIN_PEXT32:
32440 case IX86_BUILTIN_PEXT64:
32441 gcc_assert (n_args == 2);
32442 arg1 = gimple_call_arg (stmt, 1);
32443 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32445 location_t loc = gimple_location (stmt);
32446 arg0 = gimple_call_arg (stmt, 0);
32447 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32448 gimple_set_location (g, loc);
32449 gsi_replace (gsi, g, false);
32450 return true;
32452 break;
32454 default:
32455 break;
32458 return false;
32461 /* Make builtins to detect cpu type and features supported. NAME is
32462 the builtin name, CODE is the builtin code, and FTYPE is the function
32463 type of the builtin. */
32465 static void
32466 make_cpu_type_builtin (const char* name, int code,
32467 enum ix86_builtin_func_type ftype, bool is_const)
32469 tree decl;
32470 tree type;
32472 type = ix86_get_builtin_func_type (ftype);
32473 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32474 NULL, NULL_TREE);
32475 gcc_assert (decl != NULL_TREE);
32476 ix86_builtins[(int) code] = decl;
32477 TREE_READONLY (decl) = is_const;
32480 /* Make builtins to get CPU type and features supported. The created
32481 builtins are :
32483 __builtin_cpu_init (), to detect cpu type and features,
32484 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32485 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32488 static void
32489 ix86_init_platform_type_builtins (void)
32491 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32492 INT_FTYPE_VOID, false);
32493 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32494 INT_FTYPE_PCCHAR, true);
32495 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32496 INT_FTYPE_PCCHAR, true);
32499 /* Internal method for ix86_init_builtins. */
32501 static void
32502 ix86_init_builtins_va_builtins_abi (void)
32504 tree ms_va_ref, sysv_va_ref;
32505 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32506 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32507 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32508 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32510 if (!TARGET_64BIT)
32511 return;
32512 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32513 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32514 ms_va_ref = build_reference_type (ms_va_list_type_node);
32515 sysv_va_ref =
32516 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32518 fnvoid_va_end_ms =
32519 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32520 fnvoid_va_start_ms =
32521 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32522 fnvoid_va_end_sysv =
32523 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32524 fnvoid_va_start_sysv =
32525 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32526 NULL_TREE);
32527 fnvoid_va_copy_ms =
32528 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32529 NULL_TREE);
32530 fnvoid_va_copy_sysv =
32531 build_function_type_list (void_type_node, sysv_va_ref,
32532 sysv_va_ref, NULL_TREE);
32534 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32535 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32536 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32537 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32538 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32539 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32540 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32541 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32542 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32543 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32544 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32545 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32548 static void
32549 ix86_init_builtin_types (void)
32551 tree float80_type_node, const_string_type_node;
32553 /* The __float80 type. */
32554 float80_type_node = long_double_type_node;
32555 if (TYPE_MODE (float80_type_node) != XFmode)
32557 if (float64x_type_node != NULL_TREE
32558 && TYPE_MODE (float64x_type_node) == XFmode)
32559 float80_type_node = float64x_type_node;
32560 else
32562 /* The __float80 type. */
32563 float80_type_node = make_node (REAL_TYPE);
32565 TYPE_PRECISION (float80_type_node) = 80;
32566 layout_type (float80_type_node);
32569 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32571 /* The __float128 type. The node has already been created as
32572 _Float128, so we only need to register the __float128 name for
32573 it. */
32574 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32576 const_string_type_node
32577 = build_pointer_type (build_qualified_type
32578 (char_type_node, TYPE_QUAL_CONST));
32580 /* This macro is built by i386-builtin-types.awk. */
32581 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32584 static void
32585 ix86_init_builtins (void)
32587 tree ftype, decl;
32589 ix86_init_builtin_types ();
32591 /* Builtins to get CPU type and features. */
32592 ix86_init_platform_type_builtins ();
32594 /* TFmode support builtins. */
32595 def_builtin_const (0, "__builtin_infq",
32596 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32597 def_builtin_const (0, "__builtin_huge_valq",
32598 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32600 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32601 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32602 BUILT_IN_MD, "nanq", NULL_TREE);
32603 TREE_READONLY (decl) = 1;
32604 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32606 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32607 BUILT_IN_MD, "nansq", NULL_TREE);
32608 TREE_READONLY (decl) = 1;
32609 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32611 /* We will expand them to normal call if SSE isn't available since
32612 they are used by libgcc. */
32613 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32614 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32615 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32616 TREE_READONLY (decl) = 1;
32617 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32619 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32620 decl = add_builtin_function ("__builtin_copysignq", ftype,
32621 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32622 "__copysigntf3", NULL_TREE);
32623 TREE_READONLY (decl) = 1;
32624 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32626 ix86_init_tm_builtins ();
32627 ix86_init_mmx_sse_builtins ();
32628 ix86_init_mpx_builtins ();
32630 if (TARGET_LP64)
32631 ix86_init_builtins_va_builtins_abi ();
32633 #ifdef SUBTARGET_INIT_BUILTINS
32634 SUBTARGET_INIT_BUILTINS;
32635 #endif
32638 /* Return the ix86 builtin for CODE. */
32640 static tree
32641 ix86_builtin_decl (unsigned code, bool)
32643 if (code >= IX86_BUILTIN_MAX)
32644 return error_mark_node;
32646 return ix86_builtins[code];
32649 /* Errors in the source file can cause expand_expr to return const0_rtx
32650 where we expect a vector. To avoid crashing, use one of the vector
32651 clear instructions. */
32652 static rtx
32653 safe_vector_operand (rtx x, machine_mode mode)
32655 if (x == const0_rtx)
32656 x = CONST0_RTX (mode);
32657 return x;
32660 /* Fixup modeless constants to fit required mode. */
32661 static rtx
32662 fixup_modeless_constant (rtx x, machine_mode mode)
32664 if (GET_MODE (x) == VOIDmode)
32665 x = convert_to_mode (mode, x, 1);
32666 return x;
32669 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32671 static rtx
32672 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32674 rtx pat;
32675 tree arg0 = CALL_EXPR_ARG (exp, 0);
32676 tree arg1 = CALL_EXPR_ARG (exp, 1);
32677 rtx op0 = expand_normal (arg0);
32678 rtx op1 = expand_normal (arg1);
32679 machine_mode tmode = insn_data[icode].operand[0].mode;
32680 machine_mode mode0 = insn_data[icode].operand[1].mode;
32681 machine_mode mode1 = insn_data[icode].operand[2].mode;
32683 if (VECTOR_MODE_P (mode0))
32684 op0 = safe_vector_operand (op0, mode0);
32685 if (VECTOR_MODE_P (mode1))
32686 op1 = safe_vector_operand (op1, mode1);
32688 if (optimize || !target
32689 || GET_MODE (target) != tmode
32690 || !insn_data[icode].operand[0].predicate (target, tmode))
32691 target = gen_reg_rtx (tmode);
32693 if (GET_MODE (op1) == SImode && mode1 == TImode)
32695 rtx x = gen_reg_rtx (V4SImode);
32696 emit_insn (gen_sse2_loadd (x, op1));
32697 op1 = gen_lowpart (TImode, x);
32700 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32701 op0 = copy_to_mode_reg (mode0, op0);
32702 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32703 op1 = copy_to_mode_reg (mode1, op1);
32705 pat = GEN_FCN (icode) (target, op0, op1);
32706 if (! pat)
32707 return 0;
32709 emit_insn (pat);
32711 return target;
32714 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32716 static rtx
32717 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32718 enum ix86_builtin_func_type m_type,
32719 enum rtx_code sub_code)
32721 rtx pat;
32722 int i;
32723 int nargs;
32724 bool comparison_p = false;
32725 bool tf_p = false;
32726 bool last_arg_constant = false;
32727 int num_memory = 0;
32728 struct {
32729 rtx op;
32730 machine_mode mode;
32731 } args[4];
32733 machine_mode tmode = insn_data[icode].operand[0].mode;
32735 switch (m_type)
32737 case MULTI_ARG_4_DF2_DI_I:
32738 case MULTI_ARG_4_DF2_DI_I1:
32739 case MULTI_ARG_4_SF2_SI_I:
32740 case MULTI_ARG_4_SF2_SI_I1:
32741 nargs = 4;
32742 last_arg_constant = true;
32743 break;
32745 case MULTI_ARG_3_SF:
32746 case MULTI_ARG_3_DF:
32747 case MULTI_ARG_3_SF2:
32748 case MULTI_ARG_3_DF2:
32749 case MULTI_ARG_3_DI:
32750 case MULTI_ARG_3_SI:
32751 case MULTI_ARG_3_SI_DI:
32752 case MULTI_ARG_3_HI:
32753 case MULTI_ARG_3_HI_SI:
32754 case MULTI_ARG_3_QI:
32755 case MULTI_ARG_3_DI2:
32756 case MULTI_ARG_3_SI2:
32757 case MULTI_ARG_3_HI2:
32758 case MULTI_ARG_3_QI2:
32759 nargs = 3;
32760 break;
32762 case MULTI_ARG_2_SF:
32763 case MULTI_ARG_2_DF:
32764 case MULTI_ARG_2_DI:
32765 case MULTI_ARG_2_SI:
32766 case MULTI_ARG_2_HI:
32767 case MULTI_ARG_2_QI:
32768 nargs = 2;
32769 break;
32771 case MULTI_ARG_2_DI_IMM:
32772 case MULTI_ARG_2_SI_IMM:
32773 case MULTI_ARG_2_HI_IMM:
32774 case MULTI_ARG_2_QI_IMM:
32775 nargs = 2;
32776 last_arg_constant = true;
32777 break;
32779 case MULTI_ARG_1_SF:
32780 case MULTI_ARG_1_DF:
32781 case MULTI_ARG_1_SF2:
32782 case MULTI_ARG_1_DF2:
32783 case MULTI_ARG_1_DI:
32784 case MULTI_ARG_1_SI:
32785 case MULTI_ARG_1_HI:
32786 case MULTI_ARG_1_QI:
32787 case MULTI_ARG_1_SI_DI:
32788 case MULTI_ARG_1_HI_DI:
32789 case MULTI_ARG_1_HI_SI:
32790 case MULTI_ARG_1_QI_DI:
32791 case MULTI_ARG_1_QI_SI:
32792 case MULTI_ARG_1_QI_HI:
32793 nargs = 1;
32794 break;
32796 case MULTI_ARG_2_DI_CMP:
32797 case MULTI_ARG_2_SI_CMP:
32798 case MULTI_ARG_2_HI_CMP:
32799 case MULTI_ARG_2_QI_CMP:
32800 nargs = 2;
32801 comparison_p = true;
32802 break;
32804 case MULTI_ARG_2_SF_TF:
32805 case MULTI_ARG_2_DF_TF:
32806 case MULTI_ARG_2_DI_TF:
32807 case MULTI_ARG_2_SI_TF:
32808 case MULTI_ARG_2_HI_TF:
32809 case MULTI_ARG_2_QI_TF:
32810 nargs = 2;
32811 tf_p = true;
32812 break;
32814 default:
32815 gcc_unreachable ();
32818 if (optimize || !target
32819 || GET_MODE (target) != tmode
32820 || !insn_data[icode].operand[0].predicate (target, tmode))
32821 target = gen_reg_rtx (tmode);
32822 else if (memory_operand (target, tmode))
32823 num_memory++;
32825 gcc_assert (nargs <= 4);
32827 for (i = 0; i < nargs; i++)
32829 tree arg = CALL_EXPR_ARG (exp, i);
32830 rtx op = expand_normal (arg);
32831 int adjust = (comparison_p) ? 1 : 0;
32832 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32834 if (last_arg_constant && i == nargs - 1)
32836 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32838 enum insn_code new_icode = icode;
32839 switch (icode)
32841 case CODE_FOR_xop_vpermil2v2df3:
32842 case CODE_FOR_xop_vpermil2v4sf3:
32843 case CODE_FOR_xop_vpermil2v4df3:
32844 case CODE_FOR_xop_vpermil2v8sf3:
32845 error ("the last argument must be a 2-bit immediate");
32846 return gen_reg_rtx (tmode);
32847 case CODE_FOR_xop_rotlv2di3:
32848 new_icode = CODE_FOR_rotlv2di3;
32849 goto xop_rotl;
32850 case CODE_FOR_xop_rotlv4si3:
32851 new_icode = CODE_FOR_rotlv4si3;
32852 goto xop_rotl;
32853 case CODE_FOR_xop_rotlv8hi3:
32854 new_icode = CODE_FOR_rotlv8hi3;
32855 goto xop_rotl;
32856 case CODE_FOR_xop_rotlv16qi3:
32857 new_icode = CODE_FOR_rotlv16qi3;
32858 xop_rotl:
32859 if (CONST_INT_P (op))
32861 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32862 op = GEN_INT (INTVAL (op) & mask);
32863 gcc_checking_assert
32864 (insn_data[icode].operand[i + 1].predicate (op, mode));
32866 else
32868 gcc_checking_assert
32869 (nargs == 2
32870 && insn_data[new_icode].operand[0].mode == tmode
32871 && insn_data[new_icode].operand[1].mode == tmode
32872 && insn_data[new_icode].operand[2].mode == mode
32873 && insn_data[new_icode].operand[0].predicate
32874 == insn_data[icode].operand[0].predicate
32875 && insn_data[new_icode].operand[1].predicate
32876 == insn_data[icode].operand[1].predicate);
32877 icode = new_icode;
32878 goto non_constant;
32880 break;
32881 default:
32882 gcc_unreachable ();
32886 else
32888 non_constant:
32889 if (VECTOR_MODE_P (mode))
32890 op = safe_vector_operand (op, mode);
32892 /* If we aren't optimizing, only allow one memory operand to be
32893 generated. */
32894 if (memory_operand (op, mode))
32895 num_memory++;
32897 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32899 if (optimize
32900 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32901 || num_memory > 1)
32902 op = force_reg (mode, op);
32905 args[i].op = op;
32906 args[i].mode = mode;
32909 switch (nargs)
32911 case 1:
32912 pat = GEN_FCN (icode) (target, args[0].op);
32913 break;
32915 case 2:
32916 if (tf_p)
32917 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32918 GEN_INT ((int)sub_code));
32919 else if (! comparison_p)
32920 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32921 else
32923 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32924 args[0].op,
32925 args[1].op);
32927 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32929 break;
32931 case 3:
32932 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32933 break;
32935 case 4:
32936 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32937 break;
32939 default:
32940 gcc_unreachable ();
32943 if (! pat)
32944 return 0;
32946 emit_insn (pat);
32947 return target;
32950 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32951 insns with vec_merge. */
32953 static rtx
32954 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32955 rtx target)
32957 rtx pat;
32958 tree arg0 = CALL_EXPR_ARG (exp, 0);
32959 rtx op1, op0 = expand_normal (arg0);
32960 machine_mode tmode = insn_data[icode].operand[0].mode;
32961 machine_mode mode0 = insn_data[icode].operand[1].mode;
32963 if (optimize || !target
32964 || GET_MODE (target) != tmode
32965 || !insn_data[icode].operand[0].predicate (target, tmode))
32966 target = gen_reg_rtx (tmode);
32968 if (VECTOR_MODE_P (mode0))
32969 op0 = safe_vector_operand (op0, mode0);
32971 if ((optimize && !register_operand (op0, mode0))
32972 || !insn_data[icode].operand[1].predicate (op0, mode0))
32973 op0 = copy_to_mode_reg (mode0, op0);
32975 op1 = op0;
32976 if (!insn_data[icode].operand[2].predicate (op1, mode0))
32977 op1 = copy_to_mode_reg (mode0, op1);
32979 pat = GEN_FCN (icode) (target, op0, op1);
32980 if (! pat)
32981 return 0;
32982 emit_insn (pat);
32983 return target;
32986 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
32988 static rtx
32989 ix86_expand_sse_compare (const struct builtin_description *d,
32990 tree exp, rtx target, bool swap)
32992 rtx pat;
32993 tree arg0 = CALL_EXPR_ARG (exp, 0);
32994 tree arg1 = CALL_EXPR_ARG (exp, 1);
32995 rtx op0 = expand_normal (arg0);
32996 rtx op1 = expand_normal (arg1);
32997 rtx op2;
32998 machine_mode tmode = insn_data[d->icode].operand[0].mode;
32999 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33000 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33001 enum rtx_code comparison = d->comparison;
33003 if (VECTOR_MODE_P (mode0))
33004 op0 = safe_vector_operand (op0, mode0);
33005 if (VECTOR_MODE_P (mode1))
33006 op1 = safe_vector_operand (op1, mode1);
33008 /* Swap operands if we have a comparison that isn't available in
33009 hardware. */
33010 if (swap)
33011 std::swap (op0, op1);
33013 if (optimize || !target
33014 || GET_MODE (target) != tmode
33015 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33016 target = gen_reg_rtx (tmode);
33018 if ((optimize && !register_operand (op0, mode0))
33019 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33020 op0 = copy_to_mode_reg (mode0, op0);
33021 if ((optimize && !register_operand (op1, mode1))
33022 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33023 op1 = copy_to_mode_reg (mode1, op1);
33025 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33026 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33027 if (! pat)
33028 return 0;
33029 emit_insn (pat);
33030 return target;
33033 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33035 static rtx
33036 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33037 rtx target)
33039 rtx pat;
33040 tree arg0 = CALL_EXPR_ARG (exp, 0);
33041 tree arg1 = CALL_EXPR_ARG (exp, 1);
33042 rtx op0 = expand_normal (arg0);
33043 rtx op1 = expand_normal (arg1);
33044 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33045 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33046 enum rtx_code comparison = d->comparison;
33048 if (VECTOR_MODE_P (mode0))
33049 op0 = safe_vector_operand (op0, mode0);
33050 if (VECTOR_MODE_P (mode1))
33051 op1 = safe_vector_operand (op1, mode1);
33053 /* Swap operands if we have a comparison that isn't available in
33054 hardware. */
33055 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33056 std::swap (op0, op1);
33058 target = gen_reg_rtx (SImode);
33059 emit_move_insn (target, const0_rtx);
33060 target = gen_rtx_SUBREG (QImode, target, 0);
33062 if ((optimize && !register_operand (op0, mode0))
33063 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33064 op0 = copy_to_mode_reg (mode0, op0);
33065 if ((optimize && !register_operand (op1, mode1))
33066 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33067 op1 = copy_to_mode_reg (mode1, op1);
33069 pat = GEN_FCN (d->icode) (op0, op1);
33070 if (! pat)
33071 return 0;
33072 emit_insn (pat);
33073 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33074 gen_rtx_fmt_ee (comparison, QImode,
33075 SET_DEST (pat),
33076 const0_rtx)));
33078 return SUBREG_REG (target);
33081 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33083 static rtx
33084 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33085 rtx target)
33087 rtx pat;
33088 tree arg0 = CALL_EXPR_ARG (exp, 0);
33089 rtx op1, op0 = expand_normal (arg0);
33090 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33091 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33093 if (optimize || target == 0
33094 || GET_MODE (target) != tmode
33095 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33096 target = gen_reg_rtx (tmode);
33098 if (VECTOR_MODE_P (mode0))
33099 op0 = safe_vector_operand (op0, mode0);
33101 if ((optimize && !register_operand (op0, mode0))
33102 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33103 op0 = copy_to_mode_reg (mode0, op0);
33105 op1 = GEN_INT (d->comparison);
33107 pat = GEN_FCN (d->icode) (target, op0, op1);
33108 if (! pat)
33109 return 0;
33110 emit_insn (pat);
33111 return target;
33114 static rtx
33115 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33116 tree exp, rtx target)
33118 rtx pat;
33119 tree arg0 = CALL_EXPR_ARG (exp, 0);
33120 tree arg1 = CALL_EXPR_ARG (exp, 1);
33121 rtx op0 = expand_normal (arg0);
33122 rtx op1 = expand_normal (arg1);
33123 rtx op2;
33124 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33125 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33126 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33128 if (optimize || target == 0
33129 || GET_MODE (target) != tmode
33130 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33131 target = gen_reg_rtx (tmode);
33133 op0 = safe_vector_operand (op0, mode0);
33134 op1 = safe_vector_operand (op1, mode1);
33136 if ((optimize && !register_operand (op0, mode0))
33137 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33138 op0 = copy_to_mode_reg (mode0, op0);
33139 if ((optimize && !register_operand (op1, mode1))
33140 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33141 op1 = copy_to_mode_reg (mode1, op1);
33143 op2 = GEN_INT (d->comparison);
33145 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33146 if (! pat)
33147 return 0;
33148 emit_insn (pat);
33149 return target;
33152 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33154 static rtx
33155 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33156 rtx target)
33158 rtx pat;
33159 tree arg0 = CALL_EXPR_ARG (exp, 0);
33160 tree arg1 = CALL_EXPR_ARG (exp, 1);
33161 rtx op0 = expand_normal (arg0);
33162 rtx op1 = expand_normal (arg1);
33163 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33164 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33165 enum rtx_code comparison = d->comparison;
33167 if (VECTOR_MODE_P (mode0))
33168 op0 = safe_vector_operand (op0, mode0);
33169 if (VECTOR_MODE_P (mode1))
33170 op1 = safe_vector_operand (op1, mode1);
33172 target = gen_reg_rtx (SImode);
33173 emit_move_insn (target, const0_rtx);
33174 target = gen_rtx_SUBREG (QImode, target, 0);
33176 if ((optimize && !register_operand (op0, mode0))
33177 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33178 op0 = copy_to_mode_reg (mode0, op0);
33179 if ((optimize && !register_operand (op1, mode1))
33180 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33181 op1 = copy_to_mode_reg (mode1, op1);
33183 pat = GEN_FCN (d->icode) (op0, op1);
33184 if (! pat)
33185 return 0;
33186 emit_insn (pat);
33187 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33188 gen_rtx_fmt_ee (comparison, QImode,
33189 SET_DEST (pat),
33190 const0_rtx)));
33192 return SUBREG_REG (target);
33195 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
33197 static rtx
33198 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
33199 tree exp, rtx target)
33201 rtx pat;
33202 tree arg0 = CALL_EXPR_ARG (exp, 0);
33203 tree arg1 = CALL_EXPR_ARG (exp, 1);
33204 tree arg2 = CALL_EXPR_ARG (exp, 2);
33205 tree arg3 = CALL_EXPR_ARG (exp, 3);
33206 tree arg4 = CALL_EXPR_ARG (exp, 4);
33207 rtx scratch0, scratch1;
33208 rtx op0 = expand_normal (arg0);
33209 rtx op1 = expand_normal (arg1);
33210 rtx op2 = expand_normal (arg2);
33211 rtx op3 = expand_normal (arg3);
33212 rtx op4 = expand_normal (arg4);
33213 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
33215 tmode0 = insn_data[d->icode].operand[0].mode;
33216 tmode1 = insn_data[d->icode].operand[1].mode;
33217 modev2 = insn_data[d->icode].operand[2].mode;
33218 modei3 = insn_data[d->icode].operand[3].mode;
33219 modev4 = insn_data[d->icode].operand[4].mode;
33220 modei5 = insn_data[d->icode].operand[5].mode;
33221 modeimm = insn_data[d->icode].operand[6].mode;
33223 if (VECTOR_MODE_P (modev2))
33224 op0 = safe_vector_operand (op0, modev2);
33225 if (VECTOR_MODE_P (modev4))
33226 op2 = safe_vector_operand (op2, modev4);
33228 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33229 op0 = copy_to_mode_reg (modev2, op0);
33230 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
33231 op1 = copy_to_mode_reg (modei3, op1);
33232 if ((optimize && !register_operand (op2, modev4))
33233 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33234 op2 = copy_to_mode_reg (modev4, op2);
33235 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33236 op3 = copy_to_mode_reg (modei5, op3);
33238 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33240 error ("the fifth argument must be an 8-bit immediate");
33241 return const0_rtx;
33244 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33246 if (optimize || !target
33247 || GET_MODE (target) != tmode0
33248 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33249 target = gen_reg_rtx (tmode0);
33251 scratch1 = gen_reg_rtx (tmode1);
33253 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33255 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33257 if (optimize || !target
33258 || GET_MODE (target) != tmode1
33259 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33260 target = gen_reg_rtx (tmode1);
33262 scratch0 = gen_reg_rtx (tmode0);
33264 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33266 else
33268 gcc_assert (d->flag);
33270 scratch0 = gen_reg_rtx (tmode0);
33271 scratch1 = gen_reg_rtx (tmode1);
33273 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33276 if (! pat)
33277 return 0;
33279 emit_insn (pat);
33281 if (d->flag)
33283 target = gen_reg_rtx (SImode);
33284 emit_move_insn (target, const0_rtx);
33285 target = gen_rtx_SUBREG (QImode, target, 0);
33287 emit_insn
33288 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33289 gen_rtx_fmt_ee (EQ, QImode,
33290 gen_rtx_REG ((machine_mode) d->flag,
33291 FLAGS_REG),
33292 const0_rtx)));
33293 return SUBREG_REG (target);
33295 else
33296 return target;
33300 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33302 static rtx
33303 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33304 tree exp, rtx target)
33306 rtx pat;
33307 tree arg0 = CALL_EXPR_ARG (exp, 0);
33308 tree arg1 = CALL_EXPR_ARG (exp, 1);
33309 tree arg2 = CALL_EXPR_ARG (exp, 2);
33310 rtx scratch0, scratch1;
33311 rtx op0 = expand_normal (arg0);
33312 rtx op1 = expand_normal (arg1);
33313 rtx op2 = expand_normal (arg2);
33314 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33316 tmode0 = insn_data[d->icode].operand[0].mode;
33317 tmode1 = insn_data[d->icode].operand[1].mode;
33318 modev2 = insn_data[d->icode].operand[2].mode;
33319 modev3 = insn_data[d->icode].operand[3].mode;
33320 modeimm = insn_data[d->icode].operand[4].mode;
33322 if (VECTOR_MODE_P (modev2))
33323 op0 = safe_vector_operand (op0, modev2);
33324 if (VECTOR_MODE_P (modev3))
33325 op1 = safe_vector_operand (op1, modev3);
33327 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33328 op0 = copy_to_mode_reg (modev2, op0);
33329 if ((optimize && !register_operand (op1, modev3))
33330 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33331 op1 = copy_to_mode_reg (modev3, op1);
33333 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33335 error ("the third argument must be an 8-bit immediate");
33336 return const0_rtx;
33339 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33341 if (optimize || !target
33342 || GET_MODE (target) != tmode0
33343 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33344 target = gen_reg_rtx (tmode0);
33346 scratch1 = gen_reg_rtx (tmode1);
33348 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33350 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33352 if (optimize || !target
33353 || GET_MODE (target) != tmode1
33354 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33355 target = gen_reg_rtx (tmode1);
33357 scratch0 = gen_reg_rtx (tmode0);
33359 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33361 else
33363 gcc_assert (d->flag);
33365 scratch0 = gen_reg_rtx (tmode0);
33366 scratch1 = gen_reg_rtx (tmode1);
33368 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33371 if (! pat)
33372 return 0;
33374 emit_insn (pat);
33376 if (d->flag)
33378 target = gen_reg_rtx (SImode);
33379 emit_move_insn (target, const0_rtx);
33380 target = gen_rtx_SUBREG (QImode, target, 0);
33382 emit_insn
33383 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33384 gen_rtx_fmt_ee (EQ, QImode,
33385 gen_rtx_REG ((machine_mode) d->flag,
33386 FLAGS_REG),
33387 const0_rtx)));
33388 return SUBREG_REG (target);
33390 else
33391 return target;
33394 /* Subroutine of ix86_expand_builtin to take care of insns with
33395 variable number of operands. */
33397 static rtx
33398 ix86_expand_args_builtin (const struct builtin_description *d,
33399 tree exp, rtx target)
33401 rtx pat, real_target;
33402 unsigned int i, nargs;
33403 unsigned int nargs_constant = 0;
33404 unsigned int mask_pos = 0;
33405 int num_memory = 0;
33406 struct
33408 rtx op;
33409 machine_mode mode;
33410 } args[6];
33411 bool second_arg_count = false;
33412 enum insn_code icode = d->icode;
33413 const struct insn_data_d *insn_p = &insn_data[icode];
33414 machine_mode tmode = insn_p->operand[0].mode;
33415 machine_mode rmode = VOIDmode;
33416 bool swap = false;
33417 enum rtx_code comparison = d->comparison;
33419 switch ((enum ix86_builtin_func_type) d->flag)
33421 case V2DF_FTYPE_V2DF_ROUND:
33422 case V4DF_FTYPE_V4DF_ROUND:
33423 case V8DF_FTYPE_V8DF_ROUND:
33424 case V4SF_FTYPE_V4SF_ROUND:
33425 case V8SF_FTYPE_V8SF_ROUND:
33426 case V16SF_FTYPE_V16SF_ROUND:
33427 case V4SI_FTYPE_V4SF_ROUND:
33428 case V8SI_FTYPE_V8SF_ROUND:
33429 case V16SI_FTYPE_V16SF_ROUND:
33430 return ix86_expand_sse_round (d, exp, target);
33431 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33432 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33433 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33434 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33435 case INT_FTYPE_V8SF_V8SF_PTEST:
33436 case INT_FTYPE_V4DI_V4DI_PTEST:
33437 case INT_FTYPE_V4DF_V4DF_PTEST:
33438 case INT_FTYPE_V4SF_V4SF_PTEST:
33439 case INT_FTYPE_V2DI_V2DI_PTEST:
33440 case INT_FTYPE_V2DF_V2DF_PTEST:
33441 return ix86_expand_sse_ptest (d, exp, target);
33442 case FLOAT128_FTYPE_FLOAT128:
33443 case FLOAT_FTYPE_FLOAT:
33444 case INT_FTYPE_INT:
33445 case UINT_FTYPE_UINT:
33446 case UINT16_FTYPE_UINT16:
33447 case UINT64_FTYPE_INT:
33448 case UINT64_FTYPE_UINT64:
33449 case INT64_FTYPE_INT64:
33450 case INT64_FTYPE_V4SF:
33451 case INT64_FTYPE_V2DF:
33452 case INT_FTYPE_V16QI:
33453 case INT_FTYPE_V8QI:
33454 case INT_FTYPE_V8SF:
33455 case INT_FTYPE_V4DF:
33456 case INT_FTYPE_V4SF:
33457 case INT_FTYPE_V2DF:
33458 case INT_FTYPE_V32QI:
33459 case V16QI_FTYPE_V16QI:
33460 case V8SI_FTYPE_V8SF:
33461 case V8SI_FTYPE_V4SI:
33462 case V8HI_FTYPE_V8HI:
33463 case V8HI_FTYPE_V16QI:
33464 case V8QI_FTYPE_V8QI:
33465 case V8SF_FTYPE_V8SF:
33466 case V8SF_FTYPE_V8SI:
33467 case V8SF_FTYPE_V4SF:
33468 case V8SF_FTYPE_V8HI:
33469 case V4SI_FTYPE_V4SI:
33470 case V4SI_FTYPE_V16QI:
33471 case V4SI_FTYPE_V4SF:
33472 case V4SI_FTYPE_V8SI:
33473 case V4SI_FTYPE_V8HI:
33474 case V4SI_FTYPE_V4DF:
33475 case V4SI_FTYPE_V2DF:
33476 case V4HI_FTYPE_V4HI:
33477 case V4DF_FTYPE_V4DF:
33478 case V4DF_FTYPE_V4SI:
33479 case V4DF_FTYPE_V4SF:
33480 case V4DF_FTYPE_V2DF:
33481 case V4SF_FTYPE_V4SF:
33482 case V4SF_FTYPE_V4SI:
33483 case V4SF_FTYPE_V8SF:
33484 case V4SF_FTYPE_V4DF:
33485 case V4SF_FTYPE_V8HI:
33486 case V4SF_FTYPE_V2DF:
33487 case V2DI_FTYPE_V2DI:
33488 case V2DI_FTYPE_V16QI:
33489 case V2DI_FTYPE_V8HI:
33490 case V2DI_FTYPE_V4SI:
33491 case V2DF_FTYPE_V2DF:
33492 case V2DF_FTYPE_V4SI:
33493 case V2DF_FTYPE_V4DF:
33494 case V2DF_FTYPE_V4SF:
33495 case V2DF_FTYPE_V2SI:
33496 case V2SI_FTYPE_V2SI:
33497 case V2SI_FTYPE_V4SF:
33498 case V2SI_FTYPE_V2SF:
33499 case V2SI_FTYPE_V2DF:
33500 case V2SF_FTYPE_V2SF:
33501 case V2SF_FTYPE_V2SI:
33502 case V32QI_FTYPE_V32QI:
33503 case V32QI_FTYPE_V16QI:
33504 case V16HI_FTYPE_V16HI:
33505 case V16HI_FTYPE_V8HI:
33506 case V8SI_FTYPE_V8SI:
33507 case V16HI_FTYPE_V16QI:
33508 case V8SI_FTYPE_V16QI:
33509 case V4DI_FTYPE_V16QI:
33510 case V8SI_FTYPE_V8HI:
33511 case V4DI_FTYPE_V8HI:
33512 case V4DI_FTYPE_V4SI:
33513 case V4DI_FTYPE_V2DI:
33514 case UQI_FTYPE_UQI:
33515 case UHI_FTYPE_UHI:
33516 case USI_FTYPE_USI:
33517 case USI_FTYPE_UQI:
33518 case USI_FTYPE_UHI:
33519 case UDI_FTYPE_UDI:
33520 case UHI_FTYPE_V16QI:
33521 case USI_FTYPE_V32QI:
33522 case UDI_FTYPE_V64QI:
33523 case V16QI_FTYPE_UHI:
33524 case V32QI_FTYPE_USI:
33525 case V64QI_FTYPE_UDI:
33526 case V8HI_FTYPE_UQI:
33527 case V16HI_FTYPE_UHI:
33528 case V32HI_FTYPE_USI:
33529 case V4SI_FTYPE_UQI:
33530 case V8SI_FTYPE_UQI:
33531 case V4SI_FTYPE_UHI:
33532 case V8SI_FTYPE_UHI:
33533 case UQI_FTYPE_V8HI:
33534 case UHI_FTYPE_V16HI:
33535 case USI_FTYPE_V32HI:
33536 case UQI_FTYPE_V4SI:
33537 case UQI_FTYPE_V8SI:
33538 case UHI_FTYPE_V16SI:
33539 case UQI_FTYPE_V2DI:
33540 case UQI_FTYPE_V4DI:
33541 case UQI_FTYPE_V8DI:
33542 case V16SI_FTYPE_UHI:
33543 case V2DI_FTYPE_UQI:
33544 case V4DI_FTYPE_UQI:
33545 case V16SI_FTYPE_INT:
33546 case V16SF_FTYPE_V8SF:
33547 case V16SI_FTYPE_V8SI:
33548 case V16SF_FTYPE_V4SF:
33549 case V16SI_FTYPE_V4SI:
33550 case V16SI_FTYPE_V16SF:
33551 case V16SI_FTYPE_V16SI:
33552 case V64QI_FTYPE_V64QI:
33553 case V32HI_FTYPE_V32HI:
33554 case V16SF_FTYPE_V16SF:
33555 case V8DI_FTYPE_UQI:
33556 case V8DI_FTYPE_V8DI:
33557 case V8DF_FTYPE_V4DF:
33558 case V8DF_FTYPE_V2DF:
33559 case V8DF_FTYPE_V8DF:
33560 case V4DI_FTYPE_V4DI:
33561 nargs = 1;
33562 break;
33563 case V4SF_FTYPE_V4SF_VEC_MERGE:
33564 case V2DF_FTYPE_V2DF_VEC_MERGE:
33565 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33566 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33567 case V16QI_FTYPE_V16QI_V16QI:
33568 case V16QI_FTYPE_V8HI_V8HI:
33569 case V16SF_FTYPE_V16SF_V16SF:
33570 case V8QI_FTYPE_V8QI_V8QI:
33571 case V8QI_FTYPE_V4HI_V4HI:
33572 case V8HI_FTYPE_V8HI_V8HI:
33573 case V8HI_FTYPE_V16QI_V16QI:
33574 case V8HI_FTYPE_V4SI_V4SI:
33575 case V8SF_FTYPE_V8SF_V8SF:
33576 case V8SF_FTYPE_V8SF_V8SI:
33577 case V8DF_FTYPE_V8DF_V8DF:
33578 case V4SI_FTYPE_V4SI_V4SI:
33579 case V4SI_FTYPE_V8HI_V8HI:
33580 case V4SI_FTYPE_V2DF_V2DF:
33581 case V4HI_FTYPE_V4HI_V4HI:
33582 case V4HI_FTYPE_V8QI_V8QI:
33583 case V4HI_FTYPE_V2SI_V2SI:
33584 case V4DF_FTYPE_V4DF_V4DF:
33585 case V4DF_FTYPE_V4DF_V4DI:
33586 case V4SF_FTYPE_V4SF_V4SF:
33587 case V4SF_FTYPE_V4SF_V4SI:
33588 case V4SF_FTYPE_V4SF_V2SI:
33589 case V4SF_FTYPE_V4SF_V2DF:
33590 case V4SF_FTYPE_V4SF_UINT:
33591 case V4SF_FTYPE_V4SF_DI:
33592 case V4SF_FTYPE_V4SF_SI:
33593 case V2DI_FTYPE_V2DI_V2DI:
33594 case V2DI_FTYPE_V16QI_V16QI:
33595 case V2DI_FTYPE_V4SI_V4SI:
33596 case V2DI_FTYPE_V2DI_V16QI:
33597 case V2SI_FTYPE_V2SI_V2SI:
33598 case V2SI_FTYPE_V4HI_V4HI:
33599 case V2SI_FTYPE_V2SF_V2SF:
33600 case V2DF_FTYPE_V2DF_V2DF:
33601 case V2DF_FTYPE_V2DF_V4SF:
33602 case V2DF_FTYPE_V2DF_V2DI:
33603 case V2DF_FTYPE_V2DF_DI:
33604 case V2DF_FTYPE_V2DF_SI:
33605 case V2DF_FTYPE_V2DF_UINT:
33606 case V2SF_FTYPE_V2SF_V2SF:
33607 case V1DI_FTYPE_V1DI_V1DI:
33608 case V1DI_FTYPE_V8QI_V8QI:
33609 case V1DI_FTYPE_V2SI_V2SI:
33610 case V32QI_FTYPE_V16HI_V16HI:
33611 case V16HI_FTYPE_V8SI_V8SI:
33612 case V64QI_FTYPE_V64QI_V64QI:
33613 case V32QI_FTYPE_V32QI_V32QI:
33614 case V16HI_FTYPE_V32QI_V32QI:
33615 case V16HI_FTYPE_V16HI_V16HI:
33616 case V8SI_FTYPE_V4DF_V4DF:
33617 case V8SI_FTYPE_V8SI_V8SI:
33618 case V8SI_FTYPE_V16HI_V16HI:
33619 case V4DI_FTYPE_V4DI_V4DI:
33620 case V4DI_FTYPE_V8SI_V8SI:
33621 case V8DI_FTYPE_V64QI_V64QI:
33622 if (comparison == UNKNOWN)
33623 return ix86_expand_binop_builtin (icode, exp, target);
33624 nargs = 2;
33625 break;
33626 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33627 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33628 gcc_assert (comparison != UNKNOWN);
33629 nargs = 2;
33630 swap = true;
33631 break;
33632 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33633 case V16HI_FTYPE_V16HI_SI_COUNT:
33634 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33635 case V8SI_FTYPE_V8SI_SI_COUNT:
33636 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33637 case V4DI_FTYPE_V4DI_INT_COUNT:
33638 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33639 case V8HI_FTYPE_V8HI_SI_COUNT:
33640 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33641 case V4SI_FTYPE_V4SI_SI_COUNT:
33642 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33643 case V4HI_FTYPE_V4HI_SI_COUNT:
33644 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33645 case V2DI_FTYPE_V2DI_SI_COUNT:
33646 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33647 case V2SI_FTYPE_V2SI_SI_COUNT:
33648 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33649 case V1DI_FTYPE_V1DI_SI_COUNT:
33650 nargs = 2;
33651 second_arg_count = true;
33652 break;
33653 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33654 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33655 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33656 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33657 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33658 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33659 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33660 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33661 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33662 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33663 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33664 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33665 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33666 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33667 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33668 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33669 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33670 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33671 nargs = 4;
33672 second_arg_count = true;
33673 break;
33674 case UINT64_FTYPE_UINT64_UINT64:
33675 case UINT_FTYPE_UINT_UINT:
33676 case UINT_FTYPE_UINT_USHORT:
33677 case UINT_FTYPE_UINT_UCHAR:
33678 case UINT16_FTYPE_UINT16_INT:
33679 case UINT8_FTYPE_UINT8_INT:
33680 case UQI_FTYPE_UQI_UQI:
33681 case UHI_FTYPE_UHI_UHI:
33682 case USI_FTYPE_USI_USI:
33683 case UDI_FTYPE_UDI_UDI:
33684 case V16SI_FTYPE_V8DF_V8DF:
33685 nargs = 2;
33686 break;
33687 case V2DI_FTYPE_V2DI_INT_CONVERT:
33688 nargs = 2;
33689 rmode = V1TImode;
33690 nargs_constant = 1;
33691 break;
33692 case V4DI_FTYPE_V4DI_INT_CONVERT:
33693 nargs = 2;
33694 rmode = V2TImode;
33695 nargs_constant = 1;
33696 break;
33697 case V8DI_FTYPE_V8DI_INT_CONVERT:
33698 nargs = 2;
33699 rmode = V4TImode;
33700 nargs_constant = 1;
33701 break;
33702 case V8HI_FTYPE_V8HI_INT:
33703 case V8HI_FTYPE_V8SF_INT:
33704 case V16HI_FTYPE_V16SF_INT:
33705 case V8HI_FTYPE_V4SF_INT:
33706 case V8SF_FTYPE_V8SF_INT:
33707 case V4SF_FTYPE_V16SF_INT:
33708 case V16SF_FTYPE_V16SF_INT:
33709 case V4SI_FTYPE_V4SI_INT:
33710 case V4SI_FTYPE_V8SI_INT:
33711 case V4HI_FTYPE_V4HI_INT:
33712 case V4DF_FTYPE_V4DF_INT:
33713 case V4DF_FTYPE_V8DF_INT:
33714 case V4SF_FTYPE_V4SF_INT:
33715 case V4SF_FTYPE_V8SF_INT:
33716 case V2DI_FTYPE_V2DI_INT:
33717 case V2DF_FTYPE_V2DF_INT:
33718 case V2DF_FTYPE_V4DF_INT:
33719 case V16HI_FTYPE_V16HI_INT:
33720 case V8SI_FTYPE_V8SI_INT:
33721 case V16SI_FTYPE_V16SI_INT:
33722 case V4SI_FTYPE_V16SI_INT:
33723 case V4DI_FTYPE_V4DI_INT:
33724 case V2DI_FTYPE_V4DI_INT:
33725 case V4DI_FTYPE_V8DI_INT:
33726 case QI_FTYPE_V4SF_INT:
33727 case QI_FTYPE_V2DF_INT:
33728 case UQI_FTYPE_UQI_UQI_CONST:
33729 case UHI_FTYPE_UHI_UQI:
33730 case USI_FTYPE_USI_UQI:
33731 case UDI_FTYPE_UDI_UQI:
33732 nargs = 2;
33733 nargs_constant = 1;
33734 break;
33735 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33736 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33737 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33738 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33739 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33740 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33741 case UHI_FTYPE_V16SI_V16SI_UHI:
33742 case UQI_FTYPE_V8DI_V8DI_UQI:
33743 case V16HI_FTYPE_V16SI_V16HI_UHI:
33744 case V16QI_FTYPE_V16SI_V16QI_UHI:
33745 case V16QI_FTYPE_V8DI_V16QI_UQI:
33746 case V16SF_FTYPE_V16SF_V16SF_UHI:
33747 case V16SF_FTYPE_V4SF_V16SF_UHI:
33748 case V16SI_FTYPE_SI_V16SI_UHI:
33749 case V16SI_FTYPE_V16HI_V16SI_UHI:
33750 case V16SI_FTYPE_V16QI_V16SI_UHI:
33751 case V8SF_FTYPE_V4SF_V8SF_UQI:
33752 case V4DF_FTYPE_V2DF_V4DF_UQI:
33753 case V8SI_FTYPE_V4SI_V8SI_UQI:
33754 case V8SI_FTYPE_SI_V8SI_UQI:
33755 case V4SI_FTYPE_V4SI_V4SI_UQI:
33756 case V4SI_FTYPE_SI_V4SI_UQI:
33757 case V4DI_FTYPE_V2DI_V4DI_UQI:
33758 case V4DI_FTYPE_DI_V4DI_UQI:
33759 case V2DI_FTYPE_V2DI_V2DI_UQI:
33760 case V2DI_FTYPE_DI_V2DI_UQI:
33761 case V64QI_FTYPE_V64QI_V64QI_UDI:
33762 case V64QI_FTYPE_V16QI_V64QI_UDI:
33763 case V64QI_FTYPE_QI_V64QI_UDI:
33764 case V32QI_FTYPE_V32QI_V32QI_USI:
33765 case V32QI_FTYPE_V16QI_V32QI_USI:
33766 case V32QI_FTYPE_QI_V32QI_USI:
33767 case V16QI_FTYPE_V16QI_V16QI_UHI:
33768 case V16QI_FTYPE_QI_V16QI_UHI:
33769 case V32HI_FTYPE_V8HI_V32HI_USI:
33770 case V32HI_FTYPE_HI_V32HI_USI:
33771 case V16HI_FTYPE_V8HI_V16HI_UHI:
33772 case V16HI_FTYPE_HI_V16HI_UHI:
33773 case V8HI_FTYPE_V8HI_V8HI_UQI:
33774 case V8HI_FTYPE_HI_V8HI_UQI:
33775 case V8SF_FTYPE_V8HI_V8SF_UQI:
33776 case V4SF_FTYPE_V8HI_V4SF_UQI:
33777 case V8SI_FTYPE_V8SF_V8SI_UQI:
33778 case V4SI_FTYPE_V4SF_V4SI_UQI:
33779 case V4DI_FTYPE_V4SF_V4DI_UQI:
33780 case V2DI_FTYPE_V4SF_V2DI_UQI:
33781 case V4SF_FTYPE_V4DI_V4SF_UQI:
33782 case V4SF_FTYPE_V2DI_V4SF_UQI:
33783 case V4DF_FTYPE_V4DI_V4DF_UQI:
33784 case V2DF_FTYPE_V2DI_V2DF_UQI:
33785 case V16QI_FTYPE_V8HI_V16QI_UQI:
33786 case V16QI_FTYPE_V16HI_V16QI_UHI:
33787 case V16QI_FTYPE_V4SI_V16QI_UQI:
33788 case V16QI_FTYPE_V8SI_V16QI_UQI:
33789 case V8HI_FTYPE_V4SI_V8HI_UQI:
33790 case V8HI_FTYPE_V8SI_V8HI_UQI:
33791 case V16QI_FTYPE_V2DI_V16QI_UQI:
33792 case V16QI_FTYPE_V4DI_V16QI_UQI:
33793 case V8HI_FTYPE_V2DI_V8HI_UQI:
33794 case V8HI_FTYPE_V4DI_V8HI_UQI:
33795 case V4SI_FTYPE_V2DI_V4SI_UQI:
33796 case V4SI_FTYPE_V4DI_V4SI_UQI:
33797 case V32QI_FTYPE_V32HI_V32QI_USI:
33798 case UHI_FTYPE_V16QI_V16QI_UHI:
33799 case USI_FTYPE_V32QI_V32QI_USI:
33800 case UDI_FTYPE_V64QI_V64QI_UDI:
33801 case UQI_FTYPE_V8HI_V8HI_UQI:
33802 case UHI_FTYPE_V16HI_V16HI_UHI:
33803 case USI_FTYPE_V32HI_V32HI_USI:
33804 case UQI_FTYPE_V4SI_V4SI_UQI:
33805 case UQI_FTYPE_V8SI_V8SI_UQI:
33806 case UQI_FTYPE_V2DI_V2DI_UQI:
33807 case UQI_FTYPE_V4DI_V4DI_UQI:
33808 case V4SF_FTYPE_V2DF_V4SF_UQI:
33809 case V4SF_FTYPE_V4DF_V4SF_UQI:
33810 case V16SI_FTYPE_V16SI_V16SI_UHI:
33811 case V16SI_FTYPE_V4SI_V16SI_UHI:
33812 case V2DI_FTYPE_V4SI_V2DI_UQI:
33813 case V2DI_FTYPE_V8HI_V2DI_UQI:
33814 case V2DI_FTYPE_V16QI_V2DI_UQI:
33815 case V4DI_FTYPE_V4DI_V4DI_UQI:
33816 case V4DI_FTYPE_V4SI_V4DI_UQI:
33817 case V4DI_FTYPE_V8HI_V4DI_UQI:
33818 case V4DI_FTYPE_V16QI_V4DI_UQI:
33819 case V4DI_FTYPE_V4DF_V4DI_UQI:
33820 case V2DI_FTYPE_V2DF_V2DI_UQI:
33821 case V4SI_FTYPE_V4DF_V4SI_UQI:
33822 case V4SI_FTYPE_V2DF_V4SI_UQI:
33823 case V4SI_FTYPE_V8HI_V4SI_UQI:
33824 case V4SI_FTYPE_V16QI_V4SI_UQI:
33825 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33826 case V8DF_FTYPE_V2DF_V8DF_UQI:
33827 case V8DF_FTYPE_V4DF_V8DF_UQI:
33828 case V8DF_FTYPE_V8DF_V8DF_UQI:
33829 case V8SF_FTYPE_V8SF_V8SF_UQI:
33830 case V8SF_FTYPE_V8SI_V8SF_UQI:
33831 case V4DF_FTYPE_V4DF_V4DF_UQI:
33832 case V4SF_FTYPE_V4SF_V4SF_UQI:
33833 case V2DF_FTYPE_V2DF_V2DF_UQI:
33834 case V2DF_FTYPE_V4SF_V2DF_UQI:
33835 case V2DF_FTYPE_V4SI_V2DF_UQI:
33836 case V4SF_FTYPE_V4SI_V4SF_UQI:
33837 case V4DF_FTYPE_V4SF_V4DF_UQI:
33838 case V4DF_FTYPE_V4SI_V4DF_UQI:
33839 case V8SI_FTYPE_V8SI_V8SI_UQI:
33840 case V8SI_FTYPE_V8HI_V8SI_UQI:
33841 case V8SI_FTYPE_V16QI_V8SI_UQI:
33842 case V8DF_FTYPE_V8SI_V8DF_UQI:
33843 case V8DI_FTYPE_DI_V8DI_UQI:
33844 case V16SF_FTYPE_V8SF_V16SF_UHI:
33845 case V16SI_FTYPE_V8SI_V16SI_UHI:
33846 case V16HI_FTYPE_V16HI_V16HI_UHI:
33847 case V8HI_FTYPE_V16QI_V8HI_UQI:
33848 case V16HI_FTYPE_V16QI_V16HI_UHI:
33849 case V32HI_FTYPE_V32HI_V32HI_USI:
33850 case V32HI_FTYPE_V32QI_V32HI_USI:
33851 case V8DI_FTYPE_V16QI_V8DI_UQI:
33852 case V8DI_FTYPE_V2DI_V8DI_UQI:
33853 case V8DI_FTYPE_V4DI_V8DI_UQI:
33854 case V8DI_FTYPE_V8DI_V8DI_UQI:
33855 case V8DI_FTYPE_V8HI_V8DI_UQI:
33856 case V8DI_FTYPE_V8SI_V8DI_UQI:
33857 case V8HI_FTYPE_V8DI_V8HI_UQI:
33858 case V8SI_FTYPE_V8DI_V8SI_UQI:
33859 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33860 case V16SI_FTYPE_V16SI_V16SI_V16SI:
33861 case V8DI_FTYPE_V8DI_V8DI_V8DI:
33862 case V32HI_FTYPE_V32HI_V32HI_V32HI:
33863 case V2DI_FTYPE_V2DI_V2DI_V2DI:
33864 case V16HI_FTYPE_V16HI_V16HI_V16HI:
33865 case V8SI_FTYPE_V8SI_V8SI_V8SI:
33866 case V8HI_FTYPE_V8HI_V8HI_V8HI:
33867 nargs = 3;
33868 break;
33869 case V32QI_FTYPE_V32QI_V32QI_INT:
33870 case V16HI_FTYPE_V16HI_V16HI_INT:
33871 case V16QI_FTYPE_V16QI_V16QI_INT:
33872 case V4DI_FTYPE_V4DI_V4DI_INT:
33873 case V8HI_FTYPE_V8HI_V8HI_INT:
33874 case V8SI_FTYPE_V8SI_V8SI_INT:
33875 case V8SI_FTYPE_V8SI_V4SI_INT:
33876 case V8SF_FTYPE_V8SF_V8SF_INT:
33877 case V8SF_FTYPE_V8SF_V4SF_INT:
33878 case V4SI_FTYPE_V4SI_V4SI_INT:
33879 case V4DF_FTYPE_V4DF_V4DF_INT:
33880 case V16SF_FTYPE_V16SF_V16SF_INT:
33881 case V16SF_FTYPE_V16SF_V4SF_INT:
33882 case V16SI_FTYPE_V16SI_V4SI_INT:
33883 case V4DF_FTYPE_V4DF_V2DF_INT:
33884 case V4SF_FTYPE_V4SF_V4SF_INT:
33885 case V2DI_FTYPE_V2DI_V2DI_INT:
33886 case V4DI_FTYPE_V4DI_V2DI_INT:
33887 case V2DF_FTYPE_V2DF_V2DF_INT:
33888 case UQI_FTYPE_V8DI_V8UDI_INT:
33889 case UQI_FTYPE_V8DF_V8DF_INT:
33890 case UQI_FTYPE_V2DF_V2DF_INT:
33891 case UQI_FTYPE_V4SF_V4SF_INT:
33892 case UHI_FTYPE_V16SI_V16SI_INT:
33893 case UHI_FTYPE_V16SF_V16SF_INT:
33894 case V64QI_FTYPE_V64QI_V64QI_INT:
33895 case V32HI_FTYPE_V32HI_V32HI_INT:
33896 case V16SI_FTYPE_V16SI_V16SI_INT:
33897 case V8DI_FTYPE_V8DI_V8DI_INT:
33898 nargs = 3;
33899 nargs_constant = 1;
33900 break;
33901 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33902 nargs = 3;
33903 rmode = V4DImode;
33904 nargs_constant = 1;
33905 break;
33906 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33907 nargs = 3;
33908 rmode = V2DImode;
33909 nargs_constant = 1;
33910 break;
33911 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33912 nargs = 3;
33913 rmode = DImode;
33914 nargs_constant = 1;
33915 break;
33916 case V2DI_FTYPE_V2DI_UINT_UINT:
33917 nargs = 3;
33918 nargs_constant = 2;
33919 break;
33920 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33921 nargs = 3;
33922 rmode = V8DImode;
33923 nargs_constant = 1;
33924 break;
33925 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33926 nargs = 5;
33927 rmode = V8DImode;
33928 mask_pos = 2;
33929 nargs_constant = 1;
33930 break;
33931 case QI_FTYPE_V8DF_INT_UQI:
33932 case QI_FTYPE_V4DF_INT_UQI:
33933 case QI_FTYPE_V2DF_INT_UQI:
33934 case HI_FTYPE_V16SF_INT_UHI:
33935 case QI_FTYPE_V8SF_INT_UQI:
33936 case QI_FTYPE_V4SF_INT_UQI:
33937 case UHI_FTYPE_V2DI_V2DI_UHI:
33938 case USI_FTYPE_V4DI_V4DI_USI:
33939 case V4SI_FTYPE_V4SI_V4SI_UHI:
33940 case V8SI_FTYPE_V8SI_V8SI_UHI:
33941 nargs = 3;
33942 mask_pos = 1;
33943 nargs_constant = 1;
33944 break;
33945 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33946 nargs = 5;
33947 rmode = V4DImode;
33948 mask_pos = 2;
33949 nargs_constant = 1;
33950 break;
33951 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33952 nargs = 5;
33953 rmode = V2DImode;
33954 mask_pos = 2;
33955 nargs_constant = 1;
33956 break;
33957 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
33958 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
33959 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
33960 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
33961 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
33962 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
33963 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
33964 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
33965 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
33966 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
33967 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
33968 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
33969 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
33970 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
33971 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
33972 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
33973 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
33974 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
33975 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
33976 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
33977 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
33978 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
33979 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
33980 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
33981 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
33982 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
33983 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
33984 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
33985 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
33986 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
33987 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
33988 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
33989 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
33990 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
33991 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
33992 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
33993 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
33994 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
33995 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
33996 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
33997 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
33998 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
33999 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34000 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34001 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34002 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34003 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34004 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34005 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34006 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34007 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34008 nargs = 4;
34009 break;
34010 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34011 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34012 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34013 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34014 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34015 nargs = 4;
34016 nargs_constant = 1;
34017 break;
34018 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34019 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34020 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34021 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34022 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34023 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34024 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34025 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34026 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34027 case USI_FTYPE_V32QI_V32QI_INT_USI:
34028 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34029 case USI_FTYPE_V32HI_V32HI_INT_USI:
34030 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34031 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34032 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34033 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34034 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34035 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34036 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34037 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34038 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34039 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34040 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34041 nargs = 4;
34042 mask_pos = 1;
34043 nargs_constant = 1;
34044 break;
34045 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34046 nargs = 4;
34047 nargs_constant = 2;
34048 break;
34049 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34050 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34051 nargs = 4;
34052 break;
34053 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34054 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34055 mask_pos = 1;
34056 nargs = 4;
34057 nargs_constant = 1;
34058 break;
34059 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34060 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34061 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34062 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34063 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34064 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34065 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34066 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34067 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34068 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34069 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34070 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34071 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34072 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34073 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34074 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34075 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34076 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34077 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34078 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34079 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34080 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34081 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34082 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34083 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34084 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34085 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34086 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34087 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34088 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34089 nargs = 4;
34090 mask_pos = 2;
34091 nargs_constant = 1;
34092 break;
34093 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34094 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34095 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34096 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34097 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34098 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34099 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34100 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34101 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34102 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34103 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34104 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34105 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34106 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34107 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34108 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34109 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34110 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34111 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34112 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34113 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34114 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34115 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34116 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34117 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34118 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34119 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34120 nargs = 5;
34121 mask_pos = 2;
34122 nargs_constant = 1;
34123 break;
34124 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34125 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34126 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34127 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34128 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34129 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34130 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34131 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34132 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34133 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34134 nargs = 5;
34135 mask_pos = 1;
34136 nargs_constant = 1;
34137 break;
34138 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34139 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34140 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34141 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34142 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34143 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34144 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34145 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34146 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34147 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34148 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34149 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34150 nargs = 5;
34151 mask_pos = 1;
34152 nargs_constant = 2;
34153 break;
34155 default:
34156 gcc_unreachable ();
34159 gcc_assert (nargs <= ARRAY_SIZE (args));
34161 if (comparison != UNKNOWN)
34163 gcc_assert (nargs == 2);
34164 return ix86_expand_sse_compare (d, exp, target, swap);
34167 if (rmode == VOIDmode || rmode == tmode)
34169 if (optimize
34170 || target == 0
34171 || GET_MODE (target) != tmode
34172 || !insn_p->operand[0].predicate (target, tmode))
34173 target = gen_reg_rtx (tmode);
34174 else if (memory_operand (target, tmode))
34175 num_memory++;
34176 real_target = target;
34178 else
34180 real_target = gen_reg_rtx (tmode);
34181 target = lowpart_subreg (rmode, real_target, tmode);
34184 for (i = 0; i < nargs; i++)
34186 tree arg = CALL_EXPR_ARG (exp, i);
34187 rtx op = expand_normal (arg);
34188 machine_mode mode = insn_p->operand[i + 1].mode;
34189 bool match = insn_p->operand[i + 1].predicate (op, mode);
34191 if (second_arg_count && i == 1)
34193 /* SIMD shift insns take either an 8-bit immediate or
34194 register as count. But builtin functions take int as
34195 count. If count doesn't match, we put it in register.
34196 The instructions are using 64-bit count, if op is just
34197 32-bit, zero-extend it, as negative shift counts
34198 are undefined behavior and zero-extension is more
34199 efficient. */
34200 if (!match)
34202 if (SCALAR_INT_MODE_P (GET_MODE (op)))
34203 op = convert_modes (mode, GET_MODE (op), op, 1);
34204 else
34205 op = lowpart_subreg (mode, op, GET_MODE (op));
34206 if (!insn_p->operand[i + 1].predicate (op, mode))
34207 op = copy_to_reg (op);
34210 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34211 (!mask_pos && (nargs - i) <= nargs_constant))
34213 if (!match)
34214 switch (icode)
34216 case CODE_FOR_avx_vinsertf128v4di:
34217 case CODE_FOR_avx_vextractf128v4di:
34218 error ("the last argument must be an 1-bit immediate");
34219 return const0_rtx;
34221 case CODE_FOR_avx512f_cmpv8di3_mask:
34222 case CODE_FOR_avx512f_cmpv16si3_mask:
34223 case CODE_FOR_avx512f_ucmpv8di3_mask:
34224 case CODE_FOR_avx512f_ucmpv16si3_mask:
34225 case CODE_FOR_avx512vl_cmpv4di3_mask:
34226 case CODE_FOR_avx512vl_cmpv8si3_mask:
34227 case CODE_FOR_avx512vl_ucmpv4di3_mask:
34228 case CODE_FOR_avx512vl_ucmpv8si3_mask:
34229 case CODE_FOR_avx512vl_cmpv2di3_mask:
34230 case CODE_FOR_avx512vl_cmpv4si3_mask:
34231 case CODE_FOR_avx512vl_ucmpv2di3_mask:
34232 case CODE_FOR_avx512vl_ucmpv4si3_mask:
34233 error ("the last argument must be a 3-bit immediate");
34234 return const0_rtx;
34236 case CODE_FOR_sse4_1_roundsd:
34237 case CODE_FOR_sse4_1_roundss:
34239 case CODE_FOR_sse4_1_roundpd:
34240 case CODE_FOR_sse4_1_roundps:
34241 case CODE_FOR_avx_roundpd256:
34242 case CODE_FOR_avx_roundps256:
34244 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
34245 case CODE_FOR_sse4_1_roundps_sfix:
34246 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
34247 case CODE_FOR_avx_roundps_sfix256:
34249 case CODE_FOR_sse4_1_blendps:
34250 case CODE_FOR_avx_blendpd256:
34251 case CODE_FOR_avx_vpermilv4df:
34252 case CODE_FOR_avx_vpermilv4df_mask:
34253 case CODE_FOR_avx512f_getmantv8df_mask:
34254 case CODE_FOR_avx512f_getmantv16sf_mask:
34255 case CODE_FOR_avx512vl_getmantv8sf_mask:
34256 case CODE_FOR_avx512vl_getmantv4df_mask:
34257 case CODE_FOR_avx512vl_getmantv4sf_mask:
34258 case CODE_FOR_avx512vl_getmantv2df_mask:
34259 case CODE_FOR_avx512dq_rangepv8df_mask_round:
34260 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
34261 case CODE_FOR_avx512dq_rangepv4df_mask:
34262 case CODE_FOR_avx512dq_rangepv8sf_mask:
34263 case CODE_FOR_avx512dq_rangepv2df_mask:
34264 case CODE_FOR_avx512dq_rangepv4sf_mask:
34265 case CODE_FOR_avx_shufpd256_mask:
34266 error ("the last argument must be a 4-bit immediate");
34267 return const0_rtx;
34269 case CODE_FOR_sha1rnds4:
34270 case CODE_FOR_sse4_1_blendpd:
34271 case CODE_FOR_avx_vpermilv2df:
34272 case CODE_FOR_avx_vpermilv2df_mask:
34273 case CODE_FOR_xop_vpermil2v2df3:
34274 case CODE_FOR_xop_vpermil2v4sf3:
34275 case CODE_FOR_xop_vpermil2v4df3:
34276 case CODE_FOR_xop_vpermil2v8sf3:
34277 case CODE_FOR_avx512f_vinsertf32x4_mask:
34278 case CODE_FOR_avx512f_vinserti32x4_mask:
34279 case CODE_FOR_avx512f_vextractf32x4_mask:
34280 case CODE_FOR_avx512f_vextracti32x4_mask:
34281 case CODE_FOR_sse2_shufpd:
34282 case CODE_FOR_sse2_shufpd_mask:
34283 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34284 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34285 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34286 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34287 error ("the last argument must be a 2-bit immediate");
34288 return const0_rtx;
34290 case CODE_FOR_avx_vextractf128v4df:
34291 case CODE_FOR_avx_vextractf128v8sf:
34292 case CODE_FOR_avx_vextractf128v8si:
34293 case CODE_FOR_avx_vinsertf128v4df:
34294 case CODE_FOR_avx_vinsertf128v8sf:
34295 case CODE_FOR_avx_vinsertf128v8si:
34296 case CODE_FOR_avx512f_vinsertf64x4_mask:
34297 case CODE_FOR_avx512f_vinserti64x4_mask:
34298 case CODE_FOR_avx512f_vextractf64x4_mask:
34299 case CODE_FOR_avx512f_vextracti64x4_mask:
34300 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34301 case CODE_FOR_avx512dq_vinserti32x8_mask:
34302 case CODE_FOR_avx512vl_vinsertv4df:
34303 case CODE_FOR_avx512vl_vinsertv4di:
34304 case CODE_FOR_avx512vl_vinsertv8sf:
34305 case CODE_FOR_avx512vl_vinsertv8si:
34306 error ("the last argument must be a 1-bit immediate");
34307 return const0_rtx;
34309 case CODE_FOR_avx_vmcmpv2df3:
34310 case CODE_FOR_avx_vmcmpv4sf3:
34311 case CODE_FOR_avx_cmpv2df3:
34312 case CODE_FOR_avx_cmpv4sf3:
34313 case CODE_FOR_avx_cmpv4df3:
34314 case CODE_FOR_avx_cmpv8sf3:
34315 case CODE_FOR_avx512f_cmpv8df3_mask:
34316 case CODE_FOR_avx512f_cmpv16sf3_mask:
34317 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34318 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34319 error ("the last argument must be a 5-bit immediate");
34320 return const0_rtx;
34322 default:
34323 switch (nargs_constant)
34325 case 2:
34326 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34327 (!mask_pos && (nargs - i) == nargs_constant))
34329 error ("the next to last argument must be an 8-bit immediate");
34330 break;
34332 /* FALLTHRU */
34333 case 1:
34334 error ("the last argument must be an 8-bit immediate");
34335 break;
34336 default:
34337 gcc_unreachable ();
34339 return const0_rtx;
34342 else
34344 if (VECTOR_MODE_P (mode))
34345 op = safe_vector_operand (op, mode);
34347 /* If we aren't optimizing, only allow one memory operand to
34348 be generated. */
34349 if (memory_operand (op, mode))
34350 num_memory++;
34352 op = fixup_modeless_constant (op, mode);
34354 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34356 if (optimize || !match || num_memory > 1)
34357 op = copy_to_mode_reg (mode, op);
34359 else
34361 op = copy_to_reg (op);
34362 op = lowpart_subreg (mode, op, GET_MODE (op));
34366 args[i].op = op;
34367 args[i].mode = mode;
34370 switch (nargs)
34372 case 1:
34373 pat = GEN_FCN (icode) (real_target, args[0].op);
34374 break;
34375 case 2:
34376 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34377 break;
34378 case 3:
34379 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34380 args[2].op);
34381 break;
34382 case 4:
34383 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34384 args[2].op, args[3].op);
34385 break;
34386 case 5:
34387 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34388 args[2].op, args[3].op, args[4].op);
34389 break;
34390 case 6:
34391 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34392 args[2].op, args[3].op, args[4].op,
34393 args[5].op);
34394 break;
34395 default:
34396 gcc_unreachable ();
34399 if (! pat)
34400 return 0;
34402 emit_insn (pat);
34403 return target;
34406 /* Transform pattern of following layout:
34407 (set A
34408 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34410 into:
34411 (set (A B)) */
34413 static rtx
34414 ix86_erase_embedded_rounding (rtx pat)
34416 if (GET_CODE (pat) == INSN)
34417 pat = PATTERN (pat);
34419 gcc_assert (GET_CODE (pat) == SET);
34420 rtx src = SET_SRC (pat);
34421 gcc_assert (XVECLEN (src, 0) == 2);
34422 rtx p0 = XVECEXP (src, 0, 0);
34423 gcc_assert (GET_CODE (src) == UNSPEC
34424 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34425 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34426 return res;
34429 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34430 with rounding. */
34431 static rtx
34432 ix86_expand_sse_comi_round (const struct builtin_description *d,
34433 tree exp, rtx target)
34435 rtx pat, set_dst;
34436 tree arg0 = CALL_EXPR_ARG (exp, 0);
34437 tree arg1 = CALL_EXPR_ARG (exp, 1);
34438 tree arg2 = CALL_EXPR_ARG (exp, 2);
34439 tree arg3 = CALL_EXPR_ARG (exp, 3);
34440 rtx op0 = expand_normal (arg0);
34441 rtx op1 = expand_normal (arg1);
34442 rtx op2 = expand_normal (arg2);
34443 rtx op3 = expand_normal (arg3);
34444 enum insn_code icode = d->icode;
34445 const struct insn_data_d *insn_p = &insn_data[icode];
34446 machine_mode mode0 = insn_p->operand[0].mode;
34447 machine_mode mode1 = insn_p->operand[1].mode;
34448 enum rtx_code comparison = UNEQ;
34449 bool need_ucomi = false;
34451 /* See avxintrin.h for values. */
34452 enum rtx_code comi_comparisons[32] =
34454 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34455 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34456 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34458 bool need_ucomi_values[32] =
34460 true, false, false, true, true, false, false, true,
34461 true, false, false, true, true, false, false, true,
34462 false, true, true, false, false, true, true, false,
34463 false, true, true, false, false, true, true, false
34466 if (!CONST_INT_P (op2))
34468 error ("the third argument must be comparison constant");
34469 return const0_rtx;
34471 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34473 error ("incorrect comparison mode");
34474 return const0_rtx;
34477 if (!insn_p->operand[2].predicate (op3, SImode))
34479 error ("incorrect rounding operand");
34480 return const0_rtx;
34483 comparison = comi_comparisons[INTVAL (op2)];
34484 need_ucomi = need_ucomi_values[INTVAL (op2)];
34486 if (VECTOR_MODE_P (mode0))
34487 op0 = safe_vector_operand (op0, mode0);
34488 if (VECTOR_MODE_P (mode1))
34489 op1 = safe_vector_operand (op1, mode1);
34491 target = gen_reg_rtx (SImode);
34492 emit_move_insn (target, const0_rtx);
34493 target = gen_rtx_SUBREG (QImode, target, 0);
34495 if ((optimize && !register_operand (op0, mode0))
34496 || !insn_p->operand[0].predicate (op0, mode0))
34497 op0 = copy_to_mode_reg (mode0, op0);
34498 if ((optimize && !register_operand (op1, mode1))
34499 || !insn_p->operand[1].predicate (op1, mode1))
34500 op1 = copy_to_mode_reg (mode1, op1);
34502 if (need_ucomi)
34503 icode = icode == CODE_FOR_sse_comi_round
34504 ? CODE_FOR_sse_ucomi_round
34505 : CODE_FOR_sse2_ucomi_round;
34507 pat = GEN_FCN (icode) (op0, op1, op3);
34508 if (! pat)
34509 return 0;
34511 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34512 if (INTVAL (op3) == NO_ROUND)
34514 pat = ix86_erase_embedded_rounding (pat);
34515 if (! pat)
34516 return 0;
34518 set_dst = SET_DEST (pat);
34520 else
34522 gcc_assert (GET_CODE (pat) == SET);
34523 set_dst = SET_DEST (pat);
34526 emit_insn (pat);
34527 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34528 gen_rtx_fmt_ee (comparison, QImode,
34529 set_dst,
34530 const0_rtx)));
34532 return SUBREG_REG (target);
34535 static rtx
34536 ix86_expand_round_builtin (const struct builtin_description *d,
34537 tree exp, rtx target)
34539 rtx pat;
34540 unsigned int i, nargs;
34541 struct
34543 rtx op;
34544 machine_mode mode;
34545 } args[6];
34546 enum insn_code icode = d->icode;
34547 const struct insn_data_d *insn_p = &insn_data[icode];
34548 machine_mode tmode = insn_p->operand[0].mode;
34549 unsigned int nargs_constant = 0;
34550 unsigned int redundant_embed_rnd = 0;
34552 switch ((enum ix86_builtin_func_type) d->flag)
34554 case UINT64_FTYPE_V2DF_INT:
34555 case UINT64_FTYPE_V4SF_INT:
34556 case UINT_FTYPE_V2DF_INT:
34557 case UINT_FTYPE_V4SF_INT:
34558 case INT64_FTYPE_V2DF_INT:
34559 case INT64_FTYPE_V4SF_INT:
34560 case INT_FTYPE_V2DF_INT:
34561 case INT_FTYPE_V4SF_INT:
34562 nargs = 2;
34563 break;
34564 case V4SF_FTYPE_V4SF_UINT_INT:
34565 case V4SF_FTYPE_V4SF_UINT64_INT:
34566 case V2DF_FTYPE_V2DF_UINT64_INT:
34567 case V4SF_FTYPE_V4SF_INT_INT:
34568 case V4SF_FTYPE_V4SF_INT64_INT:
34569 case V2DF_FTYPE_V2DF_INT64_INT:
34570 case V4SF_FTYPE_V4SF_V4SF_INT:
34571 case V2DF_FTYPE_V2DF_V2DF_INT:
34572 case V4SF_FTYPE_V4SF_V2DF_INT:
34573 case V2DF_FTYPE_V2DF_V4SF_INT:
34574 nargs = 3;
34575 break;
34576 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34577 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34578 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34579 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34580 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34581 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34582 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34583 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34584 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34585 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34586 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34587 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34588 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34589 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34590 nargs = 4;
34591 break;
34592 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34593 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34594 nargs_constant = 2;
34595 nargs = 4;
34596 break;
34597 case INT_FTYPE_V4SF_V4SF_INT_INT:
34598 case INT_FTYPE_V2DF_V2DF_INT_INT:
34599 return ix86_expand_sse_comi_round (d, exp, target);
34600 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34601 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34602 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34603 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34604 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34605 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34606 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34607 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34608 nargs = 5;
34609 break;
34610 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34611 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34612 nargs_constant = 4;
34613 nargs = 5;
34614 break;
34615 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34616 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34617 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34618 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34619 nargs_constant = 3;
34620 nargs = 5;
34621 break;
34622 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34623 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34624 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34625 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34626 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34627 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34628 nargs = 6;
34629 nargs_constant = 4;
34630 break;
34631 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34632 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34633 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34634 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34635 nargs = 6;
34636 nargs_constant = 3;
34637 break;
34638 default:
34639 gcc_unreachable ();
34641 gcc_assert (nargs <= ARRAY_SIZE (args));
34643 if (optimize
34644 || target == 0
34645 || GET_MODE (target) != tmode
34646 || !insn_p->operand[0].predicate (target, tmode))
34647 target = gen_reg_rtx (tmode);
34649 for (i = 0; i < nargs; i++)
34651 tree arg = CALL_EXPR_ARG (exp, i);
34652 rtx op = expand_normal (arg);
34653 machine_mode mode = insn_p->operand[i + 1].mode;
34654 bool match = insn_p->operand[i + 1].predicate (op, mode);
34656 if (i == nargs - nargs_constant)
34658 if (!match)
34660 switch (icode)
34662 case CODE_FOR_avx512f_getmantv8df_mask_round:
34663 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34664 case CODE_FOR_avx512f_vgetmantv2df_round:
34665 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34666 case CODE_FOR_avx512f_vgetmantv4sf_round:
34667 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34668 error ("the immediate argument must be a 4-bit immediate");
34669 return const0_rtx;
34670 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34671 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34672 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34673 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34674 error ("the immediate argument must be a 5-bit immediate");
34675 return const0_rtx;
34676 default:
34677 error ("the immediate argument must be an 8-bit immediate");
34678 return const0_rtx;
34682 else if (i == nargs-1)
34684 if (!insn_p->operand[nargs].predicate (op, SImode))
34686 error ("incorrect rounding operand");
34687 return const0_rtx;
34690 /* If there is no rounding use normal version of the pattern. */
34691 if (INTVAL (op) == NO_ROUND)
34692 redundant_embed_rnd = 1;
34694 else
34696 if (VECTOR_MODE_P (mode))
34697 op = safe_vector_operand (op, mode);
34699 op = fixup_modeless_constant (op, mode);
34701 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34703 if (optimize || !match)
34704 op = copy_to_mode_reg (mode, op);
34706 else
34708 op = copy_to_reg (op);
34709 op = lowpart_subreg (mode, op, GET_MODE (op));
34713 args[i].op = op;
34714 args[i].mode = mode;
34717 switch (nargs)
34719 case 1:
34720 pat = GEN_FCN (icode) (target, args[0].op);
34721 break;
34722 case 2:
34723 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34724 break;
34725 case 3:
34726 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34727 args[2].op);
34728 break;
34729 case 4:
34730 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34731 args[2].op, args[3].op);
34732 break;
34733 case 5:
34734 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34735 args[2].op, args[3].op, args[4].op);
34736 break;
34737 case 6:
34738 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34739 args[2].op, args[3].op, args[4].op,
34740 args[5].op);
34741 break;
34742 default:
34743 gcc_unreachable ();
34746 if (!pat)
34747 return 0;
34749 if (redundant_embed_rnd)
34750 pat = ix86_erase_embedded_rounding (pat);
34752 emit_insn (pat);
34753 return target;
34756 /* Subroutine of ix86_expand_builtin to take care of special insns
34757 with variable number of operands. */
34759 static rtx
34760 ix86_expand_special_args_builtin (const struct builtin_description *d,
34761 tree exp, rtx target)
34763 tree arg;
34764 rtx pat, op;
34765 unsigned int i, nargs, arg_adjust, memory;
34766 bool aligned_mem = false;
34767 struct
34769 rtx op;
34770 machine_mode mode;
34771 } args[3];
34772 enum insn_code icode = d->icode;
34773 bool last_arg_constant = false;
34774 const struct insn_data_d *insn_p = &insn_data[icode];
34775 machine_mode tmode = insn_p->operand[0].mode;
34776 enum { load, store } klass;
34778 switch ((enum ix86_builtin_func_type) d->flag)
34780 case VOID_FTYPE_VOID:
34781 emit_insn (GEN_FCN (icode) (target));
34782 return 0;
34783 case VOID_FTYPE_UINT64:
34784 case VOID_FTYPE_UNSIGNED:
34785 nargs = 0;
34786 klass = store;
34787 memory = 0;
34788 break;
34790 case INT_FTYPE_VOID:
34791 case USHORT_FTYPE_VOID:
34792 case UINT64_FTYPE_VOID:
34793 case UNSIGNED_FTYPE_VOID:
34794 nargs = 0;
34795 klass = load;
34796 memory = 0;
34797 break;
34798 case UINT64_FTYPE_PUNSIGNED:
34799 case V2DI_FTYPE_PV2DI:
34800 case V4DI_FTYPE_PV4DI:
34801 case V32QI_FTYPE_PCCHAR:
34802 case V16QI_FTYPE_PCCHAR:
34803 case V8SF_FTYPE_PCV4SF:
34804 case V8SF_FTYPE_PCFLOAT:
34805 case V4SF_FTYPE_PCFLOAT:
34806 case V4DF_FTYPE_PCV2DF:
34807 case V4DF_FTYPE_PCDOUBLE:
34808 case V2DF_FTYPE_PCDOUBLE:
34809 case VOID_FTYPE_PVOID:
34810 case V8DI_FTYPE_PV8DI:
34811 nargs = 1;
34812 klass = load;
34813 memory = 0;
34814 switch (icode)
34816 case CODE_FOR_sse4_1_movntdqa:
34817 case CODE_FOR_avx2_movntdqa:
34818 case CODE_FOR_avx512f_movntdqa:
34819 aligned_mem = true;
34820 break;
34821 default:
34822 break;
34824 break;
34825 case VOID_FTYPE_PV2SF_V4SF:
34826 case VOID_FTYPE_PV8DI_V8DI:
34827 case VOID_FTYPE_PV4DI_V4DI:
34828 case VOID_FTYPE_PV2DI_V2DI:
34829 case VOID_FTYPE_PCHAR_V32QI:
34830 case VOID_FTYPE_PCHAR_V16QI:
34831 case VOID_FTYPE_PFLOAT_V16SF:
34832 case VOID_FTYPE_PFLOAT_V8SF:
34833 case VOID_FTYPE_PFLOAT_V4SF:
34834 case VOID_FTYPE_PDOUBLE_V8DF:
34835 case VOID_FTYPE_PDOUBLE_V4DF:
34836 case VOID_FTYPE_PDOUBLE_V2DF:
34837 case VOID_FTYPE_PLONGLONG_LONGLONG:
34838 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34839 case VOID_FTYPE_PINT_INT:
34840 nargs = 1;
34841 klass = store;
34842 /* Reserve memory operand for target. */
34843 memory = ARRAY_SIZE (args);
34844 switch (icode)
34846 /* These builtins and instructions require the memory
34847 to be properly aligned. */
34848 case CODE_FOR_avx_movntv4di:
34849 case CODE_FOR_sse2_movntv2di:
34850 case CODE_FOR_avx_movntv8sf:
34851 case CODE_FOR_sse_movntv4sf:
34852 case CODE_FOR_sse4a_vmmovntv4sf:
34853 case CODE_FOR_avx_movntv4df:
34854 case CODE_FOR_sse2_movntv2df:
34855 case CODE_FOR_sse4a_vmmovntv2df:
34856 case CODE_FOR_sse2_movntidi:
34857 case CODE_FOR_sse_movntq:
34858 case CODE_FOR_sse2_movntisi:
34859 case CODE_FOR_avx512f_movntv16sf:
34860 case CODE_FOR_avx512f_movntv8df:
34861 case CODE_FOR_avx512f_movntv8di:
34862 aligned_mem = true;
34863 break;
34864 default:
34865 break;
34867 break;
34868 case V4SF_FTYPE_V4SF_PCV2SF:
34869 case V2DF_FTYPE_V2DF_PCDOUBLE:
34870 nargs = 2;
34871 klass = load;
34872 memory = 1;
34873 break;
34874 case V8SF_FTYPE_PCV8SF_V8SI:
34875 case V4DF_FTYPE_PCV4DF_V4DI:
34876 case V4SF_FTYPE_PCV4SF_V4SI:
34877 case V2DF_FTYPE_PCV2DF_V2DI:
34878 case V8SI_FTYPE_PCV8SI_V8SI:
34879 case V4DI_FTYPE_PCV4DI_V4DI:
34880 case V4SI_FTYPE_PCV4SI_V4SI:
34881 case V2DI_FTYPE_PCV2DI_V2DI:
34882 case VOID_FTYPE_INT_INT64:
34883 nargs = 2;
34884 klass = load;
34885 memory = 0;
34886 break;
34887 case VOID_FTYPE_PV8DF_V8DF_UQI:
34888 case VOID_FTYPE_PV4DF_V4DF_UQI:
34889 case VOID_FTYPE_PV2DF_V2DF_UQI:
34890 case VOID_FTYPE_PV16SF_V16SF_UHI:
34891 case VOID_FTYPE_PV8SF_V8SF_UQI:
34892 case VOID_FTYPE_PV4SF_V4SF_UQI:
34893 case VOID_FTYPE_PV8DI_V8DI_UQI:
34894 case VOID_FTYPE_PV4DI_V4DI_UQI:
34895 case VOID_FTYPE_PV2DI_V2DI_UQI:
34896 case VOID_FTYPE_PV16SI_V16SI_UHI:
34897 case VOID_FTYPE_PV8SI_V8SI_UQI:
34898 case VOID_FTYPE_PV4SI_V4SI_UQI:
34899 case VOID_FTYPE_PV64QI_V64QI_UDI:
34900 case VOID_FTYPE_PV32HI_V32HI_USI:
34901 case VOID_FTYPE_PV32QI_V32QI_USI:
34902 case VOID_FTYPE_PV16QI_V16QI_UHI:
34903 case VOID_FTYPE_PV16HI_V16HI_UHI:
34904 case VOID_FTYPE_PV8HI_V8HI_UQI:
34905 switch (icode)
34907 /* These builtins and instructions require the memory
34908 to be properly aligned. */
34909 case CODE_FOR_avx512f_storev16sf_mask:
34910 case CODE_FOR_avx512f_storev16si_mask:
34911 case CODE_FOR_avx512f_storev8df_mask:
34912 case CODE_FOR_avx512f_storev8di_mask:
34913 case CODE_FOR_avx512vl_storev8sf_mask:
34914 case CODE_FOR_avx512vl_storev8si_mask:
34915 case CODE_FOR_avx512vl_storev4df_mask:
34916 case CODE_FOR_avx512vl_storev4di_mask:
34917 case CODE_FOR_avx512vl_storev4sf_mask:
34918 case CODE_FOR_avx512vl_storev4si_mask:
34919 case CODE_FOR_avx512vl_storev2df_mask:
34920 case CODE_FOR_avx512vl_storev2di_mask:
34921 aligned_mem = true;
34922 break;
34923 default:
34924 break;
34926 /* FALLTHRU */
34927 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34928 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34929 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34930 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34931 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34932 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34933 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34934 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34935 case VOID_FTYPE_PV8SI_V8DI_UQI:
34936 case VOID_FTYPE_PV8HI_V8DI_UQI:
34937 case VOID_FTYPE_PV16HI_V16SI_UHI:
34938 case VOID_FTYPE_PV16QI_V8DI_UQI:
34939 case VOID_FTYPE_PV16QI_V16SI_UHI:
34940 case VOID_FTYPE_PV4SI_V4DI_UQI:
34941 case VOID_FTYPE_PV4SI_V2DI_UQI:
34942 case VOID_FTYPE_PV8HI_V4DI_UQI:
34943 case VOID_FTYPE_PV8HI_V2DI_UQI:
34944 case VOID_FTYPE_PV8HI_V8SI_UQI:
34945 case VOID_FTYPE_PV8HI_V4SI_UQI:
34946 case VOID_FTYPE_PV16QI_V4DI_UQI:
34947 case VOID_FTYPE_PV16QI_V2DI_UQI:
34948 case VOID_FTYPE_PV16QI_V8SI_UQI:
34949 case VOID_FTYPE_PV16QI_V4SI_UQI:
34950 case VOID_FTYPE_PCHAR_V64QI_UDI:
34951 case VOID_FTYPE_PCHAR_V32QI_USI:
34952 case VOID_FTYPE_PCHAR_V16QI_UHI:
34953 case VOID_FTYPE_PSHORT_V32HI_USI:
34954 case VOID_FTYPE_PSHORT_V16HI_UHI:
34955 case VOID_FTYPE_PSHORT_V8HI_UQI:
34956 case VOID_FTYPE_PINT_V16SI_UHI:
34957 case VOID_FTYPE_PINT_V8SI_UQI:
34958 case VOID_FTYPE_PINT_V4SI_UQI:
34959 case VOID_FTYPE_PINT64_V8DI_UQI:
34960 case VOID_FTYPE_PINT64_V4DI_UQI:
34961 case VOID_FTYPE_PINT64_V2DI_UQI:
34962 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
34963 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
34964 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
34965 case VOID_FTYPE_PFLOAT_V16SF_UHI:
34966 case VOID_FTYPE_PFLOAT_V8SF_UQI:
34967 case VOID_FTYPE_PFLOAT_V4SF_UQI:
34968 case VOID_FTYPE_PV32QI_V32HI_USI:
34969 case VOID_FTYPE_PV16QI_V16HI_UHI:
34970 case VOID_FTYPE_PV8QI_V8HI_UQI:
34971 nargs = 2;
34972 klass = store;
34973 /* Reserve memory operand for target. */
34974 memory = ARRAY_SIZE (args);
34975 break;
34976 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
34977 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
34978 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
34979 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
34980 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
34981 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
34982 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
34983 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
34984 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
34985 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
34986 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
34987 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
34988 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
34989 case V32HI_FTYPE_PCV32HI_V32HI_USI:
34990 case V32QI_FTYPE_PCV32QI_V32QI_USI:
34991 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
34992 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
34993 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
34994 switch (icode)
34996 /* These builtins and instructions require the memory
34997 to be properly aligned. */
34998 case CODE_FOR_avx512f_loadv16sf_mask:
34999 case CODE_FOR_avx512f_loadv16si_mask:
35000 case CODE_FOR_avx512f_loadv8df_mask:
35001 case CODE_FOR_avx512f_loadv8di_mask:
35002 case CODE_FOR_avx512vl_loadv8sf_mask:
35003 case CODE_FOR_avx512vl_loadv8si_mask:
35004 case CODE_FOR_avx512vl_loadv4df_mask:
35005 case CODE_FOR_avx512vl_loadv4di_mask:
35006 case CODE_FOR_avx512vl_loadv4sf_mask:
35007 case CODE_FOR_avx512vl_loadv4si_mask:
35008 case CODE_FOR_avx512vl_loadv2df_mask:
35009 case CODE_FOR_avx512vl_loadv2di_mask:
35010 case CODE_FOR_avx512bw_loadv64qi_mask:
35011 case CODE_FOR_avx512vl_loadv32qi_mask:
35012 case CODE_FOR_avx512vl_loadv16qi_mask:
35013 case CODE_FOR_avx512bw_loadv32hi_mask:
35014 case CODE_FOR_avx512vl_loadv16hi_mask:
35015 case CODE_FOR_avx512vl_loadv8hi_mask:
35016 aligned_mem = true;
35017 break;
35018 default:
35019 break;
35021 /* FALLTHRU */
35022 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35023 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35024 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35025 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35026 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35027 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35028 case V16SI_FTYPE_PCINT_V16SI_UHI:
35029 case V8SI_FTYPE_PCINT_V8SI_UQI:
35030 case V4SI_FTYPE_PCINT_V4SI_UQI:
35031 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35032 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35033 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35034 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35035 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35036 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35037 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35038 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35039 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35040 nargs = 3;
35041 klass = load;
35042 memory = 0;
35043 break;
35044 case VOID_FTYPE_UINT_UINT_UINT:
35045 case VOID_FTYPE_UINT64_UINT_UINT:
35046 case UCHAR_FTYPE_UINT_UINT_UINT:
35047 case UCHAR_FTYPE_UINT64_UINT_UINT:
35048 nargs = 3;
35049 klass = load;
35050 memory = ARRAY_SIZE (args);
35051 last_arg_constant = true;
35052 break;
35053 default:
35054 gcc_unreachable ();
35057 gcc_assert (nargs <= ARRAY_SIZE (args));
35059 if (klass == store)
35061 arg = CALL_EXPR_ARG (exp, 0);
35062 op = expand_normal (arg);
35063 gcc_assert (target == 0);
35064 if (memory)
35066 op = ix86_zero_extend_to_Pmode (op);
35067 target = gen_rtx_MEM (tmode, op);
35068 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35069 on it. Try to improve it using get_pointer_alignment,
35070 and if the special builtin is one that requires strict
35071 mode alignment, also from it's GET_MODE_ALIGNMENT.
35072 Failure to do so could lead to ix86_legitimate_combined_insn
35073 rejecting all changes to such insns. */
35074 unsigned int align = get_pointer_alignment (arg);
35075 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35076 align = GET_MODE_ALIGNMENT (tmode);
35077 if (MEM_ALIGN (target) < align)
35078 set_mem_align (target, align);
35080 else
35081 target = force_reg (tmode, op);
35082 arg_adjust = 1;
35084 else
35086 arg_adjust = 0;
35087 if (optimize
35088 || target == 0
35089 || !register_operand (target, tmode)
35090 || GET_MODE (target) != tmode)
35091 target = gen_reg_rtx (tmode);
35094 for (i = 0; i < nargs; i++)
35096 machine_mode mode = insn_p->operand[i + 1].mode;
35097 bool match;
35099 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35100 op = expand_normal (arg);
35101 match = insn_p->operand[i + 1].predicate (op, mode);
35103 if (last_arg_constant && (i + 1) == nargs)
35105 if (!match)
35107 if (icode == CODE_FOR_lwp_lwpvalsi3
35108 || icode == CODE_FOR_lwp_lwpinssi3
35109 || icode == CODE_FOR_lwp_lwpvaldi3
35110 || icode == CODE_FOR_lwp_lwpinsdi3)
35111 error ("the last argument must be a 32-bit immediate");
35112 else
35113 error ("the last argument must be an 8-bit immediate");
35114 return const0_rtx;
35117 else
35119 if (i == memory)
35121 /* This must be the memory operand. */
35122 op = ix86_zero_extend_to_Pmode (op);
35123 op = gen_rtx_MEM (mode, op);
35124 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35125 on it. Try to improve it using get_pointer_alignment,
35126 and if the special builtin is one that requires strict
35127 mode alignment, also from it's GET_MODE_ALIGNMENT.
35128 Failure to do so could lead to ix86_legitimate_combined_insn
35129 rejecting all changes to such insns. */
35130 unsigned int align = get_pointer_alignment (arg);
35131 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35132 align = GET_MODE_ALIGNMENT (mode);
35133 if (MEM_ALIGN (op) < align)
35134 set_mem_align (op, align);
35136 else
35138 /* This must be register. */
35139 if (VECTOR_MODE_P (mode))
35140 op = safe_vector_operand (op, mode);
35142 op = fixup_modeless_constant (op, mode);
35144 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35145 op = copy_to_mode_reg (mode, op);
35146 else
35148 op = copy_to_reg (op);
35149 op = lowpart_subreg (mode, op, GET_MODE (op));
35154 args[i].op = op;
35155 args[i].mode = mode;
35158 switch (nargs)
35160 case 0:
35161 pat = GEN_FCN (icode) (target);
35162 break;
35163 case 1:
35164 pat = GEN_FCN (icode) (target, args[0].op);
35165 break;
35166 case 2:
35167 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35168 break;
35169 case 3:
35170 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35171 break;
35172 default:
35173 gcc_unreachable ();
35176 if (! pat)
35177 return 0;
35178 emit_insn (pat);
35179 return klass == store ? 0 : target;
35182 /* Return the integer constant in ARG. Constrain it to be in the range
35183 of the subparts of VEC_TYPE; issue an error if not. */
35185 static int
35186 get_element_number (tree vec_type, tree arg)
35188 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35190 if (!tree_fits_uhwi_p (arg)
35191 || (elt = tree_to_uhwi (arg), elt > max))
35193 error ("selector must be an integer constant in the range 0..%wi", max);
35194 return 0;
35197 return elt;
35200 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35201 ix86_expand_vector_init. We DO have language-level syntax for this, in
35202 the form of (type){ init-list }. Except that since we can't place emms
35203 instructions from inside the compiler, we can't allow the use of MMX
35204 registers unless the user explicitly asks for it. So we do *not* define
35205 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
35206 we have builtins invoked by mmintrin.h that gives us license to emit
35207 these sorts of instructions. */
35209 static rtx
35210 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
35212 machine_mode tmode = TYPE_MODE (type);
35213 machine_mode inner_mode = GET_MODE_INNER (tmode);
35214 int i, n_elt = GET_MODE_NUNITS (tmode);
35215 rtvec v = rtvec_alloc (n_elt);
35217 gcc_assert (VECTOR_MODE_P (tmode));
35218 gcc_assert (call_expr_nargs (exp) == n_elt);
35220 for (i = 0; i < n_elt; ++i)
35222 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
35223 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
35226 if (!target || !register_operand (target, tmode))
35227 target = gen_reg_rtx (tmode);
35229 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
35230 return target;
35233 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35234 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
35235 had a language-level syntax for referencing vector elements. */
35237 static rtx
35238 ix86_expand_vec_ext_builtin (tree exp, rtx target)
35240 machine_mode tmode, mode0;
35241 tree arg0, arg1;
35242 int elt;
35243 rtx op0;
35245 arg0 = CALL_EXPR_ARG (exp, 0);
35246 arg1 = CALL_EXPR_ARG (exp, 1);
35248 op0 = expand_normal (arg0);
35249 elt = get_element_number (TREE_TYPE (arg0), arg1);
35251 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35252 mode0 = TYPE_MODE (TREE_TYPE (arg0));
35253 gcc_assert (VECTOR_MODE_P (mode0));
35255 op0 = force_reg (mode0, op0);
35257 if (optimize || !target || !register_operand (target, tmode))
35258 target = gen_reg_rtx (tmode);
35260 ix86_expand_vector_extract (true, target, op0, elt);
35262 return target;
35265 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35266 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
35267 a language-level syntax for referencing vector elements. */
35269 static rtx
35270 ix86_expand_vec_set_builtin (tree exp)
35272 machine_mode tmode, mode1;
35273 tree arg0, arg1, arg2;
35274 int elt;
35275 rtx op0, op1, target;
35277 arg0 = CALL_EXPR_ARG (exp, 0);
35278 arg1 = CALL_EXPR_ARG (exp, 1);
35279 arg2 = CALL_EXPR_ARG (exp, 2);
35281 tmode = TYPE_MODE (TREE_TYPE (arg0));
35282 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35283 gcc_assert (VECTOR_MODE_P (tmode));
35285 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
35286 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
35287 elt = get_element_number (TREE_TYPE (arg0), arg2);
35289 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35290 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35292 op0 = force_reg (tmode, op0);
35293 op1 = force_reg (mode1, op1);
35295 /* OP0 is the source of these builtin functions and shouldn't be
35296 modified. Create a copy, use it and return it as target. */
35297 target = gen_reg_rtx (tmode);
35298 emit_move_insn (target, op0);
35299 ix86_expand_vector_set (true, target, op1, elt);
35301 return target;
35304 /* Emit conditional move of SRC to DST with condition
35305 OP1 CODE OP2. */
35306 static void
35307 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35309 rtx t;
35311 if (TARGET_CMOVE)
35313 t = ix86_expand_compare (code, op1, op2);
35314 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35315 src, dst)));
35317 else
35319 rtx_code_label *nomove = gen_label_rtx ();
35320 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35321 const0_rtx, GET_MODE (op1), 1, nomove);
35322 emit_move_insn (dst, src);
35323 emit_label (nomove);
35327 /* Choose max of DST and SRC and put it to DST. */
35328 static void
35329 ix86_emit_move_max (rtx dst, rtx src)
35331 ix86_emit_cmove (dst, src, LTU, dst, src);
35334 /* Expand an expression EXP that calls a built-in function,
35335 with result going to TARGET if that's convenient
35336 (and in mode MODE if that's convenient).
35337 SUBTARGET may be used as the target for computing one of EXP's operands.
35338 IGNORE is nonzero if the value is to be ignored. */
35340 static rtx
35341 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35342 machine_mode mode, int ignore)
35344 size_t i;
35345 enum insn_code icode, icode2;
35346 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35347 tree arg0, arg1, arg2, arg3, arg4;
35348 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
35349 machine_mode mode0, mode1, mode2, mode3, mode4;
35350 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35352 /* For CPU builtins that can be folded, fold first and expand the fold. */
35353 switch (fcode)
35355 case IX86_BUILTIN_CPU_INIT:
35357 /* Make it call __cpu_indicator_init in libgcc. */
35358 tree call_expr, fndecl, type;
35359 type = build_function_type_list (integer_type_node, NULL_TREE);
35360 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35361 call_expr = build_call_expr (fndecl, 0);
35362 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35364 case IX86_BUILTIN_CPU_IS:
35365 case IX86_BUILTIN_CPU_SUPPORTS:
35367 tree arg0 = CALL_EXPR_ARG (exp, 0);
35368 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35369 gcc_assert (fold_expr != NULL_TREE);
35370 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35374 /* Determine whether the builtin function is available under the current ISA.
35375 Originally the builtin was not created if it wasn't applicable to the
35376 current ISA based on the command line switches. With function specific
35377 options, we need to check in the context of the function making the call
35378 whether it is supported. Treat AVX512{VL,BW,F} and MMX specially. For
35379 other flags, if isa includes more than one ISA bit, treat those are
35380 requiring any of them. For AVX512VL, require both AVX512VL and the
35381 non-AVX512VL ISAs. Likewise for MMX, require both MMX and the non-MMX
35382 ISAs. Similarly for AVX512F and AVX512BW.
35383 Similarly for 64BIT, but we shouldn't be building such builtins
35384 at all, -m64 is a whole TU option. */
35385 if (((ix86_builtins_isa[fcode].isa
35386 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35387 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_AVX512BW
35388 | OPTION_MASK_ISA_AVX512F))
35389 && !(ix86_builtins_isa[fcode].isa
35390 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35391 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_AVX512BW
35392 | OPTION_MASK_ISA_AVX512F)
35393 & ix86_isa_flags))
35394 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
35395 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
35396 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512BW)
35397 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512BW))
35398 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512F)
35399 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512F))
35400 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_MMX)
35401 && !(ix86_isa_flags & OPTION_MASK_ISA_MMX))
35402 || (ix86_builtins_isa[fcode].isa2
35403 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
35405 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
35406 ix86_builtins_isa[fcode].isa2, 0, 0,
35407 NULL, NULL, (enum fpmath_unit) 0,
35408 false);
35409 if (!opts)
35410 error ("%qE needs unknown isa option", fndecl);
35411 else
35413 gcc_assert (opts != NULL);
35414 error ("%qE needs isa option %s", fndecl, opts);
35415 free (opts);
35417 return expand_call (exp, target, ignore);
35420 switch (fcode)
35422 case IX86_BUILTIN_BNDMK:
35423 if (!target
35424 || GET_MODE (target) != BNDmode
35425 || !register_operand (target, BNDmode))
35426 target = gen_reg_rtx (BNDmode);
35428 arg0 = CALL_EXPR_ARG (exp, 0);
35429 arg1 = CALL_EXPR_ARG (exp, 1);
35431 op0 = expand_normal (arg0);
35432 op1 = expand_normal (arg1);
35434 if (!register_operand (op0, Pmode))
35435 op0 = ix86_zero_extend_to_Pmode (op0);
35436 if (!register_operand (op1, Pmode))
35437 op1 = ix86_zero_extend_to_Pmode (op1);
35439 /* Builtin arg1 is size of block but instruction op1 should
35440 be (size - 1). */
35441 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35442 NULL_RTX, 1, OPTAB_DIRECT);
35444 emit_insn (BNDmode == BND64mode
35445 ? gen_bnd64_mk (target, op0, op1)
35446 : gen_bnd32_mk (target, op0, op1));
35447 return target;
35449 case IX86_BUILTIN_BNDSTX:
35450 arg0 = CALL_EXPR_ARG (exp, 0);
35451 arg1 = CALL_EXPR_ARG (exp, 1);
35452 arg2 = CALL_EXPR_ARG (exp, 2);
35454 op0 = expand_normal (arg0);
35455 op1 = expand_normal (arg1);
35456 op2 = expand_normal (arg2);
35458 if (!register_operand (op0, Pmode))
35459 op0 = ix86_zero_extend_to_Pmode (op0);
35460 if (!register_operand (op1, BNDmode))
35461 op1 = copy_to_mode_reg (BNDmode, op1);
35462 if (!register_operand (op2, Pmode))
35463 op2 = ix86_zero_extend_to_Pmode (op2);
35465 emit_insn (BNDmode == BND64mode
35466 ? gen_bnd64_stx (op2, op0, op1)
35467 : gen_bnd32_stx (op2, op0, op1));
35468 return 0;
35470 case IX86_BUILTIN_BNDLDX:
35471 if (!target
35472 || GET_MODE (target) != BNDmode
35473 || !register_operand (target, BNDmode))
35474 target = gen_reg_rtx (BNDmode);
35476 arg0 = CALL_EXPR_ARG (exp, 0);
35477 arg1 = CALL_EXPR_ARG (exp, 1);
35479 op0 = expand_normal (arg0);
35480 op1 = expand_normal (arg1);
35482 if (!register_operand (op0, Pmode))
35483 op0 = ix86_zero_extend_to_Pmode (op0);
35484 if (!register_operand (op1, Pmode))
35485 op1 = ix86_zero_extend_to_Pmode (op1);
35487 emit_insn (BNDmode == BND64mode
35488 ? gen_bnd64_ldx (target, op0, op1)
35489 : gen_bnd32_ldx (target, op0, op1));
35490 return target;
35492 case IX86_BUILTIN_BNDCL:
35493 arg0 = CALL_EXPR_ARG (exp, 0);
35494 arg1 = CALL_EXPR_ARG (exp, 1);
35496 op0 = expand_normal (arg0);
35497 op1 = expand_normal (arg1);
35499 if (!register_operand (op0, Pmode))
35500 op0 = ix86_zero_extend_to_Pmode (op0);
35501 if (!register_operand (op1, BNDmode))
35502 op1 = copy_to_mode_reg (BNDmode, op1);
35504 emit_insn (BNDmode == BND64mode
35505 ? gen_bnd64_cl (op1, op0)
35506 : gen_bnd32_cl (op1, op0));
35507 return 0;
35509 case IX86_BUILTIN_BNDCU:
35510 arg0 = CALL_EXPR_ARG (exp, 0);
35511 arg1 = CALL_EXPR_ARG (exp, 1);
35513 op0 = expand_normal (arg0);
35514 op1 = expand_normal (arg1);
35516 if (!register_operand (op0, Pmode))
35517 op0 = ix86_zero_extend_to_Pmode (op0);
35518 if (!register_operand (op1, BNDmode))
35519 op1 = copy_to_mode_reg (BNDmode, op1);
35521 emit_insn (BNDmode == BND64mode
35522 ? gen_bnd64_cu (op1, op0)
35523 : gen_bnd32_cu (op1, op0));
35524 return 0;
35526 case IX86_BUILTIN_BNDRET:
35527 arg0 = CALL_EXPR_ARG (exp, 0);
35528 target = chkp_get_rtl_bounds (arg0);
35530 /* If no bounds were specified for returned value,
35531 then use INIT bounds. It usually happens when
35532 some built-in function is expanded. */
35533 if (!target)
35535 rtx t1 = gen_reg_rtx (Pmode);
35536 rtx t2 = gen_reg_rtx (Pmode);
35537 target = gen_reg_rtx (BNDmode);
35538 emit_move_insn (t1, const0_rtx);
35539 emit_move_insn (t2, constm1_rtx);
35540 emit_insn (BNDmode == BND64mode
35541 ? gen_bnd64_mk (target, t1, t2)
35542 : gen_bnd32_mk (target, t1, t2));
35545 gcc_assert (target && REG_P (target));
35546 return target;
35548 case IX86_BUILTIN_BNDNARROW:
35550 rtx m1, m1h1, m1h2, lb, ub, t1;
35552 /* Return value and lb. */
35553 arg0 = CALL_EXPR_ARG (exp, 0);
35554 /* Bounds. */
35555 arg1 = CALL_EXPR_ARG (exp, 1);
35556 /* Size. */
35557 arg2 = CALL_EXPR_ARG (exp, 2);
35559 lb = expand_normal (arg0);
35560 op1 = expand_normal (arg1);
35561 op2 = expand_normal (arg2);
35563 /* Size was passed but we need to use (size - 1) as for bndmk. */
35564 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35565 NULL_RTX, 1, OPTAB_DIRECT);
35567 /* Add LB to size and inverse to get UB. */
35568 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35569 op2, 1, OPTAB_DIRECT);
35570 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35572 if (!register_operand (lb, Pmode))
35573 lb = ix86_zero_extend_to_Pmode (lb);
35574 if (!register_operand (ub, Pmode))
35575 ub = ix86_zero_extend_to_Pmode (ub);
35577 /* We need to move bounds to memory before any computations. */
35578 if (MEM_P (op1))
35579 m1 = op1;
35580 else
35582 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35583 emit_move_insn (m1, op1);
35586 /* Generate mem expression to be used for access to LB and UB. */
35587 m1h1 = adjust_address (m1, Pmode, 0);
35588 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35590 t1 = gen_reg_rtx (Pmode);
35592 /* Compute LB. */
35593 emit_move_insn (t1, m1h1);
35594 ix86_emit_move_max (t1, lb);
35595 emit_move_insn (m1h1, t1);
35597 /* Compute UB. UB is stored in 1's complement form. Therefore
35598 we also use max here. */
35599 emit_move_insn (t1, m1h2);
35600 ix86_emit_move_max (t1, ub);
35601 emit_move_insn (m1h2, t1);
35603 op2 = gen_reg_rtx (BNDmode);
35604 emit_move_insn (op2, m1);
35606 return chkp_join_splitted_slot (lb, op2);
35609 case IX86_BUILTIN_BNDINT:
35611 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35613 if (!target
35614 || GET_MODE (target) != BNDmode
35615 || !register_operand (target, BNDmode))
35616 target = gen_reg_rtx (BNDmode);
35618 arg0 = CALL_EXPR_ARG (exp, 0);
35619 arg1 = CALL_EXPR_ARG (exp, 1);
35621 op0 = expand_normal (arg0);
35622 op1 = expand_normal (arg1);
35624 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35625 rh1 = adjust_address (res, Pmode, 0);
35626 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35628 /* Put first bounds to temporaries. */
35629 lb1 = gen_reg_rtx (Pmode);
35630 ub1 = gen_reg_rtx (Pmode);
35631 if (MEM_P (op0))
35633 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35634 emit_move_insn (ub1, adjust_address (op0, Pmode,
35635 GET_MODE_SIZE (Pmode)));
35637 else
35639 emit_move_insn (res, op0);
35640 emit_move_insn (lb1, rh1);
35641 emit_move_insn (ub1, rh2);
35644 /* Put second bounds to temporaries. */
35645 lb2 = gen_reg_rtx (Pmode);
35646 ub2 = gen_reg_rtx (Pmode);
35647 if (MEM_P (op1))
35649 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35650 emit_move_insn (ub2, adjust_address (op1, Pmode,
35651 GET_MODE_SIZE (Pmode)));
35653 else
35655 emit_move_insn (res, op1);
35656 emit_move_insn (lb2, rh1);
35657 emit_move_insn (ub2, rh2);
35660 /* Compute LB. */
35661 ix86_emit_move_max (lb1, lb2);
35662 emit_move_insn (rh1, lb1);
35664 /* Compute UB. UB is stored in 1's complement form. Therefore
35665 we also use max here. */
35666 ix86_emit_move_max (ub1, ub2);
35667 emit_move_insn (rh2, ub1);
35669 emit_move_insn (target, res);
35671 return target;
35674 case IX86_BUILTIN_SIZEOF:
35676 tree name;
35677 rtx symbol;
35679 if (!target
35680 || GET_MODE (target) != Pmode
35681 || !register_operand (target, Pmode))
35682 target = gen_reg_rtx (Pmode);
35684 arg0 = CALL_EXPR_ARG (exp, 0);
35685 gcc_assert (VAR_P (arg0));
35687 name = DECL_ASSEMBLER_NAME (arg0);
35688 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35690 emit_insn (Pmode == SImode
35691 ? gen_move_size_reloc_si (target, symbol)
35692 : gen_move_size_reloc_di (target, symbol));
35694 return target;
35697 case IX86_BUILTIN_BNDLOWER:
35699 rtx mem, hmem;
35701 if (!target
35702 || GET_MODE (target) != Pmode
35703 || !register_operand (target, Pmode))
35704 target = gen_reg_rtx (Pmode);
35706 arg0 = CALL_EXPR_ARG (exp, 0);
35707 op0 = expand_normal (arg0);
35709 /* We need to move bounds to memory first. */
35710 if (MEM_P (op0))
35711 mem = op0;
35712 else
35714 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35715 emit_move_insn (mem, op0);
35718 /* Generate mem expression to access LB and load it. */
35719 hmem = adjust_address (mem, Pmode, 0);
35720 emit_move_insn (target, hmem);
35722 return target;
35725 case IX86_BUILTIN_BNDUPPER:
35727 rtx mem, hmem, res;
35729 if (!target
35730 || GET_MODE (target) != Pmode
35731 || !register_operand (target, Pmode))
35732 target = gen_reg_rtx (Pmode);
35734 arg0 = CALL_EXPR_ARG (exp, 0);
35735 op0 = expand_normal (arg0);
35737 /* We need to move bounds to memory first. */
35738 if (MEM_P (op0))
35739 mem = op0;
35740 else
35742 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35743 emit_move_insn (mem, op0);
35746 /* Generate mem expression to access UB. */
35747 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35749 /* We need to inverse all bits of UB. */
35750 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35752 if (res != target)
35753 emit_move_insn (target, res);
35755 return target;
35758 case IX86_BUILTIN_MASKMOVQ:
35759 case IX86_BUILTIN_MASKMOVDQU:
35760 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35761 ? CODE_FOR_mmx_maskmovq
35762 : CODE_FOR_sse2_maskmovdqu);
35763 /* Note the arg order is different from the operand order. */
35764 arg1 = CALL_EXPR_ARG (exp, 0);
35765 arg2 = CALL_EXPR_ARG (exp, 1);
35766 arg0 = CALL_EXPR_ARG (exp, 2);
35767 op0 = expand_normal (arg0);
35768 op1 = expand_normal (arg1);
35769 op2 = expand_normal (arg2);
35770 mode0 = insn_data[icode].operand[0].mode;
35771 mode1 = insn_data[icode].operand[1].mode;
35772 mode2 = insn_data[icode].operand[2].mode;
35774 op0 = ix86_zero_extend_to_Pmode (op0);
35775 op0 = gen_rtx_MEM (mode1, op0);
35777 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35778 op0 = copy_to_mode_reg (mode0, op0);
35779 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35780 op1 = copy_to_mode_reg (mode1, op1);
35781 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35782 op2 = copy_to_mode_reg (mode2, op2);
35783 pat = GEN_FCN (icode) (op0, op1, op2);
35784 if (! pat)
35785 return 0;
35786 emit_insn (pat);
35787 return 0;
35789 case IX86_BUILTIN_LDMXCSR:
35790 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35791 target = assign_386_stack_local (SImode, SLOT_TEMP);
35792 emit_move_insn (target, op0);
35793 emit_insn (gen_sse_ldmxcsr (target));
35794 return 0;
35796 case IX86_BUILTIN_STMXCSR:
35797 target = assign_386_stack_local (SImode, SLOT_TEMP);
35798 emit_insn (gen_sse_stmxcsr (target));
35799 return copy_to_mode_reg (SImode, target);
35801 case IX86_BUILTIN_CLFLUSH:
35802 arg0 = CALL_EXPR_ARG (exp, 0);
35803 op0 = expand_normal (arg0);
35804 icode = CODE_FOR_sse2_clflush;
35805 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35806 op0 = ix86_zero_extend_to_Pmode (op0);
35808 emit_insn (gen_sse2_clflush (op0));
35809 return 0;
35811 case IX86_BUILTIN_CLWB:
35812 arg0 = CALL_EXPR_ARG (exp, 0);
35813 op0 = expand_normal (arg0);
35814 icode = CODE_FOR_clwb;
35815 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35816 op0 = ix86_zero_extend_to_Pmode (op0);
35818 emit_insn (gen_clwb (op0));
35819 return 0;
35821 case IX86_BUILTIN_CLFLUSHOPT:
35822 arg0 = CALL_EXPR_ARG (exp, 0);
35823 op0 = expand_normal (arg0);
35824 icode = CODE_FOR_clflushopt;
35825 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35826 op0 = ix86_zero_extend_to_Pmode (op0);
35828 emit_insn (gen_clflushopt (op0));
35829 return 0;
35831 case IX86_BUILTIN_MONITOR:
35832 case IX86_BUILTIN_MONITORX:
35833 arg0 = CALL_EXPR_ARG (exp, 0);
35834 arg1 = CALL_EXPR_ARG (exp, 1);
35835 arg2 = CALL_EXPR_ARG (exp, 2);
35836 op0 = expand_normal (arg0);
35837 op1 = expand_normal (arg1);
35838 op2 = expand_normal (arg2);
35839 if (!REG_P (op0))
35840 op0 = ix86_zero_extend_to_Pmode (op0);
35841 if (!REG_P (op1))
35842 op1 = copy_to_mode_reg (SImode, op1);
35843 if (!REG_P (op2))
35844 op2 = copy_to_mode_reg (SImode, op2);
35846 emit_insn (fcode == IX86_BUILTIN_MONITOR
35847 ? ix86_gen_monitor (op0, op1, op2)
35848 : ix86_gen_monitorx (op0, op1, op2));
35849 return 0;
35851 case IX86_BUILTIN_MWAIT:
35852 arg0 = CALL_EXPR_ARG (exp, 0);
35853 arg1 = CALL_EXPR_ARG (exp, 1);
35854 op0 = expand_normal (arg0);
35855 op1 = expand_normal (arg1);
35856 if (!REG_P (op0))
35857 op0 = copy_to_mode_reg (SImode, op0);
35858 if (!REG_P (op1))
35859 op1 = copy_to_mode_reg (SImode, op1);
35860 emit_insn (gen_sse3_mwait (op0, op1));
35861 return 0;
35863 case IX86_BUILTIN_MWAITX:
35864 arg0 = CALL_EXPR_ARG (exp, 0);
35865 arg1 = CALL_EXPR_ARG (exp, 1);
35866 arg2 = CALL_EXPR_ARG (exp, 2);
35867 op0 = expand_normal (arg0);
35868 op1 = expand_normal (arg1);
35869 op2 = expand_normal (arg2);
35870 if (!REG_P (op0))
35871 op0 = copy_to_mode_reg (SImode, op0);
35872 if (!REG_P (op1))
35873 op1 = copy_to_mode_reg (SImode, op1);
35874 if (!REG_P (op2))
35875 op2 = copy_to_mode_reg (SImode, op2);
35876 emit_insn (gen_mwaitx (op0, op1, op2));
35877 return 0;
35879 case IX86_BUILTIN_CLZERO:
35880 arg0 = CALL_EXPR_ARG (exp, 0);
35881 op0 = expand_normal (arg0);
35882 if (!REG_P (op0))
35883 op0 = ix86_zero_extend_to_Pmode (op0);
35884 emit_insn (ix86_gen_clzero (op0));
35885 return 0;
35887 case IX86_BUILTIN_VEC_INIT_V2SI:
35888 case IX86_BUILTIN_VEC_INIT_V4HI:
35889 case IX86_BUILTIN_VEC_INIT_V8QI:
35890 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35892 case IX86_BUILTIN_VEC_EXT_V2DF:
35893 case IX86_BUILTIN_VEC_EXT_V2DI:
35894 case IX86_BUILTIN_VEC_EXT_V4SF:
35895 case IX86_BUILTIN_VEC_EXT_V4SI:
35896 case IX86_BUILTIN_VEC_EXT_V8HI:
35897 case IX86_BUILTIN_VEC_EXT_V2SI:
35898 case IX86_BUILTIN_VEC_EXT_V4HI:
35899 case IX86_BUILTIN_VEC_EXT_V16QI:
35900 return ix86_expand_vec_ext_builtin (exp, target);
35902 case IX86_BUILTIN_VEC_SET_V2DI:
35903 case IX86_BUILTIN_VEC_SET_V4SF:
35904 case IX86_BUILTIN_VEC_SET_V4SI:
35905 case IX86_BUILTIN_VEC_SET_V8HI:
35906 case IX86_BUILTIN_VEC_SET_V4HI:
35907 case IX86_BUILTIN_VEC_SET_V16QI:
35908 return ix86_expand_vec_set_builtin (exp);
35910 case IX86_BUILTIN_NANQ:
35911 case IX86_BUILTIN_NANSQ:
35912 return expand_call (exp, target, ignore);
35914 case IX86_BUILTIN_RDPMC:
35915 case IX86_BUILTIN_RDTSC:
35916 case IX86_BUILTIN_RDTSCP:
35917 case IX86_BUILTIN_XGETBV:
35919 op0 = gen_reg_rtx (DImode);
35920 op1 = gen_reg_rtx (DImode);
35922 if (fcode == IX86_BUILTIN_RDPMC)
35924 arg0 = CALL_EXPR_ARG (exp, 0);
35925 op2 = expand_normal (arg0);
35926 if (!register_operand (op2, SImode))
35927 op2 = copy_to_mode_reg (SImode, op2);
35929 insn = (TARGET_64BIT
35930 ? gen_rdpmc_rex64 (op0, op1, op2)
35931 : gen_rdpmc (op0, op2));
35932 emit_insn (insn);
35934 else if (fcode == IX86_BUILTIN_XGETBV)
35936 arg0 = CALL_EXPR_ARG (exp, 0);
35937 op2 = expand_normal (arg0);
35938 if (!register_operand (op2, SImode))
35939 op2 = copy_to_mode_reg (SImode, op2);
35941 insn = (TARGET_64BIT
35942 ? gen_xgetbv_rex64 (op0, op1, op2)
35943 : gen_xgetbv (op0, op2));
35944 emit_insn (insn);
35946 else if (fcode == IX86_BUILTIN_RDTSC)
35948 insn = (TARGET_64BIT
35949 ? gen_rdtsc_rex64 (op0, op1)
35950 : gen_rdtsc (op0));
35951 emit_insn (insn);
35953 else
35955 op2 = gen_reg_rtx (SImode);
35957 insn = (TARGET_64BIT
35958 ? gen_rdtscp_rex64 (op0, op1, op2)
35959 : gen_rdtscp (op0, op2));
35960 emit_insn (insn);
35962 arg0 = CALL_EXPR_ARG (exp, 0);
35963 op4 = expand_normal (arg0);
35964 if (!address_operand (op4, VOIDmode))
35966 op4 = convert_memory_address (Pmode, op4);
35967 op4 = copy_addr_to_reg (op4);
35969 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
35972 if (target == 0)
35974 /* mode is VOIDmode if __builtin_rd* has been called
35975 without lhs. */
35976 if (mode == VOIDmode)
35977 return target;
35978 target = gen_reg_rtx (mode);
35981 if (TARGET_64BIT)
35983 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
35984 op1, 1, OPTAB_DIRECT);
35985 op0 = expand_simple_binop (DImode, IOR, op0, op1,
35986 op0, 1, OPTAB_DIRECT);
35989 emit_move_insn (target, op0);
35990 return target;
35992 case IX86_BUILTIN_FXSAVE:
35993 case IX86_BUILTIN_FXRSTOR:
35994 case IX86_BUILTIN_FXSAVE64:
35995 case IX86_BUILTIN_FXRSTOR64:
35996 case IX86_BUILTIN_FNSTENV:
35997 case IX86_BUILTIN_FLDENV:
35998 mode0 = BLKmode;
35999 switch (fcode)
36001 case IX86_BUILTIN_FXSAVE:
36002 icode = CODE_FOR_fxsave;
36003 break;
36004 case IX86_BUILTIN_FXRSTOR:
36005 icode = CODE_FOR_fxrstor;
36006 break;
36007 case IX86_BUILTIN_FXSAVE64:
36008 icode = CODE_FOR_fxsave64;
36009 break;
36010 case IX86_BUILTIN_FXRSTOR64:
36011 icode = CODE_FOR_fxrstor64;
36012 break;
36013 case IX86_BUILTIN_FNSTENV:
36014 icode = CODE_FOR_fnstenv;
36015 break;
36016 case IX86_BUILTIN_FLDENV:
36017 icode = CODE_FOR_fldenv;
36018 break;
36019 default:
36020 gcc_unreachable ();
36023 arg0 = CALL_EXPR_ARG (exp, 0);
36024 op0 = expand_normal (arg0);
36026 if (!address_operand (op0, VOIDmode))
36028 op0 = convert_memory_address (Pmode, op0);
36029 op0 = copy_addr_to_reg (op0);
36031 op0 = gen_rtx_MEM (mode0, op0);
36033 pat = GEN_FCN (icode) (op0);
36034 if (pat)
36035 emit_insn (pat);
36036 return 0;
36038 case IX86_BUILTIN_XSETBV:
36039 arg0 = CALL_EXPR_ARG (exp, 0);
36040 arg1 = CALL_EXPR_ARG (exp, 1);
36041 op0 = expand_normal (arg0);
36042 op1 = expand_normal (arg1);
36044 if (!REG_P (op0))
36045 op0 = copy_to_mode_reg (SImode, op0);
36047 if (TARGET_64BIT)
36049 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36050 NULL, 1, OPTAB_DIRECT);
36052 op2 = gen_lowpart (SImode, op2);
36053 op1 = gen_lowpart (SImode, op1);
36054 if (!REG_P (op1))
36055 op1 = copy_to_mode_reg (SImode, op1);
36056 if (!REG_P (op2))
36057 op2 = copy_to_mode_reg (SImode, op2);
36058 icode = CODE_FOR_xsetbv_rex64;
36059 pat = GEN_FCN (icode) (op0, op1, op2);
36061 else
36063 if (!REG_P (op1))
36064 op1 = copy_to_mode_reg (DImode, op1);
36065 icode = CODE_FOR_xsetbv;
36066 pat = GEN_FCN (icode) (op0, op1);
36068 if (pat)
36069 emit_insn (pat);
36070 return 0;
36072 case IX86_BUILTIN_XSAVE:
36073 case IX86_BUILTIN_XRSTOR:
36074 case IX86_BUILTIN_XSAVE64:
36075 case IX86_BUILTIN_XRSTOR64:
36076 case IX86_BUILTIN_XSAVEOPT:
36077 case IX86_BUILTIN_XSAVEOPT64:
36078 case IX86_BUILTIN_XSAVES:
36079 case IX86_BUILTIN_XRSTORS:
36080 case IX86_BUILTIN_XSAVES64:
36081 case IX86_BUILTIN_XRSTORS64:
36082 case IX86_BUILTIN_XSAVEC:
36083 case IX86_BUILTIN_XSAVEC64:
36084 arg0 = CALL_EXPR_ARG (exp, 0);
36085 arg1 = CALL_EXPR_ARG (exp, 1);
36086 op0 = expand_normal (arg0);
36087 op1 = expand_normal (arg1);
36089 if (!address_operand (op0, VOIDmode))
36091 op0 = convert_memory_address (Pmode, op0);
36092 op0 = copy_addr_to_reg (op0);
36094 op0 = gen_rtx_MEM (BLKmode, op0);
36096 op1 = force_reg (DImode, op1);
36098 if (TARGET_64BIT)
36100 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36101 NULL, 1, OPTAB_DIRECT);
36102 switch (fcode)
36104 case IX86_BUILTIN_XSAVE:
36105 icode = CODE_FOR_xsave_rex64;
36106 break;
36107 case IX86_BUILTIN_XRSTOR:
36108 icode = CODE_FOR_xrstor_rex64;
36109 break;
36110 case IX86_BUILTIN_XSAVE64:
36111 icode = CODE_FOR_xsave64;
36112 break;
36113 case IX86_BUILTIN_XRSTOR64:
36114 icode = CODE_FOR_xrstor64;
36115 break;
36116 case IX86_BUILTIN_XSAVEOPT:
36117 icode = CODE_FOR_xsaveopt_rex64;
36118 break;
36119 case IX86_BUILTIN_XSAVEOPT64:
36120 icode = CODE_FOR_xsaveopt64;
36121 break;
36122 case IX86_BUILTIN_XSAVES:
36123 icode = CODE_FOR_xsaves_rex64;
36124 break;
36125 case IX86_BUILTIN_XRSTORS:
36126 icode = CODE_FOR_xrstors_rex64;
36127 break;
36128 case IX86_BUILTIN_XSAVES64:
36129 icode = CODE_FOR_xsaves64;
36130 break;
36131 case IX86_BUILTIN_XRSTORS64:
36132 icode = CODE_FOR_xrstors64;
36133 break;
36134 case IX86_BUILTIN_XSAVEC:
36135 icode = CODE_FOR_xsavec_rex64;
36136 break;
36137 case IX86_BUILTIN_XSAVEC64:
36138 icode = CODE_FOR_xsavec64;
36139 break;
36140 default:
36141 gcc_unreachable ();
36144 op2 = gen_lowpart (SImode, op2);
36145 op1 = gen_lowpart (SImode, op1);
36146 pat = GEN_FCN (icode) (op0, op1, op2);
36148 else
36150 switch (fcode)
36152 case IX86_BUILTIN_XSAVE:
36153 icode = CODE_FOR_xsave;
36154 break;
36155 case IX86_BUILTIN_XRSTOR:
36156 icode = CODE_FOR_xrstor;
36157 break;
36158 case IX86_BUILTIN_XSAVEOPT:
36159 icode = CODE_FOR_xsaveopt;
36160 break;
36161 case IX86_BUILTIN_XSAVES:
36162 icode = CODE_FOR_xsaves;
36163 break;
36164 case IX86_BUILTIN_XRSTORS:
36165 icode = CODE_FOR_xrstors;
36166 break;
36167 case IX86_BUILTIN_XSAVEC:
36168 icode = CODE_FOR_xsavec;
36169 break;
36170 default:
36171 gcc_unreachable ();
36173 pat = GEN_FCN (icode) (op0, op1);
36176 if (pat)
36177 emit_insn (pat);
36178 return 0;
36180 case IX86_BUILTIN_LLWPCB:
36181 arg0 = CALL_EXPR_ARG (exp, 0);
36182 op0 = expand_normal (arg0);
36183 icode = CODE_FOR_lwp_llwpcb;
36184 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36185 op0 = ix86_zero_extend_to_Pmode (op0);
36186 emit_insn (gen_lwp_llwpcb (op0));
36187 return 0;
36189 case IX86_BUILTIN_SLWPCB:
36190 icode = CODE_FOR_lwp_slwpcb;
36191 if (!target
36192 || !insn_data[icode].operand[0].predicate (target, Pmode))
36193 target = gen_reg_rtx (Pmode);
36194 emit_insn (gen_lwp_slwpcb (target));
36195 return target;
36197 case IX86_BUILTIN_BEXTRI32:
36198 case IX86_BUILTIN_BEXTRI64:
36199 arg0 = CALL_EXPR_ARG (exp, 0);
36200 arg1 = CALL_EXPR_ARG (exp, 1);
36201 op0 = expand_normal (arg0);
36202 op1 = expand_normal (arg1);
36203 icode = (fcode == IX86_BUILTIN_BEXTRI32
36204 ? CODE_FOR_tbm_bextri_si
36205 : CODE_FOR_tbm_bextri_di);
36206 if (!CONST_INT_P (op1))
36208 error ("last argument must be an immediate");
36209 return const0_rtx;
36211 else
36213 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36214 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36215 op1 = GEN_INT (length);
36216 op2 = GEN_INT (lsb_index);
36217 pat = GEN_FCN (icode) (target, op0, op1, op2);
36218 if (pat)
36219 emit_insn (pat);
36220 return target;
36223 case IX86_BUILTIN_RDRAND16_STEP:
36224 icode = CODE_FOR_rdrandhi_1;
36225 mode0 = HImode;
36226 goto rdrand_step;
36228 case IX86_BUILTIN_RDRAND32_STEP:
36229 icode = CODE_FOR_rdrandsi_1;
36230 mode0 = SImode;
36231 goto rdrand_step;
36233 case IX86_BUILTIN_RDRAND64_STEP:
36234 icode = CODE_FOR_rdranddi_1;
36235 mode0 = DImode;
36237 rdrand_step:
36238 arg0 = CALL_EXPR_ARG (exp, 0);
36239 op1 = expand_normal (arg0);
36240 if (!address_operand (op1, VOIDmode))
36242 op1 = convert_memory_address (Pmode, op1);
36243 op1 = copy_addr_to_reg (op1);
36246 op0 = gen_reg_rtx (mode0);
36247 emit_insn (GEN_FCN (icode) (op0));
36249 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36251 op1 = gen_reg_rtx (SImode);
36252 emit_move_insn (op1, CONST1_RTX (SImode));
36254 /* Emit SImode conditional move. */
36255 if (mode0 == HImode)
36257 if (TARGET_ZERO_EXTEND_WITH_AND
36258 && optimize_function_for_speed_p (cfun))
36260 op2 = force_reg (SImode, const0_rtx);
36262 emit_insn (gen_movstricthi
36263 (gen_lowpart (HImode, op2), op0));
36265 else
36267 op2 = gen_reg_rtx (SImode);
36269 emit_insn (gen_zero_extendhisi2 (op2, op0));
36272 else if (mode0 == SImode)
36273 op2 = op0;
36274 else
36275 op2 = gen_rtx_SUBREG (SImode, op0, 0);
36277 if (target == 0
36278 || !register_operand (target, SImode))
36279 target = gen_reg_rtx (SImode);
36281 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
36282 const0_rtx);
36283 emit_insn (gen_rtx_SET (target,
36284 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
36285 return target;
36287 case IX86_BUILTIN_RDSEED16_STEP:
36288 icode = CODE_FOR_rdseedhi_1;
36289 mode0 = HImode;
36290 goto rdseed_step;
36292 case IX86_BUILTIN_RDSEED32_STEP:
36293 icode = CODE_FOR_rdseedsi_1;
36294 mode0 = SImode;
36295 goto rdseed_step;
36297 case IX86_BUILTIN_RDSEED64_STEP:
36298 icode = CODE_FOR_rdseeddi_1;
36299 mode0 = DImode;
36301 rdseed_step:
36302 arg0 = CALL_EXPR_ARG (exp, 0);
36303 op1 = expand_normal (arg0);
36304 if (!address_operand (op1, VOIDmode))
36306 op1 = convert_memory_address (Pmode, op1);
36307 op1 = copy_addr_to_reg (op1);
36310 op0 = gen_reg_rtx (mode0);
36311 emit_insn (GEN_FCN (icode) (op0));
36313 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36315 op2 = gen_reg_rtx (QImode);
36317 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36318 const0_rtx);
36319 emit_insn (gen_rtx_SET (op2, pat));
36321 if (target == 0
36322 || !register_operand (target, SImode))
36323 target = gen_reg_rtx (SImode);
36325 emit_insn (gen_zero_extendqisi2 (target, op2));
36326 return target;
36328 case IX86_BUILTIN_SBB32:
36329 icode = CODE_FOR_subborrowsi;
36330 icode2 = CODE_FOR_subborrowsi_0;
36331 mode0 = SImode;
36332 mode1 = DImode;
36333 mode2 = CCmode;
36334 goto handlecarry;
36336 case IX86_BUILTIN_SBB64:
36337 icode = CODE_FOR_subborrowdi;
36338 icode2 = CODE_FOR_subborrowdi_0;
36339 mode0 = DImode;
36340 mode1 = TImode;
36341 mode2 = CCmode;
36342 goto handlecarry;
36344 case IX86_BUILTIN_ADDCARRYX32:
36345 icode = CODE_FOR_addcarrysi;
36346 icode2 = CODE_FOR_addcarrysi_0;
36347 mode0 = SImode;
36348 mode1 = DImode;
36349 mode2 = CCCmode;
36350 goto handlecarry;
36352 case IX86_BUILTIN_ADDCARRYX64:
36353 icode = CODE_FOR_addcarrydi;
36354 icode2 = CODE_FOR_addcarrydi_0;
36355 mode0 = DImode;
36356 mode1 = TImode;
36357 mode2 = CCCmode;
36359 handlecarry:
36360 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36361 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36362 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36363 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36365 op1 = expand_normal (arg0);
36366 if (!integer_zerop (arg0))
36367 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36369 op2 = expand_normal (arg1);
36370 if (!register_operand (op2, mode0))
36371 op2 = copy_to_mode_reg (mode0, op2);
36373 op3 = expand_normal (arg2);
36374 if (!register_operand (op3, mode0))
36375 op3 = copy_to_mode_reg (mode0, op3);
36377 op4 = expand_normal (arg3);
36378 if (!address_operand (op4, VOIDmode))
36380 op4 = convert_memory_address (Pmode, op4);
36381 op4 = copy_addr_to_reg (op4);
36384 op0 = gen_reg_rtx (mode0);
36385 if (integer_zerop (arg0))
36387 /* If arg0 is 0, optimize right away into add or sub
36388 instruction that sets CCCmode flags. */
36389 op1 = gen_rtx_REG (mode2, FLAGS_REG);
36390 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
36392 else
36394 /* Generate CF from input operand. */
36395 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36397 /* Generate instruction that consumes CF. */
36398 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36399 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
36400 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
36401 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
36404 /* Return current CF value. */
36405 if (target == 0)
36406 target = gen_reg_rtx (QImode);
36408 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
36409 emit_insn (gen_rtx_SET (target, pat));
36411 /* Store the result. */
36412 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36414 return target;
36416 case IX86_BUILTIN_READ_FLAGS:
36417 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36419 if (optimize
36420 || target == NULL_RTX
36421 || !nonimmediate_operand (target, word_mode)
36422 || GET_MODE (target) != word_mode)
36423 target = gen_reg_rtx (word_mode);
36425 emit_insn (gen_pop (target));
36426 return target;
36428 case IX86_BUILTIN_WRITE_FLAGS:
36430 arg0 = CALL_EXPR_ARG (exp, 0);
36431 op0 = expand_normal (arg0);
36432 if (!general_no_elim_operand (op0, word_mode))
36433 op0 = copy_to_mode_reg (word_mode, op0);
36435 emit_insn (gen_push (op0));
36436 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36437 return 0;
36439 case IX86_BUILTIN_KTESTC8:
36440 icode = CODE_FOR_ktestqi;
36441 mode3 = CCCmode;
36442 goto kortest;
36444 case IX86_BUILTIN_KTESTZ8:
36445 icode = CODE_FOR_ktestqi;
36446 mode3 = CCZmode;
36447 goto kortest;
36449 case IX86_BUILTIN_KTESTC16:
36450 icode = CODE_FOR_ktesthi;
36451 mode3 = CCCmode;
36452 goto kortest;
36454 case IX86_BUILTIN_KTESTZ16:
36455 icode = CODE_FOR_ktesthi;
36456 mode3 = CCZmode;
36457 goto kortest;
36459 case IX86_BUILTIN_KTESTC32:
36460 icode = CODE_FOR_ktestsi;
36461 mode3 = CCCmode;
36462 goto kortest;
36464 case IX86_BUILTIN_KTESTZ32:
36465 icode = CODE_FOR_ktestsi;
36466 mode3 = CCZmode;
36467 goto kortest;
36469 case IX86_BUILTIN_KTESTC64:
36470 icode = CODE_FOR_ktestdi;
36471 mode3 = CCCmode;
36472 goto kortest;
36474 case IX86_BUILTIN_KTESTZ64:
36475 icode = CODE_FOR_ktestdi;
36476 mode3 = CCZmode;
36477 goto kortest;
36479 case IX86_BUILTIN_KORTESTC8:
36480 icode = CODE_FOR_kortestqi;
36481 mode3 = CCCmode;
36482 goto kortest;
36484 case IX86_BUILTIN_KORTESTZ8:
36485 icode = CODE_FOR_kortestqi;
36486 mode3 = CCZmode;
36487 goto kortest;
36489 case IX86_BUILTIN_KORTESTC16:
36490 icode = CODE_FOR_kortesthi;
36491 mode3 = CCCmode;
36492 goto kortest;
36494 case IX86_BUILTIN_KORTESTZ16:
36495 icode = CODE_FOR_kortesthi;
36496 mode3 = CCZmode;
36497 goto kortest;
36499 case IX86_BUILTIN_KORTESTC32:
36500 icode = CODE_FOR_kortestsi;
36501 mode3 = CCCmode;
36502 goto kortest;
36504 case IX86_BUILTIN_KORTESTZ32:
36505 icode = CODE_FOR_kortestsi;
36506 mode3 = CCZmode;
36507 goto kortest;
36509 case IX86_BUILTIN_KORTESTC64:
36510 icode = CODE_FOR_kortestdi;
36511 mode3 = CCCmode;
36512 goto kortest;
36514 case IX86_BUILTIN_KORTESTZ64:
36515 icode = CODE_FOR_kortestdi;
36516 mode3 = CCZmode;
36518 kortest:
36519 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36520 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36521 op0 = expand_normal (arg0);
36522 op1 = expand_normal (arg1);
36524 mode0 = insn_data[icode].operand[0].mode;
36525 mode1 = insn_data[icode].operand[1].mode;
36527 if (GET_MODE (op0) != VOIDmode)
36528 op0 = force_reg (GET_MODE (op0), op0);
36530 op0 = gen_lowpart (mode0, op0);
36532 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36533 op0 = copy_to_mode_reg (mode0, op0);
36535 if (GET_MODE (op1) != VOIDmode)
36536 op1 = force_reg (GET_MODE (op1), op1);
36538 op1 = gen_lowpart (mode1, op1);
36540 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36541 op1 = copy_to_mode_reg (mode1, op1);
36543 target = gen_reg_rtx (QImode);
36545 /* Emit kortest. */
36546 emit_insn (GEN_FCN (icode) (op0, op1));
36547 /* And use setcc to return result from flags. */
36548 ix86_expand_setcc (target, EQ,
36549 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36550 return target;
36552 case IX86_BUILTIN_GATHERSIV2DF:
36553 icode = CODE_FOR_avx2_gathersiv2df;
36554 goto gather_gen;
36555 case IX86_BUILTIN_GATHERSIV4DF:
36556 icode = CODE_FOR_avx2_gathersiv4df;
36557 goto gather_gen;
36558 case IX86_BUILTIN_GATHERDIV2DF:
36559 icode = CODE_FOR_avx2_gatherdiv2df;
36560 goto gather_gen;
36561 case IX86_BUILTIN_GATHERDIV4DF:
36562 icode = CODE_FOR_avx2_gatherdiv4df;
36563 goto gather_gen;
36564 case IX86_BUILTIN_GATHERSIV4SF:
36565 icode = CODE_FOR_avx2_gathersiv4sf;
36566 goto gather_gen;
36567 case IX86_BUILTIN_GATHERSIV8SF:
36568 icode = CODE_FOR_avx2_gathersiv8sf;
36569 goto gather_gen;
36570 case IX86_BUILTIN_GATHERDIV4SF:
36571 icode = CODE_FOR_avx2_gatherdiv4sf;
36572 goto gather_gen;
36573 case IX86_BUILTIN_GATHERDIV8SF:
36574 icode = CODE_FOR_avx2_gatherdiv8sf;
36575 goto gather_gen;
36576 case IX86_BUILTIN_GATHERSIV2DI:
36577 icode = CODE_FOR_avx2_gathersiv2di;
36578 goto gather_gen;
36579 case IX86_BUILTIN_GATHERSIV4DI:
36580 icode = CODE_FOR_avx2_gathersiv4di;
36581 goto gather_gen;
36582 case IX86_BUILTIN_GATHERDIV2DI:
36583 icode = CODE_FOR_avx2_gatherdiv2di;
36584 goto gather_gen;
36585 case IX86_BUILTIN_GATHERDIV4DI:
36586 icode = CODE_FOR_avx2_gatherdiv4di;
36587 goto gather_gen;
36588 case IX86_BUILTIN_GATHERSIV4SI:
36589 icode = CODE_FOR_avx2_gathersiv4si;
36590 goto gather_gen;
36591 case IX86_BUILTIN_GATHERSIV8SI:
36592 icode = CODE_FOR_avx2_gathersiv8si;
36593 goto gather_gen;
36594 case IX86_BUILTIN_GATHERDIV4SI:
36595 icode = CODE_FOR_avx2_gatherdiv4si;
36596 goto gather_gen;
36597 case IX86_BUILTIN_GATHERDIV8SI:
36598 icode = CODE_FOR_avx2_gatherdiv8si;
36599 goto gather_gen;
36600 case IX86_BUILTIN_GATHERALTSIV4DF:
36601 icode = CODE_FOR_avx2_gathersiv4df;
36602 goto gather_gen;
36603 case IX86_BUILTIN_GATHERALTDIV8SF:
36604 icode = CODE_FOR_avx2_gatherdiv8sf;
36605 goto gather_gen;
36606 case IX86_BUILTIN_GATHERALTSIV4DI:
36607 icode = CODE_FOR_avx2_gathersiv4di;
36608 goto gather_gen;
36609 case IX86_BUILTIN_GATHERALTDIV8SI:
36610 icode = CODE_FOR_avx2_gatherdiv8si;
36611 goto gather_gen;
36612 case IX86_BUILTIN_GATHER3SIV16SF:
36613 icode = CODE_FOR_avx512f_gathersiv16sf;
36614 goto gather_gen;
36615 case IX86_BUILTIN_GATHER3SIV8DF:
36616 icode = CODE_FOR_avx512f_gathersiv8df;
36617 goto gather_gen;
36618 case IX86_BUILTIN_GATHER3DIV16SF:
36619 icode = CODE_FOR_avx512f_gatherdiv16sf;
36620 goto gather_gen;
36621 case IX86_BUILTIN_GATHER3DIV8DF:
36622 icode = CODE_FOR_avx512f_gatherdiv8df;
36623 goto gather_gen;
36624 case IX86_BUILTIN_GATHER3SIV16SI:
36625 icode = CODE_FOR_avx512f_gathersiv16si;
36626 goto gather_gen;
36627 case IX86_BUILTIN_GATHER3SIV8DI:
36628 icode = CODE_FOR_avx512f_gathersiv8di;
36629 goto gather_gen;
36630 case IX86_BUILTIN_GATHER3DIV16SI:
36631 icode = CODE_FOR_avx512f_gatherdiv16si;
36632 goto gather_gen;
36633 case IX86_BUILTIN_GATHER3DIV8DI:
36634 icode = CODE_FOR_avx512f_gatherdiv8di;
36635 goto gather_gen;
36636 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36637 icode = CODE_FOR_avx512f_gathersiv8df;
36638 goto gather_gen;
36639 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36640 icode = CODE_FOR_avx512f_gatherdiv16sf;
36641 goto gather_gen;
36642 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36643 icode = CODE_FOR_avx512f_gathersiv8di;
36644 goto gather_gen;
36645 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36646 icode = CODE_FOR_avx512f_gatherdiv16si;
36647 goto gather_gen;
36648 case IX86_BUILTIN_GATHER3SIV2DF:
36649 icode = CODE_FOR_avx512vl_gathersiv2df;
36650 goto gather_gen;
36651 case IX86_BUILTIN_GATHER3SIV4DF:
36652 icode = CODE_FOR_avx512vl_gathersiv4df;
36653 goto gather_gen;
36654 case IX86_BUILTIN_GATHER3DIV2DF:
36655 icode = CODE_FOR_avx512vl_gatherdiv2df;
36656 goto gather_gen;
36657 case IX86_BUILTIN_GATHER3DIV4DF:
36658 icode = CODE_FOR_avx512vl_gatherdiv4df;
36659 goto gather_gen;
36660 case IX86_BUILTIN_GATHER3SIV4SF:
36661 icode = CODE_FOR_avx512vl_gathersiv4sf;
36662 goto gather_gen;
36663 case IX86_BUILTIN_GATHER3SIV8SF:
36664 icode = CODE_FOR_avx512vl_gathersiv8sf;
36665 goto gather_gen;
36666 case IX86_BUILTIN_GATHER3DIV4SF:
36667 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36668 goto gather_gen;
36669 case IX86_BUILTIN_GATHER3DIV8SF:
36670 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36671 goto gather_gen;
36672 case IX86_BUILTIN_GATHER3SIV2DI:
36673 icode = CODE_FOR_avx512vl_gathersiv2di;
36674 goto gather_gen;
36675 case IX86_BUILTIN_GATHER3SIV4DI:
36676 icode = CODE_FOR_avx512vl_gathersiv4di;
36677 goto gather_gen;
36678 case IX86_BUILTIN_GATHER3DIV2DI:
36679 icode = CODE_FOR_avx512vl_gatherdiv2di;
36680 goto gather_gen;
36681 case IX86_BUILTIN_GATHER3DIV4DI:
36682 icode = CODE_FOR_avx512vl_gatherdiv4di;
36683 goto gather_gen;
36684 case IX86_BUILTIN_GATHER3SIV4SI:
36685 icode = CODE_FOR_avx512vl_gathersiv4si;
36686 goto gather_gen;
36687 case IX86_BUILTIN_GATHER3SIV8SI:
36688 icode = CODE_FOR_avx512vl_gathersiv8si;
36689 goto gather_gen;
36690 case IX86_BUILTIN_GATHER3DIV4SI:
36691 icode = CODE_FOR_avx512vl_gatherdiv4si;
36692 goto gather_gen;
36693 case IX86_BUILTIN_GATHER3DIV8SI:
36694 icode = CODE_FOR_avx512vl_gatherdiv8si;
36695 goto gather_gen;
36696 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36697 icode = CODE_FOR_avx512vl_gathersiv4df;
36698 goto gather_gen;
36699 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36700 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36701 goto gather_gen;
36702 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36703 icode = CODE_FOR_avx512vl_gathersiv4di;
36704 goto gather_gen;
36705 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36706 icode = CODE_FOR_avx512vl_gatherdiv8si;
36707 goto gather_gen;
36708 case IX86_BUILTIN_SCATTERSIV16SF:
36709 icode = CODE_FOR_avx512f_scattersiv16sf;
36710 goto scatter_gen;
36711 case IX86_BUILTIN_SCATTERSIV8DF:
36712 icode = CODE_FOR_avx512f_scattersiv8df;
36713 goto scatter_gen;
36714 case IX86_BUILTIN_SCATTERDIV16SF:
36715 icode = CODE_FOR_avx512f_scatterdiv16sf;
36716 goto scatter_gen;
36717 case IX86_BUILTIN_SCATTERDIV8DF:
36718 icode = CODE_FOR_avx512f_scatterdiv8df;
36719 goto scatter_gen;
36720 case IX86_BUILTIN_SCATTERSIV16SI:
36721 icode = CODE_FOR_avx512f_scattersiv16si;
36722 goto scatter_gen;
36723 case IX86_BUILTIN_SCATTERSIV8DI:
36724 icode = CODE_FOR_avx512f_scattersiv8di;
36725 goto scatter_gen;
36726 case IX86_BUILTIN_SCATTERDIV16SI:
36727 icode = CODE_FOR_avx512f_scatterdiv16si;
36728 goto scatter_gen;
36729 case IX86_BUILTIN_SCATTERDIV8DI:
36730 icode = CODE_FOR_avx512f_scatterdiv8di;
36731 goto scatter_gen;
36732 case IX86_BUILTIN_SCATTERSIV8SF:
36733 icode = CODE_FOR_avx512vl_scattersiv8sf;
36734 goto scatter_gen;
36735 case IX86_BUILTIN_SCATTERSIV4SF:
36736 icode = CODE_FOR_avx512vl_scattersiv4sf;
36737 goto scatter_gen;
36738 case IX86_BUILTIN_SCATTERSIV4DF:
36739 icode = CODE_FOR_avx512vl_scattersiv4df;
36740 goto scatter_gen;
36741 case IX86_BUILTIN_SCATTERSIV2DF:
36742 icode = CODE_FOR_avx512vl_scattersiv2df;
36743 goto scatter_gen;
36744 case IX86_BUILTIN_SCATTERDIV8SF:
36745 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36746 goto scatter_gen;
36747 case IX86_BUILTIN_SCATTERDIV4SF:
36748 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36749 goto scatter_gen;
36750 case IX86_BUILTIN_SCATTERDIV4DF:
36751 icode = CODE_FOR_avx512vl_scatterdiv4df;
36752 goto scatter_gen;
36753 case IX86_BUILTIN_SCATTERDIV2DF:
36754 icode = CODE_FOR_avx512vl_scatterdiv2df;
36755 goto scatter_gen;
36756 case IX86_BUILTIN_SCATTERSIV8SI:
36757 icode = CODE_FOR_avx512vl_scattersiv8si;
36758 goto scatter_gen;
36759 case IX86_BUILTIN_SCATTERSIV4SI:
36760 icode = CODE_FOR_avx512vl_scattersiv4si;
36761 goto scatter_gen;
36762 case IX86_BUILTIN_SCATTERSIV4DI:
36763 icode = CODE_FOR_avx512vl_scattersiv4di;
36764 goto scatter_gen;
36765 case IX86_BUILTIN_SCATTERSIV2DI:
36766 icode = CODE_FOR_avx512vl_scattersiv2di;
36767 goto scatter_gen;
36768 case IX86_BUILTIN_SCATTERDIV8SI:
36769 icode = CODE_FOR_avx512vl_scatterdiv8si;
36770 goto scatter_gen;
36771 case IX86_BUILTIN_SCATTERDIV4SI:
36772 icode = CODE_FOR_avx512vl_scatterdiv4si;
36773 goto scatter_gen;
36774 case IX86_BUILTIN_SCATTERDIV4DI:
36775 icode = CODE_FOR_avx512vl_scatterdiv4di;
36776 goto scatter_gen;
36777 case IX86_BUILTIN_SCATTERDIV2DI:
36778 icode = CODE_FOR_avx512vl_scatterdiv2di;
36779 goto scatter_gen;
36780 case IX86_BUILTIN_GATHERPFDPD:
36781 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36782 goto vec_prefetch_gen;
36783 case IX86_BUILTIN_SCATTERALTSIV8DF:
36784 icode = CODE_FOR_avx512f_scattersiv8df;
36785 goto scatter_gen;
36786 case IX86_BUILTIN_SCATTERALTDIV16SF:
36787 icode = CODE_FOR_avx512f_scatterdiv16sf;
36788 goto scatter_gen;
36789 case IX86_BUILTIN_SCATTERALTSIV8DI:
36790 icode = CODE_FOR_avx512f_scattersiv8di;
36791 goto scatter_gen;
36792 case IX86_BUILTIN_SCATTERALTDIV16SI:
36793 icode = CODE_FOR_avx512f_scatterdiv16si;
36794 goto scatter_gen;
36795 case IX86_BUILTIN_GATHERPFDPS:
36796 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36797 goto vec_prefetch_gen;
36798 case IX86_BUILTIN_GATHERPFQPD:
36799 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36800 goto vec_prefetch_gen;
36801 case IX86_BUILTIN_GATHERPFQPS:
36802 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36803 goto vec_prefetch_gen;
36804 case IX86_BUILTIN_SCATTERPFDPD:
36805 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36806 goto vec_prefetch_gen;
36807 case IX86_BUILTIN_SCATTERPFDPS:
36808 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36809 goto vec_prefetch_gen;
36810 case IX86_BUILTIN_SCATTERPFQPD:
36811 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36812 goto vec_prefetch_gen;
36813 case IX86_BUILTIN_SCATTERPFQPS:
36814 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36815 goto vec_prefetch_gen;
36817 gather_gen:
36818 rtx half;
36819 rtx (*gen) (rtx, rtx);
36821 arg0 = CALL_EXPR_ARG (exp, 0);
36822 arg1 = CALL_EXPR_ARG (exp, 1);
36823 arg2 = CALL_EXPR_ARG (exp, 2);
36824 arg3 = CALL_EXPR_ARG (exp, 3);
36825 arg4 = CALL_EXPR_ARG (exp, 4);
36826 op0 = expand_normal (arg0);
36827 op1 = expand_normal (arg1);
36828 op2 = expand_normal (arg2);
36829 op3 = expand_normal (arg3);
36830 op4 = expand_normal (arg4);
36831 /* Note the arg order is different from the operand order. */
36832 mode0 = insn_data[icode].operand[1].mode;
36833 mode2 = insn_data[icode].operand[3].mode;
36834 mode3 = insn_data[icode].operand[4].mode;
36835 mode4 = insn_data[icode].operand[5].mode;
36837 if (target == NULL_RTX
36838 || GET_MODE (target) != insn_data[icode].operand[0].mode
36839 || !insn_data[icode].operand[0].predicate (target,
36840 GET_MODE (target)))
36841 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36842 else
36843 subtarget = target;
36845 switch (fcode)
36847 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36848 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36849 half = gen_reg_rtx (V8SImode);
36850 if (!nonimmediate_operand (op2, V16SImode))
36851 op2 = copy_to_mode_reg (V16SImode, op2);
36852 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36853 op2 = half;
36854 break;
36855 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36856 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36857 case IX86_BUILTIN_GATHERALTSIV4DF:
36858 case IX86_BUILTIN_GATHERALTSIV4DI:
36859 half = gen_reg_rtx (V4SImode);
36860 if (!nonimmediate_operand (op2, V8SImode))
36861 op2 = copy_to_mode_reg (V8SImode, op2);
36862 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36863 op2 = half;
36864 break;
36865 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36866 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36867 half = gen_reg_rtx (mode0);
36868 if (mode0 == V8SFmode)
36869 gen = gen_vec_extract_lo_v16sf;
36870 else
36871 gen = gen_vec_extract_lo_v16si;
36872 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36873 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36874 emit_insn (gen (half, op0));
36875 op0 = half;
36876 if (GET_MODE (op3) != VOIDmode)
36878 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36879 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36880 emit_insn (gen (half, op3));
36881 op3 = half;
36883 break;
36884 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36885 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36886 case IX86_BUILTIN_GATHERALTDIV8SF:
36887 case IX86_BUILTIN_GATHERALTDIV8SI:
36888 half = gen_reg_rtx (mode0);
36889 if (mode0 == V4SFmode)
36890 gen = gen_vec_extract_lo_v8sf;
36891 else
36892 gen = gen_vec_extract_lo_v8si;
36893 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36894 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36895 emit_insn (gen (half, op0));
36896 op0 = half;
36897 if (GET_MODE (op3) != VOIDmode)
36899 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36900 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36901 emit_insn (gen (half, op3));
36902 op3 = half;
36904 break;
36905 default:
36906 break;
36909 /* Force memory operand only with base register here. But we
36910 don't want to do it on memory operand for other builtin
36911 functions. */
36912 op1 = ix86_zero_extend_to_Pmode (op1);
36914 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36915 op0 = copy_to_mode_reg (mode0, op0);
36916 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36917 op1 = copy_to_mode_reg (Pmode, op1);
36918 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36919 op2 = copy_to_mode_reg (mode2, op2);
36921 op3 = fixup_modeless_constant (op3, mode3);
36923 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36925 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36926 op3 = copy_to_mode_reg (mode3, op3);
36928 else
36930 op3 = copy_to_reg (op3);
36931 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36933 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36935 error ("the last argument must be scale 1, 2, 4, 8");
36936 return const0_rtx;
36939 /* Optimize. If mask is known to have all high bits set,
36940 replace op0 with pc_rtx to signal that the instruction
36941 overwrites the whole destination and doesn't use its
36942 previous contents. */
36943 if (optimize)
36945 if (TREE_CODE (arg3) == INTEGER_CST)
36947 if (integer_all_onesp (arg3))
36948 op0 = pc_rtx;
36950 else if (TREE_CODE (arg3) == VECTOR_CST)
36952 unsigned int negative = 0;
36953 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36955 tree cst = VECTOR_CST_ELT (arg3, i);
36956 if (TREE_CODE (cst) == INTEGER_CST
36957 && tree_int_cst_sign_bit (cst))
36958 negative++;
36959 else if (TREE_CODE (cst) == REAL_CST
36960 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36961 negative++;
36963 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
36964 op0 = pc_rtx;
36966 else if (TREE_CODE (arg3) == SSA_NAME
36967 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
36969 /* Recognize also when mask is like:
36970 __v2df src = _mm_setzero_pd ();
36971 __v2df mask = _mm_cmpeq_pd (src, src);
36973 __v8sf src = _mm256_setzero_ps ();
36974 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
36975 as that is a cheaper way to load all ones into
36976 a register than having to load a constant from
36977 memory. */
36978 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
36979 if (is_gimple_call (def_stmt))
36981 tree fndecl = gimple_call_fndecl (def_stmt);
36982 if (fndecl
36983 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
36984 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
36986 case IX86_BUILTIN_CMPPD:
36987 case IX86_BUILTIN_CMPPS:
36988 case IX86_BUILTIN_CMPPD256:
36989 case IX86_BUILTIN_CMPPS256:
36990 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
36991 break;
36992 /* FALLTHRU */
36993 case IX86_BUILTIN_CMPEQPD:
36994 case IX86_BUILTIN_CMPEQPS:
36995 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
36996 && initializer_zerop (gimple_call_arg (def_stmt,
36997 1)))
36998 op0 = pc_rtx;
36999 break;
37000 default:
37001 break;
37007 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37008 if (! pat)
37009 return const0_rtx;
37010 emit_insn (pat);
37012 switch (fcode)
37014 case IX86_BUILTIN_GATHER3DIV16SF:
37015 if (target == NULL_RTX)
37016 target = gen_reg_rtx (V8SFmode);
37017 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37018 break;
37019 case IX86_BUILTIN_GATHER3DIV16SI:
37020 if (target == NULL_RTX)
37021 target = gen_reg_rtx (V8SImode);
37022 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37023 break;
37024 case IX86_BUILTIN_GATHER3DIV8SF:
37025 case IX86_BUILTIN_GATHERDIV8SF:
37026 if (target == NULL_RTX)
37027 target = gen_reg_rtx (V4SFmode);
37028 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37029 break;
37030 case IX86_BUILTIN_GATHER3DIV8SI:
37031 case IX86_BUILTIN_GATHERDIV8SI:
37032 if (target == NULL_RTX)
37033 target = gen_reg_rtx (V4SImode);
37034 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37035 break;
37036 default:
37037 target = subtarget;
37038 break;
37040 return target;
37042 scatter_gen:
37043 arg0 = CALL_EXPR_ARG (exp, 0);
37044 arg1 = CALL_EXPR_ARG (exp, 1);
37045 arg2 = CALL_EXPR_ARG (exp, 2);
37046 arg3 = CALL_EXPR_ARG (exp, 3);
37047 arg4 = CALL_EXPR_ARG (exp, 4);
37048 op0 = expand_normal (arg0);
37049 op1 = expand_normal (arg1);
37050 op2 = expand_normal (arg2);
37051 op3 = expand_normal (arg3);
37052 op4 = expand_normal (arg4);
37053 mode1 = insn_data[icode].operand[1].mode;
37054 mode2 = insn_data[icode].operand[2].mode;
37055 mode3 = insn_data[icode].operand[3].mode;
37056 mode4 = insn_data[icode].operand[4].mode;
37058 /* Scatter instruction stores operand op3 to memory with
37059 indices from op2 and scale from op4 under writemask op1.
37060 If index operand op2 has more elements then source operand
37061 op3 one need to use only its low half. And vice versa. */
37062 switch (fcode)
37064 case IX86_BUILTIN_SCATTERALTSIV8DF:
37065 case IX86_BUILTIN_SCATTERALTSIV8DI:
37066 half = gen_reg_rtx (V8SImode);
37067 if (!nonimmediate_operand (op2, V16SImode))
37068 op2 = copy_to_mode_reg (V16SImode, op2);
37069 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37070 op2 = half;
37071 break;
37072 case IX86_BUILTIN_SCATTERALTDIV16SF:
37073 case IX86_BUILTIN_SCATTERALTDIV16SI:
37074 half = gen_reg_rtx (mode3);
37075 if (mode3 == V8SFmode)
37076 gen = gen_vec_extract_lo_v16sf;
37077 else
37078 gen = gen_vec_extract_lo_v16si;
37079 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37080 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37081 emit_insn (gen (half, op3));
37082 op3 = half;
37083 break;
37084 default:
37085 break;
37088 /* Force memory operand only with base register here. But we
37089 don't want to do it on memory operand for other builtin
37090 functions. */
37091 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37093 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37094 op0 = copy_to_mode_reg (Pmode, op0);
37096 op1 = fixup_modeless_constant (op1, mode1);
37098 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37100 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37101 op1 = copy_to_mode_reg (mode1, op1);
37103 else
37105 op1 = copy_to_reg (op1);
37106 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37109 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37110 op2 = copy_to_mode_reg (mode2, op2);
37112 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37113 op3 = copy_to_mode_reg (mode3, op3);
37115 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37117 error ("the last argument must be scale 1, 2, 4, 8");
37118 return const0_rtx;
37121 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37122 if (! pat)
37123 return const0_rtx;
37125 emit_insn (pat);
37126 return 0;
37128 vec_prefetch_gen:
37129 arg0 = CALL_EXPR_ARG (exp, 0);
37130 arg1 = CALL_EXPR_ARG (exp, 1);
37131 arg2 = CALL_EXPR_ARG (exp, 2);
37132 arg3 = CALL_EXPR_ARG (exp, 3);
37133 arg4 = CALL_EXPR_ARG (exp, 4);
37134 op0 = expand_normal (arg0);
37135 op1 = expand_normal (arg1);
37136 op2 = expand_normal (arg2);
37137 op3 = expand_normal (arg3);
37138 op4 = expand_normal (arg4);
37139 mode0 = insn_data[icode].operand[0].mode;
37140 mode1 = insn_data[icode].operand[1].mode;
37141 mode3 = insn_data[icode].operand[3].mode;
37142 mode4 = insn_data[icode].operand[4].mode;
37144 op0 = fixup_modeless_constant (op0, mode0);
37146 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37148 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37149 op0 = copy_to_mode_reg (mode0, op0);
37151 else
37153 op0 = copy_to_reg (op0);
37154 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37157 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37158 op1 = copy_to_mode_reg (mode1, op1);
37160 /* Force memory operand only with base register here. But we
37161 don't want to do it on memory operand for other builtin
37162 functions. */
37163 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37165 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37166 op2 = copy_to_mode_reg (Pmode, op2);
37168 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37170 error ("the forth argument must be scale 1, 2, 4, 8");
37171 return const0_rtx;
37174 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37176 error ("incorrect hint operand");
37177 return const0_rtx;
37180 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37181 if (! pat)
37182 return const0_rtx;
37184 emit_insn (pat);
37186 return 0;
37188 case IX86_BUILTIN_XABORT:
37189 icode = CODE_FOR_xabort;
37190 arg0 = CALL_EXPR_ARG (exp, 0);
37191 op0 = expand_normal (arg0);
37192 mode0 = insn_data[icode].operand[0].mode;
37193 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37195 error ("the xabort's argument must be an 8-bit immediate");
37196 return const0_rtx;
37198 emit_insn (gen_xabort (op0));
37199 return 0;
37201 case IX86_BUILTIN_RSTORSSP:
37202 case IX86_BUILTIN_CLRSSBSY:
37203 arg0 = CALL_EXPR_ARG (exp, 0);
37204 op0 = expand_normal (arg0);
37205 icode = (fcode == IX86_BUILTIN_RSTORSSP
37206 ? CODE_FOR_rstorssp
37207 : CODE_FOR_clrssbsy);
37208 if (!address_operand (op0, VOIDmode))
37210 op1 = convert_memory_address (Pmode, op0);
37211 op0 = copy_addr_to_reg (op1);
37213 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
37214 return 0;
37216 case IX86_BUILTIN_WRSSD:
37217 case IX86_BUILTIN_WRSSQ:
37218 case IX86_BUILTIN_WRUSSD:
37219 case IX86_BUILTIN_WRUSSQ:
37220 arg0 = CALL_EXPR_ARG (exp, 0);
37221 op0 = expand_normal (arg0);
37222 arg1 = CALL_EXPR_ARG (exp, 1);
37223 op1 = expand_normal (arg1);
37224 switch (fcode)
37226 case IX86_BUILTIN_WRSSD:
37227 icode = CODE_FOR_wrsssi;
37228 mode = SImode;
37229 break;
37230 case IX86_BUILTIN_WRSSQ:
37231 icode = CODE_FOR_wrssdi;
37232 mode = DImode;
37233 break;
37234 case IX86_BUILTIN_WRUSSD:
37235 icode = CODE_FOR_wrusssi;
37236 mode = SImode;
37237 break;
37238 case IX86_BUILTIN_WRUSSQ:
37239 icode = CODE_FOR_wrussdi;
37240 mode = DImode;
37241 break;
37243 op0 = force_reg (mode, op0);
37244 if (!address_operand (op1, VOIDmode))
37246 op2 = convert_memory_address (Pmode, op1);
37247 op1 = copy_addr_to_reg (op2);
37249 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
37250 return 0;
37252 default:
37253 break;
37256 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37257 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37259 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37260 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37261 target);
37264 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37265 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37267 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37268 switch (fcode)
37270 case IX86_BUILTIN_FABSQ:
37271 case IX86_BUILTIN_COPYSIGNQ:
37272 if (!TARGET_SSE)
37273 /* Emit a normal call if SSE isn't available. */
37274 return expand_call (exp, target, ignore);
37275 /* FALLTHRU */
37276 default:
37277 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37281 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
37282 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
37284 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
37285 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
37286 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
37287 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
37288 int masked = 1;
37289 machine_mode mode, wide_mode, nar_mode;
37291 nar_mode = V4SFmode;
37292 mode = V16SFmode;
37293 wide_mode = V64SFmode;
37294 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
37295 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
37297 switch (fcode)
37299 case IX86_BUILTIN_4FMAPS:
37300 fcn = gen_avx5124fmaddps_4fmaddps;
37301 masked = 0;
37302 goto v4fma_expand;
37304 case IX86_BUILTIN_4DPWSSD:
37305 nar_mode = V4SImode;
37306 mode = V16SImode;
37307 wide_mode = V64SImode;
37308 fcn = gen_avx5124vnniw_vp4dpwssd;
37309 masked = 0;
37310 goto v4fma_expand;
37312 case IX86_BUILTIN_4DPWSSDS:
37313 nar_mode = V4SImode;
37314 mode = V16SImode;
37315 wide_mode = V64SImode;
37316 fcn = gen_avx5124vnniw_vp4dpwssds;
37317 masked = 0;
37318 goto v4fma_expand;
37320 case IX86_BUILTIN_4FNMAPS:
37321 fcn = gen_avx5124fmaddps_4fnmaddps;
37322 masked = 0;
37323 goto v4fma_expand;
37325 case IX86_BUILTIN_4FNMAPS_MASK:
37326 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37327 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37328 goto v4fma_expand;
37330 case IX86_BUILTIN_4DPWSSD_MASK:
37331 nar_mode = V4SImode;
37332 mode = V16SImode;
37333 wide_mode = V64SImode;
37334 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37335 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37336 goto v4fma_expand;
37338 case IX86_BUILTIN_4DPWSSDS_MASK:
37339 nar_mode = V4SImode;
37340 mode = V16SImode;
37341 wide_mode = V64SImode;
37342 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37343 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37344 goto v4fma_expand;
37346 case IX86_BUILTIN_4FMAPS_MASK:
37348 tree args[4];
37349 rtx ops[4];
37350 rtx wide_reg;
37351 rtx accum;
37352 rtx addr;
37353 rtx mem;
37355 v4fma_expand:
37356 wide_reg = gen_reg_rtx (wide_mode);
37357 for (i = 0; i < 4; i++)
37359 args[i] = CALL_EXPR_ARG (exp, i);
37360 ops[i] = expand_normal (args[i]);
37362 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37363 ops[i]);
37366 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37367 accum = force_reg (mode, accum);
37369 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37370 addr = force_reg (Pmode, addr);
37372 mem = gen_rtx_MEM (nar_mode, addr);
37374 target = gen_reg_rtx (mode);
37376 emit_move_insn (target, accum);
37378 if (! masked)
37379 emit_insn (fcn (target, accum, wide_reg, mem));
37380 else
37382 rtx merge, mask;
37383 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37385 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37387 if (CONST_INT_P (mask))
37388 mask = fixup_modeless_constant (mask, HImode);
37390 mask = force_reg (HImode, mask);
37392 if (GET_MODE (mask) != HImode)
37393 mask = gen_rtx_SUBREG (HImode, mask, 0);
37395 /* If merge is 0 then we're about to emit z-masked variant. */
37396 if (const0_operand (merge, mode))
37397 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37398 /* If merge is the same as accum then emit merge-masked variant. */
37399 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37401 merge = force_reg (mode, merge);
37402 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37404 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37405 else
37407 target = gen_reg_rtx (mode);
37408 emit_move_insn (target, merge);
37409 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37412 return target;
37415 case IX86_BUILTIN_4FNMASS:
37416 fcn = gen_avx5124fmaddps_4fnmaddss;
37417 masked = 0;
37418 goto s4fma_expand;
37420 case IX86_BUILTIN_4FMASS:
37421 fcn = gen_avx5124fmaddps_4fmaddss;
37422 masked = 0;
37423 goto s4fma_expand;
37425 case IX86_BUILTIN_4FNMASS_MASK:
37426 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37427 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37428 goto s4fma_expand;
37430 case IX86_BUILTIN_4FMASS_MASK:
37432 tree args[4];
37433 rtx ops[4];
37434 rtx wide_reg;
37435 rtx accum;
37436 rtx addr;
37437 rtx mem;
37439 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37440 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37442 s4fma_expand:
37443 mode = V4SFmode;
37444 wide_reg = gen_reg_rtx (V64SFmode);
37445 for (i = 0; i < 4; i++)
37447 rtx tmp;
37448 args[i] = CALL_EXPR_ARG (exp, i);
37449 ops[i] = expand_normal (args[i]);
37451 tmp = gen_reg_rtx (SFmode);
37452 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37454 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37455 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37458 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37459 accum = force_reg (V4SFmode, accum);
37461 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37462 addr = force_reg (Pmode, addr);
37464 mem = gen_rtx_MEM (V4SFmode, addr);
37466 target = gen_reg_rtx (V4SFmode);
37468 emit_move_insn (target, accum);
37470 if (! masked)
37471 emit_insn (fcn (target, accum, wide_reg, mem));
37472 else
37474 rtx merge, mask;
37475 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37477 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37479 if (CONST_INT_P (mask))
37480 mask = fixup_modeless_constant (mask, QImode);
37482 mask = force_reg (QImode, mask);
37484 if (GET_MODE (mask) != QImode)
37485 mask = gen_rtx_SUBREG (QImode, mask, 0);
37487 /* If merge is 0 then we're about to emit z-masked variant. */
37488 if (const0_operand (merge, mode))
37489 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37490 /* If merge is the same as accum then emit merge-masked
37491 variant. */
37492 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37494 merge = force_reg (mode, merge);
37495 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37497 /* Merge with something unknown might happen if we z-mask
37498 w/ -O0. */
37499 else
37501 target = gen_reg_rtx (mode);
37502 emit_move_insn (target, merge);
37503 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37506 return target;
37508 case IX86_BUILTIN_RDPID:
37509 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37510 target);
37511 default:
37512 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37516 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37517 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37519 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37520 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37523 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37524 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37526 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37527 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37530 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37531 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37533 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37534 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37537 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37538 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37540 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37541 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37544 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37545 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37547 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37548 const struct builtin_description *d = bdesc_multi_arg + i;
37549 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37550 (enum ix86_builtin_func_type)
37551 d->flag, d->comparison);
37554 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37555 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37557 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37558 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37559 target);
37562 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37563 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37565 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37566 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37567 target);
37570 gcc_unreachable ();
37573 /* This returns the target-specific builtin with code CODE if
37574 current_function_decl has visibility on this builtin, which is checked
37575 using isa flags. Returns NULL_TREE otherwise. */
37577 static tree ix86_get_builtin (enum ix86_builtins code)
37579 struct cl_target_option *opts;
37580 tree target_tree = NULL_TREE;
37582 /* Determine the isa flags of current_function_decl. */
37584 if (current_function_decl)
37585 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37587 if (target_tree == NULL)
37588 target_tree = target_option_default_node;
37590 opts = TREE_TARGET_OPTION (target_tree);
37592 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37593 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37594 return ix86_builtin_decl (code, true);
37595 else
37596 return NULL_TREE;
37599 /* Return function decl for target specific builtin
37600 for given MPX builtin passed i FCODE. */
37601 static tree
37602 ix86_builtin_mpx_function (unsigned fcode)
37604 switch (fcode)
37606 case BUILT_IN_CHKP_BNDMK:
37607 return ix86_builtins[IX86_BUILTIN_BNDMK];
37609 case BUILT_IN_CHKP_BNDSTX:
37610 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37612 case BUILT_IN_CHKP_BNDLDX:
37613 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37615 case BUILT_IN_CHKP_BNDCL:
37616 return ix86_builtins[IX86_BUILTIN_BNDCL];
37618 case BUILT_IN_CHKP_BNDCU:
37619 return ix86_builtins[IX86_BUILTIN_BNDCU];
37621 case BUILT_IN_CHKP_BNDRET:
37622 return ix86_builtins[IX86_BUILTIN_BNDRET];
37624 case BUILT_IN_CHKP_INTERSECT:
37625 return ix86_builtins[IX86_BUILTIN_BNDINT];
37627 case BUILT_IN_CHKP_NARROW:
37628 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37630 case BUILT_IN_CHKP_SIZEOF:
37631 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37633 case BUILT_IN_CHKP_EXTRACT_LOWER:
37634 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37636 case BUILT_IN_CHKP_EXTRACT_UPPER:
37637 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37639 default:
37640 return NULL_TREE;
37643 gcc_unreachable ();
37646 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37648 Return an address to be used to load/store bounds for pointer
37649 passed in SLOT.
37651 SLOT_NO is an integer constant holding number of a target
37652 dependent special slot to be used in case SLOT is not a memory.
37654 SPECIAL_BASE is a pointer to be used as a base of fake address
37655 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37656 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37658 static rtx
37659 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37661 rtx addr = NULL;
37663 /* NULL slot means we pass bounds for pointer not passed to the
37664 function at all. Register slot means we pass pointer in a
37665 register. In both these cases bounds are passed via Bounds
37666 Table. Since we do not have actual pointer stored in memory,
37667 we have to use fake addresses to access Bounds Table. We
37668 start with (special_base - sizeof (void*)) and decrease this
37669 address by pointer size to get addresses for other slots. */
37670 if (!slot || REG_P (slot))
37672 gcc_assert (CONST_INT_P (slot_no));
37673 addr = plus_constant (Pmode, special_base,
37674 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37676 /* If pointer is passed in a memory then its address is used to
37677 access Bounds Table. */
37678 else if (MEM_P (slot))
37680 addr = XEXP (slot, 0);
37681 if (!register_operand (addr, Pmode))
37682 addr = copy_addr_to_reg (addr);
37684 else
37685 gcc_unreachable ();
37687 return addr;
37690 /* Expand pass uses this hook to load bounds for function parameter
37691 PTR passed in SLOT in case its bounds are not passed in a register.
37693 If SLOT is a memory, then bounds are loaded as for regular pointer
37694 loaded from memory. PTR may be NULL in case SLOT is a memory.
37695 In such case value of PTR (if required) may be loaded from SLOT.
37697 If SLOT is NULL or a register then SLOT_NO is an integer constant
37698 holding number of the target dependent special slot which should be
37699 used to obtain bounds.
37701 Return loaded bounds. */
37703 static rtx
37704 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37706 rtx reg = gen_reg_rtx (BNDmode);
37707 rtx addr;
37709 /* Get address to be used to access Bounds Table. Special slots start
37710 at the location of return address of the current function. */
37711 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37713 /* Load pointer value from a memory if we don't have it. */
37714 if (!ptr)
37716 gcc_assert (MEM_P (slot));
37717 ptr = copy_addr_to_reg (slot);
37720 if (!register_operand (ptr, Pmode))
37721 ptr = ix86_zero_extend_to_Pmode (ptr);
37723 emit_insn (BNDmode == BND64mode
37724 ? gen_bnd64_ldx (reg, addr, ptr)
37725 : gen_bnd32_ldx (reg, addr, ptr));
37727 return reg;
37730 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37731 passed in SLOT in case BOUNDS are not passed in a register.
37733 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37734 stored in memory. PTR may be NULL in case SLOT is a memory.
37735 In such case value of PTR (if required) may be loaded from SLOT.
37737 If SLOT is NULL or a register then SLOT_NO is an integer constant
37738 holding number of the target dependent special slot which should be
37739 used to store BOUNDS. */
37741 static void
37742 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37744 rtx addr;
37746 /* Get address to be used to access Bounds Table. Special slots start
37747 at the location of return address of a called function. */
37748 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37750 /* Load pointer value from a memory if we don't have it. */
37751 if (!ptr)
37753 gcc_assert (MEM_P (slot));
37754 ptr = copy_addr_to_reg (slot);
37757 if (!register_operand (ptr, Pmode))
37758 ptr = ix86_zero_extend_to_Pmode (ptr);
37760 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37761 if (!register_operand (bounds, BNDmode))
37762 bounds = copy_to_mode_reg (BNDmode, bounds);
37764 emit_insn (BNDmode == BND64mode
37765 ? gen_bnd64_stx (addr, ptr, bounds)
37766 : gen_bnd32_stx (addr, ptr, bounds));
37769 /* Load and return bounds returned by function in SLOT. */
37771 static rtx
37772 ix86_load_returned_bounds (rtx slot)
37774 rtx res;
37776 gcc_assert (REG_P (slot));
37777 res = gen_reg_rtx (BNDmode);
37778 emit_move_insn (res, slot);
37780 return res;
37783 /* Store BOUNDS returned by function into SLOT. */
37785 static void
37786 ix86_store_returned_bounds (rtx slot, rtx bounds)
37788 gcc_assert (REG_P (slot));
37789 emit_move_insn (slot, bounds);
37792 /* Returns a function decl for a vectorized version of the combined function
37793 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37794 if it is not available. */
37796 static tree
37797 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37798 tree type_in)
37800 machine_mode in_mode, out_mode;
37801 int in_n, out_n;
37803 if (TREE_CODE (type_out) != VECTOR_TYPE
37804 || TREE_CODE (type_in) != VECTOR_TYPE)
37805 return NULL_TREE;
37807 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37808 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37809 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37810 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37812 switch (fn)
37814 CASE_CFN_EXP2:
37815 if (out_mode == SFmode && in_mode == SFmode)
37817 if (out_n == 16 && in_n == 16)
37818 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37820 break;
37822 CASE_CFN_IFLOOR:
37823 CASE_CFN_LFLOOR:
37824 CASE_CFN_LLFLOOR:
37825 /* The round insn does not trap on denormals. */
37826 if (flag_trapping_math || !TARGET_SSE4_1)
37827 break;
37829 if (out_mode == SImode && in_mode == DFmode)
37831 if (out_n == 4 && in_n == 2)
37832 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37833 else if (out_n == 8 && in_n == 4)
37834 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37835 else if (out_n == 16 && in_n == 8)
37836 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37838 if (out_mode == SImode && in_mode == SFmode)
37840 if (out_n == 4 && in_n == 4)
37841 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37842 else if (out_n == 8 && in_n == 8)
37843 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37844 else if (out_n == 16 && in_n == 16)
37845 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37847 break;
37849 CASE_CFN_ICEIL:
37850 CASE_CFN_LCEIL:
37851 CASE_CFN_LLCEIL:
37852 /* The round insn does not trap on denormals. */
37853 if (flag_trapping_math || !TARGET_SSE4_1)
37854 break;
37856 if (out_mode == SImode && in_mode == DFmode)
37858 if (out_n == 4 && in_n == 2)
37859 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37860 else if (out_n == 8 && in_n == 4)
37861 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37862 else if (out_n == 16 && in_n == 8)
37863 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37865 if (out_mode == SImode && in_mode == SFmode)
37867 if (out_n == 4 && in_n == 4)
37868 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37869 else if (out_n == 8 && in_n == 8)
37870 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37871 else if (out_n == 16 && in_n == 16)
37872 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37874 break;
37876 CASE_CFN_IRINT:
37877 CASE_CFN_LRINT:
37878 CASE_CFN_LLRINT:
37879 if (out_mode == SImode && in_mode == DFmode)
37881 if (out_n == 4 && in_n == 2)
37882 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37883 else if (out_n == 8 && in_n == 4)
37884 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37885 else if (out_n == 16 && in_n == 8)
37886 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37888 if (out_mode == SImode && in_mode == SFmode)
37890 if (out_n == 4 && in_n == 4)
37891 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37892 else if (out_n == 8 && in_n == 8)
37893 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37894 else if (out_n == 16 && in_n == 16)
37895 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37897 break;
37899 CASE_CFN_IROUND:
37900 CASE_CFN_LROUND:
37901 CASE_CFN_LLROUND:
37902 /* The round insn does not trap on denormals. */
37903 if (flag_trapping_math || !TARGET_SSE4_1)
37904 break;
37906 if (out_mode == SImode && in_mode == DFmode)
37908 if (out_n == 4 && in_n == 2)
37909 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37910 else if (out_n == 8 && in_n == 4)
37911 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37912 else if (out_n == 16 && in_n == 8)
37913 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37915 if (out_mode == SImode && in_mode == SFmode)
37917 if (out_n == 4 && in_n == 4)
37918 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37919 else if (out_n == 8 && in_n == 8)
37920 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37921 else if (out_n == 16 && in_n == 16)
37922 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37924 break;
37926 CASE_CFN_FLOOR:
37927 /* The round insn does not trap on denormals. */
37928 if (flag_trapping_math || !TARGET_SSE4_1)
37929 break;
37931 if (out_mode == DFmode && in_mode == DFmode)
37933 if (out_n == 2 && in_n == 2)
37934 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37935 else if (out_n == 4 && in_n == 4)
37936 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37937 else if (out_n == 8 && in_n == 8)
37938 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37940 if (out_mode == SFmode && in_mode == SFmode)
37942 if (out_n == 4 && in_n == 4)
37943 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37944 else if (out_n == 8 && in_n == 8)
37945 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37946 else if (out_n == 16 && in_n == 16)
37947 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37949 break;
37951 CASE_CFN_CEIL:
37952 /* The round insn does not trap on denormals. */
37953 if (flag_trapping_math || !TARGET_SSE4_1)
37954 break;
37956 if (out_mode == DFmode && in_mode == DFmode)
37958 if (out_n == 2 && in_n == 2)
37959 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37960 else if (out_n == 4 && in_n == 4)
37961 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37962 else if (out_n == 8 && in_n == 8)
37963 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
37965 if (out_mode == SFmode && in_mode == SFmode)
37967 if (out_n == 4 && in_n == 4)
37968 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
37969 else if (out_n == 8 && in_n == 8)
37970 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
37971 else if (out_n == 16 && in_n == 16)
37972 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
37974 break;
37976 CASE_CFN_TRUNC:
37977 /* The round insn does not trap on denormals. */
37978 if (flag_trapping_math || !TARGET_SSE4_1)
37979 break;
37981 if (out_mode == DFmode && in_mode == DFmode)
37983 if (out_n == 2 && in_n == 2)
37984 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
37985 else if (out_n == 4 && in_n == 4)
37986 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
37987 else if (out_n == 8 && in_n == 8)
37988 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
37990 if (out_mode == SFmode && in_mode == SFmode)
37992 if (out_n == 4 && in_n == 4)
37993 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
37994 else if (out_n == 8 && in_n == 8)
37995 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
37996 else if (out_n == 16 && in_n == 16)
37997 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
37999 break;
38001 CASE_CFN_RINT:
38002 /* The round insn does not trap on denormals. */
38003 if (flag_trapping_math || !TARGET_SSE4_1)
38004 break;
38006 if (out_mode == DFmode && in_mode == DFmode)
38008 if (out_n == 2 && in_n == 2)
38009 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38010 else if (out_n == 4 && in_n == 4)
38011 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38013 if (out_mode == SFmode && in_mode == SFmode)
38015 if (out_n == 4 && in_n == 4)
38016 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38017 else if (out_n == 8 && in_n == 8)
38018 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38020 break;
38022 CASE_CFN_FMA:
38023 if (out_mode == DFmode && in_mode == DFmode)
38025 if (out_n == 2 && in_n == 2)
38026 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38027 if (out_n == 4 && in_n == 4)
38028 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38030 if (out_mode == SFmode && in_mode == SFmode)
38032 if (out_n == 4 && in_n == 4)
38033 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38034 if (out_n == 8 && in_n == 8)
38035 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38037 break;
38039 default:
38040 break;
38043 /* Dispatch to a handler for a vectorization library. */
38044 if (ix86_veclib_handler)
38045 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38047 return NULL_TREE;
38050 /* Handler for an SVML-style interface to
38051 a library with vectorized intrinsics. */
38053 static tree
38054 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38056 char name[20];
38057 tree fntype, new_fndecl, args;
38058 unsigned arity;
38059 const char *bname;
38060 machine_mode el_mode, in_mode;
38061 int n, in_n;
38063 /* The SVML is suitable for unsafe math only. */
38064 if (!flag_unsafe_math_optimizations)
38065 return NULL_TREE;
38067 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38068 n = TYPE_VECTOR_SUBPARTS (type_out);
38069 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38070 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38071 if (el_mode != in_mode
38072 || n != in_n)
38073 return NULL_TREE;
38075 switch (fn)
38077 CASE_CFN_EXP:
38078 CASE_CFN_LOG:
38079 CASE_CFN_LOG10:
38080 CASE_CFN_POW:
38081 CASE_CFN_TANH:
38082 CASE_CFN_TAN:
38083 CASE_CFN_ATAN:
38084 CASE_CFN_ATAN2:
38085 CASE_CFN_ATANH:
38086 CASE_CFN_CBRT:
38087 CASE_CFN_SINH:
38088 CASE_CFN_SIN:
38089 CASE_CFN_ASINH:
38090 CASE_CFN_ASIN:
38091 CASE_CFN_COSH:
38092 CASE_CFN_COS:
38093 CASE_CFN_ACOSH:
38094 CASE_CFN_ACOS:
38095 if ((el_mode != DFmode || n != 2)
38096 && (el_mode != SFmode || n != 4))
38097 return NULL_TREE;
38098 break;
38100 default:
38101 return NULL_TREE;
38104 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38105 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38107 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38108 strcpy (name, "vmlsLn4");
38109 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38110 strcpy (name, "vmldLn2");
38111 else if (n == 4)
38113 sprintf (name, "vmls%s", bname+10);
38114 name[strlen (name)-1] = '4';
38116 else
38117 sprintf (name, "vmld%s2", bname+10);
38119 /* Convert to uppercase. */
38120 name[4] &= ~0x20;
38122 arity = 0;
38123 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38124 arity++;
38126 if (arity == 1)
38127 fntype = build_function_type_list (type_out, type_in, NULL);
38128 else
38129 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38131 /* Build a function declaration for the vectorized function. */
38132 new_fndecl = build_decl (BUILTINS_LOCATION,
38133 FUNCTION_DECL, get_identifier (name), fntype);
38134 TREE_PUBLIC (new_fndecl) = 1;
38135 DECL_EXTERNAL (new_fndecl) = 1;
38136 DECL_IS_NOVOPS (new_fndecl) = 1;
38137 TREE_READONLY (new_fndecl) = 1;
38139 return new_fndecl;
38142 /* Handler for an ACML-style interface to
38143 a library with vectorized intrinsics. */
38145 static tree
38146 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38148 char name[20] = "__vr.._";
38149 tree fntype, new_fndecl, args;
38150 unsigned arity;
38151 const char *bname;
38152 machine_mode el_mode, in_mode;
38153 int n, in_n;
38155 /* The ACML is 64bits only and suitable for unsafe math only as
38156 it does not correctly support parts of IEEE with the required
38157 precision such as denormals. */
38158 if (!TARGET_64BIT
38159 || !flag_unsafe_math_optimizations)
38160 return NULL_TREE;
38162 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38163 n = TYPE_VECTOR_SUBPARTS (type_out);
38164 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38165 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38166 if (el_mode != in_mode
38167 || n != in_n)
38168 return NULL_TREE;
38170 switch (fn)
38172 CASE_CFN_SIN:
38173 CASE_CFN_COS:
38174 CASE_CFN_EXP:
38175 CASE_CFN_LOG:
38176 CASE_CFN_LOG2:
38177 CASE_CFN_LOG10:
38178 if (el_mode == DFmode && n == 2)
38180 name[4] = 'd';
38181 name[5] = '2';
38183 else if (el_mode == SFmode && n == 4)
38185 name[4] = 's';
38186 name[5] = '4';
38188 else
38189 return NULL_TREE;
38190 break;
38192 default:
38193 return NULL_TREE;
38196 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38197 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38198 sprintf (name + 7, "%s", bname+10);
38200 arity = 0;
38201 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38202 arity++;
38204 if (arity == 1)
38205 fntype = build_function_type_list (type_out, type_in, NULL);
38206 else
38207 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38209 /* Build a function declaration for the vectorized function. */
38210 new_fndecl = build_decl (BUILTINS_LOCATION,
38211 FUNCTION_DECL, get_identifier (name), fntype);
38212 TREE_PUBLIC (new_fndecl) = 1;
38213 DECL_EXTERNAL (new_fndecl) = 1;
38214 DECL_IS_NOVOPS (new_fndecl) = 1;
38215 TREE_READONLY (new_fndecl) = 1;
38217 return new_fndecl;
38220 /* Returns a decl of a function that implements gather load with
38221 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38222 Return NULL_TREE if it is not available. */
38224 static tree
38225 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38226 const_tree index_type, int scale)
38228 bool si;
38229 enum ix86_builtins code;
38231 if (! TARGET_AVX2)
38232 return NULL_TREE;
38234 if ((TREE_CODE (index_type) != INTEGER_TYPE
38235 && !POINTER_TYPE_P (index_type))
38236 || (TYPE_MODE (index_type) != SImode
38237 && TYPE_MODE (index_type) != DImode))
38238 return NULL_TREE;
38240 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38241 return NULL_TREE;
38243 /* v*gather* insn sign extends index to pointer mode. */
38244 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38245 && TYPE_UNSIGNED (index_type))
38246 return NULL_TREE;
38248 if (scale <= 0
38249 || scale > 8
38250 || (scale & (scale - 1)) != 0)
38251 return NULL_TREE;
38253 si = TYPE_MODE (index_type) == SImode;
38254 switch (TYPE_MODE (mem_vectype))
38256 case E_V2DFmode:
38257 if (TARGET_AVX512VL)
38258 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38259 else
38260 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38261 break;
38262 case E_V4DFmode:
38263 if (TARGET_AVX512VL)
38264 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38265 else
38266 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38267 break;
38268 case E_V2DImode:
38269 if (TARGET_AVX512VL)
38270 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38271 else
38272 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38273 break;
38274 case E_V4DImode:
38275 if (TARGET_AVX512VL)
38276 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38277 else
38278 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38279 break;
38280 case E_V4SFmode:
38281 if (TARGET_AVX512VL)
38282 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38283 else
38284 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38285 break;
38286 case E_V8SFmode:
38287 if (TARGET_AVX512VL)
38288 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38289 else
38290 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38291 break;
38292 case E_V4SImode:
38293 if (TARGET_AVX512VL)
38294 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38295 else
38296 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38297 break;
38298 case E_V8SImode:
38299 if (TARGET_AVX512VL)
38300 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38301 else
38302 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38303 break;
38304 case E_V8DFmode:
38305 if (TARGET_AVX512F)
38306 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38307 else
38308 return NULL_TREE;
38309 break;
38310 case E_V8DImode:
38311 if (TARGET_AVX512F)
38312 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38313 else
38314 return NULL_TREE;
38315 break;
38316 case E_V16SFmode:
38317 if (TARGET_AVX512F)
38318 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38319 else
38320 return NULL_TREE;
38321 break;
38322 case E_V16SImode:
38323 if (TARGET_AVX512F)
38324 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38325 else
38326 return NULL_TREE;
38327 break;
38328 default:
38329 return NULL_TREE;
38332 return ix86_get_builtin (code);
38335 /* Returns a decl of a function that implements scatter store with
38336 register type VECTYPE and index type INDEX_TYPE and SCALE.
38337 Return NULL_TREE if it is not available. */
38339 static tree
38340 ix86_vectorize_builtin_scatter (const_tree vectype,
38341 const_tree index_type, int scale)
38343 bool si;
38344 enum ix86_builtins code;
38346 if (!TARGET_AVX512F)
38347 return NULL_TREE;
38349 if ((TREE_CODE (index_type) != INTEGER_TYPE
38350 && !POINTER_TYPE_P (index_type))
38351 || (TYPE_MODE (index_type) != SImode
38352 && TYPE_MODE (index_type) != DImode))
38353 return NULL_TREE;
38355 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38356 return NULL_TREE;
38358 /* v*scatter* insn sign extends index to pointer mode. */
38359 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38360 && TYPE_UNSIGNED (index_type))
38361 return NULL_TREE;
38363 /* Scale can be 1, 2, 4 or 8. */
38364 if (scale <= 0
38365 || scale > 8
38366 || (scale & (scale - 1)) != 0)
38367 return NULL_TREE;
38369 si = TYPE_MODE (index_type) == SImode;
38370 switch (TYPE_MODE (vectype))
38372 case E_V8DFmode:
38373 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38374 break;
38375 case E_V8DImode:
38376 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38377 break;
38378 case E_V16SFmode:
38379 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38380 break;
38381 case E_V16SImode:
38382 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38383 break;
38384 default:
38385 return NULL_TREE;
38388 return ix86_builtins[code];
38391 /* Return true if it is safe to use the rsqrt optabs to optimize
38392 1.0/sqrt. */
38394 static bool
38395 use_rsqrt_p ()
38397 return (TARGET_SSE_MATH
38398 && flag_finite_math_only
38399 && !flag_trapping_math
38400 && flag_unsafe_math_optimizations);
38403 /* Returns a code for a target-specific builtin that implements
38404 reciprocal of the function, or NULL_TREE if not available. */
38406 static tree
38407 ix86_builtin_reciprocal (tree fndecl)
38409 switch (DECL_FUNCTION_CODE (fndecl))
38411 /* Vectorized version of sqrt to rsqrt conversion. */
38412 case IX86_BUILTIN_SQRTPS_NR:
38413 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38415 case IX86_BUILTIN_SQRTPS_NR256:
38416 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38418 default:
38419 return NULL_TREE;
38423 /* Helper for avx_vpermilps256_operand et al. This is also used by
38424 the expansion functions to turn the parallel back into a mask.
38425 The return value is 0 for no match and the imm8+1 for a match. */
38428 avx_vpermilp_parallel (rtx par, machine_mode mode)
38430 unsigned i, nelt = GET_MODE_NUNITS (mode);
38431 unsigned mask = 0;
38432 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38434 if (XVECLEN (par, 0) != (int) nelt)
38435 return 0;
38437 /* Validate that all of the elements are constants, and not totally
38438 out of range. Copy the data into an integral array to make the
38439 subsequent checks easier. */
38440 for (i = 0; i < nelt; ++i)
38442 rtx er = XVECEXP (par, 0, i);
38443 unsigned HOST_WIDE_INT ei;
38445 if (!CONST_INT_P (er))
38446 return 0;
38447 ei = INTVAL (er);
38448 if (ei >= nelt)
38449 return 0;
38450 ipar[i] = ei;
38453 switch (mode)
38455 case E_V8DFmode:
38456 /* In the 512-bit DFmode case, we can only move elements within
38457 a 128-bit lane. First fill the second part of the mask,
38458 then fallthru. */
38459 for (i = 4; i < 6; ++i)
38461 if (ipar[i] < 4 || ipar[i] >= 6)
38462 return 0;
38463 mask |= (ipar[i] - 4) << i;
38465 for (i = 6; i < 8; ++i)
38467 if (ipar[i] < 6)
38468 return 0;
38469 mask |= (ipar[i] - 6) << i;
38471 /* FALLTHRU */
38473 case E_V4DFmode:
38474 /* In the 256-bit DFmode case, we can only move elements within
38475 a 128-bit lane. */
38476 for (i = 0; i < 2; ++i)
38478 if (ipar[i] >= 2)
38479 return 0;
38480 mask |= ipar[i] << i;
38482 for (i = 2; i < 4; ++i)
38484 if (ipar[i] < 2)
38485 return 0;
38486 mask |= (ipar[i] - 2) << i;
38488 break;
38490 case E_V16SFmode:
38491 /* In 512 bit SFmode case, permutation in the upper 256 bits
38492 must mirror the permutation in the lower 256-bits. */
38493 for (i = 0; i < 8; ++i)
38494 if (ipar[i] + 8 != ipar[i + 8])
38495 return 0;
38496 /* FALLTHRU */
38498 case E_V8SFmode:
38499 /* In 256 bit SFmode case, we have full freedom of
38500 movement within the low 128-bit lane, but the high 128-bit
38501 lane must mirror the exact same pattern. */
38502 for (i = 0; i < 4; ++i)
38503 if (ipar[i] + 4 != ipar[i + 4])
38504 return 0;
38505 nelt = 4;
38506 /* FALLTHRU */
38508 case E_V2DFmode:
38509 case E_V4SFmode:
38510 /* In the 128-bit case, we've full freedom in the placement of
38511 the elements from the source operand. */
38512 for (i = 0; i < nelt; ++i)
38513 mask |= ipar[i] << (i * (nelt / 2));
38514 break;
38516 default:
38517 gcc_unreachable ();
38520 /* Make sure success has a non-zero value by adding one. */
38521 return mask + 1;
38524 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38525 the expansion functions to turn the parallel back into a mask.
38526 The return value is 0 for no match and the imm8+1 for a match. */
38529 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38531 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38532 unsigned mask = 0;
38533 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38535 if (XVECLEN (par, 0) != (int) nelt)
38536 return 0;
38538 /* Validate that all of the elements are constants, and not totally
38539 out of range. Copy the data into an integral array to make the
38540 subsequent checks easier. */
38541 for (i = 0; i < nelt; ++i)
38543 rtx er = XVECEXP (par, 0, i);
38544 unsigned HOST_WIDE_INT ei;
38546 if (!CONST_INT_P (er))
38547 return 0;
38548 ei = INTVAL (er);
38549 if (ei >= 2 * nelt)
38550 return 0;
38551 ipar[i] = ei;
38554 /* Validate that the halves of the permute are halves. */
38555 for (i = 0; i < nelt2 - 1; ++i)
38556 if (ipar[i] + 1 != ipar[i + 1])
38557 return 0;
38558 for (i = nelt2; i < nelt - 1; ++i)
38559 if (ipar[i] + 1 != ipar[i + 1])
38560 return 0;
38562 /* Reconstruct the mask. */
38563 for (i = 0; i < 2; ++i)
38565 unsigned e = ipar[i * nelt2];
38566 if (e % nelt2)
38567 return 0;
38568 e /= nelt2;
38569 mask |= e << (i * 4);
38572 /* Make sure success has a non-zero value by adding one. */
38573 return mask + 1;
38576 /* Return a register priority for hard reg REGNO. */
38577 static int
38578 ix86_register_priority (int hard_regno)
38580 /* ebp and r13 as the base always wants a displacement, r12 as the
38581 base always wants an index. So discourage their usage in an
38582 address. */
38583 if (hard_regno == R12_REG || hard_regno == R13_REG)
38584 return 0;
38585 if (hard_regno == BP_REG)
38586 return 1;
38587 /* New x86-64 int registers result in bigger code size. Discourage
38588 them. */
38589 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38590 return 2;
38591 /* New x86-64 SSE registers result in bigger code size. Discourage
38592 them. */
38593 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38594 return 2;
38595 /* Usage of AX register results in smaller code. Prefer it. */
38596 if (hard_regno == AX_REG)
38597 return 4;
38598 return 3;
38601 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38603 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38604 QImode must go into class Q_REGS.
38605 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38606 movdf to do mem-to-mem moves through integer regs. */
38608 static reg_class_t
38609 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38611 machine_mode mode = GET_MODE (x);
38613 /* We're only allowed to return a subclass of CLASS. Many of the
38614 following checks fail for NO_REGS, so eliminate that early. */
38615 if (regclass == NO_REGS)
38616 return NO_REGS;
38618 /* All classes can load zeros. */
38619 if (x == CONST0_RTX (mode))
38620 return regclass;
38622 /* Force constants into memory if we are loading a (nonzero) constant into
38623 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38624 instructions to load from a constant. */
38625 if (CONSTANT_P (x)
38626 && (MAYBE_MMX_CLASS_P (regclass)
38627 || MAYBE_SSE_CLASS_P (regclass)
38628 || MAYBE_MASK_CLASS_P (regclass)))
38629 return NO_REGS;
38631 /* Floating-point constants need more complex checks. */
38632 if (CONST_DOUBLE_P (x))
38634 /* General regs can load everything. */
38635 if (INTEGER_CLASS_P (regclass))
38636 return regclass;
38638 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38639 zero above. We only want to wind up preferring 80387 registers if
38640 we plan on doing computation with them. */
38641 if (IS_STACK_MODE (mode)
38642 && standard_80387_constant_p (x) > 0)
38644 /* Limit class to FP regs. */
38645 if (FLOAT_CLASS_P (regclass))
38646 return FLOAT_REGS;
38647 else if (regclass == FP_TOP_SSE_REGS)
38648 return FP_TOP_REG;
38649 else if (regclass == FP_SECOND_SSE_REGS)
38650 return FP_SECOND_REG;
38653 return NO_REGS;
38656 /* Prefer SSE regs only, if we can use them for math. */
38657 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38658 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38660 /* Generally when we see PLUS here, it's the function invariant
38661 (plus soft-fp const_int). Which can only be computed into general
38662 regs. */
38663 if (GET_CODE (x) == PLUS)
38664 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38666 /* QImode constants are easy to load, but non-constant QImode data
38667 must go into Q_REGS. */
38668 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38670 if (Q_CLASS_P (regclass))
38671 return regclass;
38672 else if (reg_class_subset_p (Q_REGS, regclass))
38673 return Q_REGS;
38674 else
38675 return NO_REGS;
38678 return regclass;
38681 /* Discourage putting floating-point values in SSE registers unless
38682 SSE math is being used, and likewise for the 387 registers. */
38683 static reg_class_t
38684 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38686 machine_mode mode = GET_MODE (x);
38688 /* Restrict the output reload class to the register bank that we are doing
38689 math on. If we would like not to return a subset of CLASS, reject this
38690 alternative: if reload cannot do this, it will still use its choice. */
38691 mode = GET_MODE (x);
38692 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38693 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38695 if (IS_STACK_MODE (mode))
38697 if (regclass == FP_TOP_SSE_REGS)
38698 return FP_TOP_REG;
38699 else if (regclass == FP_SECOND_SSE_REGS)
38700 return FP_SECOND_REG;
38701 else
38702 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38705 return regclass;
38708 static reg_class_t
38709 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38710 machine_mode mode, secondary_reload_info *sri)
38712 /* Double-word spills from general registers to non-offsettable memory
38713 references (zero-extended addresses) require special handling. */
38714 if (TARGET_64BIT
38715 && MEM_P (x)
38716 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38717 && INTEGER_CLASS_P (rclass)
38718 && !offsettable_memref_p (x))
38720 sri->icode = (in_p
38721 ? CODE_FOR_reload_noff_load
38722 : CODE_FOR_reload_noff_store);
38723 /* Add the cost of moving address to a temporary. */
38724 sri->extra_cost = 1;
38726 return NO_REGS;
38729 /* QImode spills from non-QI registers require
38730 intermediate register on 32bit targets. */
38731 if (mode == QImode
38732 && ((!TARGET_64BIT && !in_p
38733 && INTEGER_CLASS_P (rclass)
38734 && MAYBE_NON_Q_CLASS_P (rclass))
38735 || (!TARGET_AVX512DQ
38736 && MAYBE_MASK_CLASS_P (rclass))))
38738 int regno = true_regnum (x);
38740 /* Return Q_REGS if the operand is in memory. */
38741 if (regno == -1)
38742 return Q_REGS;
38744 return NO_REGS;
38747 /* This condition handles corner case where an expression involving
38748 pointers gets vectorized. We're trying to use the address of a
38749 stack slot as a vector initializer.
38751 (set (reg:V2DI 74 [ vect_cst_.2 ])
38752 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38754 Eventually frame gets turned into sp+offset like this:
38756 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38757 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38758 (const_int 392 [0x188]))))
38760 That later gets turned into:
38762 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38763 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38764 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38766 We'll have the following reload recorded:
38768 Reload 0: reload_in (DI) =
38769 (plus:DI (reg/f:DI 7 sp)
38770 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38771 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38772 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38773 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38774 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38775 reload_reg_rtx: (reg:V2DI 22 xmm1)
38777 Which isn't going to work since SSE instructions can't handle scalar
38778 additions. Returning GENERAL_REGS forces the addition into integer
38779 register and reload can handle subsequent reloads without problems. */
38781 if (in_p && GET_CODE (x) == PLUS
38782 && SSE_CLASS_P (rclass)
38783 && SCALAR_INT_MODE_P (mode))
38784 return GENERAL_REGS;
38786 return NO_REGS;
38789 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38791 static bool
38792 ix86_class_likely_spilled_p (reg_class_t rclass)
38794 switch (rclass)
38796 case AREG:
38797 case DREG:
38798 case CREG:
38799 case BREG:
38800 case AD_REGS:
38801 case SIREG:
38802 case DIREG:
38803 case SSE_FIRST_REG:
38804 case FP_TOP_REG:
38805 case FP_SECOND_REG:
38806 case BND_REGS:
38807 return true;
38809 default:
38810 break;
38813 return false;
38816 /* If we are copying between registers from different register sets
38817 (e.g. FP and integer), we may need a memory location.
38819 The function can't work reliably when one of the CLASSES is a class
38820 containing registers from multiple sets. We avoid this by never combining
38821 different sets in a single alternative in the machine description.
38822 Ensure that this constraint holds to avoid unexpected surprises.
38824 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38825 so do not enforce these sanity checks.
38827 To optimize register_move_cost performance, define inline variant. */
38829 static inline bool
38830 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38831 reg_class_t class2, int strict)
38833 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38834 return false;
38836 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38837 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38838 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38839 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38840 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38841 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38842 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38843 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38845 gcc_assert (!strict || lra_in_progress);
38846 return true;
38849 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38850 return true;
38852 /* Between mask and general, we have moves no larger than word size. */
38853 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38854 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38855 return true;
38857 /* ??? This is a lie. We do have moves between mmx/general, and for
38858 mmx/sse2. But by saying we need secondary memory we discourage the
38859 register allocator from using the mmx registers unless needed. */
38860 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38861 return true;
38863 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38865 /* SSE1 doesn't have any direct moves from other classes. */
38866 if (!TARGET_SSE2)
38867 return true;
38869 /* If the target says that inter-unit moves are more expensive
38870 than moving through memory, then don't generate them. */
38871 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38872 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38873 return true;
38875 /* Between SSE and general, we have moves no larger than word size. */
38876 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38877 return true;
38880 return false;
38883 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38885 static bool
38886 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38887 reg_class_t class2)
38889 return inline_secondary_memory_needed (mode, class1, class2, true);
38892 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38894 get_secondary_mem widens integral modes to BITS_PER_WORD.
38895 There is no need to emit full 64 bit move on 64 bit targets
38896 for integral modes that can be moved using 32 bit move. */
38898 static machine_mode
38899 ix86_secondary_memory_needed_mode (machine_mode mode)
38901 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38902 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38903 return mode;
38906 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38908 On the 80386, this is the size of MODE in words,
38909 except in the FP regs, where a single reg is always enough. */
38911 static unsigned char
38912 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38914 if (MAYBE_INTEGER_CLASS_P (rclass))
38916 if (mode == XFmode)
38917 return (TARGET_64BIT ? 2 : 3);
38918 else if (mode == XCmode)
38919 return (TARGET_64BIT ? 4 : 6);
38920 else
38921 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38923 else
38925 if (COMPLEX_MODE_P (mode))
38926 return 2;
38927 else
38928 return 1;
38932 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38934 static bool
38935 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38936 reg_class_t regclass)
38938 if (from == to)
38939 return true;
38941 /* x87 registers can't do subreg at all, as all values are reformatted
38942 to extended precision. */
38943 if (MAYBE_FLOAT_CLASS_P (regclass))
38944 return false;
38946 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38948 /* Vector registers do not support QI or HImode loads. If we don't
38949 disallow a change to these modes, reload will assume it's ok to
38950 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38951 the vec_dupv4hi pattern. */
38952 if (GET_MODE_SIZE (from) < 4)
38953 return false;
38956 return true;
38959 /* Return index of MODE in the sse load/store tables. */
38961 static inline int
38962 sse_store_index (machine_mode mode)
38964 switch (GET_MODE_SIZE (mode))
38966 case 4:
38967 return 0;
38968 case 8:
38969 return 1;
38970 case 16:
38971 return 2;
38972 case 32:
38973 return 3;
38974 case 64:
38975 return 4;
38976 default:
38977 return -1;
38981 /* Return the cost of moving data of mode M between a
38982 register and memory. A value of 2 is the default; this cost is
38983 relative to those in `REGISTER_MOVE_COST'.
38985 This function is used extensively by register_move_cost that is used to
38986 build tables at startup. Make it inline in this case.
38987 When IN is 2, return maximum of in and out move cost.
38989 If moving between registers and memory is more expensive than
38990 between two registers, you should define this macro to express the
38991 relative cost.
38993 Model also increased moving costs of QImode registers in non
38994 Q_REGS classes.
38996 static inline int
38997 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
38998 int in)
39000 int cost;
39001 if (FLOAT_CLASS_P (regclass))
39003 int index;
39004 switch (mode)
39006 case E_SFmode:
39007 index = 0;
39008 break;
39009 case E_DFmode:
39010 index = 1;
39011 break;
39012 case E_XFmode:
39013 index = 2;
39014 break;
39015 default:
39016 return 100;
39018 if (in == 2)
39019 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39020 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39022 if (SSE_CLASS_P (regclass))
39024 int index = sse_store_index (mode);
39025 if (index == -1)
39026 return 100;
39027 if (in == 2)
39028 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39029 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39031 if (MMX_CLASS_P (regclass))
39033 int index;
39034 switch (GET_MODE_SIZE (mode))
39036 case 4:
39037 index = 0;
39038 break;
39039 case 8:
39040 index = 1;
39041 break;
39042 default:
39043 return 100;
39045 if (in)
39046 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39047 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39049 switch (GET_MODE_SIZE (mode))
39051 case 1:
39052 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39054 if (!in)
39055 return ix86_cost->int_store[0];
39056 if (TARGET_PARTIAL_REG_DEPENDENCY
39057 && optimize_function_for_speed_p (cfun))
39058 cost = ix86_cost->movzbl_load;
39059 else
39060 cost = ix86_cost->int_load[0];
39061 if (in == 2)
39062 return MAX (cost, ix86_cost->int_store[0]);
39063 return cost;
39065 else
39067 if (in == 2)
39068 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39069 if (in)
39070 return ix86_cost->movzbl_load;
39071 else
39072 return ix86_cost->int_store[0] + 4;
39074 break;
39075 case 2:
39076 if (in == 2)
39077 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39078 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39079 default:
39080 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39081 if (mode == TFmode)
39082 mode = XFmode;
39083 if (in == 2)
39084 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39085 else if (in)
39086 cost = ix86_cost->int_load[2];
39087 else
39088 cost = ix86_cost->int_store[2];
39089 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39093 static int
39094 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39095 bool in)
39097 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39101 /* Return the cost of moving data from a register in class CLASS1 to
39102 one in class CLASS2.
39104 It is not required that the cost always equal 2 when FROM is the same as TO;
39105 on some machines it is expensive to move between registers if they are not
39106 general registers. */
39108 static int
39109 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39110 reg_class_t class2_i)
39112 enum reg_class class1 = (enum reg_class) class1_i;
39113 enum reg_class class2 = (enum reg_class) class2_i;
39115 /* In case we require secondary memory, compute cost of the store followed
39116 by load. In order to avoid bad register allocation choices, we need
39117 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39119 if (inline_secondary_memory_needed (mode, class1, class2, false))
39121 int cost = 1;
39123 cost += inline_memory_move_cost (mode, class1, 2);
39124 cost += inline_memory_move_cost (mode, class2, 2);
39126 /* In case of copying from general_purpose_register we may emit multiple
39127 stores followed by single load causing memory size mismatch stall.
39128 Count this as arbitrarily high cost of 20. */
39129 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39130 && TARGET_MEMORY_MISMATCH_STALL
39131 && targetm.class_max_nregs (class1, mode)
39132 > targetm.class_max_nregs (class2, mode))
39133 cost += 20;
39135 /* In the case of FP/MMX moves, the registers actually overlap, and we
39136 have to switch modes in order to treat them differently. */
39137 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39138 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39139 cost += 20;
39141 return cost;
39144 /* Moves between SSE/MMX and integer unit are expensive. */
39145 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39146 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39148 /* ??? By keeping returned value relatively high, we limit the number
39149 of moves between integer and MMX/SSE registers for all targets.
39150 Additionally, high value prevents problem with x86_modes_tieable_p(),
39151 where integer modes in MMX/SSE registers are not tieable
39152 because of missing QImode and HImode moves to, from or between
39153 MMX/SSE registers. */
39154 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39155 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39157 if (MAYBE_FLOAT_CLASS_P (class1))
39158 return ix86_cost->fp_move;
39159 if (MAYBE_SSE_CLASS_P (class1))
39161 if (GET_MODE_BITSIZE (mode) <= 128)
39162 return ix86_cost->xmm_move;
39163 if (GET_MODE_BITSIZE (mode) <= 256)
39164 return ix86_cost->ymm_move;
39165 return ix86_cost->zmm_move;
39167 if (MAYBE_MMX_CLASS_P (class1))
39168 return ix86_cost->mmx_move;
39169 return 2;
39172 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39173 words of a value of mode MODE but can be less for certain modes in
39174 special long registers.
39176 Actually there are no two word move instructions for consecutive
39177 registers. And only registers 0-3 may have mov byte instructions
39178 applied to them. */
39180 static unsigned int
39181 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
39183 if (GENERAL_REGNO_P (regno))
39185 if (mode == XFmode)
39186 return TARGET_64BIT ? 2 : 3;
39187 if (mode == XCmode)
39188 return TARGET_64BIT ? 4 : 6;
39189 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39191 if (COMPLEX_MODE_P (mode))
39192 return 2;
39193 if (mode == V64SFmode || mode == V64SImode)
39194 return 4;
39195 return 1;
39198 /* Implement TARGET_HARD_REGNO_MODE_OK. */
39200 static bool
39201 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
39203 /* Flags and only flags can only hold CCmode values. */
39204 if (CC_REGNO_P (regno))
39205 return GET_MODE_CLASS (mode) == MODE_CC;
39206 if (GET_MODE_CLASS (mode) == MODE_CC
39207 || GET_MODE_CLASS (mode) == MODE_RANDOM
39208 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39209 return false;
39210 if (STACK_REGNO_P (regno))
39211 return VALID_FP_MODE_P (mode);
39212 if (MASK_REGNO_P (regno))
39213 return (VALID_MASK_REG_MODE (mode)
39214 || (TARGET_AVX512BW
39215 && VALID_MASK_AVX512BW_MODE (mode)));
39216 if (BND_REGNO_P (regno))
39217 return VALID_BND_REG_MODE (mode);
39218 if (SSE_REGNO_P (regno))
39220 /* We implement the move patterns for all vector modes into and
39221 out of SSE registers, even when no operation instructions
39222 are available. */
39224 /* For AVX-512 we allow, regardless of regno:
39225 - XI mode
39226 - any of 512-bit wide vector mode
39227 - any scalar mode. */
39228 if (TARGET_AVX512F
39229 && (mode == XImode
39230 || VALID_AVX512F_REG_MODE (mode)
39231 || VALID_AVX512F_SCALAR_MODE (mode)))
39232 return true;
39234 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
39235 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39236 && MOD4_SSE_REGNO_P (regno)
39237 && mode == V64SFmode)
39238 return true;
39240 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
39241 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39242 && MOD4_SSE_REGNO_P (regno)
39243 && mode == V64SImode)
39244 return true;
39246 /* TODO check for QI/HI scalars. */
39247 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39248 if (TARGET_AVX512VL
39249 && (mode == OImode
39250 || mode == TImode
39251 || VALID_AVX256_REG_MODE (mode)
39252 || VALID_AVX512VL_128_REG_MODE (mode)))
39253 return true;
39255 /* xmm16-xmm31 are only available for AVX-512. */
39256 if (EXT_REX_SSE_REGNO_P (regno))
39257 return false;
39259 /* OImode and AVX modes are available only when AVX is enabled. */
39260 return ((TARGET_AVX
39261 && VALID_AVX256_REG_OR_OI_MODE (mode))
39262 || VALID_SSE_REG_MODE (mode)
39263 || VALID_SSE2_REG_MODE (mode)
39264 || VALID_MMX_REG_MODE (mode)
39265 || VALID_MMX_REG_MODE_3DNOW (mode));
39267 if (MMX_REGNO_P (regno))
39269 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39270 so if the register is available at all, then we can move data of
39271 the given mode into or out of it. */
39272 return (VALID_MMX_REG_MODE (mode)
39273 || VALID_MMX_REG_MODE_3DNOW (mode));
39276 if (mode == QImode)
39278 /* Take care for QImode values - they can be in non-QI regs,
39279 but then they do cause partial register stalls. */
39280 if (ANY_QI_REGNO_P (regno))
39281 return true;
39282 if (!TARGET_PARTIAL_REG_STALL)
39283 return true;
39284 /* LRA checks if the hard register is OK for the given mode.
39285 QImode values can live in non-QI regs, so we allow all
39286 registers here. */
39287 if (lra_in_progress)
39288 return true;
39289 return !can_create_pseudo_p ();
39291 /* We handle both integer and floats in the general purpose registers. */
39292 else if (VALID_INT_MODE_P (mode))
39293 return true;
39294 else if (VALID_FP_MODE_P (mode))
39295 return true;
39296 else if (VALID_DFP_MODE_P (mode))
39297 return true;
39298 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39299 on to use that value in smaller contexts, this can easily force a
39300 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39301 supporting DImode, allow it. */
39302 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39303 return true;
39305 return false;
39308 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
39309 saves SSE registers across calls is Win64 (thus no need to check the
39310 current ABI here), and with AVX enabled Win64 only guarantees that
39311 the low 16 bytes are saved. */
39313 static bool
39314 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
39316 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39319 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39320 tieable integer mode. */
39322 static bool
39323 ix86_tieable_integer_mode_p (machine_mode mode)
39325 switch (mode)
39327 case E_HImode:
39328 case E_SImode:
39329 return true;
39331 case E_QImode:
39332 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39334 case E_DImode:
39335 return TARGET_64BIT;
39337 default:
39338 return false;
39342 /* Implement TARGET_MODES_TIEABLE_P.
39344 Return true if MODE1 is accessible in a register that can hold MODE2
39345 without copying. That is, all register classes that can hold MODE2
39346 can also hold MODE1. */
39348 static bool
39349 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39351 if (mode1 == mode2)
39352 return true;
39354 if (ix86_tieable_integer_mode_p (mode1)
39355 && ix86_tieable_integer_mode_p (mode2))
39356 return true;
39358 /* MODE2 being XFmode implies fp stack or general regs, which means we
39359 can tie any smaller floating point modes to it. Note that we do not
39360 tie this with TFmode. */
39361 if (mode2 == XFmode)
39362 return mode1 == SFmode || mode1 == DFmode;
39364 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39365 that we can tie it with SFmode. */
39366 if (mode2 == DFmode)
39367 return mode1 == SFmode;
39369 /* If MODE2 is only appropriate for an SSE register, then tie with
39370 any other mode acceptable to SSE registers. */
39371 if (GET_MODE_SIZE (mode2) == 32
39372 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39373 return (GET_MODE_SIZE (mode1) == 32
39374 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39375 if (GET_MODE_SIZE (mode2) == 16
39376 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39377 return (GET_MODE_SIZE (mode1) == 16
39378 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39380 /* If MODE2 is appropriate for an MMX register, then tie
39381 with any other mode acceptable to MMX registers. */
39382 if (GET_MODE_SIZE (mode2) == 8
39383 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39384 return (GET_MODE_SIZE (mode1) == 8
39385 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39387 return false;
39390 /* Return the cost of moving between two registers of mode MODE. */
39392 static int
39393 ix86_set_reg_reg_cost (machine_mode mode)
39395 unsigned int units = UNITS_PER_WORD;
39397 switch (GET_MODE_CLASS (mode))
39399 default:
39400 break;
39402 case MODE_CC:
39403 units = GET_MODE_SIZE (CCmode);
39404 break;
39406 case MODE_FLOAT:
39407 if ((TARGET_SSE && mode == TFmode)
39408 || (TARGET_80387 && mode == XFmode)
39409 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39410 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39411 units = GET_MODE_SIZE (mode);
39412 break;
39414 case MODE_COMPLEX_FLOAT:
39415 if ((TARGET_SSE && mode == TCmode)
39416 || (TARGET_80387 && mode == XCmode)
39417 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39418 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39419 units = GET_MODE_SIZE (mode);
39420 break;
39422 case MODE_VECTOR_INT:
39423 case MODE_VECTOR_FLOAT:
39424 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39425 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39426 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39427 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39428 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39429 units = GET_MODE_SIZE (mode);
39432 /* Return the cost of moving between two registers of mode MODE,
39433 assuming that the move will be in pieces of at most UNITS bytes. */
39434 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39437 /* Return cost of vector operation in MODE given that scalar version has
39438 COST. If PARALLEL is true assume that CPU has more than one unit
39439 performing the operation. */
39441 static int
39442 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39444 if (!VECTOR_MODE_P (mode))
39445 return cost;
39447 if (!parallel)
39448 return cost * GET_MODE_NUNITS (mode);
39449 if (GET_MODE_BITSIZE (mode) == 128
39450 && TARGET_SSE_SPLIT_REGS)
39451 return cost * 2;
39452 if (GET_MODE_BITSIZE (mode) > 128
39453 && TARGET_AVX128_OPTIMAL)
39454 return cost * GET_MODE_BITSIZE (mode) / 128;
39455 return cost;
39458 /* Return cost of multiplication in MODE. */
39460 static int
39461 ix86_multiplication_cost (const struct processor_costs *cost,
39462 enum machine_mode mode)
39464 machine_mode inner_mode = mode;
39465 if (VECTOR_MODE_P (mode))
39466 inner_mode = GET_MODE_INNER (mode);
39468 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39469 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
39470 else if (X87_FLOAT_MODE_P (mode))
39471 return cost->fmul;
39472 else if (FLOAT_MODE_P (mode))
39473 return ix86_vec_cost (mode,
39474 inner_mode == DFmode
39475 ? cost->mulsd : cost->mulss, true);
39476 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39478 /* V*QImode is emulated with 7-13 insns. */
39479 if (mode == V16QImode || mode == V32QImode)
39481 int extra = 11;
39482 if (TARGET_XOP && mode == V16QImode)
39483 extra = 5;
39484 else if (TARGET_SSSE3)
39485 extra = 6;
39486 return ix86_vec_cost (mode,
39487 cost->mulss * 2 + cost->sse_op * extra,
39488 true);
39490 /* V*DImode is emulated with 5-8 insns. */
39491 else if (mode == V2DImode || mode == V4DImode)
39493 if (TARGET_XOP && mode == V2DImode)
39494 return ix86_vec_cost (mode,
39495 cost->mulss * 2 + cost->sse_op * 3,
39496 true);
39497 else
39498 return ix86_vec_cost (mode,
39499 cost->mulss * 3 + cost->sse_op * 5,
39500 true);
39502 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39503 insns, including two PMULUDQ. */
39504 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39505 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39506 true);
39507 else
39508 return ix86_vec_cost (mode, cost->mulss, true);
39510 else
39511 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
39514 /* Return cost of multiplication in MODE. */
39516 static int
39517 ix86_division_cost (const struct processor_costs *cost,
39518 enum machine_mode mode)
39520 machine_mode inner_mode = mode;
39521 if (VECTOR_MODE_P (mode))
39522 inner_mode = GET_MODE_INNER (mode);
39524 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39525 return inner_mode == DFmode ? cost->divsd : cost->divss;
39526 else if (X87_FLOAT_MODE_P (mode))
39527 return cost->fdiv;
39528 else if (FLOAT_MODE_P (mode))
39529 return ix86_vec_cost (mode,
39530 inner_mode == DFmode ? cost->divsd : cost->divss,
39531 true);
39532 else
39533 return cost->divide[MODE_INDEX (mode)];
39536 /* Return cost of shift in MODE.
39537 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
39538 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
39539 if op1 is a result of subreg.
39541 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
39543 static int
39544 ix86_shift_rotate_cost (const struct processor_costs *cost,
39545 enum machine_mode mode, bool constant_op1,
39546 HOST_WIDE_INT op1_val,
39547 bool speed,
39548 bool and_in_op1,
39549 bool shift_and_truncate,
39550 bool *skip_op0, bool *skip_op1)
39552 if (skip_op0)
39553 *skip_op0 = *skip_op1 = false;
39554 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39556 /* V*QImode is emulated with 1-11 insns. */
39557 if (mode == V16QImode || mode == V32QImode)
39559 int count = 11;
39560 if (TARGET_XOP && mode == V16QImode)
39562 /* For XOP we use vpshab, which requires a broadcast of the
39563 value to the variable shift insn. For constants this
39564 means a V16Q const in mem; even when we can perform the
39565 shift with one insn set the cost to prefer paddb. */
39566 if (constant_op1)
39568 if (skip_op1)
39569 *skip_op1 = true;
39570 return ix86_vec_cost (mode,
39571 cost->sse_op
39572 + (speed
39574 : COSTS_N_BYTES
39575 (GET_MODE_UNIT_SIZE (mode))), true);
39577 count = 3;
39579 else if (TARGET_SSSE3)
39580 count = 7;
39581 return ix86_vec_cost (mode, cost->sse_op * count, true);
39583 else
39584 return ix86_vec_cost (mode, cost->sse_op, true);
39586 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39588 if (constant_op1)
39590 if (op1_val > 32)
39591 return cost->shift_const + COSTS_N_INSNS (2);
39592 else
39593 return cost->shift_const * 2;
39595 else
39597 if (and_in_op1)
39598 return cost->shift_var * 2;
39599 else
39600 return cost->shift_var * 6 + COSTS_N_INSNS (2);
39603 else
39605 if (constant_op1)
39606 return cost->shift_const;
39607 else if (shift_and_truncate)
39609 if (skip_op0)
39610 *skip_op0 = *skip_op1 = true;
39611 /* Return the cost after shift-and truncation. */
39612 return cost->shift_var;
39614 else
39615 return cost->shift_var;
39617 return cost->shift_const;
39620 /* Compute a (partial) cost for rtx X. Return true if the complete
39621 cost has been computed, and false if subexpressions should be
39622 scanned. In either case, *TOTAL contains the cost result. */
39624 static bool
39625 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39626 int *total, bool speed)
39628 rtx mask;
39629 enum rtx_code code = GET_CODE (x);
39630 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39631 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39632 int src_cost;
39634 switch (code)
39636 case SET:
39637 if (register_operand (SET_DEST (x), VOIDmode)
39638 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39640 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39641 return true;
39644 if (register_operand (SET_SRC (x), VOIDmode))
39645 /* Avoid potentially incorrect high cost from rtx_costs
39646 for non-tieable SUBREGs. */
39647 src_cost = 0;
39648 else
39650 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39652 if (CONSTANT_P (SET_SRC (x)))
39653 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39654 a small value, possibly zero for cheap constants. */
39655 src_cost += COSTS_N_INSNS (1);
39658 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39659 return true;
39661 case CONST_INT:
39662 case CONST:
39663 case LABEL_REF:
39664 case SYMBOL_REF:
39665 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39666 *total = 3;
39667 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39668 *total = 2;
39669 else if (flag_pic && SYMBOLIC_CONST (x)
39670 && !(TARGET_64BIT
39671 && (GET_CODE (x) == LABEL_REF
39672 || (GET_CODE (x) == SYMBOL_REF
39673 && SYMBOL_REF_LOCAL_P (x))))
39674 /* Use 0 cost for CONST to improve its propagation. */
39675 && (TARGET_64BIT || GET_CODE (x) != CONST))
39676 *total = 1;
39677 else
39678 *total = 0;
39679 return true;
39681 case CONST_DOUBLE:
39682 if (IS_STACK_MODE (mode))
39683 switch (standard_80387_constant_p (x))
39685 case -1:
39686 case 0:
39687 break;
39688 case 1: /* 0.0 */
39689 *total = 1;
39690 return true;
39691 default: /* Other constants */
39692 *total = 2;
39693 return true;
39695 /* FALLTHRU */
39697 case CONST_VECTOR:
39698 switch (standard_sse_constant_p (x, mode))
39700 case 0:
39701 break;
39702 case 1: /* 0: xor eliminates false dependency */
39703 *total = 0;
39704 return true;
39705 default: /* -1: cmp contains false dependency */
39706 *total = 1;
39707 return true;
39709 /* FALLTHRU */
39711 case CONST_WIDE_INT:
39712 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39713 it'll probably end up. Add a penalty for size. */
39714 *total = (COSTS_N_INSNS (1)
39715 + (!TARGET_64BIT && flag_pic)
39716 + (GET_MODE_SIZE (mode) <= 4
39717 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39718 return true;
39720 case ZERO_EXTEND:
39721 /* The zero extensions is often completely free on x86_64, so make
39722 it as cheap as possible. */
39723 if (TARGET_64BIT && mode == DImode
39724 && GET_MODE (XEXP (x, 0)) == SImode)
39725 *total = 1;
39726 else if (TARGET_ZERO_EXTEND_WITH_AND)
39727 *total = cost->add;
39728 else
39729 *total = cost->movzx;
39730 return false;
39732 case SIGN_EXTEND:
39733 *total = cost->movsx;
39734 return false;
39736 case ASHIFT:
39737 if (SCALAR_INT_MODE_P (mode)
39738 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39739 && CONST_INT_P (XEXP (x, 1)))
39741 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39742 if (value == 1)
39744 *total = cost->add;
39745 return false;
39747 if ((value == 2 || value == 3)
39748 && cost->lea <= cost->shift_const)
39750 *total = cost->lea;
39751 return false;
39754 /* FALLTHRU */
39756 case ROTATE:
39757 case ASHIFTRT:
39758 case LSHIFTRT:
39759 case ROTATERT:
39760 bool skip_op0, skip_op1;
39761 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
39762 CONST_INT_P (XEXP (x, 1))
39763 ? INTVAL (XEXP (x, 1)) : -1,
39764 speed,
39765 GET_CODE (XEXP (x, 1)) == AND,
39766 SUBREG_P (XEXP (x, 1))
39767 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
39768 &skip_op0, &skip_op1);
39769 if (skip_op0 || skip_op1)
39771 if (!skip_op0)
39772 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
39773 if (!skip_op1)
39774 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
39775 return true;
39777 return false;
39779 case FMA:
39781 rtx sub;
39783 gcc_assert (FLOAT_MODE_P (mode));
39784 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39786 *total = ix86_vec_cost (mode,
39787 mode == SFmode ? cost->fmass : cost->fmasd,
39788 true);
39789 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39791 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39792 sub = XEXP (x, 0);
39793 if (GET_CODE (sub) == NEG)
39794 sub = XEXP (sub, 0);
39795 *total += rtx_cost (sub, mode, FMA, 0, speed);
39797 sub = XEXP (x, 2);
39798 if (GET_CODE (sub) == NEG)
39799 sub = XEXP (sub, 0);
39800 *total += rtx_cost (sub, mode, FMA, 2, speed);
39801 return true;
39804 case MULT:
39805 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
39807 rtx op0 = XEXP (x, 0);
39808 rtx op1 = XEXP (x, 1);
39809 int nbits;
39810 if (CONST_INT_P (XEXP (x, 1)))
39812 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39813 for (nbits = 0; value != 0; value &= value - 1)
39814 nbits++;
39816 else
39817 /* This is arbitrary. */
39818 nbits = 7;
39820 /* Compute costs correctly for widening multiplication. */
39821 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39822 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39823 == GET_MODE_SIZE (mode))
39825 int is_mulwiden = 0;
39826 machine_mode inner_mode = GET_MODE (op0);
39828 if (GET_CODE (op0) == GET_CODE (op1))
39829 is_mulwiden = 1, op1 = XEXP (op1, 0);
39830 else if (CONST_INT_P (op1))
39832 if (GET_CODE (op0) == SIGN_EXTEND)
39833 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39834 == INTVAL (op1);
39835 else
39836 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39839 if (is_mulwiden)
39840 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39843 *total = (cost->mult_init[MODE_INDEX (mode)]
39844 + nbits * cost->mult_bit
39845 + rtx_cost (op0, mode, outer_code, opno, speed)
39846 + rtx_cost (op1, mode, outer_code, opno, speed));
39848 return true;
39850 *total = ix86_multiplication_cost (cost, mode);
39851 return false;
39853 case DIV:
39854 case UDIV:
39855 case MOD:
39856 case UMOD:
39857 *total = ix86_division_cost (cost, mode);
39858 return false;
39860 case PLUS:
39861 if (GET_MODE_CLASS (mode) == MODE_INT
39862 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39864 if (GET_CODE (XEXP (x, 0)) == PLUS
39865 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39866 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39867 && CONSTANT_P (XEXP (x, 1)))
39869 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39870 if (val == 2 || val == 4 || val == 8)
39872 *total = cost->lea;
39873 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39874 outer_code, opno, speed);
39875 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39876 outer_code, opno, speed);
39877 *total += rtx_cost (XEXP (x, 1), mode,
39878 outer_code, opno, speed);
39879 return true;
39882 else if (GET_CODE (XEXP (x, 0)) == MULT
39883 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39885 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39886 if (val == 2 || val == 4 || val == 8)
39888 *total = cost->lea;
39889 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39890 outer_code, opno, speed);
39891 *total += rtx_cost (XEXP (x, 1), mode,
39892 outer_code, opno, speed);
39893 return true;
39896 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39898 /* Add with carry, ignore the cost of adding a carry flag. */
39899 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39900 *total = cost->add;
39901 else
39903 *total = cost->lea;
39904 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39905 outer_code, opno, speed);
39908 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39909 outer_code, opno, speed);
39910 *total += rtx_cost (XEXP (x, 1), mode,
39911 outer_code, opno, speed);
39912 return true;
39915 /* FALLTHRU */
39917 case MINUS:
39918 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39919 if (GET_MODE_CLASS (mode) == MODE_INT
39920 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39921 && GET_CODE (XEXP (x, 0)) == MINUS
39922 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39924 *total = cost->add;
39925 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39926 outer_code, opno, speed);
39927 *total += rtx_cost (XEXP (x, 1), mode,
39928 outer_code, opno, speed);
39929 return true;
39932 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39934 *total = cost->addss;
39935 return false;
39937 else if (X87_FLOAT_MODE_P (mode))
39939 *total = cost->fadd;
39940 return false;
39942 else if (FLOAT_MODE_P (mode))
39944 *total = ix86_vec_cost (mode, cost->addss, true);
39945 return false;
39947 /* FALLTHRU */
39949 case AND:
39950 case IOR:
39951 case XOR:
39952 if (GET_MODE_CLASS (mode) == MODE_INT
39953 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39955 *total = (cost->add * 2
39956 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39957 << (GET_MODE (XEXP (x, 0)) != DImode))
39958 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39959 << (GET_MODE (XEXP (x, 1)) != DImode)));
39960 return true;
39962 /* FALLTHRU */
39964 case NEG:
39965 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39967 *total = cost->sse_op;
39968 return false;
39970 else if (X87_FLOAT_MODE_P (mode))
39972 *total = cost->fchs;
39973 return false;
39975 else if (FLOAT_MODE_P (mode))
39977 *total = ix86_vec_cost (mode, cost->sse_op, true);
39978 return false;
39980 /* FALLTHRU */
39982 case NOT:
39983 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39984 *total = ix86_vec_cost (mode, cost->sse_op, true);
39985 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39986 *total = cost->add * 2;
39987 else
39988 *total = cost->add;
39989 return false;
39991 case COMPARE:
39992 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
39993 && XEXP (XEXP (x, 0), 1) == const1_rtx
39994 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
39995 && XEXP (x, 1) == const0_rtx)
39997 /* This kind of construct is implemented using test[bwl].
39998 Treat it as if we had an AND. */
39999 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40000 *total = (cost->add
40001 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40002 opno, speed)
40003 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40004 return true;
40007 /* The embedded comparison operand is completely free. */
40008 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40009 && XEXP (x, 1) == const0_rtx)
40010 *total = 0;
40012 return false;
40014 case FLOAT_EXTEND:
40015 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40016 *total = 0;
40017 else
40018 *total = ix86_vec_cost (mode, cost->addss, true);
40019 return false;
40021 case FLOAT_TRUNCATE:
40022 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40023 *total = cost->fadd;
40024 else
40025 *total = ix86_vec_cost (mode, cost->addss, true);
40026 return false;
40028 case ABS:
40029 /* SSE requires memory load for the constant operand. It may make
40030 sense to account for this. Of course the constant operand may or
40031 may not be reused. */
40032 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40033 *total = cost->sse_op;
40034 else if (X87_FLOAT_MODE_P (mode))
40035 *total = cost->fabs;
40036 else if (FLOAT_MODE_P (mode))
40037 *total = ix86_vec_cost (mode, cost->sse_op, true);
40038 return false;
40040 case SQRT:
40041 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40042 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40043 else if (X87_FLOAT_MODE_P (mode))
40044 *total = cost->fsqrt;
40045 else if (FLOAT_MODE_P (mode))
40046 *total = ix86_vec_cost (mode,
40047 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40048 true);
40049 return false;
40051 case UNSPEC:
40052 if (XINT (x, 1) == UNSPEC_TP)
40053 *total = 0;
40054 return false;
40056 case VEC_SELECT:
40057 case VEC_CONCAT:
40058 case VEC_DUPLICATE:
40059 /* ??? Assume all of these vector manipulation patterns are
40060 recognizable. In which case they all pretty much have the
40061 same cost. */
40062 *total = cost->sse_op;
40063 return true;
40064 case VEC_MERGE:
40065 mask = XEXP (x, 2);
40066 /* This is masked instruction, assume the same cost,
40067 as nonmasked variant. */
40068 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40069 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40070 else
40071 *total = cost->sse_op;
40072 return true;
40074 default:
40075 return false;
40079 #if TARGET_MACHO
40081 static int current_machopic_label_num;
40083 /* Given a symbol name and its associated stub, write out the
40084 definition of the stub. */
40086 void
40087 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40089 unsigned int length;
40090 char *binder_name, *symbol_name, lazy_ptr_name[32];
40091 int label = ++current_machopic_label_num;
40093 /* For 64-bit we shouldn't get here. */
40094 gcc_assert (!TARGET_64BIT);
40096 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40097 symb = targetm.strip_name_encoding (symb);
40099 length = strlen (stub);
40100 binder_name = XALLOCAVEC (char, length + 32);
40101 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40103 length = strlen (symb);
40104 symbol_name = XALLOCAVEC (char, length + 32);
40105 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40107 sprintf (lazy_ptr_name, "L%d$lz", label);
40109 if (MACHOPIC_ATT_STUB)
40110 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40111 else if (MACHOPIC_PURE)
40112 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40113 else
40114 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40116 fprintf (file, "%s:\n", stub);
40117 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40119 if (MACHOPIC_ATT_STUB)
40121 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40123 else if (MACHOPIC_PURE)
40125 /* PIC stub. */
40126 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40127 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40128 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40129 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40130 label, lazy_ptr_name, label);
40131 fprintf (file, "\tjmp\t*%%ecx\n");
40133 else
40134 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40136 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40137 it needs no stub-binding-helper. */
40138 if (MACHOPIC_ATT_STUB)
40139 return;
40141 fprintf (file, "%s:\n", binder_name);
40143 if (MACHOPIC_PURE)
40145 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40146 fprintf (file, "\tpushl\t%%ecx\n");
40148 else
40149 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40151 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40153 /* N.B. Keep the correspondence of these
40154 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40155 old-pic/new-pic/non-pic stubs; altering this will break
40156 compatibility with existing dylibs. */
40157 if (MACHOPIC_PURE)
40159 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40160 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40162 else
40163 /* 16-byte -mdynamic-no-pic stub. */
40164 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40166 fprintf (file, "%s:\n", lazy_ptr_name);
40167 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40168 fprintf (file, ASM_LONG "%s\n", binder_name);
40170 #endif /* TARGET_MACHO */
40172 /* Order the registers for register allocator. */
40174 void
40175 x86_order_regs_for_local_alloc (void)
40177 int pos = 0;
40178 int i;
40180 /* First allocate the local general purpose registers. */
40181 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40182 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40183 reg_alloc_order [pos++] = i;
40185 /* Global general purpose registers. */
40186 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40187 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40188 reg_alloc_order [pos++] = i;
40190 /* x87 registers come first in case we are doing FP math
40191 using them. */
40192 if (!TARGET_SSE_MATH)
40193 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40194 reg_alloc_order [pos++] = i;
40196 /* SSE registers. */
40197 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40198 reg_alloc_order [pos++] = i;
40199 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40200 reg_alloc_order [pos++] = i;
40202 /* Extended REX SSE registers. */
40203 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40204 reg_alloc_order [pos++] = i;
40206 /* Mask register. */
40207 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40208 reg_alloc_order [pos++] = i;
40210 /* MPX bound registers. */
40211 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40212 reg_alloc_order [pos++] = i;
40214 /* x87 registers. */
40215 if (TARGET_SSE_MATH)
40216 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40217 reg_alloc_order [pos++] = i;
40219 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40220 reg_alloc_order [pos++] = i;
40222 /* Initialize the rest of array as we do not allocate some registers
40223 at all. */
40224 while (pos < FIRST_PSEUDO_REGISTER)
40225 reg_alloc_order [pos++] = 0;
40228 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40229 in struct attribute_spec handler. */
40230 static tree
40231 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
40232 bool *no_add_attrs)
40234 if (TREE_CODE (*node) != FUNCTION_TYPE
40235 && TREE_CODE (*node) != METHOD_TYPE
40236 && TREE_CODE (*node) != FIELD_DECL
40237 && TREE_CODE (*node) != TYPE_DECL)
40239 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40240 name);
40241 *no_add_attrs = true;
40242 return NULL_TREE;
40244 if (TARGET_64BIT)
40246 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40247 name);
40248 *no_add_attrs = true;
40249 return NULL_TREE;
40251 if (is_attribute_p ("callee_pop_aggregate_return", name))
40253 tree cst;
40255 cst = TREE_VALUE (args);
40256 if (TREE_CODE (cst) != INTEGER_CST)
40258 warning (OPT_Wattributes,
40259 "%qE attribute requires an integer constant argument",
40260 name);
40261 *no_add_attrs = true;
40263 else if (compare_tree_int (cst, 0) != 0
40264 && compare_tree_int (cst, 1) != 0)
40266 warning (OPT_Wattributes,
40267 "argument to %qE attribute is neither zero, nor one",
40268 name);
40269 *no_add_attrs = true;
40272 return NULL_TREE;
40275 return NULL_TREE;
40278 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40279 struct attribute_spec.handler. */
40280 static tree
40281 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40282 bool *no_add_attrs)
40284 if (TREE_CODE (*node) != FUNCTION_TYPE
40285 && TREE_CODE (*node) != METHOD_TYPE
40286 && TREE_CODE (*node) != FIELD_DECL
40287 && TREE_CODE (*node) != TYPE_DECL)
40289 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40290 name);
40291 *no_add_attrs = true;
40292 return NULL_TREE;
40295 /* Can combine regparm with all attributes but fastcall. */
40296 if (is_attribute_p ("ms_abi", name))
40298 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40300 error ("ms_abi and sysv_abi attributes are not compatible");
40303 return NULL_TREE;
40305 else if (is_attribute_p ("sysv_abi", name))
40307 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40309 error ("ms_abi and sysv_abi attributes are not compatible");
40312 return NULL_TREE;
40315 return NULL_TREE;
40318 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40319 struct attribute_spec.handler. */
40320 static tree
40321 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40322 bool *no_add_attrs)
40324 tree *type = NULL;
40325 if (DECL_P (*node))
40327 if (TREE_CODE (*node) == TYPE_DECL)
40328 type = &TREE_TYPE (*node);
40330 else
40331 type = node;
40333 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40335 warning (OPT_Wattributes, "%qE attribute ignored",
40336 name);
40337 *no_add_attrs = true;
40340 else if ((is_attribute_p ("ms_struct", name)
40341 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40342 || ((is_attribute_p ("gcc_struct", name)
40343 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40345 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40346 name);
40347 *no_add_attrs = true;
40350 return NULL_TREE;
40353 static tree
40354 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40355 bool *no_add_attrs)
40357 if (TREE_CODE (*node) != FUNCTION_DECL)
40359 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40360 name);
40361 *no_add_attrs = true;
40363 return NULL_TREE;
40366 static tree
40367 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40368 int, bool *)
40370 return NULL_TREE;
40373 static tree
40374 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40376 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40377 but the function type contains args and return type data. */
40378 tree func_type = *node;
40379 tree return_type = TREE_TYPE (func_type);
40381 int nargs = 0;
40382 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40383 while (current_arg_type
40384 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40386 if (nargs == 0)
40388 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40389 error ("interrupt service routine should have a pointer "
40390 "as the first argument");
40392 else if (nargs == 1)
40394 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40395 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40396 error ("interrupt service routine should have unsigned %s"
40397 "int as the second argument",
40398 TARGET_64BIT
40399 ? (TARGET_X32 ? "long long " : "long ")
40400 : "");
40402 nargs++;
40403 current_arg_type = TREE_CHAIN (current_arg_type);
40405 if (!nargs || nargs > 2)
40406 error ("interrupt service routine can only have a pointer argument "
40407 "and an optional integer argument");
40408 if (! VOID_TYPE_P (return_type))
40409 error ("interrupt service routine can't have non-void return value");
40411 return NULL_TREE;
40414 static bool
40415 ix86_ms_bitfield_layout_p (const_tree record_type)
40417 return ((TARGET_MS_BITFIELD_LAYOUT
40418 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40419 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40422 /* Returns an expression indicating where the this parameter is
40423 located on entry to the FUNCTION. */
40425 static rtx
40426 x86_this_parameter (tree function)
40428 tree type = TREE_TYPE (function);
40429 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40430 int nregs;
40432 if (TARGET_64BIT)
40434 const int *parm_regs;
40436 if (ix86_function_type_abi (type) == MS_ABI)
40437 parm_regs = x86_64_ms_abi_int_parameter_registers;
40438 else
40439 parm_regs = x86_64_int_parameter_registers;
40440 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40443 nregs = ix86_function_regparm (type, function);
40445 if (nregs > 0 && !stdarg_p (type))
40447 int regno;
40448 unsigned int ccvt = ix86_get_callcvt (type);
40450 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40451 regno = aggr ? DX_REG : CX_REG;
40452 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40454 regno = CX_REG;
40455 if (aggr)
40456 return gen_rtx_MEM (SImode,
40457 plus_constant (Pmode, stack_pointer_rtx, 4));
40459 else
40461 regno = AX_REG;
40462 if (aggr)
40464 regno = DX_REG;
40465 if (nregs == 1)
40466 return gen_rtx_MEM (SImode,
40467 plus_constant (Pmode,
40468 stack_pointer_rtx, 4));
40471 return gen_rtx_REG (SImode, regno);
40474 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40475 aggr ? 8 : 4));
40478 /* Determine whether x86_output_mi_thunk can succeed. */
40480 static bool
40481 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40482 const_tree function)
40484 /* 64-bit can handle anything. */
40485 if (TARGET_64BIT)
40486 return true;
40488 /* For 32-bit, everything's fine if we have one free register. */
40489 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40490 return true;
40492 /* Need a free register for vcall_offset. */
40493 if (vcall_offset)
40494 return false;
40496 /* Need a free register for GOT references. */
40497 if (flag_pic && !targetm.binds_local_p (function))
40498 return false;
40500 /* Otherwise ok. */
40501 return true;
40504 /* Output the assembler code for a thunk function. THUNK_DECL is the
40505 declaration for the thunk function itself, FUNCTION is the decl for
40506 the target function. DELTA is an immediate constant offset to be
40507 added to THIS. If VCALL_OFFSET is nonzero, the word at
40508 *(*this + vcall_offset) should be added to THIS. */
40510 static void
40511 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40512 HOST_WIDE_INT vcall_offset, tree function)
40514 rtx this_param = x86_this_parameter (function);
40515 rtx this_reg, tmp, fnaddr;
40516 unsigned int tmp_regno;
40517 rtx_insn *insn;
40519 if (TARGET_64BIT)
40520 tmp_regno = R10_REG;
40521 else
40523 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40524 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40525 tmp_regno = AX_REG;
40526 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40527 tmp_regno = DX_REG;
40528 else
40529 tmp_regno = CX_REG;
40532 emit_note (NOTE_INSN_PROLOGUE_END);
40534 /* CET is enabled, insert EB instruction. */
40535 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40536 emit_insn (gen_nop_endbr ());
40538 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40539 pull it in now and let DELTA benefit. */
40540 if (REG_P (this_param))
40541 this_reg = this_param;
40542 else if (vcall_offset)
40544 /* Put the this parameter into %eax. */
40545 this_reg = gen_rtx_REG (Pmode, AX_REG);
40546 emit_move_insn (this_reg, this_param);
40548 else
40549 this_reg = NULL_RTX;
40551 /* Adjust the this parameter by a fixed constant. */
40552 if (delta)
40554 rtx delta_rtx = GEN_INT (delta);
40555 rtx delta_dst = this_reg ? this_reg : this_param;
40557 if (TARGET_64BIT)
40559 if (!x86_64_general_operand (delta_rtx, Pmode))
40561 tmp = gen_rtx_REG (Pmode, tmp_regno);
40562 emit_move_insn (tmp, delta_rtx);
40563 delta_rtx = tmp;
40567 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40570 /* Adjust the this parameter by a value stored in the vtable. */
40571 if (vcall_offset)
40573 rtx vcall_addr, vcall_mem, this_mem;
40575 tmp = gen_rtx_REG (Pmode, tmp_regno);
40577 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40578 if (Pmode != ptr_mode)
40579 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40580 emit_move_insn (tmp, this_mem);
40582 /* Adjust the this parameter. */
40583 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40584 if (TARGET_64BIT
40585 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40587 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40588 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40589 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40592 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40593 if (Pmode != ptr_mode)
40594 emit_insn (gen_addsi_1_zext (this_reg,
40595 gen_rtx_REG (ptr_mode,
40596 REGNO (this_reg)),
40597 vcall_mem));
40598 else
40599 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40602 /* If necessary, drop THIS back to its stack slot. */
40603 if (this_reg && this_reg != this_param)
40604 emit_move_insn (this_param, this_reg);
40606 fnaddr = XEXP (DECL_RTL (function), 0);
40607 if (TARGET_64BIT)
40609 if (!flag_pic || targetm.binds_local_p (function)
40610 || TARGET_PECOFF)
40612 else
40614 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40615 tmp = gen_rtx_CONST (Pmode, tmp);
40616 fnaddr = gen_const_mem (Pmode, tmp);
40619 else
40621 if (!flag_pic || targetm.binds_local_p (function))
40623 #if TARGET_MACHO
40624 else if (TARGET_MACHO)
40626 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40627 fnaddr = XEXP (fnaddr, 0);
40629 #endif /* TARGET_MACHO */
40630 else
40632 tmp = gen_rtx_REG (Pmode, CX_REG);
40633 output_set_got (tmp, NULL_RTX);
40635 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40636 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40637 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40638 fnaddr = gen_const_mem (Pmode, fnaddr);
40642 /* Our sibling call patterns do not allow memories, because we have no
40643 predicate that can distinguish between frame and non-frame memory.
40644 For our purposes here, we can get away with (ab)using a jump pattern,
40645 because we're going to do no optimization. */
40646 if (MEM_P (fnaddr))
40648 if (sibcall_insn_operand (fnaddr, word_mode))
40650 fnaddr = XEXP (DECL_RTL (function), 0);
40651 tmp = gen_rtx_MEM (QImode, fnaddr);
40652 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40653 tmp = emit_call_insn (tmp);
40654 SIBLING_CALL_P (tmp) = 1;
40656 else
40657 emit_jump_insn (gen_indirect_jump (fnaddr));
40659 else
40661 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40663 // CM_LARGE_PIC always uses pseudo PIC register which is
40664 // uninitialized. Since FUNCTION is local and calling it
40665 // doesn't go through PLT, we use scratch register %r11 as
40666 // PIC register and initialize it here.
40667 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40668 ix86_init_large_pic_reg (tmp_regno);
40669 fnaddr = legitimize_pic_address (fnaddr,
40670 gen_rtx_REG (Pmode, tmp_regno));
40673 if (!sibcall_insn_operand (fnaddr, word_mode))
40675 tmp = gen_rtx_REG (word_mode, tmp_regno);
40676 if (GET_MODE (fnaddr) != word_mode)
40677 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40678 emit_move_insn (tmp, fnaddr);
40679 fnaddr = tmp;
40682 tmp = gen_rtx_MEM (QImode, fnaddr);
40683 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40684 tmp = emit_call_insn (tmp);
40685 SIBLING_CALL_P (tmp) = 1;
40687 emit_barrier ();
40689 /* Emit just enough of rest_of_compilation to get the insns emitted.
40690 Note that use_thunk calls assemble_start_function et al. */
40691 insn = get_insns ();
40692 shorten_branches (insn);
40693 final_start_function (insn, file, 1);
40694 final (insn, file, 1);
40695 final_end_function ();
40698 static void
40699 x86_file_start (void)
40701 default_file_start ();
40702 if (TARGET_16BIT)
40703 fputs ("\t.code16gcc\n", asm_out_file);
40704 #if TARGET_MACHO
40705 darwin_file_start ();
40706 #endif
40707 if (X86_FILE_START_VERSION_DIRECTIVE)
40708 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40709 if (X86_FILE_START_FLTUSED)
40710 fputs ("\t.global\t__fltused\n", asm_out_file);
40711 if (ix86_asm_dialect == ASM_INTEL)
40712 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40716 x86_field_alignment (tree type, int computed)
40718 machine_mode mode;
40720 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40721 return computed;
40722 if (TARGET_IAMCU)
40723 return iamcu_alignment (type, computed);
40724 mode = TYPE_MODE (strip_array_types (type));
40725 if (mode == DFmode || mode == DCmode
40726 || GET_MODE_CLASS (mode) == MODE_INT
40727 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40728 return MIN (32, computed);
40729 return computed;
40732 /* Print call to TARGET to FILE. */
40734 static void
40735 x86_print_call_or_nop (FILE *file, const char *target)
40737 if (flag_nop_mcount)
40738 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
40739 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
40740 else
40741 fprintf (file, "1:\tcall\t%s\n", target);
40744 /* Output assembler code to FILE to increment profiler label # LABELNO
40745 for profiling a function entry. */
40746 void
40747 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40749 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40750 : MCOUNT_NAME);
40751 if (TARGET_64BIT)
40753 #ifndef NO_PROFILE_COUNTERS
40754 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40755 #endif
40757 if (!TARGET_PECOFF && flag_pic)
40758 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40759 else
40760 x86_print_call_or_nop (file, mcount_name);
40762 else if (flag_pic)
40764 #ifndef NO_PROFILE_COUNTERS
40765 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40766 LPREFIX, labelno);
40767 #endif
40768 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40770 else
40772 #ifndef NO_PROFILE_COUNTERS
40773 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40774 LPREFIX, labelno);
40775 #endif
40776 x86_print_call_or_nop (file, mcount_name);
40779 if (flag_record_mcount)
40781 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40782 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40783 fprintf (file, "\t.previous\n");
40787 /* We don't have exact information about the insn sizes, but we may assume
40788 quite safely that we are informed about all 1 byte insns and memory
40789 address sizes. This is enough to eliminate unnecessary padding in
40790 99% of cases. */
40793 ix86_min_insn_size (rtx_insn *insn)
40795 int l = 0, len;
40797 if (!INSN_P (insn) || !active_insn_p (insn))
40798 return 0;
40800 /* Discard alignments we've emit and jump instructions. */
40801 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40802 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40803 return 0;
40805 /* Important case - calls are always 5 bytes.
40806 It is common to have many calls in the row. */
40807 if (CALL_P (insn)
40808 && symbolic_reference_mentioned_p (PATTERN (insn))
40809 && !SIBLING_CALL_P (insn))
40810 return 5;
40811 len = get_attr_length (insn);
40812 if (len <= 1)
40813 return 1;
40815 /* For normal instructions we rely on get_attr_length being exact,
40816 with a few exceptions. */
40817 if (!JUMP_P (insn))
40819 enum attr_type type = get_attr_type (insn);
40821 switch (type)
40823 case TYPE_MULTI:
40824 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40825 || asm_noperands (PATTERN (insn)) >= 0)
40826 return 0;
40827 break;
40828 case TYPE_OTHER:
40829 case TYPE_FCMP:
40830 break;
40831 default:
40832 /* Otherwise trust get_attr_length. */
40833 return len;
40836 l = get_attr_length_address (insn);
40837 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40838 l = 4;
40840 if (l)
40841 return 1+l;
40842 else
40843 return 2;
40846 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40848 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40849 window. */
40851 static void
40852 ix86_avoid_jump_mispredicts (void)
40854 rtx_insn *insn, *start = get_insns ();
40855 int nbytes = 0, njumps = 0;
40856 bool isjump = false;
40858 /* Look for all minimal intervals of instructions containing 4 jumps.
40859 The intervals are bounded by START and INSN. NBYTES is the total
40860 size of instructions in the interval including INSN and not including
40861 START. When the NBYTES is smaller than 16 bytes, it is possible
40862 that the end of START and INSN ends up in the same 16byte page.
40864 The smallest offset in the page INSN can start is the case where START
40865 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40866 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40868 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40869 have to, control transfer to label(s) can be performed through other
40870 means, and also we estimate minimum length of all asm stmts as 0. */
40871 for (insn = start; insn; insn = NEXT_INSN (insn))
40873 int min_size;
40875 if (LABEL_P (insn))
40877 int align = label_to_alignment (insn);
40878 int max_skip = label_to_max_skip (insn);
40880 if (max_skip > 15)
40881 max_skip = 15;
40882 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40883 already in the current 16 byte page, because otherwise
40884 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40885 bytes to reach 16 byte boundary. */
40886 if (align <= 0
40887 || (align <= 3 && max_skip != (1 << align) - 1))
40888 max_skip = 0;
40889 if (dump_file)
40890 fprintf (dump_file, "Label %i with max_skip %i\n",
40891 INSN_UID (insn), max_skip);
40892 if (max_skip)
40894 while (nbytes + max_skip >= 16)
40896 start = NEXT_INSN (start);
40897 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40898 || CALL_P (start))
40899 njumps--, isjump = true;
40900 else
40901 isjump = false;
40902 nbytes -= ix86_min_insn_size (start);
40905 continue;
40908 min_size = ix86_min_insn_size (insn);
40909 nbytes += min_size;
40910 if (dump_file)
40911 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40912 INSN_UID (insn), min_size);
40913 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40914 || CALL_P (insn))
40915 njumps++;
40916 else
40917 continue;
40919 while (njumps > 3)
40921 start = NEXT_INSN (start);
40922 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40923 || CALL_P (start))
40924 njumps--, isjump = true;
40925 else
40926 isjump = false;
40927 nbytes -= ix86_min_insn_size (start);
40929 gcc_assert (njumps >= 0);
40930 if (dump_file)
40931 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40932 INSN_UID (start), INSN_UID (insn), nbytes);
40934 if (njumps == 3 && isjump && nbytes < 16)
40936 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40938 if (dump_file)
40939 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40940 INSN_UID (insn), padsize);
40941 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40945 #endif
40947 /* AMD Athlon works faster
40948 when RET is not destination of conditional jump or directly preceded
40949 by other jump instruction. We avoid the penalty by inserting NOP just
40950 before the RET instructions in such cases. */
40951 static void
40952 ix86_pad_returns (void)
40954 edge e;
40955 edge_iterator ei;
40957 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40959 basic_block bb = e->src;
40960 rtx_insn *ret = BB_END (bb);
40961 rtx_insn *prev;
40962 bool replace = false;
40964 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
40965 || optimize_bb_for_size_p (bb))
40966 continue;
40967 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
40968 if (active_insn_p (prev) || LABEL_P (prev))
40969 break;
40970 if (prev && LABEL_P (prev))
40972 edge e;
40973 edge_iterator ei;
40975 FOR_EACH_EDGE (e, ei, bb->preds)
40976 if (EDGE_FREQUENCY (e) && e->src->index >= 0
40977 && !(e->flags & EDGE_FALLTHRU))
40979 replace = true;
40980 break;
40983 if (!replace)
40985 prev = prev_active_insn (ret);
40986 if (prev
40987 && ((JUMP_P (prev) && any_condjump_p (prev))
40988 || CALL_P (prev)))
40989 replace = true;
40990 /* Empty functions get branch mispredict even when
40991 the jump destination is not visible to us. */
40992 if (!prev && !optimize_function_for_size_p (cfun))
40993 replace = true;
40995 if (replace)
40997 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
40998 delete_insn (ret);
41003 /* Count the minimum number of instructions in BB. Return 4 if the
41004 number of instructions >= 4. */
41006 static int
41007 ix86_count_insn_bb (basic_block bb)
41009 rtx_insn *insn;
41010 int insn_count = 0;
41012 /* Count number of instructions in this block. Return 4 if the number
41013 of instructions >= 4. */
41014 FOR_BB_INSNS (bb, insn)
41016 /* Only happen in exit blocks. */
41017 if (JUMP_P (insn)
41018 && ANY_RETURN_P (PATTERN (insn)))
41019 break;
41021 if (NONDEBUG_INSN_P (insn)
41022 && GET_CODE (PATTERN (insn)) != USE
41023 && GET_CODE (PATTERN (insn)) != CLOBBER)
41025 insn_count++;
41026 if (insn_count >= 4)
41027 return insn_count;
41031 return insn_count;
41035 /* Count the minimum number of instructions in code path in BB.
41036 Return 4 if the number of instructions >= 4. */
41038 static int
41039 ix86_count_insn (basic_block bb)
41041 edge e;
41042 edge_iterator ei;
41043 int min_prev_count;
41045 /* Only bother counting instructions along paths with no
41046 more than 2 basic blocks between entry and exit. Given
41047 that BB has an edge to exit, determine if a predecessor
41048 of BB has an edge from entry. If so, compute the number
41049 of instructions in the predecessor block. If there
41050 happen to be multiple such blocks, compute the minimum. */
41051 min_prev_count = 4;
41052 FOR_EACH_EDGE (e, ei, bb->preds)
41054 edge prev_e;
41055 edge_iterator prev_ei;
41057 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41059 min_prev_count = 0;
41060 break;
41062 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41064 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41066 int count = ix86_count_insn_bb (e->src);
41067 if (count < min_prev_count)
41068 min_prev_count = count;
41069 break;
41074 if (min_prev_count < 4)
41075 min_prev_count += ix86_count_insn_bb (bb);
41077 return min_prev_count;
41080 /* Pad short function to 4 instructions. */
41082 static void
41083 ix86_pad_short_function (void)
41085 edge e;
41086 edge_iterator ei;
41088 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41090 rtx_insn *ret = BB_END (e->src);
41091 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41093 int insn_count = ix86_count_insn (e->src);
41095 /* Pad short function. */
41096 if (insn_count < 4)
41098 rtx_insn *insn = ret;
41100 /* Find epilogue. */
41101 while (insn
41102 && (!NOTE_P (insn)
41103 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41104 insn = PREV_INSN (insn);
41106 if (!insn)
41107 insn = ret;
41109 /* Two NOPs count as one instruction. */
41110 insn_count = 2 * (4 - insn_count);
41111 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41117 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41118 the epilogue, the Windows system unwinder will apply epilogue logic and
41119 produce incorrect offsets. This can be avoided by adding a nop between
41120 the last insn that can throw and the first insn of the epilogue. */
41122 static void
41123 ix86_seh_fixup_eh_fallthru (void)
41125 edge e;
41126 edge_iterator ei;
41128 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41130 rtx_insn *insn, *next;
41132 /* Find the beginning of the epilogue. */
41133 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41134 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41135 break;
41136 if (insn == NULL)
41137 continue;
41139 /* We only care about preceding insns that can throw. */
41140 insn = prev_active_insn (insn);
41141 if (insn == NULL || !can_throw_internal (insn))
41142 continue;
41144 /* Do not separate calls from their debug information. */
41145 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41146 if (NOTE_P (next)
41147 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41148 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41149 insn = next;
41150 else
41151 break;
41153 emit_insn_after (gen_nops (const1_rtx), insn);
41157 /* Given a register number BASE, the lowest of a group of registers, update
41158 regsets IN and OUT with the registers that should be avoided in input
41159 and output operands respectively when trying to avoid generating a modr/m
41160 byte for -fmitigate-rop. */
41162 static void
41163 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41165 SET_HARD_REG_BIT (out, base);
41166 SET_HARD_REG_BIT (out, base + 1);
41167 SET_HARD_REG_BIT (in, base + 2);
41168 SET_HARD_REG_BIT (in, base + 3);
41171 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41172 that certain encodings of modr/m bytes do not occur. */
41173 static void
41174 ix86_mitigate_rop (void)
41176 HARD_REG_SET input_risky;
41177 HARD_REG_SET output_risky;
41178 HARD_REG_SET inout_risky;
41180 CLEAR_HARD_REG_SET (output_risky);
41181 CLEAR_HARD_REG_SET (input_risky);
41182 SET_HARD_REG_BIT (output_risky, AX_REG);
41183 SET_HARD_REG_BIT (output_risky, CX_REG);
41184 SET_HARD_REG_BIT (input_risky, BX_REG);
41185 SET_HARD_REG_BIT (input_risky, DX_REG);
41186 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41187 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41188 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41189 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41190 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41191 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41192 COPY_HARD_REG_SET (inout_risky, input_risky);
41193 IOR_HARD_REG_SET (inout_risky, output_risky);
41195 df_note_add_problem ();
41196 /* Fix up what stack-regs did. */
41197 df_insn_rescan_all ();
41198 df_analyze ();
41200 regrename_init (true);
41201 regrename_analyze (NULL);
41203 auto_vec<du_head_p> cands;
41205 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41207 if (!NONDEBUG_INSN_P (insn))
41208 continue;
41210 if (GET_CODE (PATTERN (insn)) == USE
41211 || GET_CODE (PATTERN (insn)) == CLOBBER)
41212 continue;
41214 extract_insn (insn);
41216 int opno0, opno1;
41217 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41218 recog_data.n_operands, &opno0,
41219 &opno1);
41221 if (!ix86_rop_should_change_byte_p (modrm))
41222 continue;
41224 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41226 /* This happens when regrename has to fail a block. */
41227 if (!info->op_info)
41228 continue;
41230 if (info->op_info[opno0].n_chains != 0)
41232 gcc_assert (info->op_info[opno0].n_chains == 1);
41233 du_head_p op0c;
41234 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41235 if (op0c->target_data_1 + op0c->target_data_2 == 0
41236 && !op0c->cannot_rename)
41237 cands.safe_push (op0c);
41239 op0c->target_data_1++;
41241 if (info->op_info[opno1].n_chains != 0)
41243 gcc_assert (info->op_info[opno1].n_chains == 1);
41244 du_head_p op1c;
41245 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41246 if (op1c->target_data_1 + op1c->target_data_2 == 0
41247 && !op1c->cannot_rename)
41248 cands.safe_push (op1c);
41250 op1c->target_data_2++;
41254 int i;
41255 du_head_p head;
41256 FOR_EACH_VEC_ELT (cands, i, head)
41258 int old_reg, best_reg;
41259 HARD_REG_SET unavailable;
41261 CLEAR_HARD_REG_SET (unavailable);
41262 if (head->target_data_1)
41263 IOR_HARD_REG_SET (unavailable, output_risky);
41264 if (head->target_data_2)
41265 IOR_HARD_REG_SET (unavailable, input_risky);
41267 int n_uses;
41268 reg_class superclass = regrename_find_superclass (head, &n_uses,
41269 &unavailable);
41270 old_reg = head->regno;
41271 best_reg = find_rename_reg (head, superclass, &unavailable,
41272 old_reg, false);
41273 bool ok = regrename_do_replace (head, best_reg);
41274 gcc_assert (ok);
41275 if (dump_file)
41276 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41277 reg_names[best_reg], reg_class_names[superclass]);
41281 regrename_finish ();
41283 df_analyze ();
41285 basic_block bb;
41286 regset_head live;
41288 INIT_REG_SET (&live);
41290 FOR_EACH_BB_FN (bb, cfun)
41292 rtx_insn *insn;
41294 COPY_REG_SET (&live, DF_LR_OUT (bb));
41295 df_simulate_initialize_backwards (bb, &live);
41297 FOR_BB_INSNS_REVERSE (bb, insn)
41299 if (!NONDEBUG_INSN_P (insn))
41300 continue;
41302 df_simulate_one_insn_backwards (bb, insn, &live);
41304 if (GET_CODE (PATTERN (insn)) == USE
41305 || GET_CODE (PATTERN (insn)) == CLOBBER)
41306 continue;
41308 extract_insn (insn);
41309 constrain_operands_cached (insn, reload_completed);
41310 int opno0, opno1;
41311 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41312 recog_data.n_operands, &opno0,
41313 &opno1);
41314 if (modrm < 0
41315 || !ix86_rop_should_change_byte_p (modrm)
41316 || opno0 == opno1)
41317 continue;
41319 rtx oldreg = recog_data.operand[opno1];
41320 preprocess_constraints (insn);
41321 const operand_alternative *alt = which_op_alt ();
41323 int i;
41324 for (i = 0; i < recog_data.n_operands; i++)
41325 if (i != opno1
41326 && alt[i].earlyclobber
41327 && reg_overlap_mentioned_p (recog_data.operand[i],
41328 oldreg))
41329 break;
41331 if (i < recog_data.n_operands)
41332 continue;
41334 if (dump_file)
41335 fprintf (dump_file,
41336 "attempting to fix modrm byte in insn %d:"
41337 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41338 reg_class_names[alt[opno1].cl]);
41340 HARD_REG_SET unavailable;
41341 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41342 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41343 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41344 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41345 IOR_HARD_REG_SET (unavailable, output_risky);
41346 IOR_COMPL_HARD_REG_SET (unavailable,
41347 reg_class_contents[alt[opno1].cl]);
41349 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41350 if (!TEST_HARD_REG_BIT (unavailable, i))
41351 break;
41352 if (i == FIRST_PSEUDO_REGISTER)
41354 if (dump_file)
41355 fprintf (dump_file, ", none available\n");
41356 continue;
41358 if (dump_file)
41359 fprintf (dump_file, " -> %d\n", i);
41360 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41361 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41362 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41367 /* Implement machine specific optimizations. We implement padding of returns
41368 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41369 static void
41370 ix86_reorg (void)
41372 /* We are freeing block_for_insn in the toplev to keep compatibility
41373 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41374 compute_bb_for_insn ();
41376 if (flag_mitigate_rop)
41377 ix86_mitigate_rop ();
41379 if (TARGET_SEH && current_function_has_exception_handlers ())
41380 ix86_seh_fixup_eh_fallthru ();
41382 if (optimize && optimize_function_for_speed_p (cfun))
41384 if (TARGET_PAD_SHORT_FUNCTION)
41385 ix86_pad_short_function ();
41386 else if (TARGET_PAD_RETURNS)
41387 ix86_pad_returns ();
41388 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41389 if (TARGET_FOUR_JUMP_LIMIT)
41390 ix86_avoid_jump_mispredicts ();
41391 #endif
41395 /* Return nonzero when QImode register that must be represented via REX prefix
41396 is used. */
41397 bool
41398 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41400 int i;
41401 extract_insn_cached (insn);
41402 for (i = 0; i < recog_data.n_operands; i++)
41403 if (GENERAL_REG_P (recog_data.operand[i])
41404 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41405 return true;
41406 return false;
41409 /* Return true when INSN mentions register that must be encoded using REX
41410 prefix. */
41411 bool
41412 x86_extended_reg_mentioned_p (rtx insn)
41414 subrtx_iterator::array_type array;
41415 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41417 const_rtx x = *iter;
41418 if (REG_P (x)
41419 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41420 return true;
41422 return false;
41425 /* If profitable, negate (without causing overflow) integer constant
41426 of mode MODE at location LOC. Return true in this case. */
41427 bool
41428 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41430 HOST_WIDE_INT val;
41432 if (!CONST_INT_P (*loc))
41433 return false;
41435 switch (mode)
41437 case E_DImode:
41438 /* DImode x86_64 constants must fit in 32 bits. */
41439 gcc_assert (x86_64_immediate_operand (*loc, mode));
41441 mode = SImode;
41442 break;
41444 case E_SImode:
41445 case E_HImode:
41446 case E_QImode:
41447 break;
41449 default:
41450 gcc_unreachable ();
41453 /* Avoid overflows. */
41454 if (mode_signbit_p (mode, *loc))
41455 return false;
41457 val = INTVAL (*loc);
41459 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41460 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41461 if ((val < 0 && val != -128)
41462 || val == 128)
41464 *loc = GEN_INT (-val);
41465 return true;
41468 return false;
41471 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41472 optabs would emit if we didn't have TFmode patterns. */
41474 void
41475 x86_emit_floatuns (rtx operands[2])
41477 rtx_code_label *neglab, *donelab;
41478 rtx i0, i1, f0, in, out;
41479 machine_mode mode, inmode;
41481 inmode = GET_MODE (operands[1]);
41482 gcc_assert (inmode == SImode || inmode == DImode);
41484 out = operands[0];
41485 in = force_reg (inmode, operands[1]);
41486 mode = GET_MODE (out);
41487 neglab = gen_label_rtx ();
41488 donelab = gen_label_rtx ();
41489 f0 = gen_reg_rtx (mode);
41491 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41493 expand_float (out, in, 0);
41495 emit_jump_insn (gen_jump (donelab));
41496 emit_barrier ();
41498 emit_label (neglab);
41500 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41501 1, OPTAB_DIRECT);
41502 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41503 1, OPTAB_DIRECT);
41504 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41506 expand_float (f0, i0, 0);
41508 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41510 emit_label (donelab);
41513 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41514 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41515 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41516 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41518 /* Get a vector mode of the same size as the original but with elements
41519 twice as wide. This is only guaranteed to apply to integral vectors. */
41521 static inline machine_mode
41522 get_mode_wider_vector (machine_mode o)
41524 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41525 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41526 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41527 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41528 return n;
41531 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41532 fill target with val via vec_duplicate. */
41534 static bool
41535 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41537 bool ok;
41538 rtx_insn *insn;
41539 rtx dup;
41541 /* First attempt to recognize VAL as-is. */
41542 dup = gen_vec_duplicate (mode, val);
41543 insn = emit_insn (gen_rtx_SET (target, dup));
41544 if (recog_memoized (insn) < 0)
41546 rtx_insn *seq;
41547 machine_mode innermode = GET_MODE_INNER (mode);
41548 rtx reg;
41550 /* If that fails, force VAL into a register. */
41552 start_sequence ();
41553 reg = force_reg (innermode, val);
41554 if (GET_MODE (reg) != innermode)
41555 reg = gen_lowpart (innermode, reg);
41556 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
41557 seq = get_insns ();
41558 end_sequence ();
41559 if (seq)
41560 emit_insn_before (seq, insn);
41562 ok = recog_memoized (insn) >= 0;
41563 gcc_assert (ok);
41565 return true;
41568 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41569 with all elements equal to VAR. Return true if successful. */
41571 static bool
41572 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41573 rtx target, rtx val)
41575 bool ok;
41577 switch (mode)
41579 case E_V2SImode:
41580 case E_V2SFmode:
41581 if (!mmx_ok)
41582 return false;
41583 /* FALLTHRU */
41585 case E_V4DFmode:
41586 case E_V4DImode:
41587 case E_V8SFmode:
41588 case E_V8SImode:
41589 case E_V2DFmode:
41590 case E_V2DImode:
41591 case E_V4SFmode:
41592 case E_V4SImode:
41593 case E_V16SImode:
41594 case E_V8DImode:
41595 case E_V16SFmode:
41596 case E_V8DFmode:
41597 return ix86_vector_duplicate_value (mode, target, val);
41599 case E_V4HImode:
41600 if (!mmx_ok)
41601 return false;
41602 if (TARGET_SSE || TARGET_3DNOW_A)
41604 rtx x;
41606 val = gen_lowpart (SImode, val);
41607 x = gen_rtx_TRUNCATE (HImode, val);
41608 x = gen_rtx_VEC_DUPLICATE (mode, x);
41609 emit_insn (gen_rtx_SET (target, x));
41610 return true;
41612 goto widen;
41614 case E_V8QImode:
41615 if (!mmx_ok)
41616 return false;
41617 goto widen;
41619 case E_V8HImode:
41620 if (TARGET_AVX2)
41621 return ix86_vector_duplicate_value (mode, target, val);
41623 if (TARGET_SSE2)
41625 struct expand_vec_perm_d dperm;
41626 rtx tmp1, tmp2;
41628 permute:
41629 memset (&dperm, 0, sizeof (dperm));
41630 dperm.target = target;
41631 dperm.vmode = mode;
41632 dperm.nelt = GET_MODE_NUNITS (mode);
41633 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41634 dperm.one_operand_p = true;
41636 /* Extend to SImode using a paradoxical SUBREG. */
41637 tmp1 = gen_reg_rtx (SImode);
41638 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41640 /* Insert the SImode value as low element of a V4SImode vector. */
41641 tmp2 = gen_reg_rtx (V4SImode);
41642 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41643 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41645 ok = (expand_vec_perm_1 (&dperm)
41646 || expand_vec_perm_broadcast_1 (&dperm));
41647 gcc_assert (ok);
41648 return ok;
41650 goto widen;
41652 case E_V16QImode:
41653 if (TARGET_AVX2)
41654 return ix86_vector_duplicate_value (mode, target, val);
41656 if (TARGET_SSE2)
41657 goto permute;
41658 goto widen;
41660 widen:
41661 /* Replicate the value once into the next wider mode and recurse. */
41663 machine_mode smode, wsmode, wvmode;
41664 rtx x;
41666 smode = GET_MODE_INNER (mode);
41667 wvmode = get_mode_wider_vector (mode);
41668 wsmode = GET_MODE_INNER (wvmode);
41670 val = convert_modes (wsmode, smode, val, true);
41671 x = expand_simple_binop (wsmode, ASHIFT, val,
41672 GEN_INT (GET_MODE_BITSIZE (smode)),
41673 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41674 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41676 x = gen_reg_rtx (wvmode);
41677 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41678 gcc_assert (ok);
41679 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41680 return ok;
41683 case E_V16HImode:
41684 case E_V32QImode:
41685 if (TARGET_AVX2)
41686 return ix86_vector_duplicate_value (mode, target, val);
41687 else
41689 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41690 rtx x = gen_reg_rtx (hvmode);
41692 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41693 gcc_assert (ok);
41695 x = gen_rtx_VEC_CONCAT (mode, x, x);
41696 emit_insn (gen_rtx_SET (target, x));
41698 return true;
41700 case E_V64QImode:
41701 case E_V32HImode:
41702 if (TARGET_AVX512BW)
41703 return ix86_vector_duplicate_value (mode, target, val);
41704 else
41706 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41707 rtx x = gen_reg_rtx (hvmode);
41709 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41710 gcc_assert (ok);
41712 x = gen_rtx_VEC_CONCAT (mode, x, x);
41713 emit_insn (gen_rtx_SET (target, x));
41715 return true;
41717 default:
41718 return false;
41722 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41723 whose ONE_VAR element is VAR, and other elements are zero. Return true
41724 if successful. */
41726 static bool
41727 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41728 rtx target, rtx var, int one_var)
41730 machine_mode vsimode;
41731 rtx new_target;
41732 rtx x, tmp;
41733 bool use_vector_set = false;
41735 switch (mode)
41737 case E_V2DImode:
41738 /* For SSE4.1, we normally use vector set. But if the second
41739 element is zero and inter-unit moves are OK, we use movq
41740 instead. */
41741 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41742 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41743 && one_var == 0));
41744 break;
41745 case E_V16QImode:
41746 case E_V4SImode:
41747 case E_V4SFmode:
41748 use_vector_set = TARGET_SSE4_1;
41749 break;
41750 case E_V8HImode:
41751 use_vector_set = TARGET_SSE2;
41752 break;
41753 case E_V4HImode:
41754 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41755 break;
41756 case E_V32QImode:
41757 case E_V16HImode:
41758 case E_V8SImode:
41759 case E_V8SFmode:
41760 case E_V4DFmode:
41761 use_vector_set = TARGET_AVX;
41762 break;
41763 case E_V4DImode:
41764 /* Use ix86_expand_vector_set in 64bit mode only. */
41765 use_vector_set = TARGET_AVX && TARGET_64BIT;
41766 break;
41767 default:
41768 break;
41771 if (use_vector_set)
41773 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41774 var = force_reg (GET_MODE_INNER (mode), var);
41775 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41776 return true;
41779 switch (mode)
41781 case E_V2SFmode:
41782 case E_V2SImode:
41783 if (!mmx_ok)
41784 return false;
41785 /* FALLTHRU */
41787 case E_V2DFmode:
41788 case E_V2DImode:
41789 if (one_var != 0)
41790 return false;
41791 var = force_reg (GET_MODE_INNER (mode), var);
41792 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41793 emit_insn (gen_rtx_SET (target, x));
41794 return true;
41796 case E_V4SFmode:
41797 case E_V4SImode:
41798 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41799 new_target = gen_reg_rtx (mode);
41800 else
41801 new_target = target;
41802 var = force_reg (GET_MODE_INNER (mode), var);
41803 x = gen_rtx_VEC_DUPLICATE (mode, var);
41804 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41805 emit_insn (gen_rtx_SET (new_target, x));
41806 if (one_var != 0)
41808 /* We need to shuffle the value to the correct position, so
41809 create a new pseudo to store the intermediate result. */
41811 /* With SSE2, we can use the integer shuffle insns. */
41812 if (mode != V4SFmode && TARGET_SSE2)
41814 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41815 const1_rtx,
41816 GEN_INT (one_var == 1 ? 0 : 1),
41817 GEN_INT (one_var == 2 ? 0 : 1),
41818 GEN_INT (one_var == 3 ? 0 : 1)));
41819 if (target != new_target)
41820 emit_move_insn (target, new_target);
41821 return true;
41824 /* Otherwise convert the intermediate result to V4SFmode and
41825 use the SSE1 shuffle instructions. */
41826 if (mode != V4SFmode)
41828 tmp = gen_reg_rtx (V4SFmode);
41829 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41831 else
41832 tmp = new_target;
41834 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41835 const1_rtx,
41836 GEN_INT (one_var == 1 ? 0 : 1),
41837 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41838 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41840 if (mode != V4SFmode)
41841 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41842 else if (tmp != target)
41843 emit_move_insn (target, tmp);
41845 else if (target != new_target)
41846 emit_move_insn (target, new_target);
41847 return true;
41849 case E_V8HImode:
41850 case E_V16QImode:
41851 vsimode = V4SImode;
41852 goto widen;
41853 case E_V4HImode:
41854 case E_V8QImode:
41855 if (!mmx_ok)
41856 return false;
41857 vsimode = V2SImode;
41858 goto widen;
41859 widen:
41860 if (one_var != 0)
41861 return false;
41863 /* Zero extend the variable element to SImode and recurse. */
41864 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41866 x = gen_reg_rtx (vsimode);
41867 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41868 var, one_var))
41869 gcc_unreachable ();
41871 emit_move_insn (target, gen_lowpart (mode, x));
41872 return true;
41874 default:
41875 return false;
41879 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41880 consisting of the values in VALS. It is known that all elements
41881 except ONE_VAR are constants. Return true if successful. */
41883 static bool
41884 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41885 rtx target, rtx vals, int one_var)
41887 rtx var = XVECEXP (vals, 0, one_var);
41888 machine_mode wmode;
41889 rtx const_vec, x;
41891 const_vec = copy_rtx (vals);
41892 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41893 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41895 switch (mode)
41897 case E_V2DFmode:
41898 case E_V2DImode:
41899 case E_V2SFmode:
41900 case E_V2SImode:
41901 /* For the two element vectors, it's just as easy to use
41902 the general case. */
41903 return false;
41905 case E_V4DImode:
41906 /* Use ix86_expand_vector_set in 64bit mode only. */
41907 if (!TARGET_64BIT)
41908 return false;
41909 /* FALLTHRU */
41910 case E_V4DFmode:
41911 case E_V8SFmode:
41912 case E_V8SImode:
41913 case E_V16HImode:
41914 case E_V32QImode:
41915 case E_V4SFmode:
41916 case E_V4SImode:
41917 case E_V8HImode:
41918 case E_V4HImode:
41919 break;
41921 case E_V16QImode:
41922 if (TARGET_SSE4_1)
41923 break;
41924 wmode = V8HImode;
41925 goto widen;
41926 case E_V8QImode:
41927 wmode = V4HImode;
41928 goto widen;
41929 widen:
41930 /* There's no way to set one QImode entry easily. Combine
41931 the variable value with its adjacent constant value, and
41932 promote to an HImode set. */
41933 x = XVECEXP (vals, 0, one_var ^ 1);
41934 if (one_var & 1)
41936 var = convert_modes (HImode, QImode, var, true);
41937 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
41938 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41939 x = GEN_INT (INTVAL (x) & 0xff);
41941 else
41943 var = convert_modes (HImode, QImode, var, true);
41944 x = gen_int_mode (INTVAL (x) << 8, HImode);
41946 if (x != const0_rtx)
41947 var = expand_simple_binop (HImode, IOR, var, x, var,
41948 1, OPTAB_LIB_WIDEN);
41950 x = gen_reg_rtx (wmode);
41951 emit_move_insn (x, gen_lowpart (wmode, const_vec));
41952 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
41954 emit_move_insn (target, gen_lowpart (mode, x));
41955 return true;
41957 default:
41958 return false;
41961 emit_move_insn (target, const_vec);
41962 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41963 return true;
41966 /* A subroutine of ix86_expand_vector_init_general. Use vector
41967 concatenate to handle the most general case: all values variable,
41968 and none identical. */
41970 static void
41971 ix86_expand_vector_init_concat (machine_mode mode,
41972 rtx target, rtx *ops, int n)
41974 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
41975 rtx first[16], second[8], third[4];
41976 rtvec v;
41977 int i, j;
41979 switch (n)
41981 case 2:
41982 switch (mode)
41984 case E_V16SImode:
41985 cmode = V8SImode;
41986 break;
41987 case E_V16SFmode:
41988 cmode = V8SFmode;
41989 break;
41990 case E_V8DImode:
41991 cmode = V4DImode;
41992 break;
41993 case E_V8DFmode:
41994 cmode = V4DFmode;
41995 break;
41996 case E_V8SImode:
41997 cmode = V4SImode;
41998 break;
41999 case E_V8SFmode:
42000 cmode = V4SFmode;
42001 break;
42002 case E_V4DImode:
42003 cmode = V2DImode;
42004 break;
42005 case E_V4DFmode:
42006 cmode = V2DFmode;
42007 break;
42008 case E_V4SImode:
42009 cmode = V2SImode;
42010 break;
42011 case E_V4SFmode:
42012 cmode = V2SFmode;
42013 break;
42014 case E_V2DImode:
42015 cmode = DImode;
42016 break;
42017 case E_V2SImode:
42018 cmode = SImode;
42019 break;
42020 case E_V2DFmode:
42021 cmode = DFmode;
42022 break;
42023 case E_V2SFmode:
42024 cmode = SFmode;
42025 break;
42026 default:
42027 gcc_unreachable ();
42030 if (!register_operand (ops[1], cmode))
42031 ops[1] = force_reg (cmode, ops[1]);
42032 if (!register_operand (ops[0], cmode))
42033 ops[0] = force_reg (cmode, ops[0]);
42034 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42035 ops[1])));
42036 break;
42038 case 4:
42039 switch (mode)
42041 case E_V4DImode:
42042 cmode = V2DImode;
42043 break;
42044 case E_V4DFmode:
42045 cmode = V2DFmode;
42046 break;
42047 case E_V4SImode:
42048 cmode = V2SImode;
42049 break;
42050 case E_V4SFmode:
42051 cmode = V2SFmode;
42052 break;
42053 default:
42054 gcc_unreachable ();
42056 goto half;
42058 case 8:
42059 switch (mode)
42061 case E_V8DImode:
42062 cmode = V2DImode;
42063 hmode = V4DImode;
42064 break;
42065 case E_V8DFmode:
42066 cmode = V2DFmode;
42067 hmode = V4DFmode;
42068 break;
42069 case E_V8SImode:
42070 cmode = V2SImode;
42071 hmode = V4SImode;
42072 break;
42073 case E_V8SFmode:
42074 cmode = V2SFmode;
42075 hmode = V4SFmode;
42076 break;
42077 default:
42078 gcc_unreachable ();
42080 goto half;
42082 case 16:
42083 switch (mode)
42085 case E_V16SImode:
42086 cmode = V2SImode;
42087 hmode = V4SImode;
42088 gmode = V8SImode;
42089 break;
42090 case E_V16SFmode:
42091 cmode = V2SFmode;
42092 hmode = V4SFmode;
42093 gmode = V8SFmode;
42094 break;
42095 default:
42096 gcc_unreachable ();
42098 goto half;
42100 half:
42101 /* FIXME: We process inputs backward to help RA. PR 36222. */
42102 i = n - 1;
42103 j = (n >> 1) - 1;
42104 for (; i > 0; i -= 2, j--)
42106 first[j] = gen_reg_rtx (cmode);
42107 v = gen_rtvec (2, ops[i - 1], ops[i]);
42108 ix86_expand_vector_init (false, first[j],
42109 gen_rtx_PARALLEL (cmode, v));
42112 n >>= 1;
42113 if (n > 4)
42115 gcc_assert (hmode != VOIDmode);
42116 gcc_assert (gmode != VOIDmode);
42117 for (i = j = 0; i < n; i += 2, j++)
42119 second[j] = gen_reg_rtx (hmode);
42120 ix86_expand_vector_init_concat (hmode, second [j],
42121 &first [i], 2);
42123 n >>= 1;
42124 for (i = j = 0; i < n; i += 2, j++)
42126 third[j] = gen_reg_rtx (gmode);
42127 ix86_expand_vector_init_concat (gmode, third[j],
42128 &second[i], 2);
42130 n >>= 1;
42131 ix86_expand_vector_init_concat (mode, target, third, n);
42133 else if (n > 2)
42135 gcc_assert (hmode != VOIDmode);
42136 for (i = j = 0; i < n; i += 2, j++)
42138 second[j] = gen_reg_rtx (hmode);
42139 ix86_expand_vector_init_concat (hmode, second [j],
42140 &first [i], 2);
42142 n >>= 1;
42143 ix86_expand_vector_init_concat (mode, target, second, n);
42145 else
42146 ix86_expand_vector_init_concat (mode, target, first, n);
42147 break;
42149 default:
42150 gcc_unreachable ();
42154 /* A subroutine of ix86_expand_vector_init_general. Use vector
42155 interleave to handle the most general case: all values variable,
42156 and none identical. */
42158 static void
42159 ix86_expand_vector_init_interleave (machine_mode mode,
42160 rtx target, rtx *ops, int n)
42162 machine_mode first_imode, second_imode, third_imode, inner_mode;
42163 int i, j;
42164 rtx op0, op1;
42165 rtx (*gen_load_even) (rtx, rtx, rtx);
42166 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42167 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42169 switch (mode)
42171 case E_V8HImode:
42172 gen_load_even = gen_vec_setv8hi;
42173 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42174 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42175 inner_mode = HImode;
42176 first_imode = V4SImode;
42177 second_imode = V2DImode;
42178 third_imode = VOIDmode;
42179 break;
42180 case E_V16QImode:
42181 gen_load_even = gen_vec_setv16qi;
42182 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42183 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42184 inner_mode = QImode;
42185 first_imode = V8HImode;
42186 second_imode = V4SImode;
42187 third_imode = V2DImode;
42188 break;
42189 default:
42190 gcc_unreachable ();
42193 for (i = 0; i < n; i++)
42195 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42196 op0 = gen_reg_rtx (SImode);
42197 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42199 /* Insert the SImode value as low element of V4SImode vector. */
42200 op1 = gen_reg_rtx (V4SImode);
42201 op0 = gen_rtx_VEC_MERGE (V4SImode,
42202 gen_rtx_VEC_DUPLICATE (V4SImode,
42203 op0),
42204 CONST0_RTX (V4SImode),
42205 const1_rtx);
42206 emit_insn (gen_rtx_SET (op1, op0));
42208 /* Cast the V4SImode vector back to a vector in orignal mode. */
42209 op0 = gen_reg_rtx (mode);
42210 emit_move_insn (op0, gen_lowpart (mode, op1));
42212 /* Load even elements into the second position. */
42213 emit_insn (gen_load_even (op0,
42214 force_reg (inner_mode,
42215 ops [i + i + 1]),
42216 const1_rtx));
42218 /* Cast vector to FIRST_IMODE vector. */
42219 ops[i] = gen_reg_rtx (first_imode);
42220 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42223 /* Interleave low FIRST_IMODE vectors. */
42224 for (i = j = 0; i < n; i += 2, j++)
42226 op0 = gen_reg_rtx (first_imode);
42227 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42229 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42230 ops[j] = gen_reg_rtx (second_imode);
42231 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42234 /* Interleave low SECOND_IMODE vectors. */
42235 switch (second_imode)
42237 case E_V4SImode:
42238 for (i = j = 0; i < n / 2; i += 2, j++)
42240 op0 = gen_reg_rtx (second_imode);
42241 emit_insn (gen_interleave_second_low (op0, ops[i],
42242 ops[i + 1]));
42244 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42245 vector. */
42246 ops[j] = gen_reg_rtx (third_imode);
42247 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42249 second_imode = V2DImode;
42250 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42251 /* FALLTHRU */
42253 case E_V2DImode:
42254 op0 = gen_reg_rtx (second_imode);
42255 emit_insn (gen_interleave_second_low (op0, ops[0],
42256 ops[1]));
42258 /* Cast the SECOND_IMODE vector back to a vector on original
42259 mode. */
42260 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42261 break;
42263 default:
42264 gcc_unreachable ();
42268 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42269 all values variable, and none identical. */
42271 static void
42272 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42273 rtx target, rtx vals)
42275 rtx ops[64], op0, op1, op2, op3, op4, op5;
42276 machine_mode half_mode = VOIDmode;
42277 machine_mode quarter_mode = VOIDmode;
42278 int n, i;
42280 switch (mode)
42282 case E_V2SFmode:
42283 case E_V2SImode:
42284 if (!mmx_ok && !TARGET_SSE)
42285 break;
42286 /* FALLTHRU */
42288 case E_V16SImode:
42289 case E_V16SFmode:
42290 case E_V8DFmode:
42291 case E_V8DImode:
42292 case E_V8SFmode:
42293 case E_V8SImode:
42294 case E_V4DFmode:
42295 case E_V4DImode:
42296 case E_V4SFmode:
42297 case E_V4SImode:
42298 case E_V2DFmode:
42299 case E_V2DImode:
42300 n = GET_MODE_NUNITS (mode);
42301 for (i = 0; i < n; i++)
42302 ops[i] = XVECEXP (vals, 0, i);
42303 ix86_expand_vector_init_concat (mode, target, ops, n);
42304 return;
42306 case E_V2TImode:
42307 for (i = 0; i < 2; i++)
42308 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42309 op0 = gen_reg_rtx (V4DImode);
42310 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
42311 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42312 return;
42314 case E_V4TImode:
42315 for (i = 0; i < 4; i++)
42316 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42317 ops[4] = gen_reg_rtx (V4DImode);
42318 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
42319 ops[5] = gen_reg_rtx (V4DImode);
42320 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
42321 op0 = gen_reg_rtx (V8DImode);
42322 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
42323 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42324 return;
42326 case E_V32QImode:
42327 half_mode = V16QImode;
42328 goto half;
42330 case E_V16HImode:
42331 half_mode = V8HImode;
42332 goto half;
42334 half:
42335 n = GET_MODE_NUNITS (mode);
42336 for (i = 0; i < n; i++)
42337 ops[i] = XVECEXP (vals, 0, i);
42338 op0 = gen_reg_rtx (half_mode);
42339 op1 = gen_reg_rtx (half_mode);
42340 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42341 n >> 2);
42342 ix86_expand_vector_init_interleave (half_mode, op1,
42343 &ops [n >> 1], n >> 2);
42344 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42345 return;
42347 case E_V64QImode:
42348 quarter_mode = V16QImode;
42349 half_mode = V32QImode;
42350 goto quarter;
42352 case E_V32HImode:
42353 quarter_mode = V8HImode;
42354 half_mode = V16HImode;
42355 goto quarter;
42357 quarter:
42358 n = GET_MODE_NUNITS (mode);
42359 for (i = 0; i < n; i++)
42360 ops[i] = XVECEXP (vals, 0, i);
42361 op0 = gen_reg_rtx (quarter_mode);
42362 op1 = gen_reg_rtx (quarter_mode);
42363 op2 = gen_reg_rtx (quarter_mode);
42364 op3 = gen_reg_rtx (quarter_mode);
42365 op4 = gen_reg_rtx (half_mode);
42366 op5 = gen_reg_rtx (half_mode);
42367 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42368 n >> 3);
42369 ix86_expand_vector_init_interleave (quarter_mode, op1,
42370 &ops [n >> 2], n >> 3);
42371 ix86_expand_vector_init_interleave (quarter_mode, op2,
42372 &ops [n >> 1], n >> 3);
42373 ix86_expand_vector_init_interleave (quarter_mode, op3,
42374 &ops [(n >> 1) | (n >> 2)], n >> 3);
42375 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42376 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42377 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42378 return;
42380 case E_V16QImode:
42381 if (!TARGET_SSE4_1)
42382 break;
42383 /* FALLTHRU */
42385 case E_V8HImode:
42386 if (!TARGET_SSE2)
42387 break;
42389 /* Don't use ix86_expand_vector_init_interleave if we can't
42390 move from GPR to SSE register directly. */
42391 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42392 break;
42394 n = GET_MODE_NUNITS (mode);
42395 for (i = 0; i < n; i++)
42396 ops[i] = XVECEXP (vals, 0, i);
42397 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42398 return;
42400 case E_V4HImode:
42401 case E_V8QImode:
42402 break;
42404 default:
42405 gcc_unreachable ();
42409 int i, j, n_elts, n_words, n_elt_per_word;
42410 machine_mode inner_mode;
42411 rtx words[4], shift;
42413 inner_mode = GET_MODE_INNER (mode);
42414 n_elts = GET_MODE_NUNITS (mode);
42415 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42416 n_elt_per_word = n_elts / n_words;
42417 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42419 for (i = 0; i < n_words; ++i)
42421 rtx word = NULL_RTX;
42423 for (j = 0; j < n_elt_per_word; ++j)
42425 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42426 elt = convert_modes (word_mode, inner_mode, elt, true);
42428 if (j == 0)
42429 word = elt;
42430 else
42432 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42433 word, 1, OPTAB_LIB_WIDEN);
42434 word = expand_simple_binop (word_mode, IOR, word, elt,
42435 word, 1, OPTAB_LIB_WIDEN);
42439 words[i] = word;
42442 if (n_words == 1)
42443 emit_move_insn (target, gen_lowpart (mode, words[0]));
42444 else if (n_words == 2)
42446 rtx tmp = gen_reg_rtx (mode);
42447 emit_clobber (tmp);
42448 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42449 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42450 emit_move_insn (target, tmp);
42452 else if (n_words == 4)
42454 rtx tmp = gen_reg_rtx (V4SImode);
42455 gcc_assert (word_mode == SImode);
42456 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42457 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42458 emit_move_insn (target, gen_lowpart (mode, tmp));
42460 else
42461 gcc_unreachable ();
42465 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42466 instructions unless MMX_OK is true. */
42468 void
42469 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42471 machine_mode mode = GET_MODE (target);
42472 machine_mode inner_mode = GET_MODE_INNER (mode);
42473 int n_elts = GET_MODE_NUNITS (mode);
42474 int n_var = 0, one_var = -1;
42475 bool all_same = true, all_const_zero = true;
42476 int i;
42477 rtx x;
42479 /* Handle first initialization from vector elts. */
42480 if (n_elts != XVECLEN (vals, 0))
42482 rtx subtarget = target;
42483 x = XVECEXP (vals, 0, 0);
42484 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42485 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42487 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42488 if (inner_mode == QImode || inner_mode == HImode)
42490 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42491 mode = mode_for_vector (SImode, n_bits / 4).require ();
42492 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42493 ops[0] = gen_lowpart (inner_mode, ops[0]);
42494 ops[1] = gen_lowpart (inner_mode, ops[1]);
42495 subtarget = gen_reg_rtx (mode);
42497 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42498 if (subtarget != target)
42499 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42500 return;
42502 gcc_unreachable ();
42505 for (i = 0; i < n_elts; ++i)
42507 x = XVECEXP (vals, 0, i);
42508 if (!(CONST_SCALAR_INT_P (x)
42509 || CONST_DOUBLE_P (x)
42510 || CONST_FIXED_P (x)))
42511 n_var++, one_var = i;
42512 else if (x != CONST0_RTX (inner_mode))
42513 all_const_zero = false;
42514 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42515 all_same = false;
42518 /* Constants are best loaded from the constant pool. */
42519 if (n_var == 0)
42521 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42522 return;
42525 /* If all values are identical, broadcast the value. */
42526 if (all_same
42527 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42528 XVECEXP (vals, 0, 0)))
42529 return;
42531 /* Values where only one field is non-constant are best loaded from
42532 the pool and overwritten via move later. */
42533 if (n_var == 1)
42535 if (all_const_zero
42536 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42537 XVECEXP (vals, 0, one_var),
42538 one_var))
42539 return;
42541 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42542 return;
42545 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42548 void
42549 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42551 machine_mode mode = GET_MODE (target);
42552 machine_mode inner_mode = GET_MODE_INNER (mode);
42553 machine_mode half_mode;
42554 bool use_vec_merge = false;
42555 rtx tmp;
42556 static rtx (*gen_extract[6][2]) (rtx, rtx)
42558 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42559 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42560 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42561 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42562 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42563 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42565 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42567 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42568 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42569 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42570 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42571 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42572 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42574 int i, j, n;
42575 machine_mode mmode = VOIDmode;
42576 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42578 switch (mode)
42580 case E_V2SFmode:
42581 case E_V2SImode:
42582 if (mmx_ok)
42584 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42585 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42586 if (elt == 0)
42587 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42588 else
42589 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42590 emit_insn (gen_rtx_SET (target, tmp));
42591 return;
42593 break;
42595 case E_V2DImode:
42596 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42597 if (use_vec_merge)
42598 break;
42600 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42601 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42602 if (elt == 0)
42603 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42604 else
42605 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42606 emit_insn (gen_rtx_SET (target, tmp));
42607 return;
42609 case E_V2DFmode:
42611 rtx op0, op1;
42613 /* For the two element vectors, we implement a VEC_CONCAT with
42614 the extraction of the other element. */
42616 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42617 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42619 if (elt == 0)
42620 op0 = val, op1 = tmp;
42621 else
42622 op0 = tmp, op1 = val;
42624 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42625 emit_insn (gen_rtx_SET (target, tmp));
42627 return;
42629 case E_V4SFmode:
42630 use_vec_merge = TARGET_SSE4_1;
42631 if (use_vec_merge)
42632 break;
42634 switch (elt)
42636 case 0:
42637 use_vec_merge = true;
42638 break;
42640 case 1:
42641 /* tmp = target = A B C D */
42642 tmp = copy_to_reg (target);
42643 /* target = A A B B */
42644 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42645 /* target = X A B B */
42646 ix86_expand_vector_set (false, target, val, 0);
42647 /* target = A X C D */
42648 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42649 const1_rtx, const0_rtx,
42650 GEN_INT (2+4), GEN_INT (3+4)));
42651 return;
42653 case 2:
42654 /* tmp = target = A B C D */
42655 tmp = copy_to_reg (target);
42656 /* tmp = X B C D */
42657 ix86_expand_vector_set (false, tmp, val, 0);
42658 /* target = A B X D */
42659 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42660 const0_rtx, const1_rtx,
42661 GEN_INT (0+4), GEN_INT (3+4)));
42662 return;
42664 case 3:
42665 /* tmp = target = A B C D */
42666 tmp = copy_to_reg (target);
42667 /* tmp = X B C D */
42668 ix86_expand_vector_set (false, tmp, val, 0);
42669 /* target = A B X D */
42670 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42671 const0_rtx, const1_rtx,
42672 GEN_INT (2+4), GEN_INT (0+4)));
42673 return;
42675 default:
42676 gcc_unreachable ();
42678 break;
42680 case E_V4SImode:
42681 use_vec_merge = TARGET_SSE4_1;
42682 if (use_vec_merge)
42683 break;
42685 /* Element 0 handled by vec_merge below. */
42686 if (elt == 0)
42688 use_vec_merge = true;
42689 break;
42692 if (TARGET_SSE2)
42694 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42695 store into element 0, then shuffle them back. */
42697 rtx order[4];
42699 order[0] = GEN_INT (elt);
42700 order[1] = const1_rtx;
42701 order[2] = const2_rtx;
42702 order[3] = GEN_INT (3);
42703 order[elt] = const0_rtx;
42705 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42706 order[1], order[2], order[3]));
42708 ix86_expand_vector_set (false, target, val, 0);
42710 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42711 order[1], order[2], order[3]));
42713 else
42715 /* For SSE1, we have to reuse the V4SF code. */
42716 rtx t = gen_reg_rtx (V4SFmode);
42717 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42718 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42719 emit_move_insn (target, gen_lowpart (mode, t));
42721 return;
42723 case E_V8HImode:
42724 use_vec_merge = TARGET_SSE2;
42725 break;
42726 case E_V4HImode:
42727 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42728 break;
42730 case E_V16QImode:
42731 use_vec_merge = TARGET_SSE4_1;
42732 break;
42734 case E_V8QImode:
42735 break;
42737 case E_V32QImode:
42738 half_mode = V16QImode;
42739 j = 0;
42740 n = 16;
42741 goto half;
42743 case E_V16HImode:
42744 half_mode = V8HImode;
42745 j = 1;
42746 n = 8;
42747 goto half;
42749 case E_V8SImode:
42750 half_mode = V4SImode;
42751 j = 2;
42752 n = 4;
42753 goto half;
42755 case E_V4DImode:
42756 half_mode = V2DImode;
42757 j = 3;
42758 n = 2;
42759 goto half;
42761 case E_V8SFmode:
42762 half_mode = V4SFmode;
42763 j = 4;
42764 n = 4;
42765 goto half;
42767 case E_V4DFmode:
42768 half_mode = V2DFmode;
42769 j = 5;
42770 n = 2;
42771 goto half;
42773 half:
42774 /* Compute offset. */
42775 i = elt / n;
42776 elt %= n;
42778 gcc_assert (i <= 1);
42780 /* Extract the half. */
42781 tmp = gen_reg_rtx (half_mode);
42782 emit_insn (gen_extract[j][i] (tmp, target));
42784 /* Put val in tmp at elt. */
42785 ix86_expand_vector_set (false, tmp, val, elt);
42787 /* Put it back. */
42788 emit_insn (gen_insert[j][i] (target, target, tmp));
42789 return;
42791 case E_V8DFmode:
42792 if (TARGET_AVX512F)
42794 mmode = QImode;
42795 gen_blendm = gen_avx512f_blendmv8df;
42797 break;
42799 case E_V8DImode:
42800 if (TARGET_AVX512F)
42802 mmode = QImode;
42803 gen_blendm = gen_avx512f_blendmv8di;
42805 break;
42807 case E_V16SFmode:
42808 if (TARGET_AVX512F)
42810 mmode = HImode;
42811 gen_blendm = gen_avx512f_blendmv16sf;
42813 break;
42815 case E_V16SImode:
42816 if (TARGET_AVX512F)
42818 mmode = HImode;
42819 gen_blendm = gen_avx512f_blendmv16si;
42821 break;
42823 case E_V32HImode:
42824 if (TARGET_AVX512F && TARGET_AVX512BW)
42826 mmode = SImode;
42827 gen_blendm = gen_avx512bw_blendmv32hi;
42829 break;
42831 case E_V64QImode:
42832 if (TARGET_AVX512F && TARGET_AVX512BW)
42834 mmode = DImode;
42835 gen_blendm = gen_avx512bw_blendmv64qi;
42837 break;
42839 default:
42840 break;
42843 if (mmode != VOIDmode)
42845 tmp = gen_reg_rtx (mode);
42846 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42847 /* The avx512*_blendm<mode> expanders have different operand order
42848 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42849 elements where the mask is set and second input operand otherwise,
42850 in {sse,avx}*_*blend* the first input operand is used for elements
42851 where the mask is clear and second input operand otherwise. */
42852 emit_insn (gen_blendm (target, target, tmp,
42853 force_reg (mmode,
42854 gen_int_mode (1 << elt, mmode))));
42856 else if (use_vec_merge)
42858 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42859 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42860 emit_insn (gen_rtx_SET (target, tmp));
42862 else
42864 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42866 emit_move_insn (mem, target);
42868 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42869 emit_move_insn (tmp, val);
42871 emit_move_insn (target, mem);
42875 void
42876 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42878 machine_mode mode = GET_MODE (vec);
42879 machine_mode inner_mode = GET_MODE_INNER (mode);
42880 bool use_vec_extr = false;
42881 rtx tmp;
42883 switch (mode)
42885 case E_V2SImode:
42886 case E_V2SFmode:
42887 if (!mmx_ok)
42888 break;
42889 /* FALLTHRU */
42891 case E_V2DFmode:
42892 case E_V2DImode:
42893 case E_V2TImode:
42894 case E_V4TImode:
42895 use_vec_extr = true;
42896 break;
42898 case E_V4SFmode:
42899 use_vec_extr = TARGET_SSE4_1;
42900 if (use_vec_extr)
42901 break;
42903 switch (elt)
42905 case 0:
42906 tmp = vec;
42907 break;
42909 case 1:
42910 case 3:
42911 tmp = gen_reg_rtx (mode);
42912 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42913 GEN_INT (elt), GEN_INT (elt),
42914 GEN_INT (elt+4), GEN_INT (elt+4)));
42915 break;
42917 case 2:
42918 tmp = gen_reg_rtx (mode);
42919 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42920 break;
42922 default:
42923 gcc_unreachable ();
42925 vec = tmp;
42926 use_vec_extr = true;
42927 elt = 0;
42928 break;
42930 case E_V4SImode:
42931 use_vec_extr = TARGET_SSE4_1;
42932 if (use_vec_extr)
42933 break;
42935 if (TARGET_SSE2)
42937 switch (elt)
42939 case 0:
42940 tmp = vec;
42941 break;
42943 case 1:
42944 case 3:
42945 tmp = gen_reg_rtx (mode);
42946 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
42947 GEN_INT (elt), GEN_INT (elt),
42948 GEN_INT (elt), GEN_INT (elt)));
42949 break;
42951 case 2:
42952 tmp = gen_reg_rtx (mode);
42953 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
42954 break;
42956 default:
42957 gcc_unreachable ();
42959 vec = tmp;
42960 use_vec_extr = true;
42961 elt = 0;
42963 else
42965 /* For SSE1, we have to reuse the V4SF code. */
42966 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
42967 gen_lowpart (V4SFmode, vec), elt);
42968 return;
42970 break;
42972 case E_V8HImode:
42973 use_vec_extr = TARGET_SSE2;
42974 break;
42975 case E_V4HImode:
42976 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42977 break;
42979 case E_V16QImode:
42980 use_vec_extr = TARGET_SSE4_1;
42981 break;
42983 case E_V8SFmode:
42984 if (TARGET_AVX)
42986 tmp = gen_reg_rtx (V4SFmode);
42987 if (elt < 4)
42988 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
42989 else
42990 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
42991 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42992 return;
42994 break;
42996 case E_V4DFmode:
42997 if (TARGET_AVX)
42999 tmp = gen_reg_rtx (V2DFmode);
43000 if (elt < 2)
43001 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43002 else
43003 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43004 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43005 return;
43007 break;
43009 case E_V32QImode:
43010 if (TARGET_AVX)
43012 tmp = gen_reg_rtx (V16QImode);
43013 if (elt < 16)
43014 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43015 else
43016 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43017 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43018 return;
43020 break;
43022 case E_V16HImode:
43023 if (TARGET_AVX)
43025 tmp = gen_reg_rtx (V8HImode);
43026 if (elt < 8)
43027 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43028 else
43029 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43030 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43031 return;
43033 break;
43035 case E_V8SImode:
43036 if (TARGET_AVX)
43038 tmp = gen_reg_rtx (V4SImode);
43039 if (elt < 4)
43040 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43041 else
43042 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43043 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43044 return;
43046 break;
43048 case E_V4DImode:
43049 if (TARGET_AVX)
43051 tmp = gen_reg_rtx (V2DImode);
43052 if (elt < 2)
43053 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43054 else
43055 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43056 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43057 return;
43059 break;
43061 case E_V32HImode:
43062 if (TARGET_AVX512BW)
43064 tmp = gen_reg_rtx (V16HImode);
43065 if (elt < 16)
43066 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43067 else
43068 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43069 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43070 return;
43072 break;
43074 case E_V64QImode:
43075 if (TARGET_AVX512BW)
43077 tmp = gen_reg_rtx (V32QImode);
43078 if (elt < 32)
43079 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43080 else
43081 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43082 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43083 return;
43085 break;
43087 case E_V16SFmode:
43088 tmp = gen_reg_rtx (V8SFmode);
43089 if (elt < 8)
43090 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43091 else
43092 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43093 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43094 return;
43096 case E_V8DFmode:
43097 tmp = gen_reg_rtx (V4DFmode);
43098 if (elt < 4)
43099 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43100 else
43101 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43102 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43103 return;
43105 case E_V16SImode:
43106 tmp = gen_reg_rtx (V8SImode);
43107 if (elt < 8)
43108 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43109 else
43110 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43111 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43112 return;
43114 case E_V8DImode:
43115 tmp = gen_reg_rtx (V4DImode);
43116 if (elt < 4)
43117 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43118 else
43119 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43120 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43121 return;
43123 case E_V8QImode:
43124 /* ??? Could extract the appropriate HImode element and shift. */
43125 default:
43126 break;
43129 if (use_vec_extr)
43131 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43132 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43134 /* Let the rtl optimizers know about the zero extension performed. */
43135 if (inner_mode == QImode || inner_mode == HImode)
43137 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43138 target = gen_lowpart (SImode, target);
43141 emit_insn (gen_rtx_SET (target, tmp));
43143 else
43145 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43147 emit_move_insn (mem, vec);
43149 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43150 emit_move_insn (target, tmp);
43154 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43155 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43156 The upper bits of DEST are undefined, though they shouldn't cause
43157 exceptions (some bits from src or all zeros are ok). */
43159 static void
43160 emit_reduc_half (rtx dest, rtx src, int i)
43162 rtx tem, d = dest;
43163 switch (GET_MODE (src))
43165 case E_V4SFmode:
43166 if (i == 128)
43167 tem = gen_sse_movhlps (dest, src, src);
43168 else
43169 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43170 GEN_INT (1 + 4), GEN_INT (1 + 4));
43171 break;
43172 case E_V2DFmode:
43173 tem = gen_vec_interleave_highv2df (dest, src, src);
43174 break;
43175 case E_V16QImode:
43176 case E_V8HImode:
43177 case E_V4SImode:
43178 case E_V2DImode:
43179 d = gen_reg_rtx (V1TImode);
43180 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43181 GEN_INT (i / 2));
43182 break;
43183 case E_V8SFmode:
43184 if (i == 256)
43185 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43186 else
43187 tem = gen_avx_shufps256 (dest, src, src,
43188 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43189 break;
43190 case E_V4DFmode:
43191 if (i == 256)
43192 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43193 else
43194 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43195 break;
43196 case E_V32QImode:
43197 case E_V16HImode:
43198 case E_V8SImode:
43199 case E_V4DImode:
43200 if (i == 256)
43202 if (GET_MODE (dest) != V4DImode)
43203 d = gen_reg_rtx (V4DImode);
43204 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43205 gen_lowpart (V4DImode, src),
43206 const1_rtx);
43208 else
43210 d = gen_reg_rtx (V2TImode);
43211 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43212 GEN_INT (i / 2));
43214 break;
43215 case E_V64QImode:
43216 case E_V32HImode:
43217 case E_V16SImode:
43218 case E_V16SFmode:
43219 case E_V8DImode:
43220 case E_V8DFmode:
43221 if (i > 128)
43222 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43223 gen_lowpart (V16SImode, src),
43224 gen_lowpart (V16SImode, src),
43225 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43226 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43227 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43228 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43229 GEN_INT (0xC), GEN_INT (0xD),
43230 GEN_INT (0xE), GEN_INT (0xF),
43231 GEN_INT (0x10), GEN_INT (0x11),
43232 GEN_INT (0x12), GEN_INT (0x13),
43233 GEN_INT (0x14), GEN_INT (0x15),
43234 GEN_INT (0x16), GEN_INT (0x17));
43235 else
43236 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43237 gen_lowpart (V16SImode, src),
43238 GEN_INT (i == 128 ? 0x2 : 0x1),
43239 GEN_INT (0x3),
43240 GEN_INT (0x3),
43241 GEN_INT (0x3),
43242 GEN_INT (i == 128 ? 0x6 : 0x5),
43243 GEN_INT (0x7),
43244 GEN_INT (0x7),
43245 GEN_INT (0x7),
43246 GEN_INT (i == 128 ? 0xA : 0x9),
43247 GEN_INT (0xB),
43248 GEN_INT (0xB),
43249 GEN_INT (0xB),
43250 GEN_INT (i == 128 ? 0xE : 0xD),
43251 GEN_INT (0xF),
43252 GEN_INT (0xF),
43253 GEN_INT (0xF));
43254 break;
43255 default:
43256 gcc_unreachable ();
43258 emit_insn (tem);
43259 if (d != dest)
43260 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43263 /* Expand a vector reduction. FN is the binary pattern to reduce;
43264 DEST is the destination; IN is the input vector. */
43266 void
43267 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43269 rtx half, dst, vec = in;
43270 machine_mode mode = GET_MODE (in);
43271 int i;
43273 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43274 if (TARGET_SSE4_1
43275 && mode == V8HImode
43276 && fn == gen_uminv8hi3)
43278 emit_insn (gen_sse4_1_phminposuw (dest, in));
43279 return;
43282 for (i = GET_MODE_BITSIZE (mode);
43283 i > GET_MODE_UNIT_BITSIZE (mode);
43284 i >>= 1)
43286 half = gen_reg_rtx (mode);
43287 emit_reduc_half (half, vec, i);
43288 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43289 dst = dest;
43290 else
43291 dst = gen_reg_rtx (mode);
43292 emit_insn (fn (dst, half, vec));
43293 vec = dst;
43297 /* Target hook for scalar_mode_supported_p. */
43298 static bool
43299 ix86_scalar_mode_supported_p (scalar_mode mode)
43301 if (DECIMAL_FLOAT_MODE_P (mode))
43302 return default_decimal_float_supported_p ();
43303 else if (mode == TFmode)
43304 return true;
43305 else
43306 return default_scalar_mode_supported_p (mode);
43309 /* Implements target hook vector_mode_supported_p. */
43310 static bool
43311 ix86_vector_mode_supported_p (machine_mode mode)
43313 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43314 return true;
43315 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43316 return true;
43317 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43318 return true;
43319 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43320 return true;
43321 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43322 return true;
43323 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43324 return true;
43325 return false;
43328 /* Target hook for c_mode_for_suffix. */
43329 static machine_mode
43330 ix86_c_mode_for_suffix (char suffix)
43332 if (suffix == 'q')
43333 return TFmode;
43334 if (suffix == 'w')
43335 return XFmode;
43337 return VOIDmode;
43340 /* Worker function for TARGET_MD_ASM_ADJUST.
43342 We implement asm flag outputs, and maintain source compatibility
43343 with the old cc0-based compiler. */
43345 static rtx_insn *
43346 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43347 vec<const char *> &constraints,
43348 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43350 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43351 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43353 bool saw_asm_flag = false;
43355 start_sequence ();
43356 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43358 const char *con = constraints[i];
43359 if (strncmp (con, "=@cc", 4) != 0)
43360 continue;
43361 con += 4;
43362 if (strchr (con, ',') != NULL)
43364 error ("alternatives not allowed in asm flag output");
43365 continue;
43368 bool invert = false;
43369 if (con[0] == 'n')
43370 invert = true, con++;
43372 machine_mode mode = CCmode;
43373 rtx_code code = UNKNOWN;
43375 switch (con[0])
43377 case 'a':
43378 if (con[1] == 0)
43379 mode = CCAmode, code = EQ;
43380 else if (con[1] == 'e' && con[2] == 0)
43381 mode = CCCmode, code = NE;
43382 break;
43383 case 'b':
43384 if (con[1] == 0)
43385 mode = CCCmode, code = EQ;
43386 else if (con[1] == 'e' && con[2] == 0)
43387 mode = CCAmode, code = NE;
43388 break;
43389 case 'c':
43390 if (con[1] == 0)
43391 mode = CCCmode, code = EQ;
43392 break;
43393 case 'e':
43394 if (con[1] == 0)
43395 mode = CCZmode, code = EQ;
43396 break;
43397 case 'g':
43398 if (con[1] == 0)
43399 mode = CCGCmode, code = GT;
43400 else if (con[1] == 'e' && con[2] == 0)
43401 mode = CCGCmode, code = GE;
43402 break;
43403 case 'l':
43404 if (con[1] == 0)
43405 mode = CCGCmode, code = LT;
43406 else if (con[1] == 'e' && con[2] == 0)
43407 mode = CCGCmode, code = LE;
43408 break;
43409 case 'o':
43410 if (con[1] == 0)
43411 mode = CCOmode, code = EQ;
43412 break;
43413 case 'p':
43414 if (con[1] == 0)
43415 mode = CCPmode, code = EQ;
43416 break;
43417 case 's':
43418 if (con[1] == 0)
43419 mode = CCSmode, code = EQ;
43420 break;
43421 case 'z':
43422 if (con[1] == 0)
43423 mode = CCZmode, code = EQ;
43424 break;
43426 if (code == UNKNOWN)
43428 error ("unknown asm flag output %qs", constraints[i]);
43429 continue;
43431 if (invert)
43432 code = reverse_condition (code);
43434 rtx dest = outputs[i];
43435 if (!saw_asm_flag)
43437 /* This is the first asm flag output. Here we put the flags
43438 register in as the real output and adjust the condition to
43439 allow it. */
43440 constraints[i] = "=Bf";
43441 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43442 saw_asm_flag = true;
43444 else
43446 /* We don't need the flags register as output twice. */
43447 constraints[i] = "=X";
43448 outputs[i] = gen_rtx_SCRATCH (SImode);
43451 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43452 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43454 machine_mode dest_mode = GET_MODE (dest);
43455 if (!SCALAR_INT_MODE_P (dest_mode))
43457 error ("invalid type for asm flag output");
43458 continue;
43461 if (dest_mode == DImode && !TARGET_64BIT)
43462 dest_mode = SImode;
43464 if (dest_mode != QImode)
43466 rtx destqi = gen_reg_rtx (QImode);
43467 emit_insn (gen_rtx_SET (destqi, x));
43469 if (TARGET_ZERO_EXTEND_WITH_AND
43470 && optimize_function_for_speed_p (cfun))
43472 x = force_reg (dest_mode, const0_rtx);
43474 emit_insn (gen_movstrictqi
43475 (gen_lowpart (QImode, x), destqi));
43477 else
43478 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43481 if (dest_mode != GET_MODE (dest))
43483 rtx tmp = gen_reg_rtx (SImode);
43485 emit_insn (gen_rtx_SET (tmp, x));
43486 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43488 else
43489 emit_insn (gen_rtx_SET (dest, x));
43491 rtx_insn *seq = get_insns ();
43492 end_sequence ();
43494 if (saw_asm_flag)
43495 return seq;
43496 else
43498 /* If we had no asm flag outputs, clobber the flags. */
43499 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43500 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43501 return NULL;
43505 /* Implements target vector targetm.asm.encode_section_info. */
43507 static void ATTRIBUTE_UNUSED
43508 ix86_encode_section_info (tree decl, rtx rtl, int first)
43510 default_encode_section_info (decl, rtl, first);
43512 if (ix86_in_large_data_p (decl))
43513 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43516 /* Worker function for REVERSE_CONDITION. */
43518 enum rtx_code
43519 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43521 return (mode == CCFPmode
43522 ? reverse_condition_maybe_unordered (code)
43523 : reverse_condition (code));
43526 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43527 to OPERANDS[0]. */
43529 const char *
43530 output_387_reg_move (rtx_insn *insn, rtx *operands)
43532 if (REG_P (operands[0]))
43534 if (REG_P (operands[1])
43535 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43537 if (REGNO (operands[0]) == FIRST_STACK_REG)
43538 return output_387_ffreep (operands, 0);
43539 return "fstp\t%y0";
43541 if (STACK_TOP_P (operands[0]))
43542 return "fld%Z1\t%y1";
43543 return "fst\t%y0";
43545 else if (MEM_P (operands[0]))
43547 gcc_assert (REG_P (operands[1]));
43548 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43549 return "fstp%Z0\t%y0";
43550 else
43552 /* There is no non-popping store to memory for XFmode.
43553 So if we need one, follow the store with a load. */
43554 if (GET_MODE (operands[0]) == XFmode)
43555 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43556 else
43557 return "fst%Z0\t%y0";
43560 else
43561 gcc_unreachable();
43564 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43565 FP status register is set. */
43567 void
43568 ix86_emit_fp_unordered_jump (rtx label)
43570 rtx reg = gen_reg_rtx (HImode);
43571 rtx temp;
43573 emit_insn (gen_x86_fnstsw_1 (reg));
43575 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43577 emit_insn (gen_x86_sahf_1 (reg));
43579 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43580 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43582 else
43584 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43586 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43587 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43590 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43591 gen_rtx_LABEL_REF (VOIDmode, label),
43592 pc_rtx);
43593 temp = gen_rtx_SET (pc_rtx, temp);
43595 emit_jump_insn (temp);
43596 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43599 /* Output code to perform a log1p XFmode calculation. */
43601 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43603 rtx_code_label *label1 = gen_label_rtx ();
43604 rtx_code_label *label2 = gen_label_rtx ();
43606 rtx tmp = gen_reg_rtx (XFmode);
43607 rtx tmp2 = gen_reg_rtx (XFmode);
43608 rtx test;
43610 emit_insn (gen_absxf2 (tmp, op1));
43611 test = gen_rtx_GE (VOIDmode, tmp,
43612 const_double_from_real_value (
43613 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43614 XFmode));
43615 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43617 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43618 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43619 emit_jump (label2);
43621 emit_label (label1);
43622 emit_move_insn (tmp, CONST1_RTX (XFmode));
43623 emit_insn (gen_addxf3 (tmp, op1, tmp));
43624 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43625 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43627 emit_label (label2);
43630 /* Emit code for round calculation. */
43631 void ix86_emit_i387_round (rtx op0, rtx op1)
43633 machine_mode inmode = GET_MODE (op1);
43634 machine_mode outmode = GET_MODE (op0);
43635 rtx e1, e2, res, tmp, tmp1, half;
43636 rtx scratch = gen_reg_rtx (HImode);
43637 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43638 rtx_code_label *jump_label = gen_label_rtx ();
43639 rtx insn;
43640 rtx (*gen_abs) (rtx, rtx);
43641 rtx (*gen_neg) (rtx, rtx);
43643 switch (inmode)
43645 case E_SFmode:
43646 gen_abs = gen_abssf2;
43647 break;
43648 case E_DFmode:
43649 gen_abs = gen_absdf2;
43650 break;
43651 case E_XFmode:
43652 gen_abs = gen_absxf2;
43653 break;
43654 default:
43655 gcc_unreachable ();
43658 switch (outmode)
43660 case E_SFmode:
43661 gen_neg = gen_negsf2;
43662 break;
43663 case E_DFmode:
43664 gen_neg = gen_negdf2;
43665 break;
43666 case E_XFmode:
43667 gen_neg = gen_negxf2;
43668 break;
43669 case E_HImode:
43670 gen_neg = gen_neghi2;
43671 break;
43672 case E_SImode:
43673 gen_neg = gen_negsi2;
43674 break;
43675 case E_DImode:
43676 gen_neg = gen_negdi2;
43677 break;
43678 default:
43679 gcc_unreachable ();
43682 e1 = gen_reg_rtx (inmode);
43683 e2 = gen_reg_rtx (inmode);
43684 res = gen_reg_rtx (outmode);
43686 half = const_double_from_real_value (dconsthalf, inmode);
43688 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43690 /* scratch = fxam(op1) */
43691 emit_insn (gen_rtx_SET (scratch,
43692 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43693 UNSPEC_FXAM)));
43694 /* e1 = fabs(op1) */
43695 emit_insn (gen_abs (e1, op1));
43697 /* e2 = e1 + 0.5 */
43698 half = force_reg (inmode, half);
43699 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43701 /* res = floor(e2) */
43702 if (inmode != XFmode)
43704 tmp1 = gen_reg_rtx (XFmode);
43706 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43708 else
43709 tmp1 = e2;
43711 switch (outmode)
43713 case E_SFmode:
43714 case E_DFmode:
43716 rtx tmp0 = gen_reg_rtx (XFmode);
43718 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43720 emit_insn (gen_rtx_SET (res,
43721 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43722 UNSPEC_TRUNC_NOOP)));
43724 break;
43725 case E_XFmode:
43726 emit_insn (gen_frndintxf2_floor (res, tmp1));
43727 break;
43728 case E_HImode:
43729 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43730 break;
43731 case E_SImode:
43732 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43733 break;
43734 case E_DImode:
43735 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43736 break;
43737 default:
43738 gcc_unreachable ();
43741 /* flags = signbit(a) */
43742 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43744 /* if (flags) then res = -res */
43745 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43746 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43747 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43748 pc_rtx);
43749 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43750 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43751 JUMP_LABEL (insn) = jump_label;
43753 emit_insn (gen_neg (res, res));
43755 emit_label (jump_label);
43756 LABEL_NUSES (jump_label) = 1;
43758 emit_move_insn (op0, res);
43761 /* Output code to perform a Newton-Rhapson approximation of a single precision
43762 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43764 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43766 rtx x0, x1, e0, e1;
43768 x0 = gen_reg_rtx (mode);
43769 e0 = gen_reg_rtx (mode);
43770 e1 = gen_reg_rtx (mode);
43771 x1 = gen_reg_rtx (mode);
43773 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43775 b = force_reg (mode, b);
43777 /* x0 = rcp(b) estimate */
43778 if (mode == V16SFmode || mode == V8DFmode)
43780 if (TARGET_AVX512ER)
43782 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43783 UNSPEC_RCP28)));
43784 /* res = a * x0 */
43785 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43786 return;
43788 else
43789 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43790 UNSPEC_RCP14)));
43792 else
43793 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43794 UNSPEC_RCP)));
43796 /* e0 = x0 * b */
43797 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43799 /* e0 = x0 * e0 */
43800 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43802 /* e1 = x0 + x0 */
43803 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43805 /* x1 = e1 - e0 */
43806 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43808 /* res = a * x1 */
43809 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43812 /* Output code to perform a Newton-Rhapson approximation of a
43813 single precision floating point [reciprocal] square root. */
43815 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43817 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43818 REAL_VALUE_TYPE r;
43819 int unspec;
43821 x0 = gen_reg_rtx (mode);
43822 e0 = gen_reg_rtx (mode);
43823 e1 = gen_reg_rtx (mode);
43824 e2 = gen_reg_rtx (mode);
43825 e3 = gen_reg_rtx (mode);
43827 if (TARGET_AVX512ER && mode == V16SFmode)
43829 if (recip)
43830 /* res = rsqrt28(a) estimate */
43831 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43832 UNSPEC_RSQRT28)));
43833 else
43835 /* x0 = rsqrt28(a) estimate */
43836 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43837 UNSPEC_RSQRT28)));
43838 /* res = rcp28(x0) estimate */
43839 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43840 UNSPEC_RCP28)));
43842 return;
43845 real_from_integer (&r, VOIDmode, -3, SIGNED);
43846 mthree = const_double_from_real_value (r, SFmode);
43848 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43849 mhalf = const_double_from_real_value (r, SFmode);
43850 unspec = UNSPEC_RSQRT;
43852 if (VECTOR_MODE_P (mode))
43854 mthree = ix86_build_const_vector (mode, true, mthree);
43855 mhalf = ix86_build_const_vector (mode, true, mhalf);
43856 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43857 if (GET_MODE_SIZE (mode) == 64)
43858 unspec = UNSPEC_RSQRT14;
43861 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43862 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43864 a = force_reg (mode, a);
43866 /* x0 = rsqrt(a) estimate */
43867 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43868 unspec)));
43870 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43871 if (!recip)
43873 rtx zero = force_reg (mode, CONST0_RTX(mode));
43874 rtx mask;
43876 /* Handle masked compare. */
43877 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43879 mask = gen_reg_rtx (HImode);
43880 /* Imm value 0x4 corresponds to not-equal comparison. */
43881 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43882 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43884 else
43886 mask = gen_reg_rtx (mode);
43887 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43888 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43892 /* e0 = x0 * a */
43893 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43894 /* e1 = e0 * x0 */
43895 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43897 /* e2 = e1 - 3. */
43898 mthree = force_reg (mode, mthree);
43899 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43901 mhalf = force_reg (mode, mhalf);
43902 if (recip)
43903 /* e3 = -.5 * x0 */
43904 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43905 else
43906 /* e3 = -.5 * e0 */
43907 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43908 /* ret = e2 * e3 */
43909 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43912 #ifdef TARGET_SOLARIS
43913 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43915 static void
43916 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43917 tree decl)
43919 /* With Binutils 2.15, the "@unwind" marker must be specified on
43920 every occurrence of the ".eh_frame" section, not just the first
43921 one. */
43922 if (TARGET_64BIT
43923 && strcmp (name, ".eh_frame") == 0)
43925 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43926 flags & SECTION_WRITE ? "aw" : "a");
43927 return;
43930 #ifndef USE_GAS
43931 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43933 solaris_elf_asm_comdat_section (name, flags, decl);
43934 return;
43936 #endif
43938 default_elf_asm_named_section (name, flags, decl);
43940 #endif /* TARGET_SOLARIS */
43942 /* Return the mangling of TYPE if it is an extended fundamental type. */
43944 static const char *
43945 ix86_mangle_type (const_tree type)
43947 type = TYPE_MAIN_VARIANT (type);
43949 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
43950 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
43951 return NULL;
43953 switch (TYPE_MODE (type))
43955 case E_TFmode:
43956 /* __float128 is "g". */
43957 return "g";
43958 case E_XFmode:
43959 /* "long double" or __float80 is "e". */
43960 return "e";
43961 default:
43962 return NULL;
43966 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
43968 static tree
43969 ix86_stack_protect_guard (void)
43971 if (TARGET_SSP_TLS_GUARD)
43973 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
43974 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
43975 tree type = build_qualified_type (type_node, qual);
43976 tree t;
43978 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
43980 t = ix86_tls_stack_chk_guard_decl;
43982 if (t == NULL)
43984 rtx x;
43986 t = build_decl
43987 (UNKNOWN_LOCATION, VAR_DECL,
43988 get_identifier (ix86_stack_protector_guard_symbol_str),
43989 type);
43990 TREE_STATIC (t) = 1;
43991 TREE_PUBLIC (t) = 1;
43992 DECL_EXTERNAL (t) = 1;
43993 TREE_USED (t) = 1;
43994 TREE_THIS_VOLATILE (t) = 1;
43995 DECL_ARTIFICIAL (t) = 1;
43996 DECL_IGNORED_P (t) = 1;
43998 /* Do not share RTL as the declaration is visible outside of
43999 current function. */
44000 x = DECL_RTL (t);
44001 RTX_FLAG (x, used) = 1;
44003 ix86_tls_stack_chk_guard_decl = t;
44006 else
44008 tree asptrtype = build_pointer_type (type);
44010 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
44011 t = build2 (MEM_REF, asptrtype, t,
44012 build_int_cst (asptrtype, 0));
44015 return t;
44018 return default_stack_protect_guard ();
44021 /* For 32-bit code we can save PIC register setup by using
44022 __stack_chk_fail_local hidden function instead of calling
44023 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44024 register, so it is better to call __stack_chk_fail directly. */
44026 static tree ATTRIBUTE_UNUSED
44027 ix86_stack_protect_fail (void)
44029 return TARGET_64BIT
44030 ? default_external_stack_protect_fail ()
44031 : default_hidden_stack_protect_fail ();
44034 /* Select a format to encode pointers in exception handling data. CODE
44035 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44036 true if the symbol may be affected by dynamic relocations.
44038 ??? All x86 object file formats are capable of representing this.
44039 After all, the relocation needed is the same as for the call insn.
44040 Whether or not a particular assembler allows us to enter such, I
44041 guess we'll have to see. */
44043 asm_preferred_eh_data_format (int code, int global)
44045 if (flag_pic)
44047 int type = DW_EH_PE_sdata8;
44048 if (!TARGET_64BIT
44049 || ix86_cmodel == CM_SMALL_PIC
44050 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44051 type = DW_EH_PE_sdata4;
44052 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44054 if (ix86_cmodel == CM_SMALL
44055 || (ix86_cmodel == CM_MEDIUM && code))
44056 return DW_EH_PE_udata4;
44057 return DW_EH_PE_absptr;
44060 /* Expand copysign from SIGN to the positive value ABS_VALUE
44061 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44062 the sign-bit. */
44063 static void
44064 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44066 machine_mode mode = GET_MODE (sign);
44067 rtx sgn = gen_reg_rtx (mode);
44068 if (mask == NULL_RTX)
44070 machine_mode vmode;
44072 if (mode == SFmode)
44073 vmode = V4SFmode;
44074 else if (mode == DFmode)
44075 vmode = V2DFmode;
44076 else
44077 vmode = mode;
44079 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44080 if (!VECTOR_MODE_P (mode))
44082 /* We need to generate a scalar mode mask in this case. */
44083 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44084 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44085 mask = gen_reg_rtx (mode);
44086 emit_insn (gen_rtx_SET (mask, tmp));
44089 else
44090 mask = gen_rtx_NOT (mode, mask);
44091 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44092 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44095 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44096 mask for masking out the sign-bit is stored in *SMASK, if that is
44097 non-null. */
44098 static rtx
44099 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44101 machine_mode vmode, mode = GET_MODE (op0);
44102 rtx xa, mask;
44104 xa = gen_reg_rtx (mode);
44105 if (mode == SFmode)
44106 vmode = V4SFmode;
44107 else if (mode == DFmode)
44108 vmode = V2DFmode;
44109 else
44110 vmode = mode;
44111 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44112 if (!VECTOR_MODE_P (mode))
44114 /* We need to generate a scalar mode mask in this case. */
44115 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44116 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44117 mask = gen_reg_rtx (mode);
44118 emit_insn (gen_rtx_SET (mask, tmp));
44120 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44122 if (smask)
44123 *smask = mask;
44125 return xa;
44128 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44129 swapping the operands if SWAP_OPERANDS is true. The expanded
44130 code is a forward jump to a newly created label in case the
44131 comparison is true. The generated label rtx is returned. */
44132 static rtx_code_label *
44133 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44134 bool swap_operands)
44136 bool unordered_compare = ix86_unordered_fp_compare (code);
44137 rtx_code_label *label;
44138 rtx tmp, reg;
44140 if (swap_operands)
44141 std::swap (op0, op1);
44143 label = gen_label_rtx ();
44144 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
44145 if (unordered_compare)
44146 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
44147 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
44148 emit_insn (gen_rtx_SET (reg, tmp));
44149 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
44150 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44151 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44152 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44153 JUMP_LABEL (tmp) = label;
44155 return label;
44158 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44159 using comparison code CODE. Operands are swapped for the comparison if
44160 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44161 static rtx
44162 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44163 bool swap_operands)
44165 rtx (*insn)(rtx, rtx, rtx, rtx);
44166 machine_mode mode = GET_MODE (op0);
44167 rtx mask = gen_reg_rtx (mode);
44169 if (swap_operands)
44170 std::swap (op0, op1);
44172 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44174 emit_insn (insn (mask, op0, op1,
44175 gen_rtx_fmt_ee (code, mode, op0, op1)));
44176 return mask;
44179 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44180 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44181 static rtx
44182 ix86_gen_TWO52 (machine_mode mode)
44184 REAL_VALUE_TYPE TWO52r;
44185 rtx TWO52;
44187 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44188 TWO52 = const_double_from_real_value (TWO52r, mode);
44189 TWO52 = force_reg (mode, TWO52);
44191 return TWO52;
44194 /* Expand SSE sequence for computing lround from OP1 storing
44195 into OP0. */
44196 void
44197 ix86_expand_lround (rtx op0, rtx op1)
44199 /* C code for the stuff we're doing below:
44200 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44201 return (long)tmp;
44203 machine_mode mode = GET_MODE (op1);
44204 const struct real_format *fmt;
44205 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44206 rtx adj;
44208 /* load nextafter (0.5, 0.0) */
44209 fmt = REAL_MODE_FORMAT (mode);
44210 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44211 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44213 /* adj = copysign (0.5, op1) */
44214 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44215 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44217 /* adj = op1 + adj */
44218 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44220 /* op0 = (imode)adj */
44221 expand_fix (op0, adj, 0);
44224 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44225 into OPERAND0. */
44226 void
44227 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44229 /* C code for the stuff we're doing below (for do_floor):
44230 xi = (long)op1;
44231 xi -= (double)xi > op1 ? 1 : 0;
44232 return xi;
44234 machine_mode fmode = GET_MODE (op1);
44235 machine_mode imode = GET_MODE (op0);
44236 rtx ireg, freg, tmp;
44237 rtx_code_label *label;
44239 /* reg = (long)op1 */
44240 ireg = gen_reg_rtx (imode);
44241 expand_fix (ireg, op1, 0);
44243 /* freg = (double)reg */
44244 freg = gen_reg_rtx (fmode);
44245 expand_float (freg, ireg, 0);
44247 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44248 label = ix86_expand_sse_compare_and_jump (UNLE,
44249 freg, op1, !do_floor);
44250 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44251 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44252 emit_move_insn (ireg, tmp);
44254 emit_label (label);
44255 LABEL_NUSES (label) = 1;
44257 emit_move_insn (op0, ireg);
44260 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
44261 void
44262 ix86_expand_rint (rtx operand0, rtx operand1)
44264 /* C code for the stuff we're doing below:
44265 xa = fabs (operand1);
44266 if (!isless (xa, 2**52))
44267 return operand1;
44268 two52 = 2**52;
44269 if (flag_rounding_math)
44271 two52 = copysign (two52, operand1);
44272 xa = operand1;
44274 xa = xa + two52 - two52;
44275 return copysign (xa, operand1);
44277 machine_mode mode = GET_MODE (operand0);
44278 rtx res, xa, TWO52, two52, mask;
44279 rtx_code_label *label;
44281 res = gen_reg_rtx (mode);
44282 emit_move_insn (res, operand1);
44284 /* xa = abs (operand1) */
44285 xa = ix86_expand_sse_fabs (res, &mask);
44287 /* if (!isless (xa, TWO52)) goto label; */
44288 TWO52 = ix86_gen_TWO52 (mode);
44289 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44291 two52 = TWO52;
44292 if (flag_rounding_math)
44294 two52 = gen_reg_rtx (mode);
44295 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
44296 xa = res;
44299 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
44300 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
44302 ix86_sse_copysign_to_positive (res, xa, res, mask);
44304 emit_label (label);
44305 LABEL_NUSES (label) = 1;
44307 emit_move_insn (operand0, res);
44310 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44311 into OPERAND0. */
44312 void
44313 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44315 /* C code for the stuff we expand below.
44316 double xa = fabs (x), x2;
44317 if (!isless (xa, TWO52))
44318 return x;
44319 xa = xa + TWO52 - TWO52;
44320 x2 = copysign (xa, x);
44321 Compensate. Floor:
44322 if (x2 > x)
44323 x2 -= 1;
44324 Compensate. Ceil:
44325 if (x2 < x)
44326 x2 -= -1;
44327 return x2;
44329 machine_mode mode = GET_MODE (operand0);
44330 rtx xa, TWO52, tmp, one, res, mask;
44331 rtx_code_label *label;
44333 TWO52 = ix86_gen_TWO52 (mode);
44335 /* Temporary for holding the result, initialized to the input
44336 operand to ease control flow. */
44337 res = gen_reg_rtx (mode);
44338 emit_move_insn (res, operand1);
44340 /* xa = abs (operand1) */
44341 xa = ix86_expand_sse_fabs (res, &mask);
44343 /* if (!isless (xa, TWO52)) goto label; */
44344 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44346 /* xa = xa + TWO52 - TWO52; */
44347 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44348 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44350 /* xa = copysign (xa, operand1) */
44351 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44353 /* generate 1.0 or -1.0 */
44354 one = force_reg (mode,
44355 const_double_from_real_value (do_floor
44356 ? dconst1 : dconstm1, mode));
44358 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44359 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44360 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44361 /* We always need to subtract here to preserve signed zero. */
44362 tmp = expand_simple_binop (mode, MINUS,
44363 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44364 emit_move_insn (res, tmp);
44366 emit_label (label);
44367 LABEL_NUSES (label) = 1;
44369 emit_move_insn (operand0, res);
44372 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44373 into OPERAND0. */
44374 void
44375 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44377 /* C code for the stuff we expand below.
44378 double xa = fabs (x), x2;
44379 if (!isless (xa, TWO52))
44380 return x;
44381 x2 = (double)(long)x;
44382 Compensate. Floor:
44383 if (x2 > x)
44384 x2 -= 1;
44385 Compensate. Ceil:
44386 if (x2 < x)
44387 x2 += 1;
44388 if (HONOR_SIGNED_ZEROS (mode))
44389 return copysign (x2, x);
44390 return x2;
44392 machine_mode mode = GET_MODE (operand0);
44393 rtx xa, xi, TWO52, tmp, one, res, mask;
44394 rtx_code_label *label;
44396 TWO52 = ix86_gen_TWO52 (mode);
44398 /* Temporary for holding the result, initialized to the input
44399 operand to ease control flow. */
44400 res = gen_reg_rtx (mode);
44401 emit_move_insn (res, operand1);
44403 /* xa = abs (operand1) */
44404 xa = ix86_expand_sse_fabs (res, &mask);
44406 /* if (!isless (xa, TWO52)) goto label; */
44407 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44409 /* xa = (double)(long)x */
44410 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44411 expand_fix (xi, res, 0);
44412 expand_float (xa, xi, 0);
44414 /* generate 1.0 */
44415 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44417 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44418 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44419 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44420 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44421 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44422 emit_move_insn (res, tmp);
44424 if (HONOR_SIGNED_ZEROS (mode))
44425 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44427 emit_label (label);
44428 LABEL_NUSES (label) = 1;
44430 emit_move_insn (operand0, res);
44433 /* Expand SSE sequence for computing round from OPERAND1 storing
44434 into OPERAND0. Sequence that works without relying on DImode truncation
44435 via cvttsd2siq that is only available on 64bit targets. */
44436 void
44437 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44439 /* C code for the stuff we expand below.
44440 double xa = fabs (x), xa2, x2;
44441 if (!isless (xa, TWO52))
44442 return x;
44443 Using the absolute value and copying back sign makes
44444 -0.0 -> -0.0 correct.
44445 xa2 = xa + TWO52 - TWO52;
44446 Compensate.
44447 dxa = xa2 - xa;
44448 if (dxa <= -0.5)
44449 xa2 += 1;
44450 else if (dxa > 0.5)
44451 xa2 -= 1;
44452 x2 = copysign (xa2, x);
44453 return x2;
44455 machine_mode mode = GET_MODE (operand0);
44456 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44457 rtx_code_label *label;
44459 TWO52 = ix86_gen_TWO52 (mode);
44461 /* Temporary for holding the result, initialized to the input
44462 operand to ease control flow. */
44463 res = gen_reg_rtx (mode);
44464 emit_move_insn (res, operand1);
44466 /* xa = abs (operand1) */
44467 xa = ix86_expand_sse_fabs (res, &mask);
44469 /* if (!isless (xa, TWO52)) goto label; */
44470 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44472 /* xa2 = xa + TWO52 - TWO52; */
44473 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44474 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44476 /* dxa = xa2 - xa; */
44477 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44479 /* generate 0.5, 1.0 and -0.5 */
44480 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44481 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44482 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44483 0, OPTAB_DIRECT);
44485 /* Compensate. */
44486 tmp = gen_reg_rtx (mode);
44487 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44488 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44489 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44490 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44491 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44492 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44493 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44494 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44496 /* res = copysign (xa2, operand1) */
44497 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44499 emit_label (label);
44500 LABEL_NUSES (label) = 1;
44502 emit_move_insn (operand0, res);
44505 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44506 into OPERAND0. */
44507 void
44508 ix86_expand_trunc (rtx operand0, rtx operand1)
44510 /* C code for SSE variant we expand below.
44511 double xa = fabs (x), x2;
44512 if (!isless (xa, TWO52))
44513 return x;
44514 x2 = (double)(long)x;
44515 if (HONOR_SIGNED_ZEROS (mode))
44516 return copysign (x2, x);
44517 return x2;
44519 machine_mode mode = GET_MODE (operand0);
44520 rtx xa, xi, TWO52, res, mask;
44521 rtx_code_label *label;
44523 TWO52 = ix86_gen_TWO52 (mode);
44525 /* Temporary for holding the result, initialized to the input
44526 operand to ease control flow. */
44527 res = gen_reg_rtx (mode);
44528 emit_move_insn (res, operand1);
44530 /* xa = abs (operand1) */
44531 xa = ix86_expand_sse_fabs (res, &mask);
44533 /* if (!isless (xa, TWO52)) goto label; */
44534 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44536 /* x = (double)(long)x */
44537 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44538 expand_fix (xi, res, 0);
44539 expand_float (res, xi, 0);
44541 if (HONOR_SIGNED_ZEROS (mode))
44542 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44544 emit_label (label);
44545 LABEL_NUSES (label) = 1;
44547 emit_move_insn (operand0, res);
44550 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44551 into OPERAND0. */
44552 void
44553 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44555 machine_mode mode = GET_MODE (operand0);
44556 rtx xa, mask, TWO52, one, res, smask, tmp;
44557 rtx_code_label *label;
44559 /* C code for SSE variant we expand below.
44560 double xa = fabs (x), x2;
44561 if (!isless (xa, TWO52))
44562 return x;
44563 xa2 = xa + TWO52 - TWO52;
44564 Compensate:
44565 if (xa2 > xa)
44566 xa2 -= 1.0;
44567 x2 = copysign (xa2, x);
44568 return x2;
44571 TWO52 = ix86_gen_TWO52 (mode);
44573 /* Temporary for holding the result, initialized to the input
44574 operand to ease control flow. */
44575 res = gen_reg_rtx (mode);
44576 emit_move_insn (res, operand1);
44578 /* xa = abs (operand1) */
44579 xa = ix86_expand_sse_fabs (res, &smask);
44581 /* if (!isless (xa, TWO52)) goto label; */
44582 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44584 /* res = xa + TWO52 - TWO52; */
44585 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44586 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44587 emit_move_insn (res, tmp);
44589 /* generate 1.0 */
44590 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44592 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44593 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44594 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44595 tmp = expand_simple_binop (mode, MINUS,
44596 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44597 emit_move_insn (res, tmp);
44599 /* res = copysign (res, operand1) */
44600 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44602 emit_label (label);
44603 LABEL_NUSES (label) = 1;
44605 emit_move_insn (operand0, res);
44608 /* Expand SSE sequence for computing round from OPERAND1 storing
44609 into OPERAND0. */
44610 void
44611 ix86_expand_round (rtx operand0, rtx operand1)
44613 /* C code for the stuff we're doing below:
44614 double xa = fabs (x);
44615 if (!isless (xa, TWO52))
44616 return x;
44617 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44618 return copysign (xa, x);
44620 machine_mode mode = GET_MODE (operand0);
44621 rtx res, TWO52, xa, xi, half, mask;
44622 rtx_code_label *label;
44623 const struct real_format *fmt;
44624 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44626 /* Temporary for holding the result, initialized to the input
44627 operand to ease control flow. */
44628 res = gen_reg_rtx (mode);
44629 emit_move_insn (res, operand1);
44631 TWO52 = ix86_gen_TWO52 (mode);
44632 xa = ix86_expand_sse_fabs (res, &mask);
44633 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44635 /* load nextafter (0.5, 0.0) */
44636 fmt = REAL_MODE_FORMAT (mode);
44637 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44638 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44640 /* xa = xa + 0.5 */
44641 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44642 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44644 /* xa = (double)(int64_t)xa */
44645 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44646 expand_fix (xi, xa, 0);
44647 expand_float (xa, xi, 0);
44649 /* res = copysign (xa, operand1) */
44650 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44652 emit_label (label);
44653 LABEL_NUSES (label) = 1;
44655 emit_move_insn (operand0, res);
44658 /* Expand SSE sequence for computing round
44659 from OP1 storing into OP0 using sse4 round insn. */
44660 void
44661 ix86_expand_round_sse4 (rtx op0, rtx op1)
44663 machine_mode mode = GET_MODE (op0);
44664 rtx e1, e2, res, half;
44665 const struct real_format *fmt;
44666 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44667 rtx (*gen_copysign) (rtx, rtx, rtx);
44668 rtx (*gen_round) (rtx, rtx, rtx);
44670 switch (mode)
44672 case E_SFmode:
44673 gen_copysign = gen_copysignsf3;
44674 gen_round = gen_sse4_1_roundsf2;
44675 break;
44676 case E_DFmode:
44677 gen_copysign = gen_copysigndf3;
44678 gen_round = gen_sse4_1_rounddf2;
44679 break;
44680 default:
44681 gcc_unreachable ();
44684 /* round (a) = trunc (a + copysign (0.5, a)) */
44686 /* load nextafter (0.5, 0.0) */
44687 fmt = REAL_MODE_FORMAT (mode);
44688 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44689 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44690 half = const_double_from_real_value (pred_half, mode);
44692 /* e1 = copysign (0.5, op1) */
44693 e1 = gen_reg_rtx (mode);
44694 emit_insn (gen_copysign (e1, half, op1));
44696 /* e2 = op1 + e1 */
44697 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44699 /* res = trunc (e2) */
44700 res = gen_reg_rtx (mode);
44701 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44703 emit_move_insn (op0, res);
44707 /* Table of valid machine attributes. */
44708 static const struct attribute_spec ix86_attribute_table[] =
44710 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
44711 affects_type_identity, handler, exclude } */
44712 /* Stdcall attribute says callee is responsible for popping arguments
44713 if they are not variable. */
44714 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44715 NULL },
44716 /* Fastcall attribute says callee is responsible for popping arguments
44717 if they are not variable. */
44718 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44719 NULL },
44720 /* Thiscall attribute says callee is responsible for popping arguments
44721 if they are not variable. */
44722 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44723 NULL },
44724 /* Cdecl attribute says the callee is a normal C declaration */
44725 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44726 NULL },
44727 /* Regparm attribute specifies how many integer arguments are to be
44728 passed in registers. */
44729 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
44730 NULL },
44731 /* Sseregparm attribute says we are using x86_64 calling conventions
44732 for FP arguments. */
44733 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44734 NULL },
44735 /* The transactional memory builtins are implicitly regparm or fastcall
44736 depending on the ABI. Override the generic do-nothing attribute that
44737 these builtins were declared with. */
44738 { "*tm regparm", 0, 0, false, true, true, true,
44739 ix86_handle_tm_regparm_attribute, NULL },
44740 /* force_align_arg_pointer says this function realigns the stack at entry. */
44741 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44742 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
44743 NULL },
44744 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44745 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
44746 NULL },
44747 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
44748 NULL },
44749 { "shared", 0, 0, true, false, false, false,
44750 ix86_handle_shared_attribute, NULL },
44751 #endif
44752 { "ms_struct", 0, 0, false, false, false, false,
44753 ix86_handle_struct_attribute, NULL },
44754 { "gcc_struct", 0, 0, false, false, false, false,
44755 ix86_handle_struct_attribute, NULL },
44756 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44757 SUBTARGET_ATTRIBUTE_TABLE,
44758 #endif
44759 /* ms_abi and sysv_abi calling convention function attributes. */
44760 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
44761 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
44762 NULL },
44763 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
44764 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
44765 { "ms_hook_prologue", 0, 0, true, false, false, false,
44766 ix86_handle_fndecl_attribute, NULL },
44767 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
44768 ix86_handle_callee_pop_aggregate_return, NULL },
44769 { "interrupt", 0, 0, false, true, true, false,
44770 ix86_handle_interrupt_attribute, NULL },
44771 { "no_caller_saved_registers", 0, 0, false, true, true, false,
44772 ix86_handle_no_caller_saved_registers_attribute, NULL },
44773 { "naked", 0, 0, true, false, false, false,
44774 ix86_handle_fndecl_attribute, NULL },
44776 /* End element. */
44777 { NULL, 0, 0, false, false, false, false, NULL, NULL }
44780 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44781 static int
44782 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44783 tree vectype, int)
44785 bool fp = false;
44786 machine_mode mode = TImode;
44787 int index;
44788 if (vectype != NULL)
44790 fp = FLOAT_TYPE_P (vectype);
44791 mode = TYPE_MODE (vectype);
44794 switch (type_of_cost)
44796 case scalar_stmt:
44797 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44799 case scalar_load:
44800 /* load/store costs are relative to register move which is 2. Recompute
44801 it to COSTS_N_INSNS so everything have same base. */
44802 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44803 : ix86_cost->int_load [2]) / 2;
44805 case scalar_store:
44806 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44807 : ix86_cost->int_store [2]) / 2;
44809 case vector_stmt:
44810 return ix86_vec_cost (mode,
44811 fp ? ix86_cost->addss : ix86_cost->sse_op,
44812 true);
44814 case vector_load:
44815 index = sse_store_index (mode);
44816 /* See PR82713 - we may end up being called on non-vector type. */
44817 if (index < 0)
44818 index = 2;
44819 return ix86_vec_cost (mode,
44820 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44821 true);
44823 case vector_store:
44824 index = sse_store_index (mode);
44825 /* See PR82713 - we may end up being called on non-vector type. */
44826 if (index < 0)
44827 index = 2;
44828 return ix86_vec_cost (mode,
44829 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44830 true);
44832 case vec_to_scalar:
44833 case scalar_to_vec:
44834 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44836 /* We should have separate costs for unaligned loads and gather/scatter.
44837 Do that incrementally. */
44838 case unaligned_load:
44839 index = sse_store_index (mode);
44840 /* See PR82713 - we may end up being called on non-vector type. */
44841 if (index < 0)
44842 index = 2;
44843 return ix86_vec_cost (mode,
44844 COSTS_N_INSNS
44845 (ix86_cost->sse_unaligned_load[index]) / 2,
44846 true);
44848 case unaligned_store:
44849 index = sse_store_index (mode);
44850 /* See PR82713 - we may end up being called on non-vector type. */
44851 if (index < 0)
44852 index = 2;
44853 return ix86_vec_cost (mode,
44854 COSTS_N_INSNS
44855 (ix86_cost->sse_unaligned_store[index]) / 2,
44856 true);
44858 case vector_gather_load:
44859 return ix86_vec_cost (mode,
44860 COSTS_N_INSNS
44861 (ix86_cost->gather_static
44862 + ix86_cost->gather_per_elt
44863 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44864 true);
44866 case vector_scatter_store:
44867 return ix86_vec_cost (mode,
44868 COSTS_N_INSNS
44869 (ix86_cost->scatter_static
44870 + ix86_cost->scatter_per_elt
44871 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44872 true);
44874 case cond_branch_taken:
44875 return ix86_cost->cond_taken_branch_cost;
44877 case cond_branch_not_taken:
44878 return ix86_cost->cond_not_taken_branch_cost;
44880 case vec_perm:
44881 case vec_promote_demote:
44882 return ix86_vec_cost (mode,
44883 ix86_cost->sse_op, true);
44885 case vec_construct:
44886 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44888 default:
44889 gcc_unreachable ();
44893 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44894 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44895 insn every time. */
44897 static GTY(()) rtx_insn *vselect_insn;
44899 /* Initialize vselect_insn. */
44901 static void
44902 init_vselect_insn (void)
44904 unsigned i;
44905 rtx x;
44907 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44908 for (i = 0; i < MAX_VECT_LEN; ++i)
44909 XVECEXP (x, 0, i) = const0_rtx;
44910 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44911 const0_rtx), x);
44912 x = gen_rtx_SET (const0_rtx, x);
44913 start_sequence ();
44914 vselect_insn = emit_insn (x);
44915 end_sequence ();
44918 /* Construct (set target (vec_select op0 (parallel perm))) and
44919 return true if that's a valid instruction in the active ISA. */
44921 static bool
44922 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44923 unsigned nelt, bool testing_p)
44925 unsigned int i;
44926 rtx x, save_vconcat;
44927 int icode;
44929 if (vselect_insn == NULL_RTX)
44930 init_vselect_insn ();
44932 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44933 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44934 for (i = 0; i < nelt; ++i)
44935 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44936 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44937 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44938 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44939 SET_DEST (PATTERN (vselect_insn)) = target;
44940 icode = recog_memoized (vselect_insn);
44942 if (icode >= 0 && !testing_p)
44943 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44945 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44946 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44947 INSN_CODE (vselect_insn) = -1;
44949 return icode >= 0;
44952 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44954 static bool
44955 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44956 const unsigned char *perm, unsigned nelt,
44957 bool testing_p)
44959 machine_mode v2mode;
44960 rtx x;
44961 bool ok;
44963 if (vselect_insn == NULL_RTX)
44964 init_vselect_insn ();
44966 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
44967 return false;
44968 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44969 PUT_MODE (x, v2mode);
44970 XEXP (x, 0) = op0;
44971 XEXP (x, 1) = op1;
44972 ok = expand_vselect (target, x, perm, nelt, testing_p);
44973 XEXP (x, 0) = const0_rtx;
44974 XEXP (x, 1) = const0_rtx;
44975 return ok;
44978 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44979 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44981 static bool
44982 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44984 machine_mode mmode, vmode = d->vmode;
44985 unsigned i, mask, nelt = d->nelt;
44986 rtx target, op0, op1, maskop, x;
44987 rtx rperm[32], vperm;
44989 if (d->one_operand_p)
44990 return false;
44991 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44992 && (TARGET_AVX512BW
44993 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44995 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44997 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44999 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45001 else
45002 return false;
45004 /* This is a blend, not a permute. Elements must stay in their
45005 respective lanes. */
45006 for (i = 0; i < nelt; ++i)
45008 unsigned e = d->perm[i];
45009 if (!(e == i || e == i + nelt))
45010 return false;
45013 if (d->testing_p)
45014 return true;
45016 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45017 decision should be extracted elsewhere, so that we only try that
45018 sequence once all budget==3 options have been tried. */
45019 target = d->target;
45020 op0 = d->op0;
45021 op1 = d->op1;
45022 mask = 0;
45024 switch (vmode)
45026 case E_V8DFmode:
45027 case E_V16SFmode:
45028 case E_V4DFmode:
45029 case E_V8SFmode:
45030 case E_V2DFmode:
45031 case E_V4SFmode:
45032 case E_V8HImode:
45033 case E_V8SImode:
45034 case E_V32HImode:
45035 case E_V64QImode:
45036 case E_V16SImode:
45037 case E_V8DImode:
45038 for (i = 0; i < nelt; ++i)
45039 mask |= (d->perm[i] >= nelt) << i;
45040 break;
45042 case E_V2DImode:
45043 for (i = 0; i < 2; ++i)
45044 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45045 vmode = V8HImode;
45046 goto do_subreg;
45048 case E_V4SImode:
45049 for (i = 0; i < 4; ++i)
45050 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45051 vmode = V8HImode;
45052 goto do_subreg;
45054 case E_V16QImode:
45055 /* See if bytes move in pairs so we can use pblendw with
45056 an immediate argument, rather than pblendvb with a vector
45057 argument. */
45058 for (i = 0; i < 16; i += 2)
45059 if (d->perm[i] + 1 != d->perm[i + 1])
45061 use_pblendvb:
45062 for (i = 0; i < nelt; ++i)
45063 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45065 finish_pblendvb:
45066 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45067 vperm = force_reg (vmode, vperm);
45069 if (GET_MODE_SIZE (vmode) == 16)
45070 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45071 else
45072 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45073 if (target != d->target)
45074 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45075 return true;
45078 for (i = 0; i < 8; ++i)
45079 mask |= (d->perm[i * 2] >= 16) << i;
45080 vmode = V8HImode;
45081 /* FALLTHRU */
45083 do_subreg:
45084 target = gen_reg_rtx (vmode);
45085 op0 = gen_lowpart (vmode, op0);
45086 op1 = gen_lowpart (vmode, op1);
45087 break;
45089 case E_V32QImode:
45090 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45091 for (i = 0; i < 32; i += 2)
45092 if (d->perm[i] + 1 != d->perm[i + 1])
45093 goto use_pblendvb;
45094 /* See if bytes move in quadruplets. If yes, vpblendd
45095 with immediate can be used. */
45096 for (i = 0; i < 32; i += 4)
45097 if (d->perm[i] + 2 != d->perm[i + 2])
45098 break;
45099 if (i < 32)
45101 /* See if bytes move the same in both lanes. If yes,
45102 vpblendw with immediate can be used. */
45103 for (i = 0; i < 16; i += 2)
45104 if (d->perm[i] + 16 != d->perm[i + 16])
45105 goto use_pblendvb;
45107 /* Use vpblendw. */
45108 for (i = 0; i < 16; ++i)
45109 mask |= (d->perm[i * 2] >= 32) << i;
45110 vmode = V16HImode;
45111 goto do_subreg;
45114 /* Use vpblendd. */
45115 for (i = 0; i < 8; ++i)
45116 mask |= (d->perm[i * 4] >= 32) << i;
45117 vmode = V8SImode;
45118 goto do_subreg;
45120 case E_V16HImode:
45121 /* See if words move in pairs. If yes, vpblendd can be used. */
45122 for (i = 0; i < 16; i += 2)
45123 if (d->perm[i] + 1 != d->perm[i + 1])
45124 break;
45125 if (i < 16)
45127 /* See if words move the same in both lanes. If not,
45128 vpblendvb must be used. */
45129 for (i = 0; i < 8; i++)
45130 if (d->perm[i] + 8 != d->perm[i + 8])
45132 /* Use vpblendvb. */
45133 for (i = 0; i < 32; ++i)
45134 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45136 vmode = V32QImode;
45137 nelt = 32;
45138 target = gen_reg_rtx (vmode);
45139 op0 = gen_lowpart (vmode, op0);
45140 op1 = gen_lowpart (vmode, op1);
45141 goto finish_pblendvb;
45144 /* Use vpblendw. */
45145 for (i = 0; i < 16; ++i)
45146 mask |= (d->perm[i] >= 16) << i;
45147 break;
45150 /* Use vpblendd. */
45151 for (i = 0; i < 8; ++i)
45152 mask |= (d->perm[i * 2] >= 16) << i;
45153 vmode = V8SImode;
45154 goto do_subreg;
45156 case E_V4DImode:
45157 /* Use vpblendd. */
45158 for (i = 0; i < 4; ++i)
45159 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45160 vmode = V8SImode;
45161 goto do_subreg;
45163 default:
45164 gcc_unreachable ();
45167 switch (vmode)
45169 case E_V8DFmode:
45170 case E_V8DImode:
45171 mmode = QImode;
45172 break;
45173 case E_V16SFmode:
45174 case E_V16SImode:
45175 mmode = HImode;
45176 break;
45177 case E_V32HImode:
45178 mmode = SImode;
45179 break;
45180 case E_V64QImode:
45181 mmode = DImode;
45182 break;
45183 default:
45184 mmode = VOIDmode;
45187 if (mmode != VOIDmode)
45188 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45189 else
45190 maskop = GEN_INT (mask);
45192 /* This matches five different patterns with the different modes. */
45193 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45194 x = gen_rtx_SET (target, x);
45195 emit_insn (x);
45196 if (target != d->target)
45197 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45199 return true;
45202 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45203 in terms of the variable form of vpermilps.
45205 Note that we will have already failed the immediate input vpermilps,
45206 which requires that the high and low part shuffle be identical; the
45207 variable form doesn't require that. */
45209 static bool
45210 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45212 rtx rperm[8], vperm;
45213 unsigned i;
45215 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45216 return false;
45218 /* We can only permute within the 128-bit lane. */
45219 for (i = 0; i < 8; ++i)
45221 unsigned e = d->perm[i];
45222 if (i < 4 ? e >= 4 : e < 4)
45223 return false;
45226 if (d->testing_p)
45227 return true;
45229 for (i = 0; i < 8; ++i)
45231 unsigned e = d->perm[i];
45233 /* Within each 128-bit lane, the elements of op0 are numbered
45234 from 0 and the elements of op1 are numbered from 4. */
45235 if (e >= 8 + 4)
45236 e -= 8;
45237 else if (e >= 4)
45238 e -= 4;
45240 rperm[i] = GEN_INT (e);
45243 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45244 vperm = force_reg (V8SImode, vperm);
45245 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45247 return true;
45250 /* Return true if permutation D can be performed as VMODE permutation
45251 instead. */
45253 static bool
45254 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45256 unsigned int i, j, chunk;
45258 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45259 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45260 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45261 return false;
45263 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45264 return true;
45266 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45267 for (i = 0; i < d->nelt; i += chunk)
45268 if (d->perm[i] & (chunk - 1))
45269 return false;
45270 else
45271 for (j = 1; j < chunk; ++j)
45272 if (d->perm[i] + j != d->perm[i + j])
45273 return false;
45275 return true;
45278 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45279 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45281 static bool
45282 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45284 unsigned i, nelt, eltsz, mask;
45285 unsigned char perm[64];
45286 machine_mode vmode = V16QImode;
45287 rtx rperm[64], vperm, target, op0, op1;
45289 nelt = d->nelt;
45291 if (!d->one_operand_p)
45293 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45295 if (TARGET_AVX2
45296 && valid_perm_using_mode_p (V2TImode, d))
45298 if (d->testing_p)
45299 return true;
45301 /* Use vperm2i128 insn. The pattern uses
45302 V4DImode instead of V2TImode. */
45303 target = d->target;
45304 if (d->vmode != V4DImode)
45305 target = gen_reg_rtx (V4DImode);
45306 op0 = gen_lowpart (V4DImode, d->op0);
45307 op1 = gen_lowpart (V4DImode, d->op1);
45308 rperm[0]
45309 = GEN_INT ((d->perm[0] / (nelt / 2))
45310 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45311 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45312 if (target != d->target)
45313 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45314 return true;
45316 return false;
45319 else
45321 if (GET_MODE_SIZE (d->vmode) == 16)
45323 if (!TARGET_SSSE3)
45324 return false;
45326 else if (GET_MODE_SIZE (d->vmode) == 32)
45328 if (!TARGET_AVX2)
45329 return false;
45331 /* V4DImode should be already handled through
45332 expand_vselect by vpermq instruction. */
45333 gcc_assert (d->vmode != V4DImode);
45335 vmode = V32QImode;
45336 if (d->vmode == V8SImode
45337 || d->vmode == V16HImode
45338 || d->vmode == V32QImode)
45340 /* First see if vpermq can be used for
45341 V8SImode/V16HImode/V32QImode. */
45342 if (valid_perm_using_mode_p (V4DImode, d))
45344 for (i = 0; i < 4; i++)
45345 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45346 if (d->testing_p)
45347 return true;
45348 target = gen_reg_rtx (V4DImode);
45349 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45350 perm, 4, false))
45352 emit_move_insn (d->target,
45353 gen_lowpart (d->vmode, target));
45354 return true;
45356 return false;
45359 /* Next see if vpermd can be used. */
45360 if (valid_perm_using_mode_p (V8SImode, d))
45361 vmode = V8SImode;
45363 /* Or if vpermps can be used. */
45364 else if (d->vmode == V8SFmode)
45365 vmode = V8SImode;
45367 if (vmode == V32QImode)
45369 /* vpshufb only works intra lanes, it is not
45370 possible to shuffle bytes in between the lanes. */
45371 for (i = 0; i < nelt; ++i)
45372 if ((d->perm[i] ^ i) & (nelt / 2))
45373 return false;
45376 else if (GET_MODE_SIZE (d->vmode) == 64)
45378 if (!TARGET_AVX512BW)
45379 return false;
45381 /* If vpermq didn't work, vpshufb won't work either. */
45382 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45383 return false;
45385 vmode = V64QImode;
45386 if (d->vmode == V16SImode
45387 || d->vmode == V32HImode
45388 || d->vmode == V64QImode)
45390 /* First see if vpermq can be used for
45391 V16SImode/V32HImode/V64QImode. */
45392 if (valid_perm_using_mode_p (V8DImode, d))
45394 for (i = 0; i < 8; i++)
45395 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45396 if (d->testing_p)
45397 return true;
45398 target = gen_reg_rtx (V8DImode);
45399 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45400 perm, 8, false))
45402 emit_move_insn (d->target,
45403 gen_lowpart (d->vmode, target));
45404 return true;
45406 return false;
45409 /* Next see if vpermd can be used. */
45410 if (valid_perm_using_mode_p (V16SImode, d))
45411 vmode = V16SImode;
45413 /* Or if vpermps can be used. */
45414 else if (d->vmode == V16SFmode)
45415 vmode = V16SImode;
45416 if (vmode == V64QImode)
45418 /* vpshufb only works intra lanes, it is not
45419 possible to shuffle bytes in between the lanes. */
45420 for (i = 0; i < nelt; ++i)
45421 if ((d->perm[i] ^ i) & (nelt / 4))
45422 return false;
45425 else
45426 return false;
45429 if (d->testing_p)
45430 return true;
45432 if (vmode == V8SImode)
45433 for (i = 0; i < 8; ++i)
45434 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45435 else if (vmode == V16SImode)
45436 for (i = 0; i < 16; ++i)
45437 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45438 else
45440 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45441 if (!d->one_operand_p)
45442 mask = 2 * nelt - 1;
45443 else if (vmode == V16QImode)
45444 mask = nelt - 1;
45445 else if (vmode == V64QImode)
45446 mask = nelt / 4 - 1;
45447 else
45448 mask = nelt / 2 - 1;
45450 for (i = 0; i < nelt; ++i)
45452 unsigned j, e = d->perm[i] & mask;
45453 for (j = 0; j < eltsz; ++j)
45454 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45458 vperm = gen_rtx_CONST_VECTOR (vmode,
45459 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45460 vperm = force_reg (vmode, vperm);
45462 target = d->target;
45463 if (d->vmode != vmode)
45464 target = gen_reg_rtx (vmode);
45465 op0 = gen_lowpart (vmode, d->op0);
45466 if (d->one_operand_p)
45468 if (vmode == V16QImode)
45469 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45470 else if (vmode == V32QImode)
45471 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45472 else if (vmode == V64QImode)
45473 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45474 else if (vmode == V8SFmode)
45475 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45476 else if (vmode == V8SImode)
45477 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45478 else if (vmode == V16SFmode)
45479 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45480 else if (vmode == V16SImode)
45481 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45482 else
45483 gcc_unreachable ();
45485 else
45487 op1 = gen_lowpart (vmode, d->op1);
45488 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45490 if (target != d->target)
45491 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45493 return true;
45496 /* For V*[QHS]Imode permutations, check if the same permutation
45497 can't be performed in a 2x, 4x or 8x wider inner mode. */
45499 static bool
45500 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45501 struct expand_vec_perm_d *nd)
45503 int i;
45504 machine_mode mode = VOIDmode;
45506 switch (d->vmode)
45508 case E_V16QImode: mode = V8HImode; break;
45509 case E_V32QImode: mode = V16HImode; break;
45510 case E_V64QImode: mode = V32HImode; break;
45511 case E_V8HImode: mode = V4SImode; break;
45512 case E_V16HImode: mode = V8SImode; break;
45513 case E_V32HImode: mode = V16SImode; break;
45514 case E_V4SImode: mode = V2DImode; break;
45515 case E_V8SImode: mode = V4DImode; break;
45516 case E_V16SImode: mode = V8DImode; break;
45517 default: return false;
45519 for (i = 0; i < d->nelt; i += 2)
45520 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45521 return false;
45522 nd->vmode = mode;
45523 nd->nelt = d->nelt / 2;
45524 for (i = 0; i < nd->nelt; i++)
45525 nd->perm[i] = d->perm[2 * i] / 2;
45526 if (GET_MODE_INNER (mode) != DImode)
45527 canonicalize_vector_int_perm (nd, nd);
45528 if (nd != d)
45530 nd->one_operand_p = d->one_operand_p;
45531 nd->testing_p = d->testing_p;
45532 if (d->op0 == d->op1)
45533 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45534 else
45536 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45537 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45539 if (d->testing_p)
45540 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45541 else
45542 nd->target = gen_reg_rtx (nd->vmode);
45544 return true;
45547 /* Try to expand one-operand permutation with constant mask. */
45549 static bool
45550 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45552 machine_mode mode = GET_MODE (d->op0);
45553 machine_mode maskmode = mode;
45554 rtx (*gen) (rtx, rtx, rtx) = NULL;
45555 rtx target, op0, mask;
45556 rtx vec[64];
45558 if (!rtx_equal_p (d->op0, d->op1))
45559 return false;
45561 if (!TARGET_AVX512F)
45562 return false;
45564 switch (mode)
45566 case E_V16SImode:
45567 gen = gen_avx512f_permvarv16si;
45568 break;
45569 case E_V16SFmode:
45570 gen = gen_avx512f_permvarv16sf;
45571 maskmode = V16SImode;
45572 break;
45573 case E_V8DImode:
45574 gen = gen_avx512f_permvarv8di;
45575 break;
45576 case E_V8DFmode:
45577 gen = gen_avx512f_permvarv8df;
45578 maskmode = V8DImode;
45579 break;
45580 default:
45581 return false;
45584 target = d->target;
45585 op0 = d->op0;
45586 for (int i = 0; i < d->nelt; ++i)
45587 vec[i] = GEN_INT (d->perm[i]);
45588 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45589 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45590 return true;
45593 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45594 in a single instruction. */
45596 static bool
45597 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45599 unsigned i, nelt = d->nelt;
45600 struct expand_vec_perm_d nd;
45602 /* Check plain VEC_SELECT first, because AVX has instructions that could
45603 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45604 input where SEL+CONCAT may not. */
45605 if (d->one_operand_p)
45607 int mask = nelt - 1;
45608 bool identity_perm = true;
45609 bool broadcast_perm = true;
45611 for (i = 0; i < nelt; i++)
45613 nd.perm[i] = d->perm[i] & mask;
45614 if (nd.perm[i] != i)
45615 identity_perm = false;
45616 if (nd.perm[i])
45617 broadcast_perm = false;
45620 if (identity_perm)
45622 if (!d->testing_p)
45623 emit_move_insn (d->target, d->op0);
45624 return true;
45626 else if (broadcast_perm && TARGET_AVX2)
45628 /* Use vpbroadcast{b,w,d}. */
45629 rtx (*gen) (rtx, rtx) = NULL;
45630 switch (d->vmode)
45632 case E_V64QImode:
45633 if (TARGET_AVX512BW)
45634 gen = gen_avx512bw_vec_dupv64qi_1;
45635 break;
45636 case E_V32QImode:
45637 gen = gen_avx2_pbroadcastv32qi_1;
45638 break;
45639 case E_V32HImode:
45640 if (TARGET_AVX512BW)
45641 gen = gen_avx512bw_vec_dupv32hi_1;
45642 break;
45643 case E_V16HImode:
45644 gen = gen_avx2_pbroadcastv16hi_1;
45645 break;
45646 case E_V16SImode:
45647 if (TARGET_AVX512F)
45648 gen = gen_avx512f_vec_dupv16si_1;
45649 break;
45650 case E_V8SImode:
45651 gen = gen_avx2_pbroadcastv8si_1;
45652 break;
45653 case E_V16QImode:
45654 gen = gen_avx2_pbroadcastv16qi;
45655 break;
45656 case E_V8HImode:
45657 gen = gen_avx2_pbroadcastv8hi;
45658 break;
45659 case E_V16SFmode:
45660 if (TARGET_AVX512F)
45661 gen = gen_avx512f_vec_dupv16sf_1;
45662 break;
45663 case E_V8SFmode:
45664 gen = gen_avx2_vec_dupv8sf_1;
45665 break;
45666 case E_V8DFmode:
45667 if (TARGET_AVX512F)
45668 gen = gen_avx512f_vec_dupv8df_1;
45669 break;
45670 case E_V8DImode:
45671 if (TARGET_AVX512F)
45672 gen = gen_avx512f_vec_dupv8di_1;
45673 break;
45674 /* For other modes prefer other shuffles this function creates. */
45675 default: break;
45677 if (gen != NULL)
45679 if (!d->testing_p)
45680 emit_insn (gen (d->target, d->op0));
45681 return true;
45685 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45686 return true;
45688 /* There are plenty of patterns in sse.md that are written for
45689 SEL+CONCAT and are not replicated for a single op. Perhaps
45690 that should be changed, to avoid the nastiness here. */
45692 /* Recognize interleave style patterns, which means incrementing
45693 every other permutation operand. */
45694 for (i = 0; i < nelt; i += 2)
45696 nd.perm[i] = d->perm[i] & mask;
45697 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45699 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45700 d->testing_p))
45701 return true;
45703 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45704 if (nelt >= 4)
45706 for (i = 0; i < nelt; i += 4)
45708 nd.perm[i + 0] = d->perm[i + 0] & mask;
45709 nd.perm[i + 1] = d->perm[i + 1] & mask;
45710 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45711 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45714 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45715 d->testing_p))
45716 return true;
45720 /* Finally, try the fully general two operand permute. */
45721 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45722 d->testing_p))
45723 return true;
45725 /* Recognize interleave style patterns with reversed operands. */
45726 if (!d->one_operand_p)
45728 for (i = 0; i < nelt; ++i)
45730 unsigned e = d->perm[i];
45731 if (e >= nelt)
45732 e -= nelt;
45733 else
45734 e += nelt;
45735 nd.perm[i] = e;
45738 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45739 d->testing_p))
45740 return true;
45743 /* Try the SSE4.1 blend variable merge instructions. */
45744 if (expand_vec_perm_blend (d))
45745 return true;
45747 /* Try one of the AVX vpermil variable permutations. */
45748 if (expand_vec_perm_vpermil (d))
45749 return true;
45751 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45752 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45753 if (expand_vec_perm_pshufb (d))
45754 return true;
45756 /* Try the AVX2 vpalignr instruction. */
45757 if (expand_vec_perm_palignr (d, true))
45758 return true;
45760 /* Try the AVX512F vperm{s,d} instructions. */
45761 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45762 return true;
45764 /* Try the AVX512F vpermt2/vpermi2 instructions. */
45765 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45766 return true;
45768 /* See if we can get the same permutation in different vector integer
45769 mode. */
45770 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45772 if (!d->testing_p)
45773 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45774 return true;
45776 return false;
45779 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45780 in terms of a pair of pshuflw + pshufhw instructions. */
45782 static bool
45783 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45785 unsigned char perm2[MAX_VECT_LEN];
45786 unsigned i;
45787 bool ok;
45789 if (d->vmode != V8HImode || !d->one_operand_p)
45790 return false;
45792 /* The two permutations only operate in 64-bit lanes. */
45793 for (i = 0; i < 4; ++i)
45794 if (d->perm[i] >= 4)
45795 return false;
45796 for (i = 4; i < 8; ++i)
45797 if (d->perm[i] < 4)
45798 return false;
45800 if (d->testing_p)
45801 return true;
45803 /* Emit the pshuflw. */
45804 memcpy (perm2, d->perm, 4);
45805 for (i = 4; i < 8; ++i)
45806 perm2[i] = i;
45807 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45808 gcc_assert (ok);
45810 /* Emit the pshufhw. */
45811 memcpy (perm2 + 4, d->perm + 4, 4);
45812 for (i = 0; i < 4; ++i)
45813 perm2[i] = i;
45814 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45815 gcc_assert (ok);
45817 return true;
45820 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45821 the permutation using the SSSE3 palignr instruction. This succeeds
45822 when all of the elements in PERM fit within one vector and we merely
45823 need to shift them down so that a single vector permutation has a
45824 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45825 the vpalignr instruction itself can perform the requested permutation. */
45827 static bool
45828 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45830 unsigned i, nelt = d->nelt;
45831 unsigned min, max, minswap, maxswap;
45832 bool in_order, ok, swap = false;
45833 rtx shift, target;
45834 struct expand_vec_perm_d dcopy;
45836 /* Even with AVX, palignr only operates on 128-bit vectors,
45837 in AVX2 palignr operates on both 128-bit lanes. */
45838 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45839 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45840 return false;
45842 min = 2 * nelt;
45843 max = 0;
45844 minswap = 2 * nelt;
45845 maxswap = 0;
45846 for (i = 0; i < nelt; ++i)
45848 unsigned e = d->perm[i];
45849 unsigned eswap = d->perm[i] ^ nelt;
45850 if (GET_MODE_SIZE (d->vmode) == 32)
45852 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45853 eswap = e ^ (nelt / 2);
45855 if (e < min)
45856 min = e;
45857 if (e > max)
45858 max = e;
45859 if (eswap < minswap)
45860 minswap = eswap;
45861 if (eswap > maxswap)
45862 maxswap = eswap;
45864 if (min == 0
45865 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45867 if (d->one_operand_p
45868 || minswap == 0
45869 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45870 ? nelt / 2 : nelt))
45871 return false;
45872 swap = true;
45873 min = minswap;
45874 max = maxswap;
45877 /* Given that we have SSSE3, we know we'll be able to implement the
45878 single operand permutation after the palignr with pshufb for
45879 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45880 first. */
45881 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45882 return true;
45884 dcopy = *d;
45885 if (swap)
45887 dcopy.op0 = d->op1;
45888 dcopy.op1 = d->op0;
45889 for (i = 0; i < nelt; ++i)
45890 dcopy.perm[i] ^= nelt;
45893 in_order = true;
45894 for (i = 0; i < nelt; ++i)
45896 unsigned e = dcopy.perm[i];
45897 if (GET_MODE_SIZE (d->vmode) == 32
45898 && e >= nelt
45899 && (e & (nelt / 2 - 1)) < min)
45900 e = e - min - (nelt / 2);
45901 else
45902 e = e - min;
45903 if (e != i)
45904 in_order = false;
45905 dcopy.perm[i] = e;
45907 dcopy.one_operand_p = true;
45909 if (single_insn_only_p && !in_order)
45910 return false;
45912 /* For AVX2, test whether we can permute the result in one instruction. */
45913 if (d->testing_p)
45915 if (in_order)
45916 return true;
45917 dcopy.op1 = dcopy.op0;
45918 return expand_vec_perm_1 (&dcopy);
45921 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45922 if (GET_MODE_SIZE (d->vmode) == 16)
45924 target = gen_reg_rtx (TImode);
45925 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45926 gen_lowpart (TImode, dcopy.op0), shift));
45928 else
45930 target = gen_reg_rtx (V2TImode);
45931 emit_insn (gen_avx2_palignrv2ti (target,
45932 gen_lowpart (V2TImode, dcopy.op1),
45933 gen_lowpart (V2TImode, dcopy.op0),
45934 shift));
45937 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45939 /* Test for the degenerate case where the alignment by itself
45940 produces the desired permutation. */
45941 if (in_order)
45943 emit_move_insn (d->target, dcopy.op0);
45944 return true;
45947 ok = expand_vec_perm_1 (&dcopy);
45948 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45950 return ok;
45953 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45954 the permutation using the SSE4_1 pblendv instruction. Potentially
45955 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45957 static bool
45958 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45960 unsigned i, which, nelt = d->nelt;
45961 struct expand_vec_perm_d dcopy, dcopy1;
45962 machine_mode vmode = d->vmode;
45963 bool ok;
45965 /* Use the same checks as in expand_vec_perm_blend. */
45966 if (d->one_operand_p)
45967 return false;
45968 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45970 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45972 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45974 else
45975 return false;
45977 /* Figure out where permutation elements stay not in their
45978 respective lanes. */
45979 for (i = 0, which = 0; i < nelt; ++i)
45981 unsigned e = d->perm[i];
45982 if (e != i)
45983 which |= (e < nelt ? 1 : 2);
45985 /* We can pblend the part where elements stay not in their
45986 respective lanes only when these elements are all in one
45987 half of a permutation.
45988 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45989 lanes, but both 8 and 9 >= 8
45990 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45991 respective lanes and 8 >= 8, but 2 not. */
45992 if (which != 1 && which != 2)
45993 return false;
45994 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45995 return true;
45997 /* First we apply one operand permutation to the part where
45998 elements stay not in their respective lanes. */
45999 dcopy = *d;
46000 if (which == 2)
46001 dcopy.op0 = dcopy.op1 = d->op1;
46002 else
46003 dcopy.op0 = dcopy.op1 = d->op0;
46004 if (!d->testing_p)
46005 dcopy.target = gen_reg_rtx (vmode);
46006 dcopy.one_operand_p = true;
46008 for (i = 0; i < nelt; ++i)
46009 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46011 ok = expand_vec_perm_1 (&dcopy);
46012 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46013 return false;
46014 else
46015 gcc_assert (ok);
46016 if (d->testing_p)
46017 return true;
46019 /* Next we put permuted elements into their positions. */
46020 dcopy1 = *d;
46021 if (which == 2)
46022 dcopy1.op1 = dcopy.target;
46023 else
46024 dcopy1.op0 = dcopy.target;
46026 for (i = 0; i < nelt; ++i)
46027 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46029 ok = expand_vec_perm_blend (&dcopy1);
46030 gcc_assert (ok);
46032 return true;
46035 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46037 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46038 a two vector permutation into a single vector permutation by using
46039 an interleave operation to merge the vectors. */
46041 static bool
46042 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46044 struct expand_vec_perm_d dremap, dfinal;
46045 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46046 unsigned HOST_WIDE_INT contents;
46047 unsigned char remap[2 * MAX_VECT_LEN];
46048 rtx_insn *seq;
46049 bool ok, same_halves = false;
46051 if (GET_MODE_SIZE (d->vmode) == 16)
46053 if (d->one_operand_p)
46054 return false;
46056 else if (GET_MODE_SIZE (d->vmode) == 32)
46058 if (!TARGET_AVX)
46059 return false;
46060 /* For 32-byte modes allow even d->one_operand_p.
46061 The lack of cross-lane shuffling in some instructions
46062 might prevent a single insn shuffle. */
46063 dfinal = *d;
46064 dfinal.testing_p = true;
46065 /* If expand_vec_perm_interleave3 can expand this into
46066 a 3 insn sequence, give up and let it be expanded as
46067 3 insn sequence. While that is one insn longer,
46068 it doesn't need a memory operand and in the common
46069 case that both interleave low and high permutations
46070 with the same operands are adjacent needs 4 insns
46071 for both after CSE. */
46072 if (expand_vec_perm_interleave3 (&dfinal))
46073 return false;
46075 else
46076 return false;
46078 /* Examine from whence the elements come. */
46079 contents = 0;
46080 for (i = 0; i < nelt; ++i)
46081 contents |= HOST_WIDE_INT_1U << d->perm[i];
46083 memset (remap, 0xff, sizeof (remap));
46084 dremap = *d;
46086 if (GET_MODE_SIZE (d->vmode) == 16)
46088 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46090 /* Split the two input vectors into 4 halves. */
46091 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46092 h2 = h1 << nelt2;
46093 h3 = h2 << nelt2;
46094 h4 = h3 << nelt2;
46096 /* If the elements from the low halves use interleave low, and similarly
46097 for interleave high. If the elements are from mis-matched halves, we
46098 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46099 if ((contents & (h1 | h3)) == contents)
46101 /* punpckl* */
46102 for (i = 0; i < nelt2; ++i)
46104 remap[i] = i * 2;
46105 remap[i + nelt] = i * 2 + 1;
46106 dremap.perm[i * 2] = i;
46107 dremap.perm[i * 2 + 1] = i + nelt;
46109 if (!TARGET_SSE2 && d->vmode == V4SImode)
46110 dremap.vmode = V4SFmode;
46112 else if ((contents & (h2 | h4)) == contents)
46114 /* punpckh* */
46115 for (i = 0; i < nelt2; ++i)
46117 remap[i + nelt2] = i * 2;
46118 remap[i + nelt + nelt2] = i * 2 + 1;
46119 dremap.perm[i * 2] = i + nelt2;
46120 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46122 if (!TARGET_SSE2 && d->vmode == V4SImode)
46123 dremap.vmode = V4SFmode;
46125 else if ((contents & (h1 | h4)) == contents)
46127 /* shufps */
46128 for (i = 0; i < nelt2; ++i)
46130 remap[i] = i;
46131 remap[i + nelt + nelt2] = i + nelt2;
46132 dremap.perm[i] = i;
46133 dremap.perm[i + nelt2] = i + nelt + nelt2;
46135 if (nelt != 4)
46137 /* shufpd */
46138 dremap.vmode = V2DImode;
46139 dremap.nelt = 2;
46140 dremap.perm[0] = 0;
46141 dremap.perm[1] = 3;
46144 else if ((contents & (h2 | h3)) == contents)
46146 /* shufps */
46147 for (i = 0; i < nelt2; ++i)
46149 remap[i + nelt2] = i;
46150 remap[i + nelt] = i + nelt2;
46151 dremap.perm[i] = i + nelt2;
46152 dremap.perm[i + nelt2] = i + nelt;
46154 if (nelt != 4)
46156 /* shufpd */
46157 dremap.vmode = V2DImode;
46158 dremap.nelt = 2;
46159 dremap.perm[0] = 1;
46160 dremap.perm[1] = 2;
46163 else
46164 return false;
46166 else
46168 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46169 unsigned HOST_WIDE_INT q[8];
46170 unsigned int nonzero_halves[4];
46172 /* Split the two input vectors into 8 quarters. */
46173 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46174 for (i = 1; i < 8; ++i)
46175 q[i] = q[0] << (nelt4 * i);
46176 for (i = 0; i < 4; ++i)
46177 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46179 nonzero_halves[nzcnt] = i;
46180 ++nzcnt;
46183 if (nzcnt == 1)
46185 gcc_assert (d->one_operand_p);
46186 nonzero_halves[1] = nonzero_halves[0];
46187 same_halves = true;
46189 else if (d->one_operand_p)
46191 gcc_assert (nonzero_halves[0] == 0);
46192 gcc_assert (nonzero_halves[1] == 1);
46195 if (nzcnt <= 2)
46197 if (d->perm[0] / nelt2 == nonzero_halves[1])
46199 /* Attempt to increase the likelihood that dfinal
46200 shuffle will be intra-lane. */
46201 std::swap (nonzero_halves[0], nonzero_halves[1]);
46204 /* vperm2f128 or vperm2i128. */
46205 for (i = 0; i < nelt2; ++i)
46207 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46208 remap[i + nonzero_halves[0] * nelt2] = i;
46209 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46210 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46213 if (d->vmode != V8SFmode
46214 && d->vmode != V4DFmode
46215 && d->vmode != V8SImode)
46217 dremap.vmode = V8SImode;
46218 dremap.nelt = 8;
46219 for (i = 0; i < 4; ++i)
46221 dremap.perm[i] = i + nonzero_halves[0] * 4;
46222 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46226 else if (d->one_operand_p)
46227 return false;
46228 else if (TARGET_AVX2
46229 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46231 /* vpunpckl* */
46232 for (i = 0; i < nelt4; ++i)
46234 remap[i] = i * 2;
46235 remap[i + nelt] = i * 2 + 1;
46236 remap[i + nelt2] = i * 2 + nelt2;
46237 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46238 dremap.perm[i * 2] = i;
46239 dremap.perm[i * 2 + 1] = i + nelt;
46240 dremap.perm[i * 2 + nelt2] = i + nelt2;
46241 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46244 else if (TARGET_AVX2
46245 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46247 /* vpunpckh* */
46248 for (i = 0; i < nelt4; ++i)
46250 remap[i + nelt4] = i * 2;
46251 remap[i + nelt + nelt4] = i * 2 + 1;
46252 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46253 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46254 dremap.perm[i * 2] = i + nelt4;
46255 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46256 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46257 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46260 else
46261 return false;
46264 /* Use the remapping array set up above to move the elements from their
46265 swizzled locations into their final destinations. */
46266 dfinal = *d;
46267 for (i = 0; i < nelt; ++i)
46269 unsigned e = remap[d->perm[i]];
46270 gcc_assert (e < nelt);
46271 /* If same_halves is true, both halves of the remapped vector are the
46272 same. Avoid cross-lane accesses if possible. */
46273 if (same_halves && i >= nelt2)
46275 gcc_assert (e < nelt2);
46276 dfinal.perm[i] = e + nelt2;
46278 else
46279 dfinal.perm[i] = e;
46281 if (!d->testing_p)
46283 dremap.target = gen_reg_rtx (dremap.vmode);
46284 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46286 dfinal.op1 = dfinal.op0;
46287 dfinal.one_operand_p = true;
46289 /* Test if the final remap can be done with a single insn. For V4SFmode or
46290 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46291 start_sequence ();
46292 ok = expand_vec_perm_1 (&dfinal);
46293 seq = get_insns ();
46294 end_sequence ();
46296 if (!ok)
46297 return false;
46299 if (d->testing_p)
46300 return true;
46302 if (dremap.vmode != dfinal.vmode)
46304 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46305 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46308 ok = expand_vec_perm_1 (&dremap);
46309 gcc_assert (ok);
46311 emit_insn (seq);
46312 return true;
46315 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46316 a single vector cross-lane permutation into vpermq followed
46317 by any of the single insn permutations. */
46319 static bool
46320 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46322 struct expand_vec_perm_d dremap, dfinal;
46323 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46324 unsigned contents[2];
46325 bool ok;
46327 if (!(TARGET_AVX2
46328 && (d->vmode == V32QImode || d->vmode == V16HImode)
46329 && d->one_operand_p))
46330 return false;
46332 contents[0] = 0;
46333 contents[1] = 0;
46334 for (i = 0; i < nelt2; ++i)
46336 contents[0] |= 1u << (d->perm[i] / nelt4);
46337 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46340 for (i = 0; i < 2; ++i)
46342 unsigned int cnt = 0;
46343 for (j = 0; j < 4; ++j)
46344 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46345 return false;
46348 if (d->testing_p)
46349 return true;
46351 dremap = *d;
46352 dremap.vmode = V4DImode;
46353 dremap.nelt = 4;
46354 dremap.target = gen_reg_rtx (V4DImode);
46355 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46356 dremap.op1 = dremap.op0;
46357 dremap.one_operand_p = true;
46358 for (i = 0; i < 2; ++i)
46360 unsigned int cnt = 0;
46361 for (j = 0; j < 4; ++j)
46362 if ((contents[i] & (1u << j)) != 0)
46363 dremap.perm[2 * i + cnt++] = j;
46364 for (; cnt < 2; ++cnt)
46365 dremap.perm[2 * i + cnt] = 0;
46368 dfinal = *d;
46369 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46370 dfinal.op1 = dfinal.op0;
46371 dfinal.one_operand_p = true;
46372 for (i = 0, j = 0; i < nelt; ++i)
46374 if (i == nelt2)
46375 j = 2;
46376 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46377 if ((d->perm[i] / nelt4) == dremap.perm[j])
46379 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46380 dfinal.perm[i] |= nelt4;
46381 else
46382 gcc_unreachable ();
46385 ok = expand_vec_perm_1 (&dremap);
46386 gcc_assert (ok);
46388 ok = expand_vec_perm_1 (&dfinal);
46389 gcc_assert (ok);
46391 return true;
46394 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46395 a vector permutation using two instructions, vperm2f128 resp.
46396 vperm2i128 followed by any single in-lane permutation. */
46398 static bool
46399 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46401 struct expand_vec_perm_d dfirst, dsecond;
46402 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46403 bool ok;
46405 if (!TARGET_AVX
46406 || GET_MODE_SIZE (d->vmode) != 32
46407 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46408 return false;
46410 dsecond = *d;
46411 dsecond.one_operand_p = false;
46412 dsecond.testing_p = true;
46414 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46415 immediate. For perm < 16 the second permutation uses
46416 d->op0 as first operand, for perm >= 16 it uses d->op1
46417 as first operand. The second operand is the result of
46418 vperm2[fi]128. */
46419 for (perm = 0; perm < 32; perm++)
46421 /* Ignore permutations which do not move anything cross-lane. */
46422 if (perm < 16)
46424 /* The second shuffle for e.g. V4DFmode has
46425 0123 and ABCD operands.
46426 Ignore AB23, as 23 is already in the second lane
46427 of the first operand. */
46428 if ((perm & 0xc) == (1 << 2)) continue;
46429 /* And 01CD, as 01 is in the first lane of the first
46430 operand. */
46431 if ((perm & 3) == 0) continue;
46432 /* And 4567, as then the vperm2[fi]128 doesn't change
46433 anything on the original 4567 second operand. */
46434 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46436 else
46438 /* The second shuffle for e.g. V4DFmode has
46439 4567 and ABCD operands.
46440 Ignore AB67, as 67 is already in the second lane
46441 of the first operand. */
46442 if ((perm & 0xc) == (3 << 2)) continue;
46443 /* And 45CD, as 45 is in the first lane of the first
46444 operand. */
46445 if ((perm & 3) == 2) continue;
46446 /* And 0123, as then the vperm2[fi]128 doesn't change
46447 anything on the original 0123 first operand. */
46448 if ((perm & 0xf) == (1 << 2)) continue;
46451 for (i = 0; i < nelt; i++)
46453 j = d->perm[i] / nelt2;
46454 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46455 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46456 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46457 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46458 else
46459 break;
46462 if (i == nelt)
46464 start_sequence ();
46465 ok = expand_vec_perm_1 (&dsecond);
46466 end_sequence ();
46468 else
46469 ok = false;
46471 if (ok)
46473 if (d->testing_p)
46474 return true;
46476 /* Found a usable second shuffle. dfirst will be
46477 vperm2f128 on d->op0 and d->op1. */
46478 dsecond.testing_p = false;
46479 dfirst = *d;
46480 dfirst.target = gen_reg_rtx (d->vmode);
46481 for (i = 0; i < nelt; i++)
46482 dfirst.perm[i] = (i & (nelt2 - 1))
46483 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46485 canonicalize_perm (&dfirst);
46486 ok = expand_vec_perm_1 (&dfirst);
46487 gcc_assert (ok);
46489 /* And dsecond is some single insn shuffle, taking
46490 d->op0 and result of vperm2f128 (if perm < 16) or
46491 d->op1 and result of vperm2f128 (otherwise). */
46492 if (perm >= 16)
46493 dsecond.op0 = dsecond.op1;
46494 dsecond.op1 = dfirst.target;
46496 ok = expand_vec_perm_1 (&dsecond);
46497 gcc_assert (ok);
46499 return true;
46502 /* For one operand, the only useful vperm2f128 permutation is 0x01
46503 aka lanes swap. */
46504 if (d->one_operand_p)
46505 return false;
46508 return false;
46511 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46512 a two vector permutation using 2 intra-lane interleave insns
46513 and cross-lane shuffle for 32-byte vectors. */
46515 static bool
46516 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46518 unsigned i, nelt;
46519 rtx (*gen) (rtx, rtx, rtx);
46521 if (d->one_operand_p)
46522 return false;
46523 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46525 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46527 else
46528 return false;
46530 nelt = d->nelt;
46531 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46532 return false;
46533 for (i = 0; i < nelt; i += 2)
46534 if (d->perm[i] != d->perm[0] + i / 2
46535 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46536 return false;
46538 if (d->testing_p)
46539 return true;
46541 switch (d->vmode)
46543 case E_V32QImode:
46544 if (d->perm[0])
46545 gen = gen_vec_interleave_highv32qi;
46546 else
46547 gen = gen_vec_interleave_lowv32qi;
46548 break;
46549 case E_V16HImode:
46550 if (d->perm[0])
46551 gen = gen_vec_interleave_highv16hi;
46552 else
46553 gen = gen_vec_interleave_lowv16hi;
46554 break;
46555 case E_V8SImode:
46556 if (d->perm[0])
46557 gen = gen_vec_interleave_highv8si;
46558 else
46559 gen = gen_vec_interleave_lowv8si;
46560 break;
46561 case E_V4DImode:
46562 if (d->perm[0])
46563 gen = gen_vec_interleave_highv4di;
46564 else
46565 gen = gen_vec_interleave_lowv4di;
46566 break;
46567 case E_V8SFmode:
46568 if (d->perm[0])
46569 gen = gen_vec_interleave_highv8sf;
46570 else
46571 gen = gen_vec_interleave_lowv8sf;
46572 break;
46573 case E_V4DFmode:
46574 if (d->perm[0])
46575 gen = gen_vec_interleave_highv4df;
46576 else
46577 gen = gen_vec_interleave_lowv4df;
46578 break;
46579 default:
46580 gcc_unreachable ();
46583 emit_insn (gen (d->target, d->op0, d->op1));
46584 return true;
46587 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46588 a single vector permutation using a single intra-lane vector
46589 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46590 the non-swapped and swapped vectors together. */
46592 static bool
46593 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46595 struct expand_vec_perm_d dfirst, dsecond;
46596 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46597 rtx_insn *seq;
46598 bool ok;
46599 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46601 if (!TARGET_AVX
46602 || TARGET_AVX2
46603 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46604 || !d->one_operand_p)
46605 return false;
46607 dfirst = *d;
46608 for (i = 0; i < nelt; i++)
46609 dfirst.perm[i] = 0xff;
46610 for (i = 0, msk = 0; i < nelt; i++)
46612 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46613 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46614 return false;
46615 dfirst.perm[j] = d->perm[i];
46616 if (j != i)
46617 msk |= (1 << i);
46619 for (i = 0; i < nelt; i++)
46620 if (dfirst.perm[i] == 0xff)
46621 dfirst.perm[i] = i;
46623 if (!d->testing_p)
46624 dfirst.target = gen_reg_rtx (dfirst.vmode);
46626 start_sequence ();
46627 ok = expand_vec_perm_1 (&dfirst);
46628 seq = get_insns ();
46629 end_sequence ();
46631 if (!ok)
46632 return false;
46634 if (d->testing_p)
46635 return true;
46637 emit_insn (seq);
46639 dsecond = *d;
46640 dsecond.op0 = dfirst.target;
46641 dsecond.op1 = dfirst.target;
46642 dsecond.one_operand_p = true;
46643 dsecond.target = gen_reg_rtx (dsecond.vmode);
46644 for (i = 0; i < nelt; i++)
46645 dsecond.perm[i] = i ^ nelt2;
46647 ok = expand_vec_perm_1 (&dsecond);
46648 gcc_assert (ok);
46650 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46651 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46652 return true;
46655 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46656 permutation using two vperm2f128, followed by a vshufpd insn blending
46657 the two vectors together. */
46659 static bool
46660 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46662 struct expand_vec_perm_d dfirst, dsecond, dthird;
46663 bool ok;
46665 if (!TARGET_AVX || (d->vmode != V4DFmode))
46666 return false;
46668 if (d->testing_p)
46669 return true;
46671 dfirst = *d;
46672 dsecond = *d;
46673 dthird = *d;
46675 dfirst.perm[0] = (d->perm[0] & ~1);
46676 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46677 dfirst.perm[2] = (d->perm[2] & ~1);
46678 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46679 dsecond.perm[0] = (d->perm[1] & ~1);
46680 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46681 dsecond.perm[2] = (d->perm[3] & ~1);
46682 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46683 dthird.perm[0] = (d->perm[0] % 2);
46684 dthird.perm[1] = (d->perm[1] % 2) + 4;
46685 dthird.perm[2] = (d->perm[2] % 2) + 2;
46686 dthird.perm[3] = (d->perm[3] % 2) + 6;
46688 dfirst.target = gen_reg_rtx (dfirst.vmode);
46689 dsecond.target = gen_reg_rtx (dsecond.vmode);
46690 dthird.op0 = dfirst.target;
46691 dthird.op1 = dsecond.target;
46692 dthird.one_operand_p = false;
46694 canonicalize_perm (&dfirst);
46695 canonicalize_perm (&dsecond);
46697 ok = expand_vec_perm_1 (&dfirst)
46698 && expand_vec_perm_1 (&dsecond)
46699 && expand_vec_perm_1 (&dthird);
46701 gcc_assert (ok);
46703 return true;
46706 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46707 permutation with two pshufb insns and an ior. We should have already
46708 failed all two instruction sequences. */
46710 static bool
46711 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46713 rtx rperm[2][16], vperm, l, h, op, m128;
46714 unsigned int i, nelt, eltsz;
46716 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46717 return false;
46718 gcc_assert (!d->one_operand_p);
46720 if (d->testing_p)
46721 return true;
46723 nelt = d->nelt;
46724 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46726 /* Generate two permutation masks. If the required element is within
46727 the given vector it is shuffled into the proper lane. If the required
46728 element is in the other vector, force a zero into the lane by setting
46729 bit 7 in the permutation mask. */
46730 m128 = GEN_INT (-128);
46731 for (i = 0; i < nelt; ++i)
46733 unsigned j, e = d->perm[i];
46734 unsigned which = (e >= nelt);
46735 if (e >= nelt)
46736 e -= nelt;
46738 for (j = 0; j < eltsz; ++j)
46740 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46741 rperm[1-which][i*eltsz + j] = m128;
46745 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46746 vperm = force_reg (V16QImode, vperm);
46748 l = gen_reg_rtx (V16QImode);
46749 op = gen_lowpart (V16QImode, d->op0);
46750 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46752 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46753 vperm = force_reg (V16QImode, vperm);
46755 h = gen_reg_rtx (V16QImode);
46756 op = gen_lowpart (V16QImode, d->op1);
46757 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46759 op = d->target;
46760 if (d->vmode != V16QImode)
46761 op = gen_reg_rtx (V16QImode);
46762 emit_insn (gen_iorv16qi3 (op, l, h));
46763 if (op != d->target)
46764 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46766 return true;
46769 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46770 with two vpshufb insns, vpermq and vpor. We should have already failed
46771 all two or three instruction sequences. */
46773 static bool
46774 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46776 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46777 unsigned int i, nelt, eltsz;
46779 if (!TARGET_AVX2
46780 || !d->one_operand_p
46781 || (d->vmode != V32QImode && d->vmode != V16HImode))
46782 return false;
46784 if (d->testing_p)
46785 return true;
46787 nelt = d->nelt;
46788 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46790 /* Generate two permutation masks. If the required element is within
46791 the same lane, it is shuffled in. If the required element from the
46792 other lane, force a zero by setting bit 7 in the permutation mask.
46793 In the other mask the mask has non-negative elements if element
46794 is requested from the other lane, but also moved to the other lane,
46795 so that the result of vpshufb can have the two V2TImode halves
46796 swapped. */
46797 m128 = GEN_INT (-128);
46798 for (i = 0; i < nelt; ++i)
46800 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46801 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46803 for (j = 0; j < eltsz; ++j)
46805 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46806 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46810 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46811 vperm = force_reg (V32QImode, vperm);
46813 h = gen_reg_rtx (V32QImode);
46814 op = gen_lowpart (V32QImode, d->op0);
46815 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46817 /* Swap the 128-byte lanes of h into hp. */
46818 hp = gen_reg_rtx (V4DImode);
46819 op = gen_lowpart (V4DImode, h);
46820 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46821 const1_rtx));
46823 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46824 vperm = force_reg (V32QImode, vperm);
46826 l = gen_reg_rtx (V32QImode);
46827 op = gen_lowpart (V32QImode, d->op0);
46828 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46830 op = d->target;
46831 if (d->vmode != V32QImode)
46832 op = gen_reg_rtx (V32QImode);
46833 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46834 if (op != d->target)
46835 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46837 return true;
46840 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46841 and extract-odd permutations of two V32QImode and V16QImode operand
46842 with two vpshufb insns, vpor and vpermq. We should have already
46843 failed all two or three instruction sequences. */
46845 static bool
46846 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46848 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46849 unsigned int i, nelt, eltsz;
46851 if (!TARGET_AVX2
46852 || d->one_operand_p
46853 || (d->vmode != V32QImode && d->vmode != V16HImode))
46854 return false;
46856 for (i = 0; i < d->nelt; ++i)
46857 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46858 return false;
46860 if (d->testing_p)
46861 return true;
46863 nelt = d->nelt;
46864 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46866 /* Generate two permutation masks. In the first permutation mask
46867 the first quarter will contain indexes for the first half
46868 of the op0, the second quarter will contain bit 7 set, third quarter
46869 will contain indexes for the second half of the op0 and the
46870 last quarter bit 7 set. In the second permutation mask
46871 the first quarter will contain bit 7 set, the second quarter
46872 indexes for the first half of the op1, the third quarter bit 7 set
46873 and last quarter indexes for the second half of the op1.
46874 I.e. the first mask e.g. for V32QImode extract even will be:
46875 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46876 (all values masked with 0xf except for -128) and second mask
46877 for extract even will be
46878 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46879 m128 = GEN_INT (-128);
46880 for (i = 0; i < nelt; ++i)
46882 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46883 unsigned which = d->perm[i] >= nelt;
46884 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46886 for (j = 0; j < eltsz; ++j)
46888 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46889 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46893 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46894 vperm = force_reg (V32QImode, vperm);
46896 l = gen_reg_rtx (V32QImode);
46897 op = gen_lowpart (V32QImode, d->op0);
46898 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46900 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46901 vperm = force_reg (V32QImode, vperm);
46903 h = gen_reg_rtx (V32QImode);
46904 op = gen_lowpart (V32QImode, d->op1);
46905 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46907 ior = gen_reg_rtx (V32QImode);
46908 emit_insn (gen_iorv32qi3 (ior, l, h));
46910 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46911 op = gen_reg_rtx (V4DImode);
46912 ior = gen_lowpart (V4DImode, ior);
46913 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46914 const1_rtx, GEN_INT (3)));
46915 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46917 return true;
46920 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46921 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46922 with two "and" and "pack" or two "shift" and "pack" insns. We should
46923 have already failed all two instruction sequences. */
46925 static bool
46926 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46928 rtx op, dop0, dop1, t;
46929 unsigned i, odd, c, s, nelt = d->nelt;
46930 bool end_perm = false;
46931 machine_mode half_mode;
46932 rtx (*gen_and) (rtx, rtx, rtx);
46933 rtx (*gen_pack) (rtx, rtx, rtx);
46934 rtx (*gen_shift) (rtx, rtx, rtx);
46936 if (d->one_operand_p)
46937 return false;
46939 switch (d->vmode)
46941 case E_V8HImode:
46942 /* Required for "pack". */
46943 if (!TARGET_SSE4_1)
46944 return false;
46945 c = 0xffff;
46946 s = 16;
46947 half_mode = V4SImode;
46948 gen_and = gen_andv4si3;
46949 gen_pack = gen_sse4_1_packusdw;
46950 gen_shift = gen_lshrv4si3;
46951 break;
46952 case E_V16QImode:
46953 /* No check as all instructions are SSE2. */
46954 c = 0xff;
46955 s = 8;
46956 half_mode = V8HImode;
46957 gen_and = gen_andv8hi3;
46958 gen_pack = gen_sse2_packuswb;
46959 gen_shift = gen_lshrv8hi3;
46960 break;
46961 case E_V16HImode:
46962 if (!TARGET_AVX2)
46963 return false;
46964 c = 0xffff;
46965 s = 16;
46966 half_mode = V8SImode;
46967 gen_and = gen_andv8si3;
46968 gen_pack = gen_avx2_packusdw;
46969 gen_shift = gen_lshrv8si3;
46970 end_perm = true;
46971 break;
46972 case E_V32QImode:
46973 if (!TARGET_AVX2)
46974 return false;
46975 c = 0xff;
46976 s = 8;
46977 half_mode = V16HImode;
46978 gen_and = gen_andv16hi3;
46979 gen_pack = gen_avx2_packuswb;
46980 gen_shift = gen_lshrv16hi3;
46981 end_perm = true;
46982 break;
46983 default:
46984 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46985 general shuffles. */
46986 return false;
46989 /* Check that permutation is even or odd. */
46990 odd = d->perm[0];
46991 if (odd > 1)
46992 return false;
46994 for (i = 1; i < nelt; ++i)
46995 if (d->perm[i] != 2 * i + odd)
46996 return false;
46998 if (d->testing_p)
46999 return true;
47001 dop0 = gen_reg_rtx (half_mode);
47002 dop1 = gen_reg_rtx (half_mode);
47003 if (odd == 0)
47005 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
47006 t = force_reg (half_mode, t);
47007 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47008 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47010 else
47012 emit_insn (gen_shift (dop0,
47013 gen_lowpart (half_mode, d->op0),
47014 GEN_INT (s)));
47015 emit_insn (gen_shift (dop1,
47016 gen_lowpart (half_mode, d->op1),
47017 GEN_INT (s)));
47019 /* In AVX2 for 256 bit case we need to permute pack result. */
47020 if (TARGET_AVX2 && end_perm)
47022 op = gen_reg_rtx (d->vmode);
47023 t = gen_reg_rtx (V4DImode);
47024 emit_insn (gen_pack (op, dop0, dop1));
47025 emit_insn (gen_avx2_permv4di_1 (t,
47026 gen_lowpart (V4DImode, op),
47027 const0_rtx,
47028 const2_rtx,
47029 const1_rtx,
47030 GEN_INT (3)));
47031 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47033 else
47034 emit_insn (gen_pack (d->target, dop0, dop1));
47036 return true;
47039 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47040 and extract-odd permutations of two V64QI operands
47041 with two "shifts", two "truncs" and one "concat" insns for "odd"
47042 and two "truncs" and one concat insn for "even."
47043 Have already failed all two instruction sequences. */
47045 static bool
47046 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47048 rtx t1, t2, t3, t4;
47049 unsigned i, odd, nelt = d->nelt;
47051 if (!TARGET_AVX512BW
47052 || d->one_operand_p
47053 || d->vmode != V64QImode)
47054 return false;
47056 /* Check that permutation is even or odd. */
47057 odd = d->perm[0];
47058 if (odd > 1)
47059 return false;
47061 for (i = 1; i < nelt; ++i)
47062 if (d->perm[i] != 2 * i + odd)
47063 return false;
47065 if (d->testing_p)
47066 return true;
47069 if (odd)
47071 t1 = gen_reg_rtx (V32HImode);
47072 t2 = gen_reg_rtx (V32HImode);
47073 emit_insn (gen_lshrv32hi3 (t1,
47074 gen_lowpart (V32HImode, d->op0),
47075 GEN_INT (8)));
47076 emit_insn (gen_lshrv32hi3 (t2,
47077 gen_lowpart (V32HImode, d->op1),
47078 GEN_INT (8)));
47080 else
47082 t1 = gen_lowpart (V32HImode, d->op0);
47083 t2 = gen_lowpart (V32HImode, d->op1);
47086 t3 = gen_reg_rtx (V32QImode);
47087 t4 = gen_reg_rtx (V32QImode);
47088 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47089 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47090 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47092 return true;
47095 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47096 and extract-odd permutations. */
47098 static bool
47099 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47101 rtx t1, t2, t3, t4, t5;
47103 switch (d->vmode)
47105 case E_V4DFmode:
47106 if (d->testing_p)
47107 break;
47108 t1 = gen_reg_rtx (V4DFmode);
47109 t2 = gen_reg_rtx (V4DFmode);
47111 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47112 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47113 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47115 /* Now an unpck[lh]pd will produce the result required. */
47116 if (odd)
47117 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47118 else
47119 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47120 emit_insn (t3);
47121 break;
47123 case E_V8SFmode:
47125 int mask = odd ? 0xdd : 0x88;
47127 if (d->testing_p)
47128 break;
47129 t1 = gen_reg_rtx (V8SFmode);
47130 t2 = gen_reg_rtx (V8SFmode);
47131 t3 = gen_reg_rtx (V8SFmode);
47133 /* Shuffle within the 128-bit lanes to produce:
47134 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47135 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47136 GEN_INT (mask)));
47138 /* Shuffle the lanes around to produce:
47139 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47140 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47141 GEN_INT (0x3)));
47143 /* Shuffle within the 128-bit lanes to produce:
47144 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47145 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47147 /* Shuffle within the 128-bit lanes to produce:
47148 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47149 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47151 /* Shuffle the lanes around to produce:
47152 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47153 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47154 GEN_INT (0x20)));
47156 break;
47158 case E_V2DFmode:
47159 case E_V4SFmode:
47160 case E_V2DImode:
47161 case E_V4SImode:
47162 /* These are always directly implementable by expand_vec_perm_1. */
47163 gcc_unreachable ();
47165 case E_V8HImode:
47166 if (TARGET_SSE4_1)
47167 return expand_vec_perm_even_odd_pack (d);
47168 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47169 return expand_vec_perm_pshufb2 (d);
47170 else
47172 if (d->testing_p)
47173 break;
47174 /* We need 2*log2(N)-1 operations to achieve odd/even
47175 with interleave. */
47176 t1 = gen_reg_rtx (V8HImode);
47177 t2 = gen_reg_rtx (V8HImode);
47178 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47179 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47180 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47181 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47182 if (odd)
47183 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47184 else
47185 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47186 emit_insn (t3);
47188 break;
47190 case E_V16QImode:
47191 return expand_vec_perm_even_odd_pack (d);
47193 case E_V16HImode:
47194 case E_V32QImode:
47195 return expand_vec_perm_even_odd_pack (d);
47197 case E_V64QImode:
47198 return expand_vec_perm_even_odd_trunc (d);
47200 case E_V4DImode:
47201 if (!TARGET_AVX2)
47203 struct expand_vec_perm_d d_copy = *d;
47204 d_copy.vmode = V4DFmode;
47205 if (d->testing_p)
47206 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47207 else
47208 d_copy.target = gen_reg_rtx (V4DFmode);
47209 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47210 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47211 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47213 if (!d->testing_p)
47214 emit_move_insn (d->target,
47215 gen_lowpart (V4DImode, d_copy.target));
47216 return true;
47218 return false;
47221 if (d->testing_p)
47222 break;
47224 t1 = gen_reg_rtx (V4DImode);
47225 t2 = gen_reg_rtx (V4DImode);
47227 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47228 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47229 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47231 /* Now an vpunpck[lh]qdq will produce the result required. */
47232 if (odd)
47233 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47234 else
47235 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47236 emit_insn (t3);
47237 break;
47239 case E_V8SImode:
47240 if (!TARGET_AVX2)
47242 struct expand_vec_perm_d d_copy = *d;
47243 d_copy.vmode = V8SFmode;
47244 if (d->testing_p)
47245 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47246 else
47247 d_copy.target = gen_reg_rtx (V8SFmode);
47248 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47249 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47250 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47252 if (!d->testing_p)
47253 emit_move_insn (d->target,
47254 gen_lowpart (V8SImode, d_copy.target));
47255 return true;
47257 return false;
47260 if (d->testing_p)
47261 break;
47263 t1 = gen_reg_rtx (V8SImode);
47264 t2 = gen_reg_rtx (V8SImode);
47265 t3 = gen_reg_rtx (V4DImode);
47266 t4 = gen_reg_rtx (V4DImode);
47267 t5 = gen_reg_rtx (V4DImode);
47269 /* Shuffle the lanes around into
47270 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47271 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47272 gen_lowpart (V4DImode, d->op1),
47273 GEN_INT (0x20)));
47274 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47275 gen_lowpart (V4DImode, d->op1),
47276 GEN_INT (0x31)));
47278 /* Swap the 2nd and 3rd position in each lane into
47279 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47280 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47281 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47282 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47283 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47285 /* Now an vpunpck[lh]qdq will produce
47286 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47287 if (odd)
47288 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47289 gen_lowpart (V4DImode, t2));
47290 else
47291 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47292 gen_lowpart (V4DImode, t2));
47293 emit_insn (t3);
47294 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47295 break;
47297 default:
47298 gcc_unreachable ();
47301 return true;
47304 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47305 extract-even and extract-odd permutations. */
47307 static bool
47308 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47310 unsigned i, odd, nelt = d->nelt;
47312 odd = d->perm[0];
47313 if (odd != 0 && odd != 1)
47314 return false;
47316 for (i = 1; i < nelt; ++i)
47317 if (d->perm[i] != 2 * i + odd)
47318 return false;
47320 return expand_vec_perm_even_odd_1 (d, odd);
47323 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47324 permutations. We assume that expand_vec_perm_1 has already failed. */
47326 static bool
47327 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47329 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47330 machine_mode vmode = d->vmode;
47331 unsigned char perm2[4];
47332 rtx op0 = d->op0, dest;
47333 bool ok;
47335 switch (vmode)
47337 case E_V4DFmode:
47338 case E_V8SFmode:
47339 /* These are special-cased in sse.md so that we can optionally
47340 use the vbroadcast instruction. They expand to two insns
47341 if the input happens to be in a register. */
47342 gcc_unreachable ();
47344 case E_V2DFmode:
47345 case E_V2DImode:
47346 case E_V4SFmode:
47347 case E_V4SImode:
47348 /* These are always implementable using standard shuffle patterns. */
47349 gcc_unreachable ();
47351 case E_V8HImode:
47352 case E_V16QImode:
47353 /* These can be implemented via interleave. We save one insn by
47354 stopping once we have promoted to V4SImode and then use pshufd. */
47355 if (d->testing_p)
47356 return true;
47359 rtx dest;
47360 rtx (*gen) (rtx, rtx, rtx)
47361 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47362 : gen_vec_interleave_lowv8hi;
47364 if (elt >= nelt2)
47366 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47367 : gen_vec_interleave_highv8hi;
47368 elt -= nelt2;
47370 nelt2 /= 2;
47372 dest = gen_reg_rtx (vmode);
47373 emit_insn (gen (dest, op0, op0));
47374 vmode = get_mode_wider_vector (vmode);
47375 op0 = gen_lowpart (vmode, dest);
47377 while (vmode != V4SImode);
47379 memset (perm2, elt, 4);
47380 dest = gen_reg_rtx (V4SImode);
47381 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47382 gcc_assert (ok);
47383 if (!d->testing_p)
47384 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47385 return true;
47387 case E_V64QImode:
47388 case E_V32QImode:
47389 case E_V16HImode:
47390 case E_V8SImode:
47391 case E_V4DImode:
47392 /* For AVX2 broadcasts of the first element vpbroadcast* or
47393 vpermq should be used by expand_vec_perm_1. */
47394 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47395 return false;
47397 default:
47398 gcc_unreachable ();
47402 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47403 broadcast permutations. */
47405 static bool
47406 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47408 unsigned i, elt, nelt = d->nelt;
47410 if (!d->one_operand_p)
47411 return false;
47413 elt = d->perm[0];
47414 for (i = 1; i < nelt; ++i)
47415 if (d->perm[i] != elt)
47416 return false;
47418 return expand_vec_perm_broadcast_1 (d);
47421 /* Implement arbitrary permutations of two V64QImode operands
47422 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
47423 static bool
47424 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
47426 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47427 return false;
47429 if (d->testing_p)
47430 return true;
47432 struct expand_vec_perm_d ds[2];
47433 rtx rperm[128], vperm, target0, target1;
47434 unsigned int i, nelt;
47435 machine_mode vmode;
47437 nelt = d->nelt;
47438 vmode = V64QImode;
47440 for (i = 0; i < 2; i++)
47442 ds[i] = *d;
47443 ds[i].vmode = V32HImode;
47444 ds[i].nelt = 32;
47445 ds[i].target = gen_reg_rtx (V32HImode);
47446 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47447 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47450 /* Prepare permutations such that the first one takes care of
47451 putting the even bytes into the right positions or one higher
47452 positions (ds[0]) and the second one takes care of
47453 putting the odd bytes into the right positions or one below
47454 (ds[1]). */
47456 for (i = 0; i < nelt; i++)
47458 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47459 if (i & 1)
47461 rperm[i] = constm1_rtx;
47462 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47464 else
47466 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47467 rperm[i + 64] = constm1_rtx;
47471 bool ok = expand_vec_perm_1 (&ds[0]);
47472 gcc_assert (ok);
47473 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47475 ok = expand_vec_perm_1 (&ds[1]);
47476 gcc_assert (ok);
47477 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47479 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47480 vperm = force_reg (vmode, vperm);
47481 target0 = gen_reg_rtx (V64QImode);
47482 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47484 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47485 vperm = force_reg (vmode, vperm);
47486 target1 = gen_reg_rtx (V64QImode);
47487 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47489 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47490 return true;
47493 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47494 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47495 all the shorter instruction sequences. */
47497 static bool
47498 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47500 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47501 unsigned int i, nelt, eltsz;
47502 bool used[4];
47504 if (!TARGET_AVX2
47505 || d->one_operand_p
47506 || (d->vmode != V32QImode && d->vmode != V16HImode))
47507 return false;
47509 if (d->testing_p)
47510 return true;
47512 nelt = d->nelt;
47513 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47515 /* Generate 4 permutation masks. If the required element is within
47516 the same lane, it is shuffled in. If the required element from the
47517 other lane, force a zero by setting bit 7 in the permutation mask.
47518 In the other mask the mask has non-negative elements if element
47519 is requested from the other lane, but also moved to the other lane,
47520 so that the result of vpshufb can have the two V2TImode halves
47521 swapped. */
47522 m128 = GEN_INT (-128);
47523 for (i = 0; i < 32; ++i)
47525 rperm[0][i] = m128;
47526 rperm[1][i] = m128;
47527 rperm[2][i] = m128;
47528 rperm[3][i] = m128;
47530 used[0] = false;
47531 used[1] = false;
47532 used[2] = false;
47533 used[3] = false;
47534 for (i = 0; i < nelt; ++i)
47536 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47537 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47538 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47540 for (j = 0; j < eltsz; ++j)
47541 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47542 used[which] = true;
47545 for (i = 0; i < 2; ++i)
47547 if (!used[2 * i + 1])
47549 h[i] = NULL_RTX;
47550 continue;
47552 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47553 gen_rtvec_v (32, rperm[2 * i + 1]));
47554 vperm = force_reg (V32QImode, vperm);
47555 h[i] = gen_reg_rtx (V32QImode);
47556 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47557 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47560 /* Swap the 128-byte lanes of h[X]. */
47561 for (i = 0; i < 2; ++i)
47563 if (h[i] == NULL_RTX)
47564 continue;
47565 op = gen_reg_rtx (V4DImode);
47566 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47567 const2_rtx, GEN_INT (3), const0_rtx,
47568 const1_rtx));
47569 h[i] = gen_lowpart (V32QImode, op);
47572 for (i = 0; i < 2; ++i)
47574 if (!used[2 * i])
47576 l[i] = NULL_RTX;
47577 continue;
47579 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47580 vperm = force_reg (V32QImode, vperm);
47581 l[i] = gen_reg_rtx (V32QImode);
47582 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47583 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47586 for (i = 0; i < 2; ++i)
47588 if (h[i] && l[i])
47590 op = gen_reg_rtx (V32QImode);
47591 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47592 l[i] = op;
47594 else if (h[i])
47595 l[i] = h[i];
47598 gcc_assert (l[0] && l[1]);
47599 op = d->target;
47600 if (d->vmode != V32QImode)
47601 op = gen_reg_rtx (V32QImode);
47602 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47603 if (op != d->target)
47604 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47605 return true;
47608 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
47609 taken care of, perform the expansion in D and return true on success. */
47611 static bool
47612 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47614 /* Try a single instruction expansion. */
47615 if (expand_vec_perm_1 (d))
47616 return true;
47618 /* Try sequences of two instructions. */
47620 if (expand_vec_perm_pshuflw_pshufhw (d))
47621 return true;
47623 if (expand_vec_perm_palignr (d, false))
47624 return true;
47626 if (expand_vec_perm_interleave2 (d))
47627 return true;
47629 if (expand_vec_perm_broadcast (d))
47630 return true;
47632 if (expand_vec_perm_vpermq_perm_1 (d))
47633 return true;
47635 if (expand_vec_perm_vperm2f128 (d))
47636 return true;
47638 if (expand_vec_perm_pblendv (d))
47639 return true;
47641 /* Try sequences of three instructions. */
47643 if (expand_vec_perm_even_odd_pack (d))
47644 return true;
47646 if (expand_vec_perm_2vperm2f128_vshuf (d))
47647 return true;
47649 if (expand_vec_perm_pshufb2 (d))
47650 return true;
47652 if (expand_vec_perm_interleave3 (d))
47653 return true;
47655 if (expand_vec_perm_vperm2f128_vblend (d))
47656 return true;
47658 /* Try sequences of four instructions. */
47660 if (expand_vec_perm_even_odd_trunc (d))
47661 return true;
47662 if (expand_vec_perm_vpshufb2_vpermq (d))
47663 return true;
47665 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47666 return true;
47668 if (expand_vec_perm_vpermt2_vpshub2 (d))
47669 return true;
47671 /* ??? Look for narrow permutations whose element orderings would
47672 allow the promotion to a wider mode. */
47674 /* ??? Look for sequences of interleave or a wider permute that place
47675 the data into the correct lanes for a half-vector shuffle like
47676 pshuf[lh]w or vpermilps. */
47678 /* ??? Look for sequences of interleave that produce the desired results.
47679 The combinatorics of punpck[lh] get pretty ugly... */
47681 if (expand_vec_perm_even_odd (d))
47682 return true;
47684 /* Even longer sequences. */
47685 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47686 return true;
47688 /* See if we can get the same permutation in different vector integer
47689 mode. */
47690 struct expand_vec_perm_d nd;
47691 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47693 if (!d->testing_p)
47694 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47695 return true;
47698 return false;
47701 /* If a permutation only uses one operand, make it clear. Returns true
47702 if the permutation references both operands. */
47704 static bool
47705 canonicalize_perm (struct expand_vec_perm_d *d)
47707 int i, which, nelt = d->nelt;
47709 for (i = which = 0; i < nelt; ++i)
47710 which |= (d->perm[i] < nelt ? 1 : 2);
47712 d->one_operand_p = true;
47713 switch (which)
47715 default:
47716 gcc_unreachable();
47718 case 3:
47719 if (!rtx_equal_p (d->op0, d->op1))
47721 d->one_operand_p = false;
47722 break;
47724 /* The elements of PERM do not suggest that only the first operand
47725 is used, but both operands are identical. Allow easier matching
47726 of the permutation by folding the permutation into the single
47727 input vector. */
47728 /* FALLTHRU */
47730 case 2:
47731 for (i = 0; i < nelt; ++i)
47732 d->perm[i] &= nelt - 1;
47733 d->op0 = d->op1;
47734 break;
47736 case 1:
47737 d->op1 = d->op0;
47738 break;
47741 return (which == 3);
47744 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
47746 static bool
47747 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
47748 rtx op1, const vec_perm_indices &sel)
47750 struct expand_vec_perm_d d;
47751 unsigned char perm[MAX_VECT_LEN];
47752 unsigned int i, nelt, which;
47753 bool two_args;
47755 d.target = target;
47756 d.op0 = op0;
47757 d.op1 = op1;
47759 d.vmode = vmode;
47760 gcc_assert (VECTOR_MODE_P (d.vmode));
47761 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47762 d.testing_p = !target;
47764 gcc_assert (sel.length () == nelt);
47765 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47767 /* Given sufficient ISA support we can just return true here
47768 for selected vector modes. */
47769 switch (d.vmode)
47771 case E_V16SFmode:
47772 case E_V16SImode:
47773 case E_V8DImode:
47774 case E_V8DFmode:
47775 if (!TARGET_AVX512F)
47776 return false;
47777 /* All implementable with a single vperm[it]2 insn. */
47778 if (d.testing_p)
47779 return true;
47780 break;
47781 case E_V32HImode:
47782 if (!TARGET_AVX512BW)
47783 return false;
47784 if (d.testing_p)
47785 /* All implementable with a single vperm[it]2 insn. */
47786 return true;
47787 break;
47788 case E_V64QImode:
47789 if (!TARGET_AVX512BW)
47790 return false;
47791 if (d.testing_p)
47792 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
47793 return true;
47794 break;
47795 case E_V8SImode:
47796 case E_V8SFmode:
47797 case E_V4DFmode:
47798 case E_V4DImode:
47799 if (!TARGET_AVX)
47800 return false;
47801 if (d.testing_p && TARGET_AVX512VL)
47802 /* All implementable with a single vperm[it]2 insn. */
47803 return true;
47804 break;
47805 case E_V16HImode:
47806 if (!TARGET_SSE2)
47807 return false;
47808 if (d.testing_p && TARGET_AVX2)
47809 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47810 return true;
47811 break;
47812 case E_V32QImode:
47813 if (!TARGET_SSE2)
47814 return false;
47815 if (d.testing_p && TARGET_AVX2)
47816 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47817 return true;
47818 break;
47819 case E_V8HImode:
47820 case E_V16QImode:
47821 if (!TARGET_SSE2)
47822 return false;
47823 /* Fall through. */
47824 case E_V4SImode:
47825 case E_V4SFmode:
47826 if (!TARGET_SSE)
47827 return false;
47828 /* All implementable with a single vpperm insn. */
47829 if (d.testing_p && TARGET_XOP)
47830 return true;
47831 /* All implementable with 2 pshufb + 1 ior. */
47832 if (d.testing_p && TARGET_SSSE3)
47833 return true;
47834 break;
47835 case E_V2DImode:
47836 case E_V2DFmode:
47837 if (!TARGET_SSE)
47838 return false;
47839 /* All implementable with shufpd or unpck[lh]pd. */
47840 if (d.testing_p)
47841 return true;
47842 break;
47843 default:
47844 return false;
47847 for (i = which = 0; i < nelt; ++i)
47849 unsigned char e = sel[i];
47850 gcc_assert (e < 2 * nelt);
47851 d.perm[i] = e;
47852 perm[i] = e;
47853 which |= (e < nelt ? 1 : 2);
47856 if (d.testing_p)
47858 /* For all elements from second vector, fold the elements to first. */
47859 if (which == 2)
47860 for (i = 0; i < nelt; ++i)
47861 d.perm[i] -= nelt;
47863 /* Check whether the mask can be applied to the vector type. */
47864 d.one_operand_p = (which != 3);
47866 /* Implementable with shufps or pshufd. */
47867 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47868 return true;
47870 /* Otherwise we have to go through the motions and see if we can
47871 figure out how to generate the requested permutation. */
47872 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47873 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47874 if (!d.one_operand_p)
47875 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47877 start_sequence ();
47878 bool ret = ix86_expand_vec_perm_const_1 (&d);
47879 end_sequence ();
47881 return ret;
47884 two_args = canonicalize_perm (&d);
47886 if (ix86_expand_vec_perm_const_1 (&d))
47887 return true;
47889 /* If the selector says both arguments are needed, but the operands are the
47890 same, the above tried to expand with one_operand_p and flattened selector.
47891 If that didn't work, retry without one_operand_p; we succeeded with that
47892 during testing. */
47893 if (two_args && d.one_operand_p)
47895 d.one_operand_p = false;
47896 memcpy (d.perm, perm, sizeof (perm));
47897 return ix86_expand_vec_perm_const_1 (&d);
47900 return false;
47903 void
47904 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47906 struct expand_vec_perm_d d;
47907 unsigned i, nelt;
47909 d.target = targ;
47910 d.op0 = op0;
47911 d.op1 = op1;
47912 d.vmode = GET_MODE (targ);
47913 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47914 d.one_operand_p = false;
47915 d.testing_p = false;
47917 for (i = 0; i < nelt; ++i)
47918 d.perm[i] = i * 2 + odd;
47920 /* We'll either be able to implement the permutation directly... */
47921 if (expand_vec_perm_1 (&d))
47922 return;
47924 /* ... or we use the special-case patterns. */
47925 expand_vec_perm_even_odd_1 (&d, odd);
47928 static void
47929 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47931 struct expand_vec_perm_d d;
47932 unsigned i, nelt, base;
47933 bool ok;
47935 d.target = targ;
47936 d.op0 = op0;
47937 d.op1 = op1;
47938 d.vmode = GET_MODE (targ);
47939 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47940 d.one_operand_p = false;
47941 d.testing_p = false;
47943 base = high_p ? nelt / 2 : 0;
47944 for (i = 0; i < nelt / 2; ++i)
47946 d.perm[i * 2] = i + base;
47947 d.perm[i * 2 + 1] = i + base + nelt;
47950 /* Note that for AVX this isn't one instruction. */
47951 ok = ix86_expand_vec_perm_const_1 (&d);
47952 gcc_assert (ok);
47956 /* Expand a vector operation CODE for a V*QImode in terms of the
47957 same operation on V*HImode. */
47959 void
47960 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47962 machine_mode qimode = GET_MODE (dest);
47963 machine_mode himode;
47964 rtx (*gen_il) (rtx, rtx, rtx);
47965 rtx (*gen_ih) (rtx, rtx, rtx);
47966 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47967 struct expand_vec_perm_d d;
47968 bool ok, full_interleave;
47969 bool uns_p = false;
47970 int i;
47972 switch (qimode)
47974 case E_V16QImode:
47975 himode = V8HImode;
47976 gen_il = gen_vec_interleave_lowv16qi;
47977 gen_ih = gen_vec_interleave_highv16qi;
47978 break;
47979 case E_V32QImode:
47980 himode = V16HImode;
47981 gen_il = gen_avx2_interleave_lowv32qi;
47982 gen_ih = gen_avx2_interleave_highv32qi;
47983 break;
47984 case E_V64QImode:
47985 himode = V32HImode;
47986 gen_il = gen_avx512bw_interleave_lowv64qi;
47987 gen_ih = gen_avx512bw_interleave_highv64qi;
47988 break;
47989 default:
47990 gcc_unreachable ();
47993 op2_l = op2_h = op2;
47994 switch (code)
47996 case MULT:
47997 /* Unpack data such that we've got a source byte in each low byte of
47998 each word. We don't care what goes into the high byte of each word.
47999 Rather than trying to get zero in there, most convenient is to let
48000 it be a copy of the low byte. */
48001 op2_l = gen_reg_rtx (qimode);
48002 op2_h = gen_reg_rtx (qimode);
48003 emit_insn (gen_il (op2_l, op2, op2));
48004 emit_insn (gen_ih (op2_h, op2, op2));
48006 op1_l = gen_reg_rtx (qimode);
48007 op1_h = gen_reg_rtx (qimode);
48008 emit_insn (gen_il (op1_l, op1, op1));
48009 emit_insn (gen_ih (op1_h, op1, op1));
48010 full_interleave = qimode == V16QImode;
48011 break;
48013 case ASHIFT:
48014 case LSHIFTRT:
48015 uns_p = true;
48016 /* FALLTHRU */
48017 case ASHIFTRT:
48018 op1_l = gen_reg_rtx (himode);
48019 op1_h = gen_reg_rtx (himode);
48020 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48021 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48022 full_interleave = true;
48023 break;
48024 default:
48025 gcc_unreachable ();
48028 /* Perform the operation. */
48029 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48030 1, OPTAB_DIRECT);
48031 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48032 1, OPTAB_DIRECT);
48033 gcc_assert (res_l && res_h);
48035 /* Merge the data back into the right place. */
48036 d.target = dest;
48037 d.op0 = gen_lowpart (qimode, res_l);
48038 d.op1 = gen_lowpart (qimode, res_h);
48039 d.vmode = qimode;
48040 d.nelt = GET_MODE_NUNITS (qimode);
48041 d.one_operand_p = false;
48042 d.testing_p = false;
48044 if (full_interleave)
48046 /* For SSE2, we used an full interleave, so the desired
48047 results are in the even elements. */
48048 for (i = 0; i < d.nelt; ++i)
48049 d.perm[i] = i * 2;
48051 else
48053 /* For AVX, the interleave used above was not cross-lane. So the
48054 extraction is evens but with the second and third quarter swapped.
48055 Happily, that is even one insn shorter than even extraction.
48056 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48057 always first from the first and then from the second source operand,
48058 the index bits above the low 4 bits remains the same.
48059 Thus, for d.nelt == 32 we want permutation
48060 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48061 and for d.nelt == 64 we want permutation
48062 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48063 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48064 for (i = 0; i < d.nelt; ++i)
48065 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48068 ok = ix86_expand_vec_perm_const_1 (&d);
48069 gcc_assert (ok);
48071 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48072 gen_rtx_fmt_ee (code, qimode, op1, op2));
48075 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48076 if op is CONST_VECTOR with all odd elements equal to their
48077 preceding element. */
48079 static bool
48080 const_vector_equal_evenodd_p (rtx op)
48082 machine_mode mode = GET_MODE (op);
48083 int i, nunits = GET_MODE_NUNITS (mode);
48084 if (GET_CODE (op) != CONST_VECTOR
48085 || nunits != CONST_VECTOR_NUNITS (op))
48086 return false;
48087 for (i = 0; i < nunits; i += 2)
48088 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48089 return false;
48090 return true;
48093 void
48094 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48095 bool uns_p, bool odd_p)
48097 machine_mode mode = GET_MODE (op1);
48098 machine_mode wmode = GET_MODE (dest);
48099 rtx x;
48100 rtx orig_op1 = op1, orig_op2 = op2;
48102 if (!nonimmediate_operand (op1, mode))
48103 op1 = force_reg (mode, op1);
48104 if (!nonimmediate_operand (op2, mode))
48105 op2 = force_reg (mode, op2);
48107 /* We only play even/odd games with vectors of SImode. */
48108 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48110 /* If we're looking for the odd results, shift those members down to
48111 the even slots. For some cpus this is faster than a PSHUFD. */
48112 if (odd_p)
48114 /* For XOP use vpmacsdqh, but only for smult, as it is only
48115 signed. */
48116 if (TARGET_XOP && mode == V4SImode && !uns_p)
48118 x = force_reg (wmode, CONST0_RTX (wmode));
48119 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48120 return;
48123 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48124 if (!const_vector_equal_evenodd_p (orig_op1))
48125 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48126 x, NULL, 1, OPTAB_DIRECT);
48127 if (!const_vector_equal_evenodd_p (orig_op2))
48128 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48129 x, NULL, 1, OPTAB_DIRECT);
48130 op1 = gen_lowpart (mode, op1);
48131 op2 = gen_lowpart (mode, op2);
48134 if (mode == V16SImode)
48136 if (uns_p)
48137 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48138 else
48139 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48141 else if (mode == V8SImode)
48143 if (uns_p)
48144 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48145 else
48146 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48148 else if (uns_p)
48149 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48150 else if (TARGET_SSE4_1)
48151 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48152 else
48154 rtx s1, s2, t0, t1, t2;
48156 /* The easiest way to implement this without PMULDQ is to go through
48157 the motions as if we are performing a full 64-bit multiply. With
48158 the exception that we need to do less shuffling of the elements. */
48160 /* Compute the sign-extension, aka highparts, of the two operands. */
48161 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48162 op1, pc_rtx, pc_rtx);
48163 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48164 op2, pc_rtx, pc_rtx);
48166 /* Multiply LO(A) * HI(B), and vice-versa. */
48167 t1 = gen_reg_rtx (wmode);
48168 t2 = gen_reg_rtx (wmode);
48169 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48170 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48172 /* Multiply LO(A) * LO(B). */
48173 t0 = gen_reg_rtx (wmode);
48174 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48176 /* Combine and shift the highparts into place. */
48177 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48178 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48179 1, OPTAB_DIRECT);
48181 /* Combine high and low parts. */
48182 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48183 return;
48185 emit_insn (x);
48188 void
48189 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48190 bool uns_p, bool high_p)
48192 machine_mode wmode = GET_MODE (dest);
48193 machine_mode mode = GET_MODE (op1);
48194 rtx t1, t2, t3, t4, mask;
48196 switch (mode)
48198 case E_V4SImode:
48199 t1 = gen_reg_rtx (mode);
48200 t2 = gen_reg_rtx (mode);
48201 if (TARGET_XOP && !uns_p)
48203 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48204 shuffle the elements once so that all elements are in the right
48205 place for immediate use: { A C B D }. */
48206 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48207 const1_rtx, GEN_INT (3)));
48208 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48209 const1_rtx, GEN_INT (3)));
48211 else
48213 /* Put the elements into place for the multiply. */
48214 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48215 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48216 high_p = false;
48218 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48219 break;
48221 case E_V8SImode:
48222 /* Shuffle the elements between the lanes. After this we
48223 have { A B E F | C D G H } for each operand. */
48224 t1 = gen_reg_rtx (V4DImode);
48225 t2 = gen_reg_rtx (V4DImode);
48226 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48227 const0_rtx, const2_rtx,
48228 const1_rtx, GEN_INT (3)));
48229 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48230 const0_rtx, const2_rtx,
48231 const1_rtx, GEN_INT (3)));
48233 /* Shuffle the elements within the lanes. After this we
48234 have { A A B B | C C D D } or { E E F F | G G H H }. */
48235 t3 = gen_reg_rtx (V8SImode);
48236 t4 = gen_reg_rtx (V8SImode);
48237 mask = GEN_INT (high_p
48238 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48239 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48240 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48241 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48243 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48244 break;
48246 case E_V8HImode:
48247 case E_V16HImode:
48248 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48249 uns_p, OPTAB_DIRECT);
48250 t2 = expand_binop (mode,
48251 uns_p ? umul_highpart_optab : smul_highpart_optab,
48252 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48253 gcc_assert (t1 && t2);
48255 t3 = gen_reg_rtx (mode);
48256 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48257 emit_move_insn (dest, gen_lowpart (wmode, t3));
48258 break;
48260 case E_V16QImode:
48261 case E_V32QImode:
48262 case E_V32HImode:
48263 case E_V16SImode:
48264 case E_V64QImode:
48265 t1 = gen_reg_rtx (wmode);
48266 t2 = gen_reg_rtx (wmode);
48267 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48268 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48270 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48271 break;
48273 default:
48274 gcc_unreachable ();
48278 void
48279 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48281 rtx res_1, res_2, res_3, res_4;
48283 res_1 = gen_reg_rtx (V4SImode);
48284 res_2 = gen_reg_rtx (V4SImode);
48285 res_3 = gen_reg_rtx (V2DImode);
48286 res_4 = gen_reg_rtx (V2DImode);
48287 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48288 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48290 /* Move the results in element 2 down to element 1; we don't care
48291 what goes in elements 2 and 3. Then we can merge the parts
48292 back together with an interleave.
48294 Note that two other sequences were tried:
48295 (1) Use interleaves at the start instead of psrldq, which allows
48296 us to use a single shufps to merge things back at the end.
48297 (2) Use shufps here to combine the two vectors, then pshufd to
48298 put the elements in the correct order.
48299 In both cases the cost of the reformatting stall was too high
48300 and the overall sequence slower. */
48302 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48303 const0_rtx, const2_rtx,
48304 const0_rtx, const0_rtx));
48305 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48306 const0_rtx, const2_rtx,
48307 const0_rtx, const0_rtx));
48308 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48310 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48313 void
48314 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48316 machine_mode mode = GET_MODE (op0);
48317 rtx t1, t2, t3, t4, t5, t6;
48319 if (TARGET_AVX512DQ && mode == V8DImode)
48320 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48321 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48322 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48323 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48324 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48325 else if (TARGET_XOP && mode == V2DImode)
48327 /* op1: A,B,C,D, op2: E,F,G,H */
48328 op1 = gen_lowpart (V4SImode, op1);
48329 op2 = gen_lowpart (V4SImode, op2);
48331 t1 = gen_reg_rtx (V4SImode);
48332 t2 = gen_reg_rtx (V4SImode);
48333 t3 = gen_reg_rtx (V2DImode);
48334 t4 = gen_reg_rtx (V2DImode);
48336 /* t1: B,A,D,C */
48337 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48338 GEN_INT (1),
48339 GEN_INT (0),
48340 GEN_INT (3),
48341 GEN_INT (2)));
48343 /* t2: (B*E),(A*F),(D*G),(C*H) */
48344 emit_insn (gen_mulv4si3 (t2, t1, op2));
48346 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48347 emit_insn (gen_xop_phadddq (t3, t2));
48349 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48350 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48352 /* Multiply lower parts and add all */
48353 t5 = gen_reg_rtx (V2DImode);
48354 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48355 gen_lowpart (V4SImode, op1),
48356 gen_lowpart (V4SImode, op2)));
48357 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48360 else
48362 machine_mode nmode;
48363 rtx (*umul) (rtx, rtx, rtx);
48365 if (mode == V2DImode)
48367 umul = gen_vec_widen_umult_even_v4si;
48368 nmode = V4SImode;
48370 else if (mode == V4DImode)
48372 umul = gen_vec_widen_umult_even_v8si;
48373 nmode = V8SImode;
48375 else if (mode == V8DImode)
48377 umul = gen_vec_widen_umult_even_v16si;
48378 nmode = V16SImode;
48380 else
48381 gcc_unreachable ();
48384 /* Multiply low parts. */
48385 t1 = gen_reg_rtx (mode);
48386 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48388 /* Shift input vectors right 32 bits so we can multiply high parts. */
48389 t6 = GEN_INT (32);
48390 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48391 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48393 /* Multiply high parts by low parts. */
48394 t4 = gen_reg_rtx (mode);
48395 t5 = gen_reg_rtx (mode);
48396 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48397 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48399 /* Combine and shift the highparts back. */
48400 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48401 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48403 /* Combine high and low parts. */
48404 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48407 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48408 gen_rtx_MULT (mode, op1, op2));
48411 /* Return 1 if control tansfer instruction INSN
48412 should be encoded with bnd prefix.
48413 If insn is NULL then return 1 when control
48414 transfer instructions should be prefixed with
48415 bnd by default for current function. */
48417 bool
48418 ix86_bnd_prefixed_insn_p (rtx insn)
48420 /* For call insns check special flag. */
48421 if (insn && CALL_P (insn))
48423 rtx call = get_call_rtx_from (insn);
48424 if (call)
48425 return CALL_EXPR_WITH_BOUNDS_P (call);
48428 /* All other insns are prefixed only if function is instrumented. */
48429 return chkp_function_instrumented_p (current_function_decl);
48432 /* Return 1 if control tansfer instruction INSN
48433 should be encoded with notrack prefix. */
48435 static bool
48436 ix86_notrack_prefixed_insn_p (rtx insn)
48438 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48439 return false;
48441 if (CALL_P (insn))
48443 rtx call = get_call_rtx_from (insn);
48444 gcc_assert (call != NULL_RTX);
48445 rtx addr = XEXP (call, 0);
48447 /* Do not emit 'notrack' if it's not an indirect call. */
48448 if (MEM_P (addr)
48449 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48450 return false;
48451 else
48452 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48455 if (JUMP_P (insn) && !flag_cet_switch)
48457 rtx target = JUMP_LABEL (insn);
48458 if (target == NULL_RTX || ANY_RETURN_P (target))
48459 return false;
48461 /* Check the jump is a switch table. */
48462 rtx_insn *label = as_a<rtx_insn *> (target);
48463 rtx_insn *table = next_insn (label);
48464 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48465 return false;
48466 else
48467 return true;
48469 return false;
48472 /* Calculate integer abs() using only SSE2 instructions. */
48474 void
48475 ix86_expand_sse2_abs (rtx target, rtx input)
48477 machine_mode mode = GET_MODE (target);
48478 rtx tmp0, tmp1, x;
48480 switch (mode)
48482 /* For 32-bit signed integer X, the best way to calculate the absolute
48483 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48484 case E_V4SImode:
48485 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48486 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48487 NULL, 0, OPTAB_DIRECT);
48488 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48489 NULL, 0, OPTAB_DIRECT);
48490 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48491 target, 0, OPTAB_DIRECT);
48492 break;
48494 /* For 16-bit signed integer X, the best way to calculate the absolute
48495 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48496 case E_V8HImode:
48497 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48499 x = expand_simple_binop (mode, SMAX, tmp0, input,
48500 target, 0, OPTAB_DIRECT);
48501 break;
48503 /* For 8-bit signed integer X, the best way to calculate the absolute
48504 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48505 as SSE2 provides the PMINUB insn. */
48506 case E_V16QImode:
48507 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48509 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48510 target, 0, OPTAB_DIRECT);
48511 break;
48513 default:
48514 gcc_unreachable ();
48517 if (x != target)
48518 emit_move_insn (target, x);
48521 /* Expand an extract from a vector register through pextr insn.
48522 Return true if successful. */
48524 bool
48525 ix86_expand_pextr (rtx *operands)
48527 rtx dst = operands[0];
48528 rtx src = operands[1];
48530 unsigned int size = INTVAL (operands[2]);
48531 unsigned int pos = INTVAL (operands[3]);
48533 if (SUBREG_P (dst))
48535 /* Reject non-lowpart subregs. */
48536 if (SUBREG_BYTE (dst) > 0)
48537 return false;
48538 dst = SUBREG_REG (dst);
48541 if (SUBREG_P (src))
48543 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48544 src = SUBREG_REG (src);
48547 switch (GET_MODE (src))
48549 case E_V16QImode:
48550 case E_V8HImode:
48551 case E_V4SImode:
48552 case E_V2DImode:
48553 case E_V1TImode:
48554 case E_TImode:
48556 machine_mode srcmode, dstmode;
48557 rtx d, pat;
48559 if (!int_mode_for_size (size, 0).exists (&dstmode))
48560 return false;
48562 switch (dstmode)
48564 case E_QImode:
48565 if (!TARGET_SSE4_1)
48566 return false;
48567 srcmode = V16QImode;
48568 break;
48570 case E_HImode:
48571 if (!TARGET_SSE2)
48572 return false;
48573 srcmode = V8HImode;
48574 break;
48576 case E_SImode:
48577 if (!TARGET_SSE4_1)
48578 return false;
48579 srcmode = V4SImode;
48580 break;
48582 case E_DImode:
48583 gcc_assert (TARGET_64BIT);
48584 if (!TARGET_SSE4_1)
48585 return false;
48586 srcmode = V2DImode;
48587 break;
48589 default:
48590 return false;
48593 /* Reject extractions from misaligned positions. */
48594 if (pos & (size-1))
48595 return false;
48597 if (GET_MODE (dst) == dstmode)
48598 d = dst;
48599 else
48600 d = gen_reg_rtx (dstmode);
48602 /* Construct insn pattern. */
48603 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48604 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48606 /* Let the rtl optimizers know about the zero extension performed. */
48607 if (dstmode == QImode || dstmode == HImode)
48609 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48610 d = gen_lowpart (SImode, d);
48613 emit_insn (gen_rtx_SET (d, pat));
48615 if (d != dst)
48616 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48617 return true;
48620 default:
48621 return false;
48625 /* Expand an insert into a vector register through pinsr insn.
48626 Return true if successful. */
48628 bool
48629 ix86_expand_pinsr (rtx *operands)
48631 rtx dst = operands[0];
48632 rtx src = operands[3];
48634 unsigned int size = INTVAL (operands[1]);
48635 unsigned int pos = INTVAL (operands[2]);
48637 if (SUBREG_P (dst))
48639 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48640 dst = SUBREG_REG (dst);
48643 switch (GET_MODE (dst))
48645 case E_V16QImode:
48646 case E_V8HImode:
48647 case E_V4SImode:
48648 case E_V2DImode:
48649 case E_V1TImode:
48650 case E_TImode:
48652 machine_mode srcmode, dstmode;
48653 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48654 rtx d;
48656 if (!int_mode_for_size (size, 0).exists (&srcmode))
48657 return false;
48659 switch (srcmode)
48661 case E_QImode:
48662 if (!TARGET_SSE4_1)
48663 return false;
48664 dstmode = V16QImode;
48665 pinsr = gen_sse4_1_pinsrb;
48666 break;
48668 case E_HImode:
48669 if (!TARGET_SSE2)
48670 return false;
48671 dstmode = V8HImode;
48672 pinsr = gen_sse2_pinsrw;
48673 break;
48675 case E_SImode:
48676 if (!TARGET_SSE4_1)
48677 return false;
48678 dstmode = V4SImode;
48679 pinsr = gen_sse4_1_pinsrd;
48680 break;
48682 case E_DImode:
48683 gcc_assert (TARGET_64BIT);
48684 if (!TARGET_SSE4_1)
48685 return false;
48686 dstmode = V2DImode;
48687 pinsr = gen_sse4_1_pinsrq;
48688 break;
48690 default:
48691 return false;
48694 /* Reject insertions to misaligned positions. */
48695 if (pos & (size-1))
48696 return false;
48698 if (SUBREG_P (src))
48700 unsigned int srcpos = SUBREG_BYTE (src);
48702 if (srcpos > 0)
48704 rtx extr_ops[4];
48706 extr_ops[0] = gen_reg_rtx (srcmode);
48707 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48708 extr_ops[2] = GEN_INT (size);
48709 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48711 if (!ix86_expand_pextr (extr_ops))
48712 return false;
48714 src = extr_ops[0];
48716 else
48717 src = gen_lowpart (srcmode, SUBREG_REG (src));
48720 if (GET_MODE (dst) == dstmode)
48721 d = dst;
48722 else
48723 d = gen_reg_rtx (dstmode);
48725 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48726 gen_lowpart (srcmode, src),
48727 GEN_INT (1 << (pos / size))));
48728 if (d != dst)
48729 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48730 return true;
48733 default:
48734 return false;
48738 /* This function returns the calling abi specific va_list type node.
48739 It returns the FNDECL specific va_list type. */
48741 static tree
48742 ix86_fn_abi_va_list (tree fndecl)
48744 if (!TARGET_64BIT)
48745 return va_list_type_node;
48746 gcc_assert (fndecl != NULL_TREE);
48748 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48749 return ms_va_list_type_node;
48750 else
48751 return sysv_va_list_type_node;
48754 /* Returns the canonical va_list type specified by TYPE. If there
48755 is no valid TYPE provided, it return NULL_TREE. */
48757 static tree
48758 ix86_canonical_va_list_type (tree type)
48760 if (TARGET_64BIT)
48762 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48763 return ms_va_list_type_node;
48765 if ((TREE_CODE (type) == ARRAY_TYPE
48766 && integer_zerop (array_type_nelts (type)))
48767 || POINTER_TYPE_P (type))
48769 tree elem_type = TREE_TYPE (type);
48770 if (TREE_CODE (elem_type) == RECORD_TYPE
48771 && lookup_attribute ("sysv_abi va_list",
48772 TYPE_ATTRIBUTES (elem_type)))
48773 return sysv_va_list_type_node;
48776 return NULL_TREE;
48779 return std_canonical_va_list_type (type);
48782 /* Iterate through the target-specific builtin types for va_list.
48783 IDX denotes the iterator, *PTREE is set to the result type of
48784 the va_list builtin, and *PNAME to its internal type.
48785 Returns zero if there is no element for this index, otherwise
48786 IDX should be increased upon the next call.
48787 Note, do not iterate a base builtin's name like __builtin_va_list.
48788 Used from c_common_nodes_and_builtins. */
48790 static int
48791 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48793 if (TARGET_64BIT)
48795 switch (idx)
48797 default:
48798 break;
48800 case 0:
48801 *ptree = ms_va_list_type_node;
48802 *pname = "__builtin_ms_va_list";
48803 return 1;
48805 case 1:
48806 *ptree = sysv_va_list_type_node;
48807 *pname = "__builtin_sysv_va_list";
48808 return 1;
48812 return 0;
48815 #undef TARGET_SCHED_DISPATCH
48816 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48817 #undef TARGET_SCHED_DISPATCH_DO
48818 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48819 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48820 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48821 #undef TARGET_SCHED_REORDER
48822 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48823 #undef TARGET_SCHED_ADJUST_PRIORITY
48824 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48825 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48826 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48827 ix86_dependencies_evaluation_hook
48830 /* Implementation of reassociation_width target hook used by
48831 reassoc phase to identify parallelism level in reassociated
48832 tree. Statements tree_code is passed in OPC. Arguments type
48833 is passed in MODE. */
48835 static int
48836 ix86_reassociation_width (unsigned int op, machine_mode mode)
48838 int width = 1;
48839 /* Vector part. */
48840 if (VECTOR_MODE_P (mode))
48842 int div = 1;
48843 if (INTEGRAL_MODE_P (mode))
48844 width = ix86_cost->reassoc_vec_int;
48845 else if (FLOAT_MODE_P (mode))
48846 width = ix86_cost->reassoc_vec_fp;
48848 if (width == 1)
48849 return 1;
48851 /* Integer vector instructions execute in FP unit
48852 and can execute 3 additions and one multiplication per cycle. */
48853 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48854 && op != PLUS && op != MINUS)
48855 return 1;
48857 /* Account for targets that splits wide vectors into multiple parts. */
48858 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48859 div = GET_MODE_BITSIZE (mode) / 128;
48860 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48861 div = GET_MODE_BITSIZE (mode) / 64;
48862 width = (width + div - 1) / div;
48864 /* Scalar part. */
48865 else if (INTEGRAL_MODE_P (mode))
48866 width = ix86_cost->reassoc_int;
48867 else if (FLOAT_MODE_P (mode))
48868 width = ix86_cost->reassoc_fp;
48870 /* Avoid using too many registers in 32bit mode. */
48871 if (!TARGET_64BIT && width > 2)
48872 width = 2;
48873 return width;
48876 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48877 place emms and femms instructions. */
48879 static machine_mode
48880 ix86_preferred_simd_mode (scalar_mode mode)
48882 if (!TARGET_SSE)
48883 return word_mode;
48885 switch (mode)
48887 case E_QImode:
48888 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48889 return V64QImode;
48890 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48891 return V32QImode;
48892 else
48893 return V16QImode;
48895 case E_HImode:
48896 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48897 return V32HImode;
48898 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48899 return V16HImode;
48900 else
48901 return V8HImode;
48903 case E_SImode:
48904 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48905 return V16SImode;
48906 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48907 return V8SImode;
48908 else
48909 return V4SImode;
48911 case E_DImode:
48912 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48913 return V8DImode;
48914 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48915 return V4DImode;
48916 else
48917 return V2DImode;
48919 case E_SFmode:
48920 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48921 return V16SFmode;
48922 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48923 return V8SFmode;
48924 else
48925 return V4SFmode;
48927 case E_DFmode:
48928 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48929 return V8DFmode;
48930 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48931 return V4DFmode;
48932 else if (TARGET_SSE2)
48933 return V2DFmode;
48934 /* FALLTHRU */
48936 default:
48937 return word_mode;
48941 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
48942 vectors. If AVX512F is enabled then try vectorizing with 512bit,
48943 256bit and 128bit vectors. */
48945 static unsigned int
48946 ix86_autovectorize_vector_sizes (void)
48948 unsigned int bytesizes = 0;
48950 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48951 bytesizes |= (64 | 32 | 16);
48952 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48953 bytesizes |= (32 | 16);
48955 return bytesizes;
48958 /* Implemenation of targetm.vectorize.get_mask_mode. */
48960 static opt_machine_mode
48961 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
48963 unsigned elem_size = vector_size / nunits;
48965 /* Scalar mask case. */
48966 if ((TARGET_AVX512F && vector_size == 64)
48967 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
48969 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
48970 return smallest_int_mode_for_size (nunits);
48973 scalar_int_mode elem_mode
48974 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
48976 gcc_assert (elem_size * nunits == vector_size);
48978 return mode_for_vector (elem_mode, nunits);
48983 /* Return class of registers which could be used for pseudo of MODE
48984 and of class RCLASS for spilling instead of memory. Return NO_REGS
48985 if it is not possible or non-profitable. */
48987 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
48989 static reg_class_t
48990 ix86_spill_class (reg_class_t rclass, machine_mode mode)
48992 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
48993 && TARGET_SSE2
48994 && TARGET_INTER_UNIT_MOVES_TO_VEC
48995 && TARGET_INTER_UNIT_MOVES_FROM_VEC
48996 && (mode == SImode || (TARGET_64BIT && mode == DImode))
48997 && INTEGER_CLASS_P (rclass))
48998 return ALL_SSE_REGS;
48999 return NO_REGS;
49002 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
49003 but returns a lower bound. */
49005 static unsigned int
49006 ix86_max_noce_ifcvt_seq_cost (edge e)
49008 bool predictable_p = predictable_edge_p (e);
49010 enum compiler_param param
49011 = (predictable_p
49012 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
49013 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
49015 /* If we have a parameter set, use that, otherwise take a guess using
49016 BRANCH_COST. */
49017 if (global_options_set.x_param_values[param])
49018 return PARAM_VALUE (param);
49019 else
49020 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
49023 /* Return true if SEQ is a good candidate as a replacement for the
49024 if-convertible sequence described in IF_INFO. */
49026 static bool
49027 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
49029 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
49031 int cmov_cnt = 0;
49032 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
49033 Maybe we should allow even more conditional moves as long as they
49034 are used far enough not to stall the CPU, or also consider
49035 IF_INFO->TEST_BB succ edge probabilities. */
49036 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
49038 rtx set = single_set (insn);
49039 if (!set)
49040 continue;
49041 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
49042 continue;
49043 rtx src = SET_SRC (set);
49044 machine_mode mode = GET_MODE (src);
49045 if (GET_MODE_CLASS (mode) != MODE_INT
49046 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49047 continue;
49048 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49049 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49050 continue;
49051 /* insn is CMOV or FCMOV. */
49052 if (++cmov_cnt > 1)
49053 return false;
49056 return default_noce_conversion_profitable_p (seq, if_info);
49059 /* Implement targetm.vectorize.init_cost. */
49061 static void *
49062 ix86_init_cost (struct loop *)
49064 unsigned *cost = XNEWVEC (unsigned, 3);
49065 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49066 return cost;
49069 /* Implement targetm.vectorize.add_stmt_cost. */
49071 static unsigned
49072 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49073 struct _stmt_vec_info *stmt_info, int misalign,
49074 enum vect_cost_model_location where)
49076 unsigned *cost = (unsigned *) data;
49077 unsigned retval = 0;
49079 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49080 int stmt_cost = - 1;
49082 if ((kind == vector_stmt || kind == scalar_stmt)
49083 && stmt_info
49084 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
49086 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
49087 bool fp = false;
49088 machine_mode mode = TImode;
49090 if (vectype != NULL)
49092 fp = FLOAT_TYPE_P (vectype);
49093 mode = TYPE_MODE (vectype);
49095 /*machine_mode inner_mode = mode;
49096 if (VECTOR_MODE_P (mode))
49097 inner_mode = GET_MODE_INNER (mode);*/
49099 switch (subcode)
49101 case PLUS_EXPR:
49102 case POINTER_PLUS_EXPR:
49103 case MINUS_EXPR:
49104 if (kind == scalar_stmt)
49106 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49107 stmt_cost = ix86_cost->addss;
49108 else if (X87_FLOAT_MODE_P (mode))
49109 stmt_cost = ix86_cost->fadd;
49110 else
49111 stmt_cost = ix86_cost->add;
49113 else
49114 stmt_cost = ix86_vec_cost (mode,
49115 fp ? ix86_cost->addss
49116 : ix86_cost->sse_op,
49117 true);
49118 break;
49120 case MULT_EXPR:
49121 case WIDEN_MULT_EXPR:
49122 case MULT_HIGHPART_EXPR:
49123 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
49124 break;
49125 case FMA_EXPR:
49126 stmt_cost = ix86_vec_cost (mode,
49127 mode == SFmode ? ix86_cost->fmass
49128 : ix86_cost->fmasd,
49129 true);
49130 break;
49131 case NEGATE_EXPR:
49132 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49133 stmt_cost = ix86_cost->sse_op;
49134 else if (X87_FLOAT_MODE_P (mode))
49135 stmt_cost = ix86_cost->fchs;
49136 else if (VECTOR_MODE_P (mode))
49137 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49138 else
49139 stmt_cost = ix86_cost->add;
49140 break;
49141 case TRUNC_DIV_EXPR:
49142 case CEIL_DIV_EXPR:
49143 case FLOOR_DIV_EXPR:
49144 case ROUND_DIV_EXPR:
49145 case TRUNC_MOD_EXPR:
49146 case CEIL_MOD_EXPR:
49147 case FLOOR_MOD_EXPR:
49148 case RDIV_EXPR:
49149 case ROUND_MOD_EXPR:
49150 case EXACT_DIV_EXPR:
49151 stmt_cost = ix86_division_cost (ix86_cost, mode);
49152 break;
49154 case RSHIFT_EXPR:
49155 case LSHIFT_EXPR:
49156 case LROTATE_EXPR:
49157 case RROTATE_EXPR:
49159 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
49160 stmt_cost = ix86_shift_rotate_cost
49161 (ix86_cost, mode,
49162 TREE_CODE (op2) == INTEGER_CST,
49163 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
49164 true, false, false, NULL, NULL);
49166 break;
49167 case NOP_EXPR:
49168 stmt_cost = 0;
49169 break;
49171 case BIT_IOR_EXPR:
49172 case ABS_EXPR:
49173 case MIN_EXPR:
49174 case MAX_EXPR:
49175 case BIT_XOR_EXPR:
49176 case BIT_AND_EXPR:
49177 case BIT_NOT_EXPR:
49178 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49179 stmt_cost = ix86_cost->sse_op;
49180 else if (VECTOR_MODE_P (mode))
49181 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49182 else
49183 stmt_cost = ix86_cost->add;
49184 break;
49185 default:
49186 break;
49189 if (stmt_cost == -1)
49190 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49192 /* Penalize DFmode vector operations for Bonnell. */
49193 if (TARGET_BONNELL && kind == vector_stmt
49194 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49195 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49197 /* Statements in an inner loop relative to the loop being
49198 vectorized are weighted more heavily. The value here is
49199 arbitrary and could potentially be improved with analysis. */
49200 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49201 count *= 50; /* FIXME. */
49203 retval = (unsigned) (count * stmt_cost);
49205 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49206 for Silvermont as it has out of order integer pipeline and can execute
49207 2 scalar instruction per tick, but has in order SIMD pipeline. */
49208 if ((TARGET_SILVERMONT || TARGET_INTEL)
49209 && stmt_info && stmt_info->stmt)
49211 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49212 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49213 retval = (retval * 17) / 10;
49216 cost[where] += retval;
49218 return retval;
49221 /* Implement targetm.vectorize.finish_cost. */
49223 static void
49224 ix86_finish_cost (void *data, unsigned *prologue_cost,
49225 unsigned *body_cost, unsigned *epilogue_cost)
49227 unsigned *cost = (unsigned *) data;
49228 *prologue_cost = cost[vect_prologue];
49229 *body_cost = cost[vect_body];
49230 *epilogue_cost = cost[vect_epilogue];
49233 /* Implement targetm.vectorize.destroy_cost_data. */
49235 static void
49236 ix86_destroy_cost_data (void *data)
49238 free (data);
49241 /* Validate target specific memory model bits in VAL. */
49243 static unsigned HOST_WIDE_INT
49244 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49246 enum memmodel model = memmodel_from_int (val);
49247 bool strong;
49249 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49250 |MEMMODEL_MASK)
49251 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49253 warning (OPT_Winvalid_memory_model,
49254 "unknown architecture specific memory model");
49255 return MEMMODEL_SEQ_CST;
49257 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49258 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49260 warning (OPT_Winvalid_memory_model,
49261 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49262 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49264 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49266 warning (OPT_Winvalid_memory_model,
49267 "HLE_RELEASE not used with RELEASE or stronger memory model");
49268 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49270 return val;
49273 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49274 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49275 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49276 or number of vecsize_mangle variants that should be emitted. */
49278 static int
49279 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49280 struct cgraph_simd_clone *clonei,
49281 tree base_type, int num)
49283 int ret = 1;
49285 if (clonei->simdlen
49286 && (clonei->simdlen < 2
49287 || clonei->simdlen > 1024
49288 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49290 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49291 "unsupported simdlen %d", clonei->simdlen);
49292 return 0;
49295 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49296 if (TREE_CODE (ret_type) != VOID_TYPE)
49297 switch (TYPE_MODE (ret_type))
49299 case E_QImode:
49300 case E_HImode:
49301 case E_SImode:
49302 case E_DImode:
49303 case E_SFmode:
49304 case E_DFmode:
49305 /* case E_SCmode: */
49306 /* case E_DCmode: */
49307 break;
49308 default:
49309 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49310 "unsupported return type %qT for simd\n", ret_type);
49311 return 0;
49314 tree t;
49315 int i;
49317 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49318 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49319 switch (TYPE_MODE (TREE_TYPE (t)))
49321 case E_QImode:
49322 case E_HImode:
49323 case E_SImode:
49324 case E_DImode:
49325 case E_SFmode:
49326 case E_DFmode:
49327 /* case E_SCmode: */
49328 /* case E_DCmode: */
49329 break;
49330 default:
49331 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49332 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49333 return 0;
49336 if (!TREE_PUBLIC (node->decl))
49338 /* If the function isn't exported, we can pick up just one ISA
49339 for the clones. */
49340 if (TARGET_AVX512F)
49341 clonei->vecsize_mangle = 'e';
49342 else if (TARGET_AVX2)
49343 clonei->vecsize_mangle = 'd';
49344 else if (TARGET_AVX)
49345 clonei->vecsize_mangle = 'c';
49346 else
49347 clonei->vecsize_mangle = 'b';
49348 ret = 1;
49350 else
49352 clonei->vecsize_mangle = "bcde"[num];
49353 ret = 4;
49355 clonei->mask_mode = VOIDmode;
49356 switch (clonei->vecsize_mangle)
49358 case 'b':
49359 clonei->vecsize_int = 128;
49360 clonei->vecsize_float = 128;
49361 break;
49362 case 'c':
49363 clonei->vecsize_int = 128;
49364 clonei->vecsize_float = 256;
49365 break;
49366 case 'd':
49367 clonei->vecsize_int = 256;
49368 clonei->vecsize_float = 256;
49369 break;
49370 case 'e':
49371 clonei->vecsize_int = 512;
49372 clonei->vecsize_float = 512;
49373 if (TYPE_MODE (base_type) == QImode)
49374 clonei->mask_mode = DImode;
49375 else
49376 clonei->mask_mode = SImode;
49377 break;
49379 if (clonei->simdlen == 0)
49381 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49382 clonei->simdlen = clonei->vecsize_int;
49383 else
49384 clonei->simdlen = clonei->vecsize_float;
49385 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49387 else if (clonei->simdlen > 16)
49389 /* For compatibility with ICC, use the same upper bounds
49390 for simdlen. In particular, for CTYPE below, use the return type,
49391 unless the function returns void, in that case use the characteristic
49392 type. If it is possible for given SIMDLEN to pass CTYPE value
49393 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49394 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49395 emit corresponding clone. */
49396 tree ctype = ret_type;
49397 if (TREE_CODE (ret_type) == VOID_TYPE)
49398 ctype = base_type;
49399 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49400 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49401 cnt /= clonei->vecsize_int;
49402 else
49403 cnt /= clonei->vecsize_float;
49404 if (cnt > (TARGET_64BIT ? 16 : 8))
49406 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49407 "unsupported simdlen %d", clonei->simdlen);
49408 return 0;
49411 return ret;
49414 /* Add target attribute to SIMD clone NODE if needed. */
49416 static void
49417 ix86_simd_clone_adjust (struct cgraph_node *node)
49419 const char *str = NULL;
49420 gcc_assert (node->decl == cfun->decl);
49421 switch (node->simdclone->vecsize_mangle)
49423 case 'b':
49424 if (!TARGET_SSE2)
49425 str = "sse2";
49426 break;
49427 case 'c':
49428 if (!TARGET_AVX)
49429 str = "avx";
49430 break;
49431 case 'd':
49432 if (!TARGET_AVX2)
49433 str = "avx2";
49434 break;
49435 case 'e':
49436 if (!TARGET_AVX512F)
49437 str = "avx512f";
49438 break;
49439 default:
49440 gcc_unreachable ();
49442 if (str == NULL)
49443 return;
49444 push_cfun (NULL);
49445 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49446 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49447 gcc_assert (ok);
49448 pop_cfun ();
49449 ix86_reset_previous_fndecl ();
49450 ix86_set_current_function (node->decl);
49453 /* If SIMD clone NODE can't be used in a vectorized loop
49454 in current function, return -1, otherwise return a badness of using it
49455 (0 if it is most desirable from vecsize_mangle point of view, 1
49456 slightly less desirable, etc.). */
49458 static int
49459 ix86_simd_clone_usable (struct cgraph_node *node)
49461 switch (node->simdclone->vecsize_mangle)
49463 case 'b':
49464 if (!TARGET_SSE2)
49465 return -1;
49466 if (!TARGET_AVX)
49467 return 0;
49468 return TARGET_AVX2 ? 2 : 1;
49469 case 'c':
49470 if (!TARGET_AVX)
49471 return -1;
49472 return TARGET_AVX2 ? 1 : 0;
49473 case 'd':
49474 if (!TARGET_AVX2)
49475 return -1;
49476 return 0;
49477 case 'e':
49478 if (!TARGET_AVX512F)
49479 return -1;
49480 return 0;
49481 default:
49482 gcc_unreachable ();
49486 /* This function adjusts the unroll factor based on
49487 the hardware capabilities. For ex, bdver3 has
49488 a loop buffer which makes unrolling of smaller
49489 loops less important. This function decides the
49490 unroll factor using number of memory references
49491 (value 32 is used) as a heuristic. */
49493 static unsigned
49494 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49496 basic_block *bbs;
49497 rtx_insn *insn;
49498 unsigned i;
49499 unsigned mem_count = 0;
49501 if (!TARGET_ADJUST_UNROLL)
49502 return nunroll;
49504 /* Count the number of memory references within the loop body.
49505 This value determines the unrolling factor for bdver3 and bdver4
49506 architectures. */
49507 subrtx_iterator::array_type array;
49508 bbs = get_loop_body (loop);
49509 for (i = 0; i < loop->num_nodes; i++)
49510 FOR_BB_INSNS (bbs[i], insn)
49511 if (NONDEBUG_INSN_P (insn))
49512 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49513 if (const_rtx x = *iter)
49514 if (MEM_P (x))
49516 machine_mode mode = GET_MODE (x);
49517 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49518 if (n_words > 4)
49519 mem_count += 2;
49520 else
49521 mem_count += 1;
49523 free (bbs);
49525 if (mem_count && mem_count <=32)
49526 return 32/mem_count;
49528 return nunroll;
49532 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49534 static bool
49535 ix86_float_exceptions_rounding_supported_p (void)
49537 /* For x87 floating point with standard excess precision handling,
49538 there is no adddf3 pattern (since x87 floating point only has
49539 XFmode operations) so the default hook implementation gets this
49540 wrong. */
49541 return TARGET_80387 || TARGET_SSE_MATH;
49544 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49546 static void
49547 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49549 if (!TARGET_80387 && !TARGET_SSE_MATH)
49550 return;
49551 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49552 if (TARGET_80387)
49554 tree fenv_index_type = build_index_type (size_int (6));
49555 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49556 tree fenv_var = create_tmp_var_raw (fenv_type);
49557 TREE_ADDRESSABLE (fenv_var) = 1;
49558 tree fenv_ptr = build_pointer_type (fenv_type);
49559 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49560 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49561 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49562 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49563 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49564 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49565 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49566 tree hold_fnclex = build_call_expr (fnclex, 0);
49567 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49568 NULL_TREE, NULL_TREE);
49569 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49570 hold_fnclex);
49571 *clear = build_call_expr (fnclex, 0);
49572 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49573 tree fnstsw_call = build_call_expr (fnstsw, 0);
49574 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49575 sw_var, fnstsw_call);
49576 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49577 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49578 exceptions_var, exceptions_x87);
49579 *update = build2 (COMPOUND_EXPR, integer_type_node,
49580 sw_mod, update_mod);
49581 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49582 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49584 if (TARGET_SSE_MATH)
49586 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49587 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49588 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49589 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49590 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49591 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49592 mxcsr_orig_var, stmxcsr_hold_call);
49593 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49594 mxcsr_orig_var,
49595 build_int_cst (unsigned_type_node, 0x1f80));
49596 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49597 build_int_cst (unsigned_type_node, 0xffffffc0));
49598 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49599 mxcsr_mod_var, hold_mod_val);
49600 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49601 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49602 hold_assign_orig, hold_assign_mod);
49603 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49604 ldmxcsr_hold_call);
49605 if (*hold)
49606 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49607 else
49608 *hold = hold_all;
49609 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49610 if (*clear)
49611 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49612 ldmxcsr_clear_call);
49613 else
49614 *clear = ldmxcsr_clear_call;
49615 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49616 tree exceptions_sse = fold_convert (integer_type_node,
49617 stxmcsr_update_call);
49618 if (*update)
49620 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49621 exceptions_var, exceptions_sse);
49622 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49623 exceptions_var, exceptions_mod);
49624 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49625 exceptions_assign);
49627 else
49628 *update = build2 (MODIFY_EXPR, integer_type_node,
49629 exceptions_var, exceptions_sse);
49630 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49631 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49632 ldmxcsr_update_call);
49634 tree atomic_feraiseexcept
49635 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49636 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49637 1, exceptions_var);
49638 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49639 atomic_feraiseexcept_call);
49642 /* Return mode to be used for bounds or VOIDmode
49643 if bounds are not supported. */
49645 static machine_mode
49646 ix86_mpx_bound_mode ()
49648 /* Do not support pointer checker if MPX
49649 is not enabled. */
49650 if (!TARGET_MPX)
49652 if (flag_check_pointer_bounds)
49653 warning (0, "Pointer Checker requires MPX support on this target."
49654 " Use -mmpx options to enable MPX.");
49655 return VOIDmode;
49658 return BNDmode;
49661 /* Return constant used to statically initialize constant bounds.
49663 This function is used to create special bound values. For now
49664 only INIT bounds and NONE bounds are expected. More special
49665 values may be added later. */
49667 static tree
49668 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49670 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49671 : build_zero_cst (pointer_sized_int_node);
49672 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49673 : build_minus_one_cst (pointer_sized_int_node);
49675 /* This function is supposed to be used to create INIT and
49676 NONE bounds only. */
49677 gcc_assert ((lb == 0 && ub == -1)
49678 || (lb == -1 && ub == 0));
49680 return build_complex (NULL, low, high);
49683 /* Generate a list of statements STMTS to initialize pointer bounds
49684 variable VAR with bounds LB and UB. Return the number of generated
49685 statements. */
49687 static int
49688 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49690 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49691 tree lhs, modify, var_p;
49693 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49694 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49696 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49697 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49698 append_to_statement_list (modify, stmts);
49700 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49701 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49702 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49703 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49704 append_to_statement_list (modify, stmts);
49706 return 2;
49709 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49710 /* For i386, common symbol is local only for non-PIE binaries. For
49711 x86-64, common symbol is local only for non-PIE binaries or linker
49712 supports copy reloc in PIE binaries. */
49714 static bool
49715 ix86_binds_local_p (const_tree exp)
49717 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49718 (!flag_pic
49719 || (TARGET_64BIT
49720 && HAVE_LD_PIE_COPYRELOC != 0)));
49722 #endif
49724 /* If MEM is in the form of [base+offset], extract the two parts
49725 of address and set to BASE and OFFSET, otherwise return false. */
49727 static bool
49728 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49730 rtx addr;
49732 gcc_assert (MEM_P (mem));
49734 addr = XEXP (mem, 0);
49736 if (GET_CODE (addr) == CONST)
49737 addr = XEXP (addr, 0);
49739 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49741 *base = addr;
49742 *offset = const0_rtx;
49743 return true;
49746 if (GET_CODE (addr) == PLUS
49747 && (REG_P (XEXP (addr, 0))
49748 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49749 && CONST_INT_P (XEXP (addr, 1)))
49751 *base = XEXP (addr, 0);
49752 *offset = XEXP (addr, 1);
49753 return true;
49756 return false;
49759 /* Given OPERANDS of consecutive load/store, check if we can merge
49760 them into move multiple. LOAD is true if they are load instructions.
49761 MODE is the mode of memory operands. */
49763 bool
49764 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49765 machine_mode mode)
49767 HOST_WIDE_INT offval_1, offval_2, msize;
49768 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49770 if (load)
49772 mem_1 = operands[1];
49773 mem_2 = operands[3];
49774 reg_1 = operands[0];
49775 reg_2 = operands[2];
49777 else
49779 mem_1 = operands[0];
49780 mem_2 = operands[2];
49781 reg_1 = operands[1];
49782 reg_2 = operands[3];
49785 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49787 if (REGNO (reg_1) != REGNO (reg_2))
49788 return false;
49790 /* Check if the addresses are in the form of [base+offset]. */
49791 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49792 return false;
49793 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49794 return false;
49796 /* Check if the bases are the same. */
49797 if (!rtx_equal_p (base_1, base_2))
49798 return false;
49800 offval_1 = INTVAL (offset_1);
49801 offval_2 = INTVAL (offset_2);
49802 msize = GET_MODE_SIZE (mode);
49803 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49804 if (offval_1 + msize != offval_2)
49805 return false;
49807 return true;
49810 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49812 static bool
49813 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49814 optimization_type opt_type)
49816 switch (op)
49818 case asin_optab:
49819 case acos_optab:
49820 case log1p_optab:
49821 case exp_optab:
49822 case exp10_optab:
49823 case exp2_optab:
49824 case expm1_optab:
49825 case ldexp_optab:
49826 case scalb_optab:
49827 case round_optab:
49828 return opt_type == OPTIMIZE_FOR_SPEED;
49830 case rint_optab:
49831 if (SSE_FLOAT_MODE_P (mode1)
49832 && TARGET_SSE_MATH
49833 && !flag_trapping_math
49834 && !TARGET_SSE4_1)
49835 return opt_type == OPTIMIZE_FOR_SPEED;
49836 return true;
49838 case floor_optab:
49839 case ceil_optab:
49840 case btrunc_optab:
49841 if (SSE_FLOAT_MODE_P (mode1)
49842 && TARGET_SSE_MATH
49843 && !flag_trapping_math
49844 && TARGET_SSE4_1)
49845 return true;
49846 return opt_type == OPTIMIZE_FOR_SPEED;
49848 case rsqrt_optab:
49849 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49851 default:
49852 return true;
49856 /* Address space support.
49858 This is not "far pointers" in the 16-bit sense, but an easy way
49859 to use %fs and %gs segment prefixes. Therefore:
49861 (a) All address spaces have the same modes,
49862 (b) All address spaces have the same addresss forms,
49863 (c) While %fs and %gs are technically subsets of the generic
49864 address space, they are probably not subsets of each other.
49865 (d) Since we have no access to the segment base register values
49866 without resorting to a system call, we cannot convert a
49867 non-default address space to a default address space.
49868 Therefore we do not claim %fs or %gs are subsets of generic.
49870 Therefore we can (mostly) use the default hooks. */
49872 /* All use of segmentation is assumed to make address 0 valid. */
49874 static bool
49875 ix86_addr_space_zero_address_valid (addr_space_t as)
49877 return as != ADDR_SPACE_GENERIC;
49880 static void
49881 ix86_init_libfuncs (void)
49883 if (TARGET_64BIT)
49885 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49886 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49888 else
49890 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49891 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49894 #if TARGET_MACHO
49895 darwin_rename_builtins ();
49896 #endif
49899 /* Generate call to __divmoddi4. */
49901 static void
49902 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49903 rtx op0, rtx op1,
49904 rtx *quot_p, rtx *rem_p)
49906 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49908 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49909 mode,
49910 op0, GET_MODE (op0),
49911 op1, GET_MODE (op1),
49912 XEXP (rem, 0), Pmode);
49913 *quot_p = quot;
49914 *rem_p = rem;
49917 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49918 FPU, assume that the fpcw is set to extended precision; when using
49919 only SSE, rounding is correct; when using both SSE and the FPU,
49920 the rounding precision is indeterminate, since either may be chosen
49921 apparently at random. */
49923 static enum flt_eval_method
49924 ix86_excess_precision (enum excess_precision_type type)
49926 switch (type)
49928 case EXCESS_PRECISION_TYPE_FAST:
49929 /* The fastest type to promote to will always be the native type,
49930 whether that occurs with implicit excess precision or
49931 otherwise. */
49932 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49933 case EXCESS_PRECISION_TYPE_STANDARD:
49934 case EXCESS_PRECISION_TYPE_IMPLICIT:
49935 /* Otherwise, the excess precision we want when we are
49936 in a standards compliant mode, and the implicit precision we
49937 provide would be identical were it not for the unpredictable
49938 cases. */
49939 if (!TARGET_80387)
49940 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49941 else if (!TARGET_MIX_SSE_I387)
49943 if (!TARGET_SSE_MATH)
49944 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
49945 else if (TARGET_SSE2)
49946 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49949 /* If we are in standards compliant mode, but we know we will
49950 calculate in unpredictable precision, return
49951 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
49952 excess precision if the target can't guarantee it will honor
49953 it. */
49954 return (type == EXCESS_PRECISION_TYPE_STANDARD
49955 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
49956 : FLT_EVAL_METHOD_UNPREDICTABLE);
49957 default:
49958 gcc_unreachable ();
49961 return FLT_EVAL_METHOD_UNPREDICTABLE;
49964 /* Target-specific selftests. */
49966 #if CHECKING_P
49968 namespace selftest {
49970 /* Verify that hard regs are dumped as expected (in compact mode). */
49972 static void
49973 ix86_test_dumping_hard_regs ()
49975 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
49976 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
49979 /* Test dumping an insn with repeated references to the same SCRATCH,
49980 to verify the rtx_reuse code. */
49982 static void
49983 ix86_test_dumping_memory_blockage ()
49985 set_new_first_and_last_insn (NULL, NULL);
49987 rtx pat = gen_memory_blockage ();
49988 rtx_reuse_manager r;
49989 r.preprocess (pat);
49991 /* Verify that the repeated references to the SCRATCH show use
49992 reuse IDS. The first should be prefixed with a reuse ID,
49993 and the second should be dumped as a "reuse_rtx" of that ID.
49994 The expected string assumes Pmode == DImode. */
49995 if (Pmode == DImode)
49996 ASSERT_RTL_DUMP_EQ_WITH_REUSE
49997 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
49998 " (unspec:BLK [\n"
49999 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
50000 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
50003 /* Verify loading an RTL dump; specifically a dump of copying
50004 a param on x86_64 from a hard reg into the frame.
50005 This test is target-specific since the dump contains target-specific
50006 hard reg names. */
50008 static void
50009 ix86_test_loading_dump_fragment_1 ()
50011 rtl_dump_test t (SELFTEST_LOCATION,
50012 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
50014 rtx_insn *insn = get_insn_by_uid (1);
50016 /* The block structure and indentation here is purely for
50017 readability; it mirrors the structure of the rtx. */
50018 tree mem_expr;
50020 rtx pat = PATTERN (insn);
50021 ASSERT_EQ (SET, GET_CODE (pat));
50023 rtx dest = SET_DEST (pat);
50024 ASSERT_EQ (MEM, GET_CODE (dest));
50025 /* Verify the "/c" was parsed. */
50026 ASSERT_TRUE (RTX_FLAG (dest, call));
50027 ASSERT_EQ (SImode, GET_MODE (dest));
50029 rtx addr = XEXP (dest, 0);
50030 ASSERT_EQ (PLUS, GET_CODE (addr));
50031 ASSERT_EQ (DImode, GET_MODE (addr));
50033 rtx lhs = XEXP (addr, 0);
50034 /* Verify that the "frame" REG was consolidated. */
50035 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
50038 rtx rhs = XEXP (addr, 1);
50039 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
50040 ASSERT_EQ (-4, INTVAL (rhs));
50043 /* Verify the "[1 i+0 S4 A32]" was parsed. */
50044 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
50045 /* "i" should have been handled by synthesizing a global int
50046 variable named "i". */
50047 mem_expr = MEM_EXPR (dest);
50048 ASSERT_NE (mem_expr, NULL);
50049 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
50050 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
50051 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
50052 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
50053 /* "+0". */
50054 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
50055 ASSERT_EQ (0, MEM_OFFSET (dest));
50056 /* "S4". */
50057 ASSERT_EQ (4, MEM_SIZE (dest));
50058 /* "A32. */
50059 ASSERT_EQ (32, MEM_ALIGN (dest));
50062 rtx src = SET_SRC (pat);
50063 ASSERT_EQ (REG, GET_CODE (src));
50064 ASSERT_EQ (SImode, GET_MODE (src));
50065 ASSERT_EQ (5, REGNO (src));
50066 tree reg_expr = REG_EXPR (src);
50067 /* "i" here should point to the same var as for the MEM_EXPR. */
50068 ASSERT_EQ (reg_expr, mem_expr);
50073 /* Verify that the RTL loader copes with a call_insn dump.
50074 This test is target-specific since the dump contains a target-specific
50075 hard reg name. */
50077 static void
50078 ix86_test_loading_call_insn ()
50080 /* The test dump includes register "xmm0", where requires TARGET_SSE
50081 to exist. */
50082 if (!TARGET_SSE)
50083 return;
50085 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
50087 rtx_insn *insn = get_insns ();
50088 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
50090 /* "/j". */
50091 ASSERT_TRUE (RTX_FLAG (insn, jump));
50093 rtx pat = PATTERN (insn);
50094 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
50096 /* Verify REG_NOTES. */
50098 /* "(expr_list:REG_CALL_DECL". */
50099 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
50100 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
50101 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
50103 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
50104 rtx_expr_list *note1 = note0->next ();
50105 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
50107 ASSERT_EQ (NULL, note1->next ());
50110 /* Verify CALL_INSN_FUNCTION_USAGE. */
50112 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
50113 rtx_expr_list *usage
50114 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
50115 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
50116 ASSERT_EQ (DFmode, GET_MODE (usage));
50117 ASSERT_EQ (USE, GET_CODE (usage->element ()));
50118 ASSERT_EQ (NULL, usage->next ());
50122 /* Verify that the RTL loader copes a dump from print_rtx_function.
50123 This test is target-specific since the dump contains target-specific
50124 hard reg names. */
50126 static void
50127 ix86_test_loading_full_dump ()
50129 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
50131 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50133 rtx_insn *insn_1 = get_insn_by_uid (1);
50134 ASSERT_EQ (NOTE, GET_CODE (insn_1));
50136 rtx_insn *insn_7 = get_insn_by_uid (7);
50137 ASSERT_EQ (INSN, GET_CODE (insn_7));
50138 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
50140 rtx_insn *insn_15 = get_insn_by_uid (15);
50141 ASSERT_EQ (INSN, GET_CODE (insn_15));
50142 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
50144 /* Verify crtl->return_rtx. */
50145 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
50146 ASSERT_EQ (0, REGNO (crtl->return_rtx));
50147 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
50150 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
50151 In particular, verify that it correctly loads the 2nd operand.
50152 This test is target-specific since these are machine-specific
50153 operands (and enums). */
50155 static void
50156 ix86_test_loading_unspec ()
50158 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
50160 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50162 ASSERT_TRUE (cfun);
50164 /* Test of an UNSPEC. */
50165 rtx_insn *insn = get_insns ();
50166 ASSERT_EQ (INSN, GET_CODE (insn));
50167 rtx set = single_set (insn);
50168 ASSERT_NE (NULL, set);
50169 rtx dst = SET_DEST (set);
50170 ASSERT_EQ (MEM, GET_CODE (dst));
50171 rtx src = SET_SRC (set);
50172 ASSERT_EQ (UNSPEC, GET_CODE (src));
50173 ASSERT_EQ (BLKmode, GET_MODE (src));
50174 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
50176 rtx v0 = XVECEXP (src, 0, 0);
50178 /* Verify that the two uses of the first SCRATCH have pointer
50179 equality. */
50180 rtx scratch_a = XEXP (dst, 0);
50181 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
50183 rtx scratch_b = XEXP (v0, 0);
50184 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
50186 ASSERT_EQ (scratch_a, scratch_b);
50188 /* Verify that the two mems are thus treated as equal. */
50189 ASSERT_TRUE (rtx_equal_p (dst, v0));
50191 /* Verify the the insn is recognized. */
50192 ASSERT_NE(-1, recog_memoized (insn));
50194 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
50195 insn = NEXT_INSN (insn);
50196 ASSERT_EQ (INSN, GET_CODE (insn));
50198 set = single_set (insn);
50199 ASSERT_NE (NULL, set);
50201 src = SET_SRC (set);
50202 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
50203 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
50206 /* Run all target-specific selftests. */
50208 static void
50209 ix86_run_selftests (void)
50211 ix86_test_dumping_hard_regs ();
50212 ix86_test_dumping_memory_blockage ();
50214 /* Various tests of loading RTL dumps, here because they contain
50215 ix86-isms (e.g. names of hard regs). */
50216 ix86_test_loading_dump_fragment_1 ();
50217 ix86_test_loading_call_insn ();
50218 ix86_test_loading_full_dump ();
50219 ix86_test_loading_unspec ();
50222 } // namespace selftest
50224 #endif /* CHECKING_P */
50226 /* Initialize the GCC target structure. */
50227 #undef TARGET_RETURN_IN_MEMORY
50228 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50230 #undef TARGET_LEGITIMIZE_ADDRESS
50231 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50233 #undef TARGET_ATTRIBUTE_TABLE
50234 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50235 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50236 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50237 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50238 # undef TARGET_MERGE_DECL_ATTRIBUTES
50239 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50240 #endif
50242 #undef TARGET_COMP_TYPE_ATTRIBUTES
50243 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50245 #undef TARGET_INIT_BUILTINS
50246 #define TARGET_INIT_BUILTINS ix86_init_builtins
50247 #undef TARGET_BUILTIN_DECL
50248 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50249 #undef TARGET_EXPAND_BUILTIN
50250 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50252 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50253 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50254 ix86_builtin_vectorized_function
50256 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50257 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50259 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50260 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50262 #undef TARGET_BUILTIN_RECIPROCAL
50263 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50265 #undef TARGET_ASM_FUNCTION_EPILOGUE
50266 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50268 #undef TARGET_ENCODE_SECTION_INFO
50269 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50270 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50271 #else
50272 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50273 #endif
50275 #undef TARGET_ASM_OPEN_PAREN
50276 #define TARGET_ASM_OPEN_PAREN ""
50277 #undef TARGET_ASM_CLOSE_PAREN
50278 #define TARGET_ASM_CLOSE_PAREN ""
50280 #undef TARGET_ASM_BYTE_OP
50281 #define TARGET_ASM_BYTE_OP ASM_BYTE
50283 #undef TARGET_ASM_ALIGNED_HI_OP
50284 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50285 #undef TARGET_ASM_ALIGNED_SI_OP
50286 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50287 #ifdef ASM_QUAD
50288 #undef TARGET_ASM_ALIGNED_DI_OP
50289 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50290 #endif
50292 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50293 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50295 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50296 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50298 #undef TARGET_ASM_UNALIGNED_HI_OP
50299 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50300 #undef TARGET_ASM_UNALIGNED_SI_OP
50301 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50302 #undef TARGET_ASM_UNALIGNED_DI_OP
50303 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50305 #undef TARGET_PRINT_OPERAND
50306 #define TARGET_PRINT_OPERAND ix86_print_operand
50307 #undef TARGET_PRINT_OPERAND_ADDRESS
50308 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50309 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50310 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50311 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50312 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50314 #undef TARGET_SCHED_INIT_GLOBAL
50315 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50316 #undef TARGET_SCHED_ADJUST_COST
50317 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50318 #undef TARGET_SCHED_ISSUE_RATE
50319 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50320 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50321 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50322 ia32_multipass_dfa_lookahead
50323 #undef TARGET_SCHED_MACRO_FUSION_P
50324 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50325 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50326 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50328 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50329 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50331 #undef TARGET_MEMMODEL_CHECK
50332 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50334 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50335 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50337 #ifdef HAVE_AS_TLS
50338 #undef TARGET_HAVE_TLS
50339 #define TARGET_HAVE_TLS true
50340 #endif
50341 #undef TARGET_CANNOT_FORCE_CONST_MEM
50342 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50343 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50344 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50346 #undef TARGET_DELEGITIMIZE_ADDRESS
50347 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50349 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
50350 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
50352 #undef TARGET_MS_BITFIELD_LAYOUT_P
50353 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50355 #if TARGET_MACHO
50356 #undef TARGET_BINDS_LOCAL_P
50357 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50358 #else
50359 #undef TARGET_BINDS_LOCAL_P
50360 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50361 #endif
50362 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50363 #undef TARGET_BINDS_LOCAL_P
50364 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50365 #endif
50367 #undef TARGET_ASM_OUTPUT_MI_THUNK
50368 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50369 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50370 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50372 #undef TARGET_ASM_FILE_START
50373 #define TARGET_ASM_FILE_START x86_file_start
50375 #undef TARGET_OPTION_OVERRIDE
50376 #define TARGET_OPTION_OVERRIDE ix86_option_override
50378 #undef TARGET_REGISTER_MOVE_COST
50379 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50380 #undef TARGET_MEMORY_MOVE_COST
50381 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50382 #undef TARGET_RTX_COSTS
50383 #define TARGET_RTX_COSTS ix86_rtx_costs
50384 #undef TARGET_ADDRESS_COST
50385 #define TARGET_ADDRESS_COST ix86_address_cost
50387 #undef TARGET_FLAGS_REGNUM
50388 #define TARGET_FLAGS_REGNUM FLAGS_REG
50389 #undef TARGET_FIXED_CONDITION_CODE_REGS
50390 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50391 #undef TARGET_CC_MODES_COMPATIBLE
50392 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50394 #undef TARGET_MACHINE_DEPENDENT_REORG
50395 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50397 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50398 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50400 #undef TARGET_BUILD_BUILTIN_VA_LIST
50401 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50403 #undef TARGET_FOLD_BUILTIN
50404 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50406 #undef TARGET_GIMPLE_FOLD_BUILTIN
50407 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50409 #undef TARGET_COMPARE_VERSION_PRIORITY
50410 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50412 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50413 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50414 ix86_generate_version_dispatcher_body
50416 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50417 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50418 ix86_get_function_versions_dispatcher
50420 #undef TARGET_ENUM_VA_LIST_P
50421 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50423 #undef TARGET_FN_ABI_VA_LIST
50424 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50426 #undef TARGET_CANONICAL_VA_LIST_TYPE
50427 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50429 #undef TARGET_EXPAND_BUILTIN_VA_START
50430 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50432 #undef TARGET_MD_ASM_ADJUST
50433 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50435 #undef TARGET_C_EXCESS_PRECISION
50436 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
50437 #undef TARGET_PROMOTE_PROTOTYPES
50438 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50439 #undef TARGET_SETUP_INCOMING_VARARGS
50440 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50441 #undef TARGET_MUST_PASS_IN_STACK
50442 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50443 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
50444 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
50445 #undef TARGET_FUNCTION_ARG_ADVANCE
50446 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50447 #undef TARGET_FUNCTION_ARG
50448 #define TARGET_FUNCTION_ARG ix86_function_arg
50449 #undef TARGET_INIT_PIC_REG
50450 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50451 #undef TARGET_USE_PSEUDO_PIC_REG
50452 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50453 #undef TARGET_FUNCTION_ARG_BOUNDARY
50454 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50455 #undef TARGET_PASS_BY_REFERENCE
50456 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50457 #undef TARGET_INTERNAL_ARG_POINTER
50458 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50459 #undef TARGET_UPDATE_STACK_BOUNDARY
50460 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50461 #undef TARGET_GET_DRAP_RTX
50462 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50463 #undef TARGET_STRICT_ARGUMENT_NAMING
50464 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50465 #undef TARGET_STATIC_CHAIN
50466 #define TARGET_STATIC_CHAIN ix86_static_chain
50467 #undef TARGET_TRAMPOLINE_INIT
50468 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50469 #undef TARGET_RETURN_POPS_ARGS
50470 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50472 #undef TARGET_WARN_FUNC_RETURN
50473 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
50475 #undef TARGET_LEGITIMATE_COMBINED_INSN
50476 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50478 #undef TARGET_ASAN_SHADOW_OFFSET
50479 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50481 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50482 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50484 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50485 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50487 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50488 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50490 #undef TARGET_C_MODE_FOR_SUFFIX
50491 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50493 #ifdef HAVE_AS_TLS
50494 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50495 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50496 #endif
50498 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50499 #undef TARGET_INSERT_ATTRIBUTES
50500 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50501 #endif
50503 #undef TARGET_MANGLE_TYPE
50504 #define TARGET_MANGLE_TYPE ix86_mangle_type
50506 #undef TARGET_STACK_PROTECT_GUARD
50507 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50509 #if !TARGET_MACHO
50510 #undef TARGET_STACK_PROTECT_FAIL
50511 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50512 #endif
50514 #undef TARGET_FUNCTION_VALUE
50515 #define TARGET_FUNCTION_VALUE ix86_function_value
50517 #undef TARGET_FUNCTION_VALUE_REGNO_P
50518 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50520 #undef TARGET_PROMOTE_FUNCTION_MODE
50521 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50523 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50524 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50526 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50527 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50529 #undef TARGET_INSTANTIATE_DECLS
50530 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50532 #undef TARGET_SECONDARY_RELOAD
50533 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50534 #undef TARGET_SECONDARY_MEMORY_NEEDED
50535 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50536 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50537 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50539 #undef TARGET_CLASS_MAX_NREGS
50540 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50542 #undef TARGET_PREFERRED_RELOAD_CLASS
50543 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50544 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50545 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50546 #undef TARGET_CLASS_LIKELY_SPILLED_P
50547 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50549 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50550 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50551 ix86_builtin_vectorization_cost
50552 #undef TARGET_VECTORIZE_VEC_PERM_CONST
50553 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
50554 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50555 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50556 ix86_preferred_simd_mode
50557 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50558 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50559 ix86_autovectorize_vector_sizes
50560 #undef TARGET_VECTORIZE_GET_MASK_MODE
50561 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50562 #undef TARGET_VECTORIZE_INIT_COST
50563 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50564 #undef TARGET_VECTORIZE_ADD_STMT_COST
50565 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50566 #undef TARGET_VECTORIZE_FINISH_COST
50567 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50568 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50569 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50571 #undef TARGET_SET_CURRENT_FUNCTION
50572 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50574 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50575 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50577 #undef TARGET_OPTION_SAVE
50578 #define TARGET_OPTION_SAVE ix86_function_specific_save
50580 #undef TARGET_OPTION_RESTORE
50581 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50583 #undef TARGET_OPTION_POST_STREAM_IN
50584 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50586 #undef TARGET_OPTION_PRINT
50587 #define TARGET_OPTION_PRINT ix86_function_specific_print
50589 #undef TARGET_OPTION_FUNCTION_VERSIONS
50590 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50592 #undef TARGET_CAN_INLINE_P
50593 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50595 #undef TARGET_LEGITIMATE_ADDRESS_P
50596 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50598 #undef TARGET_REGISTER_PRIORITY
50599 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50601 #undef TARGET_REGISTER_USAGE_LEVELING_P
50602 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50604 #undef TARGET_LEGITIMATE_CONSTANT_P
50605 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50607 #undef TARGET_COMPUTE_FRAME_LAYOUT
50608 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50610 #undef TARGET_FRAME_POINTER_REQUIRED
50611 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50613 #undef TARGET_CAN_ELIMINATE
50614 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50616 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50617 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50619 #undef TARGET_ASM_CODE_END
50620 #define TARGET_ASM_CODE_END ix86_code_end
50622 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50623 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50625 #undef TARGET_CANONICALIZE_COMPARISON
50626 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50628 #undef TARGET_LOOP_UNROLL_ADJUST
50629 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50631 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50632 #undef TARGET_SPILL_CLASS
50633 #define TARGET_SPILL_CLASS ix86_spill_class
50635 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50636 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50637 ix86_simd_clone_compute_vecsize_and_simdlen
50639 #undef TARGET_SIMD_CLONE_ADJUST
50640 #define TARGET_SIMD_CLONE_ADJUST \
50641 ix86_simd_clone_adjust
50643 #undef TARGET_SIMD_CLONE_USABLE
50644 #define TARGET_SIMD_CLONE_USABLE \
50645 ix86_simd_clone_usable
50647 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50648 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50649 ix86_float_exceptions_rounding_supported_p
50651 #undef TARGET_MODE_EMIT
50652 #define TARGET_MODE_EMIT ix86_emit_mode_set
50654 #undef TARGET_MODE_NEEDED
50655 #define TARGET_MODE_NEEDED ix86_mode_needed
50657 #undef TARGET_MODE_AFTER
50658 #define TARGET_MODE_AFTER ix86_mode_after
50660 #undef TARGET_MODE_ENTRY
50661 #define TARGET_MODE_ENTRY ix86_mode_entry
50663 #undef TARGET_MODE_EXIT
50664 #define TARGET_MODE_EXIT ix86_mode_exit
50666 #undef TARGET_MODE_PRIORITY
50667 #define TARGET_MODE_PRIORITY ix86_mode_priority
50669 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50670 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50672 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50673 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50675 #undef TARGET_STORE_BOUNDS_FOR_ARG
50676 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50678 #undef TARGET_LOAD_RETURNED_BOUNDS
50679 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50681 #undef TARGET_STORE_RETURNED_BOUNDS
50682 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50684 #undef TARGET_CHKP_BOUND_MODE
50685 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50687 #undef TARGET_BUILTIN_CHKP_FUNCTION
50688 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50690 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50691 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50693 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50694 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50696 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50697 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50699 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50700 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50702 #undef TARGET_OFFLOAD_OPTIONS
50703 #define TARGET_OFFLOAD_OPTIONS \
50704 ix86_offload_options
50706 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50707 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50709 #undef TARGET_OPTAB_SUPPORTED_P
50710 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50712 #undef TARGET_HARD_REGNO_SCRATCH_OK
50713 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50715 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50716 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50718 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50719 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50721 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50722 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50724 #undef TARGET_INIT_LIBFUNCS
50725 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50727 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50728 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50730 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50731 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50733 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50734 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50736 #undef TARGET_HARD_REGNO_NREGS
50737 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50738 #undef TARGET_HARD_REGNO_MODE_OK
50739 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50741 #undef TARGET_MODES_TIEABLE_P
50742 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50744 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50745 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50746 ix86_hard_regno_call_part_clobbered
50748 #undef TARGET_CAN_CHANGE_MODE_CLASS
50749 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50751 #undef TARGET_STATIC_RTX_ALIGNMENT
50752 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
50753 #undef TARGET_CONSTANT_ALIGNMENT
50754 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50756 #undef TARGET_EMPTY_RECORD_P
50757 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
50759 #undef TARGET_WARN_PARAMETER_PASSING_ABI
50760 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
50762 #if CHECKING_P
50763 #undef TARGET_RUN_TARGET_SELFTESTS
50764 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50765 #endif /* #if CHECKING_P */
50767 struct gcc_target targetm = TARGET_INITIALIZER;
50769 #include "gt-i386.h"