* config/i386/i386.c (PTA_SKYLAKE_AVX512): Add PTA_CLWB.
[official-gcc.git] / gcc / config / i386 / i386.c
blob051c3e56d8219636f5d35103bb10a1a7e9856811
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
92 /* This file should be included last. */
93 #include "target-def.h"
95 #include "x86-tune-costs.h"
97 static rtx legitimize_dllimport_symbol (rtx, bool);
98 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
99 static rtx legitimize_pe_coff_symbol (rtx, bool);
100 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
101 static bool ix86_save_reg (unsigned int, bool, bool);
102 static bool ix86_function_naked (const_tree);
103 static bool ix86_notrack_prefixed_insn_p (rtx);
104 static void ix86_emit_restore_reg_using_pop (rtx);
107 #ifndef CHECK_STACK_LIMIT
108 #define CHECK_STACK_LIMIT (-1)
109 #endif
111 /* Return index of given mode in mult and division cost tables. */
112 #define MODE_INDEX(mode) \
113 ((mode) == QImode ? 0 \
114 : (mode) == HImode ? 1 \
115 : (mode) == SImode ? 2 \
116 : (mode) == DImode ? 3 \
117 : 4)
120 /* Set by -mtune. */
121 const struct processor_costs *ix86_tune_cost = NULL;
123 /* Set by -mtune or -Os. */
124 const struct processor_costs *ix86_cost = NULL;
126 /* Processor feature/optimization bitmasks. */
127 #define m_386 (1U<<PROCESSOR_I386)
128 #define m_486 (1U<<PROCESSOR_I486)
129 #define m_PENT (1U<<PROCESSOR_PENTIUM)
130 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
131 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
132 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
133 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
134 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
135 #define m_CORE2 (1U<<PROCESSOR_CORE2)
136 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
137 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
138 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
139 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
140 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
141 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
142 #define m_KNL (1U<<PROCESSOR_KNL)
143 #define m_KNM (1U<<PROCESSOR_KNM)
144 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
145 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
146 #define m_INTEL (1U<<PROCESSOR_INTEL)
148 #define m_GEODE (1U<<PROCESSOR_GEODE)
149 #define m_K6 (1U<<PROCESSOR_K6)
150 #define m_K6_GEODE (m_K6 | m_GEODE)
151 #define m_K8 (1U<<PROCESSOR_K8)
152 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
153 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
154 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
155 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
156 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
157 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
158 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
159 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
160 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
161 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
162 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
163 #define m_BTVER (m_BTVER1 | m_BTVER2)
164 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
165 | m_ZNVER1)
167 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
169 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
170 #undef DEF_TUNE
171 #define DEF_TUNE(tune, name, selector) name,
172 #include "x86-tune.def"
173 #undef DEF_TUNE
176 /* Feature tests against the various tunings. */
177 unsigned char ix86_tune_features[X86_TUNE_LAST];
179 /* Feature tests against the various tunings used to create ix86_tune_features
180 based on the processor mask. */
181 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
182 #undef DEF_TUNE
183 #define DEF_TUNE(tune, name, selector) selector,
184 #include "x86-tune.def"
185 #undef DEF_TUNE
188 /* Feature tests against the various architecture variations. */
189 unsigned char ix86_arch_features[X86_ARCH_LAST];
191 /* Feature tests against the various architecture variations, used to create
192 ix86_arch_features based on the processor mask. */
193 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
194 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
195 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
197 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
198 ~m_386,
200 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
201 ~(m_386 | m_486),
203 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
204 ~m_386,
206 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
207 ~m_386,
210 /* In case the average insn count for single function invocation is
211 lower than this constant, emit fast (but longer) prologue and
212 epilogue code. */
213 #define FAST_PROLOGUE_INSN_COUNT 20
215 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
216 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
217 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
218 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
220 /* Array of the smallest class containing reg number REGNO, indexed by
221 REGNO. Used by REGNO_REG_CLASS in i386.h. */
223 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
225 /* ax, dx, cx, bx */
226 AREG, DREG, CREG, BREG,
227 /* si, di, bp, sp */
228 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
229 /* FP registers */
230 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
231 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
232 /* arg pointer */
233 NON_Q_REGS,
234 /* flags, fpsr, fpcr, frame */
235 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
236 /* SSE registers */
237 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
238 SSE_REGS, SSE_REGS,
239 /* MMX registers */
240 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
241 MMX_REGS, MMX_REGS,
242 /* REX registers */
243 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
244 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
245 /* SSE REX registers */
246 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
247 SSE_REGS, SSE_REGS,
248 /* AVX-512 SSE registers */
249 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
250 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 /* Mask registers. */
254 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
255 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
256 /* MPX bound registers */
257 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
260 /* The "default" register map used in 32bit mode. */
262 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
264 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
265 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
266 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
267 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
268 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
269 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
270 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
272 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
273 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
274 101, 102, 103, 104, /* bound registers */
277 /* The "default" register map used in 64bit mode. */
279 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
281 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
282 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
283 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
284 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
285 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
286 8,9,10,11,12,13,14,15, /* extended integer registers */
287 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
288 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
289 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
290 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
291 126, 127, 128, 129, /* bound registers */
294 /* Define the register numbers to be used in Dwarf debugging information.
295 The SVR4 reference port C compiler uses the following register numbers
296 in its Dwarf output code:
297 0 for %eax (gcc regno = 0)
298 1 for %ecx (gcc regno = 2)
299 2 for %edx (gcc regno = 1)
300 3 for %ebx (gcc regno = 3)
301 4 for %esp (gcc regno = 7)
302 5 for %ebp (gcc regno = 6)
303 6 for %esi (gcc regno = 4)
304 7 for %edi (gcc regno = 5)
305 The following three DWARF register numbers are never generated by
306 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
307 believed these numbers have these meanings.
308 8 for %eip (no gcc equivalent)
309 9 for %eflags (gcc regno = 17)
310 10 for %trapno (no gcc equivalent)
311 It is not at all clear how we should number the FP stack registers
312 for the x86 architecture. If the version of SDB on x86/svr4 were
313 a bit less brain dead with respect to floating-point then we would
314 have a precedent to follow with respect to DWARF register numbers
315 for x86 FP registers, but the SDB on x86/svr4 was so completely
316 broken with respect to FP registers that it is hardly worth thinking
317 of it as something to strive for compatibility with.
318 The version of x86/svr4 SDB I had does (partially)
319 seem to believe that DWARF register number 11 is associated with
320 the x86 register %st(0), but that's about all. Higher DWARF
321 register numbers don't seem to be associated with anything in
322 particular, and even for DWARF regno 11, SDB only seemed to under-
323 stand that it should say that a variable lives in %st(0) (when
324 asked via an `=' command) if we said it was in DWARF regno 11,
325 but SDB still printed garbage when asked for the value of the
326 variable in question (via a `/' command).
327 (Also note that the labels SDB printed for various FP stack regs
328 when doing an `x' command were all wrong.)
329 Note that these problems generally don't affect the native SVR4
330 C compiler because it doesn't allow the use of -O with -g and
331 because when it is *not* optimizing, it allocates a memory
332 location for each floating-point variable, and the memory
333 location is what gets described in the DWARF AT_location
334 attribute for the variable in question.
335 Regardless of the severe mental illness of the x86/svr4 SDB, we
336 do something sensible here and we use the following DWARF
337 register numbers. Note that these are all stack-top-relative
338 numbers.
339 11 for %st(0) (gcc regno = 8)
340 12 for %st(1) (gcc regno = 9)
341 13 for %st(2) (gcc regno = 10)
342 14 for %st(3) (gcc regno = 11)
343 15 for %st(4) (gcc regno = 12)
344 16 for %st(5) (gcc regno = 13)
345 17 for %st(6) (gcc regno = 14)
346 18 for %st(7) (gcc regno = 15)
348 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
350 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
351 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
352 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
353 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
354 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
355 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
356 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
358 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
359 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
360 101, 102, 103, 104, /* bound registers */
363 /* Define parameter passing and return registers. */
365 static int const x86_64_int_parameter_registers[6] =
367 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
370 static int const x86_64_ms_abi_int_parameter_registers[4] =
372 CX_REG, DX_REG, R8_REG, R9_REG
375 static int const x86_64_int_return_registers[4] =
377 AX_REG, DX_REG, DI_REG, SI_REG
380 /* Additional registers that are clobbered by SYSV calls. */
382 #define NUM_X86_64_MS_CLOBBERED_REGS 12
383 static int const x86_64_ms_sysv_extra_clobbered_registers
384 [NUM_X86_64_MS_CLOBBERED_REGS] =
386 SI_REG, DI_REG,
387 XMM6_REG, XMM7_REG,
388 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
389 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
392 enum xlogue_stub {
393 XLOGUE_STUB_SAVE,
394 XLOGUE_STUB_RESTORE,
395 XLOGUE_STUB_RESTORE_TAIL,
396 XLOGUE_STUB_SAVE_HFP,
397 XLOGUE_STUB_RESTORE_HFP,
398 XLOGUE_STUB_RESTORE_HFP_TAIL,
400 XLOGUE_STUB_COUNT
403 enum xlogue_stub_sets {
404 XLOGUE_SET_ALIGNED,
405 XLOGUE_SET_ALIGNED_PLUS_8,
406 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
407 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
409 XLOGUE_SET_COUNT
412 /* Register save/restore layout used by out-of-line stubs. */
413 class xlogue_layout {
414 public:
415 struct reginfo
417 unsigned regno;
418 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
419 rsi) to where each register is stored. */
422 unsigned get_nregs () const {return m_nregs;}
423 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
425 const reginfo &get_reginfo (unsigned reg) const
427 gcc_assert (reg < m_nregs);
428 return m_regs[reg];
431 static const char *get_stub_name (enum xlogue_stub stub,
432 unsigned n_extra_args);
434 /* Returns an rtx for the stub's symbol based upon
435 1.) the specified stub (save, restore or restore_ret) and
436 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
437 3.) rather or not stack alignment is being performed. */
438 static rtx get_stub_rtx (enum xlogue_stub stub);
440 /* Returns the amount of stack space (including padding) that the stub
441 needs to store registers based upon data in the machine_function. */
442 HOST_WIDE_INT get_stack_space_used () const
444 const struct machine_function *m = cfun->machine;
445 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
447 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
448 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
451 /* Returns the offset for the base pointer used by the stub. */
452 HOST_WIDE_INT get_stub_ptr_offset () const
454 return STUB_INDEX_OFFSET + m_stack_align_off_in;
457 static const struct xlogue_layout &get_instance ();
458 static unsigned count_stub_managed_regs ();
459 static bool is_stub_managed_reg (unsigned regno, unsigned count);
461 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
462 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
463 static const unsigned MAX_REGS = 18;
464 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
465 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
466 static const unsigned STUB_NAME_MAX_LEN = 20;
467 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
468 static const unsigned REG_ORDER[MAX_REGS];
469 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
471 private:
472 xlogue_layout ();
473 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
474 xlogue_layout (const xlogue_layout &);
476 /* True if hard frame pointer is used. */
477 bool m_hfp;
479 /* Max number of register this layout manages. */
480 unsigned m_nregs;
482 /* Incoming offset from 16-byte alignment. */
483 HOST_WIDE_INT m_stack_align_off_in;
485 /* Register order and offsets. */
486 struct reginfo m_regs[MAX_REGS];
488 /* Lazy-inited cache of symbol names for stubs. */
489 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
490 [STUB_NAME_MAX_LEN];
492 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
495 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
496 "savms64",
497 "resms64",
498 "resms64x",
499 "savms64f",
500 "resms64f",
501 "resms64fx"
504 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
505 /* The below offset values are where each register is stored for the layout
506 relative to incoming stack pointer. The value of each m_regs[].offset will
507 be relative to the incoming base pointer (rax or rsi) used by the stub.
509 s_instances: 0 1 2 3
510 Offset: realigned or aligned + 8
511 Register aligned aligned + 8 aligned w/HFP w/HFP */
512 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
513 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
514 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
515 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
516 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
517 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
518 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
519 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
520 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
521 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
522 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
523 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
524 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
525 BP_REG, /* 0xc0 0xc8 N/A N/A */
526 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
527 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
528 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
529 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
532 /* Instantiate static const values. */
533 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
534 const unsigned xlogue_layout::MIN_REGS;
535 const unsigned xlogue_layout::MAX_REGS;
536 const unsigned xlogue_layout::MAX_EXTRA_REGS;
537 const unsigned xlogue_layout::VARIANT_COUNT;
538 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
540 /* Initialize xlogue_layout::s_stub_names to zero. */
541 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
542 [STUB_NAME_MAX_LEN];
544 /* Instantiates all xlogue_layout instances. */
545 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
546 xlogue_layout (0, false),
547 xlogue_layout (8, false),
548 xlogue_layout (0, true),
549 xlogue_layout (8, true)
552 /* Return an appropriate const instance of xlogue_layout based upon values
553 in cfun->machine and crtl. */
554 const struct xlogue_layout &
555 xlogue_layout::get_instance ()
557 enum xlogue_stub_sets stub_set;
558 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
560 if (stack_realign_fp)
561 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
562 else if (frame_pointer_needed)
563 stub_set = aligned_plus_8
564 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
565 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
566 else
567 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
569 return s_instances[stub_set];
572 /* Determine how many clobbered registers can be saved by the stub.
573 Returns the count of registers the stub will save and restore. */
574 unsigned
575 xlogue_layout::count_stub_managed_regs ()
577 bool hfp = frame_pointer_needed || stack_realign_fp;
578 unsigned i, count;
579 unsigned regno;
581 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
583 regno = REG_ORDER[i];
584 if (regno == BP_REG && hfp)
585 continue;
586 if (!ix86_save_reg (regno, false, false))
587 break;
588 ++count;
590 return count;
593 /* Determine if register REGNO is a stub managed register given the
594 total COUNT of stub managed registers. */
595 bool
596 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
598 bool hfp = frame_pointer_needed || stack_realign_fp;
599 unsigned i;
601 for (i = 0; i < count; ++i)
603 gcc_assert (i < MAX_REGS);
604 if (REG_ORDER[i] == BP_REG && hfp)
605 ++count;
606 else if (REG_ORDER[i] == regno)
607 return true;
609 return false;
612 /* Constructor for xlogue_layout. */
613 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
614 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
615 m_stack_align_off_in (stack_align_off_in)
617 HOST_WIDE_INT offset = stack_align_off_in;
618 unsigned i, j;
620 for (i = j = 0; i < MAX_REGS; ++i)
622 unsigned regno = REG_ORDER[i];
624 if (regno == BP_REG && hfp)
625 continue;
626 if (SSE_REGNO_P (regno))
628 offset += 16;
629 /* Verify that SSE regs are always aligned. */
630 gcc_assert (!((stack_align_off_in + offset) & 15));
632 else
633 offset += 8;
635 m_regs[j].regno = regno;
636 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
638 gcc_assert (j == m_nregs);
641 const char *
642 xlogue_layout::get_stub_name (enum xlogue_stub stub,
643 unsigned n_extra_regs)
645 const int have_avx = TARGET_AVX;
646 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
648 /* Lazy init */
649 if (!*name)
651 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
652 (have_avx ? "avx" : "sse"),
653 STUB_BASE_NAMES[stub],
654 MIN_REGS + n_extra_regs);
655 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
658 return name;
661 /* Return rtx of a symbol ref for the entry point (based upon
662 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
664 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
666 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
667 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
668 gcc_assert (stub < XLOGUE_STUB_COUNT);
669 gcc_assert (crtl->stack_realign_finalized);
671 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
674 /* Define the structure for the machine field in struct function. */
676 struct GTY(()) stack_local_entry {
677 unsigned short mode;
678 unsigned short n;
679 rtx rtl;
680 struct stack_local_entry *next;
683 /* Which cpu are we scheduling for. */
684 enum attr_cpu ix86_schedule;
686 /* Which cpu are we optimizing for. */
687 enum processor_type ix86_tune;
689 /* Which instruction set architecture to use. */
690 enum processor_type ix86_arch;
692 /* True if processor has SSE prefetch instruction. */
693 unsigned char x86_prefetch_sse;
695 /* -mstackrealign option */
696 static const char ix86_force_align_arg_pointer_string[]
697 = "force_align_arg_pointer";
699 static rtx (*ix86_gen_leave) (void);
700 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
701 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
702 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
703 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
704 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
705 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_clzero) (rtx);
707 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
709 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
711 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
714 /* Preferred alignment for stack boundary in bits. */
715 unsigned int ix86_preferred_stack_boundary;
717 /* Alignment for incoming stack boundary in bits specified at
718 command line. */
719 static unsigned int ix86_user_incoming_stack_boundary;
721 /* Default alignment for incoming stack boundary in bits. */
722 static unsigned int ix86_default_incoming_stack_boundary;
724 /* Alignment for incoming stack boundary in bits. */
725 unsigned int ix86_incoming_stack_boundary;
727 /* Calling abi specific va_list type nodes. */
728 static GTY(()) tree sysv_va_list_type_node;
729 static GTY(()) tree ms_va_list_type_node;
731 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
732 char internal_label_prefix[16];
733 int internal_label_prefix_len;
735 /* Fence to use after loop using movnt. */
736 tree x86_mfence;
738 /* Register class used for passing given 64bit part of the argument.
739 These represent classes as documented by the PS ABI, with the exception
740 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
741 use SF or DFmode move instead of DImode to avoid reformatting penalties.
743 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
744 whenever possible (upper half does contain padding). */
745 enum x86_64_reg_class
747 X86_64_NO_CLASS,
748 X86_64_INTEGER_CLASS,
749 X86_64_INTEGERSI_CLASS,
750 X86_64_SSE_CLASS,
751 X86_64_SSESF_CLASS,
752 X86_64_SSEDF_CLASS,
753 X86_64_SSEUP_CLASS,
754 X86_64_X87_CLASS,
755 X86_64_X87UP_CLASS,
756 X86_64_COMPLEX_X87_CLASS,
757 X86_64_MEMORY_CLASS
760 #define MAX_CLASSES 8
762 /* Table of constants used by fldpi, fldln2, etc.... */
763 static REAL_VALUE_TYPE ext_80387_constants_table [5];
764 static bool ext_80387_constants_init;
767 static struct machine_function * ix86_init_machine_status (void);
768 static rtx ix86_function_value (const_tree, const_tree, bool);
769 static bool ix86_function_value_regno_p (const unsigned int);
770 static unsigned int ix86_function_arg_boundary (machine_mode,
771 const_tree);
772 static rtx ix86_static_chain (const_tree, bool);
773 static int ix86_function_regparm (const_tree, const_tree);
774 static void ix86_compute_frame_layout (void);
775 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
776 rtx, rtx, int);
777 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
778 static tree ix86_canonical_va_list_type (tree);
779 static void predict_jump (int);
780 static unsigned int split_stack_prologue_scratch_regno (void);
781 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
783 enum ix86_function_specific_strings
785 IX86_FUNCTION_SPECIFIC_ARCH,
786 IX86_FUNCTION_SPECIFIC_TUNE,
787 IX86_FUNCTION_SPECIFIC_MAX
790 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
791 const char *, const char *, enum fpmath_unit,
792 bool);
793 static void ix86_function_specific_save (struct cl_target_option *,
794 struct gcc_options *opts);
795 static void ix86_function_specific_restore (struct gcc_options *opts,
796 struct cl_target_option *);
797 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
798 static void ix86_function_specific_print (FILE *, int,
799 struct cl_target_option *);
800 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
801 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
802 struct gcc_options *,
803 struct gcc_options *,
804 struct gcc_options *);
805 static bool ix86_can_inline_p (tree, tree);
806 static void ix86_set_current_function (tree);
807 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
809 static enum calling_abi ix86_function_abi (const_tree);
812 #ifndef SUBTARGET32_DEFAULT_CPU
813 #define SUBTARGET32_DEFAULT_CPU "i386"
814 #endif
816 /* Whether -mtune= or -march= were specified */
817 static int ix86_tune_defaulted;
818 static int ix86_arch_specified;
820 /* Vectorization library interface and handlers. */
821 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
823 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
824 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
826 /* Processor target table, indexed by processor number */
827 struct ptt
829 const char *const name; /* processor name */
830 const struct processor_costs *cost; /* Processor costs */
831 const int align_loop; /* Default alignments. */
832 const int align_loop_max_skip;
833 const int align_jump;
834 const int align_jump_max_skip;
835 const int align_func;
838 /* This table must be in sync with enum processor_type in i386.h. */
839 static const struct ptt processor_target_table[PROCESSOR_max] =
841 {"generic", &generic_cost, 16, 10, 16, 10, 16},
842 {"i386", &i386_cost, 4, 3, 4, 3, 4},
843 {"i486", &i486_cost, 16, 15, 16, 15, 16},
844 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
845 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
846 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
847 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
848 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
849 {"core2", &core_cost, 16, 10, 16, 10, 16},
850 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
851 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
852 {"haswell", &core_cost, 16, 10, 16, 10, 16},
853 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
854 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
855 {"knl", &slm_cost, 16, 15, 16, 7, 16},
856 {"knm", &slm_cost, 16, 15, 16, 7, 16},
857 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
858 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
859 {"intel", &intel_cost, 16, 15, 16, 7, 16},
860 {"geode", &geode_cost, 0, 0, 0, 0, 0},
861 {"k6", &k6_cost, 32, 7, 32, 7, 32},
862 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
863 {"k8", &k8_cost, 16, 7, 16, 7, 16},
864 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
865 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
866 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
867 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
868 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
869 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
870 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
871 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
874 static unsigned int
875 rest_of_handle_insert_vzeroupper (void)
877 int i;
879 /* vzeroupper instructions are inserted immediately after reload to
880 account for possible spills from 256bit or 512bit registers. The pass
881 reuses mode switching infrastructure by re-running mode insertion
882 pass, so disable entities that have already been processed. */
883 for (i = 0; i < MAX_386_ENTITIES; i++)
884 ix86_optimize_mode_switching[i] = 0;
886 ix86_optimize_mode_switching[AVX_U128] = 1;
888 /* Call optimize_mode_switching. */
889 g->get_passes ()->execute_pass_mode_switching ();
890 return 0;
893 /* Return 1 if INSN uses or defines a hard register.
894 Hard register uses in a memory address are ignored.
895 Clobbers and flags definitions are ignored. */
897 static bool
898 has_non_address_hard_reg (rtx_insn *insn)
900 df_ref ref;
901 FOR_EACH_INSN_DEF (ref, insn)
902 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
903 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
904 && DF_REF_REGNO (ref) != FLAGS_REG)
905 return true;
907 FOR_EACH_INSN_USE (ref, insn)
908 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
909 return true;
911 return false;
914 /* Check if comparison INSN may be transformed
915 into vector comparison. Currently we transform
916 zero checks only which look like:
918 (set (reg:CCZ 17 flags)
919 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
920 (subreg:SI (reg:DI x) 0))
921 (const_int 0 [0]))) */
923 static bool
924 convertible_comparison_p (rtx_insn *insn)
926 if (!TARGET_SSE4_1)
927 return false;
929 rtx def_set = single_set (insn);
931 gcc_assert (def_set);
933 rtx src = SET_SRC (def_set);
934 rtx dst = SET_DEST (def_set);
936 gcc_assert (GET_CODE (src) == COMPARE);
938 if (GET_CODE (dst) != REG
939 || REGNO (dst) != FLAGS_REG
940 || GET_MODE (dst) != CCZmode)
941 return false;
943 rtx op1 = XEXP (src, 0);
944 rtx op2 = XEXP (src, 1);
946 if (op2 != CONST0_RTX (GET_MODE (op2)))
947 return false;
949 if (GET_CODE (op1) != IOR)
950 return false;
952 op2 = XEXP (op1, 1);
953 op1 = XEXP (op1, 0);
955 if (!SUBREG_P (op1)
956 || !SUBREG_P (op2)
957 || GET_MODE (op1) != SImode
958 || GET_MODE (op2) != SImode
959 || ((SUBREG_BYTE (op1) != 0
960 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
961 && (SUBREG_BYTE (op2) != 0
962 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
963 return false;
965 op1 = SUBREG_REG (op1);
966 op2 = SUBREG_REG (op2);
968 if (op1 != op2
969 || !REG_P (op1)
970 || GET_MODE (op1) != DImode)
971 return false;
973 return true;
976 /* The DImode version of scalar_to_vector_candidate_p. */
978 static bool
979 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
981 rtx def_set = single_set (insn);
983 if (!def_set)
984 return false;
986 if (has_non_address_hard_reg (insn))
987 return false;
989 rtx src = SET_SRC (def_set);
990 rtx dst = SET_DEST (def_set);
992 if (GET_CODE (src) == COMPARE)
993 return convertible_comparison_p (insn);
995 /* We are interested in DImode promotion only. */
996 if ((GET_MODE (src) != DImode
997 && !CONST_INT_P (src))
998 || GET_MODE (dst) != DImode)
999 return false;
1001 if (!REG_P (dst) && !MEM_P (dst))
1002 return false;
1004 switch (GET_CODE (src))
1006 case ASHIFTRT:
1007 if (!TARGET_AVX512VL)
1008 return false;
1009 /* FALLTHRU */
1011 case ASHIFT:
1012 case LSHIFTRT:
1013 if (!REG_P (XEXP (src, 1))
1014 && (!SUBREG_P (XEXP (src, 1))
1015 || SUBREG_BYTE (XEXP (src, 1)) != 0
1016 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1017 && (!CONST_INT_P (XEXP (src, 1))
1018 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1019 return false;
1021 if (GET_MODE (XEXP (src, 1)) != QImode
1022 && !CONST_INT_P (XEXP (src, 1)))
1023 return false;
1024 break;
1026 case PLUS:
1027 case MINUS:
1028 case IOR:
1029 case XOR:
1030 case AND:
1031 if (!REG_P (XEXP (src, 1))
1032 && !MEM_P (XEXP (src, 1))
1033 && !CONST_INT_P (XEXP (src, 1)))
1034 return false;
1036 if (GET_MODE (XEXP (src, 1)) != DImode
1037 && !CONST_INT_P (XEXP (src, 1)))
1038 return false;
1039 break;
1041 case NEG:
1042 case NOT:
1043 break;
1045 case REG:
1046 return true;
1048 case MEM:
1049 case CONST_INT:
1050 return REG_P (dst);
1052 default:
1053 return false;
1056 if (!REG_P (XEXP (src, 0))
1057 && !MEM_P (XEXP (src, 0))
1058 && !CONST_INT_P (XEXP (src, 0))
1059 /* Check for andnot case. */
1060 && (GET_CODE (src) != AND
1061 || GET_CODE (XEXP (src, 0)) != NOT
1062 || !REG_P (XEXP (XEXP (src, 0), 0))))
1063 return false;
1065 if (GET_MODE (XEXP (src, 0)) != DImode
1066 && !CONST_INT_P (XEXP (src, 0)))
1067 return false;
1069 return true;
1072 /* The TImode version of scalar_to_vector_candidate_p. */
1074 static bool
1075 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1077 rtx def_set = single_set (insn);
1079 if (!def_set)
1080 return false;
1082 if (has_non_address_hard_reg (insn))
1083 return false;
1085 rtx src = SET_SRC (def_set);
1086 rtx dst = SET_DEST (def_set);
1088 /* Only TImode load and store are allowed. */
1089 if (GET_MODE (dst) != TImode)
1090 return false;
1092 if (MEM_P (dst))
1094 /* Check for store. Memory must be aligned or unaligned store
1095 is optimal. Only support store from register, standard SSE
1096 constant or CONST_WIDE_INT generated from piecewise store.
1098 ??? Verify performance impact before enabling CONST_INT for
1099 __int128 store. */
1100 if (misaligned_operand (dst, TImode)
1101 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1102 return false;
1104 switch (GET_CODE (src))
1106 default:
1107 return false;
1109 case REG:
1110 case CONST_WIDE_INT:
1111 return true;
1113 case CONST_INT:
1114 return standard_sse_constant_p (src, TImode);
1117 else if (MEM_P (src))
1119 /* Check for load. Memory must be aligned or unaligned load is
1120 optimal. */
1121 return (REG_P (dst)
1122 && (!misaligned_operand (src, TImode)
1123 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1126 return false;
1129 /* Return 1 if INSN may be converted into vector
1130 instruction. */
1132 static bool
1133 scalar_to_vector_candidate_p (rtx_insn *insn)
1135 if (TARGET_64BIT)
1136 return timode_scalar_to_vector_candidate_p (insn);
1137 else
1138 return dimode_scalar_to_vector_candidate_p (insn);
1141 /* The DImode version of remove_non_convertible_regs. */
1143 static void
1144 dimode_remove_non_convertible_regs (bitmap candidates)
1146 bitmap_iterator bi;
1147 unsigned id;
1148 bitmap regs = BITMAP_ALLOC (NULL);
1150 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1152 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1153 rtx reg = SET_DEST (def_set);
1155 if (!REG_P (reg)
1156 || bitmap_bit_p (regs, REGNO (reg))
1157 || HARD_REGISTER_P (reg))
1158 continue;
1160 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1161 def;
1162 def = DF_REF_NEXT_REG (def))
1164 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1166 if (dump_file)
1167 fprintf (dump_file,
1168 "r%d has non convertible definition in insn %d\n",
1169 REGNO (reg), DF_REF_INSN_UID (def));
1171 bitmap_set_bit (regs, REGNO (reg));
1172 break;
1177 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1179 for (df_ref def = DF_REG_DEF_CHAIN (id);
1180 def;
1181 def = DF_REF_NEXT_REG (def))
1182 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1184 if (dump_file)
1185 fprintf (dump_file, "Removing insn %d from candidates list\n",
1186 DF_REF_INSN_UID (def));
1188 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1192 BITMAP_FREE (regs);
1195 /* For a register REGNO, scan instructions for its defs and uses.
1196 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1198 static void
1199 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1200 unsigned int regno)
1202 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1203 def;
1204 def = DF_REF_NEXT_REG (def))
1206 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1208 if (dump_file)
1209 fprintf (dump_file,
1210 "r%d has non convertible def in insn %d\n",
1211 regno, DF_REF_INSN_UID (def));
1213 bitmap_set_bit (regs, regno);
1214 break;
1218 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1219 ref;
1220 ref = DF_REF_NEXT_REG (ref))
1222 /* Debug instructions are skipped. */
1223 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1224 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1226 if (dump_file)
1227 fprintf (dump_file,
1228 "r%d has non convertible use in insn %d\n",
1229 regno, DF_REF_INSN_UID (ref));
1231 bitmap_set_bit (regs, regno);
1232 break;
1237 /* The TImode version of remove_non_convertible_regs. */
1239 static void
1240 timode_remove_non_convertible_regs (bitmap candidates)
1242 bitmap_iterator bi;
1243 unsigned id;
1244 bitmap regs = BITMAP_ALLOC (NULL);
1246 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1248 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1249 rtx dest = SET_DEST (def_set);
1250 rtx src = SET_SRC (def_set);
1252 if ((!REG_P (dest)
1253 || bitmap_bit_p (regs, REGNO (dest))
1254 || HARD_REGISTER_P (dest))
1255 && (!REG_P (src)
1256 || bitmap_bit_p (regs, REGNO (src))
1257 || HARD_REGISTER_P (src)))
1258 continue;
1260 if (REG_P (dest))
1261 timode_check_non_convertible_regs (candidates, regs,
1262 REGNO (dest));
1264 if (REG_P (src))
1265 timode_check_non_convertible_regs (candidates, regs,
1266 REGNO (src));
1269 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1271 for (df_ref def = DF_REG_DEF_CHAIN (id);
1272 def;
1273 def = DF_REF_NEXT_REG (def))
1274 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1276 if (dump_file)
1277 fprintf (dump_file, "Removing insn %d from candidates list\n",
1278 DF_REF_INSN_UID (def));
1280 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1283 for (df_ref ref = DF_REG_USE_CHAIN (id);
1284 ref;
1285 ref = DF_REF_NEXT_REG (ref))
1286 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1288 if (dump_file)
1289 fprintf (dump_file, "Removing insn %d from candidates list\n",
1290 DF_REF_INSN_UID (ref));
1292 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1296 BITMAP_FREE (regs);
1299 /* For a given bitmap of insn UIDs scans all instruction and
1300 remove insn from CANDIDATES in case it has both convertible
1301 and not convertible definitions.
1303 All insns in a bitmap are conversion candidates according to
1304 scalar_to_vector_candidate_p. Currently it implies all insns
1305 are single_set. */
1307 static void
1308 remove_non_convertible_regs (bitmap candidates)
1310 if (TARGET_64BIT)
1311 timode_remove_non_convertible_regs (candidates);
1312 else
1313 dimode_remove_non_convertible_regs (candidates);
1316 class scalar_chain
1318 public:
1319 scalar_chain ();
1320 virtual ~scalar_chain ();
1322 static unsigned max_id;
1324 /* ID of a chain. */
1325 unsigned int chain_id;
1326 /* A queue of instructions to be included into a chain. */
1327 bitmap queue;
1328 /* Instructions included into a chain. */
1329 bitmap insns;
1330 /* All registers defined by a chain. */
1331 bitmap defs;
1332 /* Registers used in both vector and sclar modes. */
1333 bitmap defs_conv;
1335 void build (bitmap candidates, unsigned insn_uid);
1336 virtual int compute_convert_gain () = 0;
1337 int convert ();
1339 protected:
1340 void add_to_queue (unsigned insn_uid);
1341 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1343 private:
1344 void add_insn (bitmap candidates, unsigned insn_uid);
1345 void analyze_register_chain (bitmap candidates, df_ref ref);
1346 virtual void mark_dual_mode_def (df_ref def) = 0;
1347 virtual void convert_insn (rtx_insn *insn) = 0;
1348 virtual void convert_registers () = 0;
1351 class dimode_scalar_chain : public scalar_chain
1353 public:
1354 int compute_convert_gain ();
1355 private:
1356 void mark_dual_mode_def (df_ref def);
1357 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1358 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1359 void convert_insn (rtx_insn *insn);
1360 void convert_op (rtx *op, rtx_insn *insn);
1361 void convert_reg (unsigned regno);
1362 void make_vector_copies (unsigned regno);
1363 void convert_registers ();
1364 int vector_const_cost (rtx exp);
1367 class timode_scalar_chain : public scalar_chain
1369 public:
1370 /* Convert from TImode to V1TImode is always faster. */
1371 int compute_convert_gain () { return 1; }
1373 private:
1374 void mark_dual_mode_def (df_ref def);
1375 void fix_debug_reg_uses (rtx reg);
1376 void convert_insn (rtx_insn *insn);
1377 /* We don't convert registers to difference size. */
1378 void convert_registers () {}
1381 unsigned scalar_chain::max_id = 0;
1383 /* Initialize new chain. */
1385 scalar_chain::scalar_chain ()
1387 chain_id = ++max_id;
1389 if (dump_file)
1390 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1392 bitmap_obstack_initialize (NULL);
1393 insns = BITMAP_ALLOC (NULL);
1394 defs = BITMAP_ALLOC (NULL);
1395 defs_conv = BITMAP_ALLOC (NULL);
1396 queue = NULL;
1399 /* Free chain's data. */
1401 scalar_chain::~scalar_chain ()
1403 BITMAP_FREE (insns);
1404 BITMAP_FREE (defs);
1405 BITMAP_FREE (defs_conv);
1406 bitmap_obstack_release (NULL);
1409 /* Add instruction into chains' queue. */
1411 void
1412 scalar_chain::add_to_queue (unsigned insn_uid)
1414 if (bitmap_bit_p (insns, insn_uid)
1415 || bitmap_bit_p (queue, insn_uid))
1416 return;
1418 if (dump_file)
1419 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1420 insn_uid, chain_id);
1421 bitmap_set_bit (queue, insn_uid);
1424 /* For DImode conversion, mark register defined by DEF as requiring
1425 conversion. */
1427 void
1428 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1430 gcc_assert (DF_REF_REG_DEF_P (def));
1432 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1433 return;
1435 if (dump_file)
1436 fprintf (dump_file,
1437 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1438 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1440 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1443 /* For TImode conversion, it is unused. */
1445 void
1446 timode_scalar_chain::mark_dual_mode_def (df_ref)
1448 gcc_unreachable ();
1451 /* Check REF's chain to add new insns into a queue
1452 and find registers requiring conversion. */
1454 void
1455 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1457 df_link *chain;
1459 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1460 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1461 add_to_queue (DF_REF_INSN_UID (ref));
1463 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1465 unsigned uid = DF_REF_INSN_UID (chain->ref);
1467 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1468 continue;
1470 if (!DF_REF_REG_MEM_P (chain->ref))
1472 if (bitmap_bit_p (insns, uid))
1473 continue;
1475 if (bitmap_bit_p (candidates, uid))
1477 add_to_queue (uid);
1478 continue;
1482 if (DF_REF_REG_DEF_P (chain->ref))
1484 if (dump_file)
1485 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1486 DF_REF_REGNO (chain->ref), uid);
1487 mark_dual_mode_def (chain->ref);
1489 else
1491 if (dump_file)
1492 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1493 DF_REF_REGNO (chain->ref), uid);
1494 mark_dual_mode_def (ref);
1499 /* Add instruction into a chain. */
1501 void
1502 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1504 if (bitmap_bit_p (insns, insn_uid))
1505 return;
1507 if (dump_file)
1508 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1510 bitmap_set_bit (insns, insn_uid);
1512 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1513 rtx def_set = single_set (insn);
1514 if (def_set && REG_P (SET_DEST (def_set))
1515 && !HARD_REGISTER_P (SET_DEST (def_set)))
1516 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1518 df_ref ref;
1519 df_ref def;
1520 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1521 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1522 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1523 def;
1524 def = DF_REF_NEXT_REG (def))
1525 analyze_register_chain (candidates, def);
1526 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1527 if (!DF_REF_REG_MEM_P (ref))
1528 analyze_register_chain (candidates, ref);
1531 /* Build new chain starting from insn INSN_UID recursively
1532 adding all dependent uses and definitions. */
1534 void
1535 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1537 queue = BITMAP_ALLOC (NULL);
1538 bitmap_set_bit (queue, insn_uid);
1540 if (dump_file)
1541 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1543 while (!bitmap_empty_p (queue))
1545 insn_uid = bitmap_first_set_bit (queue);
1546 bitmap_clear_bit (queue, insn_uid);
1547 bitmap_clear_bit (candidates, insn_uid);
1548 add_insn (candidates, insn_uid);
1551 if (dump_file)
1553 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1554 fprintf (dump_file, " insns: ");
1555 dump_bitmap (dump_file, insns);
1556 if (!bitmap_empty_p (defs_conv))
1558 bitmap_iterator bi;
1559 unsigned id;
1560 const char *comma = "";
1561 fprintf (dump_file, " defs to convert: ");
1562 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1564 fprintf (dump_file, "%sr%d", comma, id);
1565 comma = ", ";
1567 fprintf (dump_file, "\n");
1571 BITMAP_FREE (queue);
1574 /* Return a cost of building a vector costant
1575 instead of using a scalar one. */
1578 dimode_scalar_chain::vector_const_cost (rtx exp)
1580 gcc_assert (CONST_INT_P (exp));
1582 if (standard_sse_constant_p (exp, V2DImode))
1583 return COSTS_N_INSNS (1);
1584 return ix86_cost->sse_load[1];
1587 /* Compute a gain for chain conversion. */
1590 dimode_scalar_chain::compute_convert_gain ()
1592 bitmap_iterator bi;
1593 unsigned insn_uid;
1594 int gain = 0;
1595 int cost = 0;
1597 if (dump_file)
1598 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1600 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1602 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1603 rtx def_set = single_set (insn);
1604 rtx src = SET_SRC (def_set);
1605 rtx dst = SET_DEST (def_set);
1607 if (REG_P (src) && REG_P (dst))
1608 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1609 else if (REG_P (src) && MEM_P (dst))
1610 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1611 else if (MEM_P (src) && REG_P (dst))
1612 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1613 else if (GET_CODE (src) == ASHIFT
1614 || GET_CODE (src) == ASHIFTRT
1615 || GET_CODE (src) == LSHIFTRT)
1617 if (CONST_INT_P (XEXP (src, 0)))
1618 gain -= vector_const_cost (XEXP (src, 0));
1619 if (CONST_INT_P (XEXP (src, 1)))
1621 gain += ix86_cost->shift_const;
1622 if (INTVAL (XEXP (src, 1)) >= 32)
1623 gain -= COSTS_N_INSNS (1);
1625 else
1626 /* Additional gain for omitting two CMOVs. */
1627 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1629 else if (GET_CODE (src) == PLUS
1630 || GET_CODE (src) == MINUS
1631 || GET_CODE (src) == IOR
1632 || GET_CODE (src) == XOR
1633 || GET_CODE (src) == AND)
1635 gain += ix86_cost->add;
1636 /* Additional gain for andnot for targets without BMI. */
1637 if (GET_CODE (XEXP (src, 0)) == NOT
1638 && !TARGET_BMI)
1639 gain += 2 * ix86_cost->add;
1641 if (CONST_INT_P (XEXP (src, 0)))
1642 gain -= vector_const_cost (XEXP (src, 0));
1643 if (CONST_INT_P (XEXP (src, 1)))
1644 gain -= vector_const_cost (XEXP (src, 1));
1646 else if (GET_CODE (src) == NEG
1647 || GET_CODE (src) == NOT)
1648 gain += ix86_cost->add - COSTS_N_INSNS (1);
1649 else if (GET_CODE (src) == COMPARE)
1651 /* Assume comparison cost is the same. */
1653 else if (CONST_INT_P (src))
1655 if (REG_P (dst))
1656 gain += COSTS_N_INSNS (2);
1657 else if (MEM_P (dst))
1658 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1659 gain -= vector_const_cost (src);
1661 else
1662 gcc_unreachable ();
1665 if (dump_file)
1666 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1668 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1669 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1671 if (dump_file)
1672 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1674 gain -= cost;
1676 if (dump_file)
1677 fprintf (dump_file, " Total gain: %d\n", gain);
1679 return gain;
1682 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1685 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1687 if (x == reg)
1688 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1690 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1691 int i, j;
1692 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1694 if (fmt[i] == 'e')
1695 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1696 else if (fmt[i] == 'E')
1697 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1698 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1699 reg, new_reg);
1702 return x;
1705 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1707 void
1708 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1709 rtx reg, rtx new_reg)
1711 replace_with_subreg (single_set (insn), reg, new_reg);
1714 /* Insert generated conversion instruction sequence INSNS
1715 after instruction AFTER. New BB may be required in case
1716 instruction has EH region attached. */
1718 void
1719 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1721 if (!control_flow_insn_p (after))
1723 emit_insn_after (insns, after);
1724 return;
1727 basic_block bb = BLOCK_FOR_INSN (after);
1728 edge e = find_fallthru_edge (bb->succs);
1729 gcc_assert (e);
1731 basic_block new_bb = split_edge (e);
1732 emit_insn_after (insns, BB_HEAD (new_bb));
1735 /* Make vector copies for all register REGNO definitions
1736 and replace its uses in a chain. */
1738 void
1739 dimode_scalar_chain::make_vector_copies (unsigned regno)
1741 rtx reg = regno_reg_rtx[regno];
1742 rtx vreg = gen_reg_rtx (DImode);
1743 bool count_reg = false;
1744 df_ref ref;
1746 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1747 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1749 df_ref use;
1751 /* Detect the count register of a shift instruction. */
1752 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1753 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1755 rtx_insn *insn = DF_REF_INSN (use);
1756 rtx def_set = single_set (insn);
1758 gcc_assert (def_set);
1760 rtx src = SET_SRC (def_set);
1762 if ((GET_CODE (src) == ASHIFT
1763 || GET_CODE (src) == ASHIFTRT
1764 || GET_CODE (src) == LSHIFTRT)
1765 && !CONST_INT_P (XEXP (src, 1))
1766 && reg_or_subregno (XEXP (src, 1)) == regno)
1767 count_reg = true;
1770 start_sequence ();
1771 if (count_reg)
1773 rtx qreg = gen_lowpart (QImode, reg);
1774 rtx tmp = gen_reg_rtx (SImode);
1776 if (TARGET_ZERO_EXTEND_WITH_AND
1777 && optimize_function_for_speed_p (cfun))
1779 emit_move_insn (tmp, const0_rtx);
1780 emit_insn (gen_movstrictqi
1781 (gen_lowpart (QImode, tmp), qreg));
1783 else
1784 emit_insn (gen_rtx_SET
1785 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1787 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1789 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1790 emit_move_insn (slot, tmp);
1791 tmp = copy_rtx (slot);
1794 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1796 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1798 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1799 emit_move_insn (adjust_address (tmp, SImode, 0),
1800 gen_rtx_SUBREG (SImode, reg, 0));
1801 emit_move_insn (adjust_address (tmp, SImode, 4),
1802 gen_rtx_SUBREG (SImode, reg, 4));
1803 emit_move_insn (vreg, tmp);
1805 else if (TARGET_SSE4_1)
1807 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1808 CONST0_RTX (V4SImode),
1809 gen_rtx_SUBREG (SImode, reg, 0)));
1810 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1811 gen_rtx_SUBREG (V4SImode, vreg, 0),
1812 gen_rtx_SUBREG (SImode, reg, 4),
1813 GEN_INT (2)));
1815 else
1817 rtx tmp = gen_reg_rtx (DImode);
1818 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1819 CONST0_RTX (V4SImode),
1820 gen_rtx_SUBREG (SImode, reg, 0)));
1821 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1822 CONST0_RTX (V4SImode),
1823 gen_rtx_SUBREG (SImode, reg, 4)));
1824 emit_insn (gen_vec_interleave_lowv4si
1825 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1826 gen_rtx_SUBREG (V4SImode, vreg, 0),
1827 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1829 rtx_insn *seq = get_insns ();
1830 end_sequence ();
1831 rtx_insn *insn = DF_REF_INSN (ref);
1832 emit_conversion_insns (seq, insn);
1834 if (dump_file)
1835 fprintf (dump_file,
1836 " Copied r%d to a vector register r%d for insn %d\n",
1837 regno, REGNO (vreg), INSN_UID (insn));
1840 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1841 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1843 rtx_insn *insn = DF_REF_INSN (ref);
1844 if (count_reg)
1846 rtx def_set = single_set (insn);
1847 gcc_assert (def_set);
1849 rtx src = SET_SRC (def_set);
1851 if ((GET_CODE (src) == ASHIFT
1852 || GET_CODE (src) == ASHIFTRT
1853 || GET_CODE (src) == LSHIFTRT)
1854 && !CONST_INT_P (XEXP (src, 1))
1855 && reg_or_subregno (XEXP (src, 1)) == regno)
1856 XEXP (src, 1) = vreg;
1858 else
1859 replace_with_subreg_in_insn (insn, reg, vreg);
1861 if (dump_file)
1862 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1863 regno, REGNO (vreg), INSN_UID (insn));
1867 /* Convert all definitions of register REGNO
1868 and fix its uses. Scalar copies may be created
1869 in case register is used in not convertible insn. */
1871 void
1872 dimode_scalar_chain::convert_reg (unsigned regno)
1874 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1875 rtx reg = regno_reg_rtx[regno];
1876 rtx scopy = NULL_RTX;
1877 df_ref ref;
1878 bitmap conv;
1880 conv = BITMAP_ALLOC (NULL);
1881 bitmap_copy (conv, insns);
1883 if (scalar_copy)
1884 scopy = gen_reg_rtx (DImode);
1886 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1888 rtx_insn *insn = DF_REF_INSN (ref);
1889 rtx def_set = single_set (insn);
1890 rtx src = SET_SRC (def_set);
1891 rtx reg = DF_REF_REG (ref);
1893 if (!MEM_P (src))
1895 replace_with_subreg_in_insn (insn, reg, reg);
1896 bitmap_clear_bit (conv, INSN_UID (insn));
1899 if (scalar_copy)
1901 start_sequence ();
1902 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1904 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1905 emit_move_insn (tmp, reg);
1906 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1907 adjust_address (tmp, SImode, 0));
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1909 adjust_address (tmp, SImode, 4));
1911 else if (TARGET_SSE4_1)
1913 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1914 emit_insn
1915 (gen_rtx_SET
1916 (gen_rtx_SUBREG (SImode, scopy, 0),
1917 gen_rtx_VEC_SELECT (SImode,
1918 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1920 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1921 emit_insn
1922 (gen_rtx_SET
1923 (gen_rtx_SUBREG (SImode, scopy, 4),
1924 gen_rtx_VEC_SELECT (SImode,
1925 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1927 else
1929 rtx vcopy = gen_reg_rtx (V2DImode);
1930 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1931 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1932 gen_rtx_SUBREG (SImode, vcopy, 0));
1933 emit_move_insn (vcopy,
1934 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1935 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1936 gen_rtx_SUBREG (SImode, vcopy, 0));
1938 rtx_insn *seq = get_insns ();
1939 end_sequence ();
1940 emit_conversion_insns (seq, insn);
1942 if (dump_file)
1943 fprintf (dump_file,
1944 " Copied r%d to a scalar register r%d for insn %d\n",
1945 regno, REGNO (scopy), INSN_UID (insn));
1949 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1950 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1952 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1954 rtx_insn *insn = DF_REF_INSN (ref);
1956 rtx def_set = single_set (insn);
1957 gcc_assert (def_set);
1959 rtx src = SET_SRC (def_set);
1960 rtx dst = SET_DEST (def_set);
1962 if ((GET_CODE (src) == ASHIFT
1963 || GET_CODE (src) == ASHIFTRT
1964 || GET_CODE (src) == LSHIFTRT)
1965 && !CONST_INT_P (XEXP (src, 1))
1966 && reg_or_subregno (XEXP (src, 1)) == regno)
1968 rtx tmp2 = gen_reg_rtx (V2DImode);
1970 start_sequence ();
1972 if (TARGET_SSE4_1)
1973 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1974 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1975 else
1977 rtx vec_cst
1978 = gen_rtx_CONST_VECTOR (V2DImode,
1979 gen_rtvec (2, GEN_INT (0xff),
1980 const0_rtx));
1981 vec_cst
1982 = validize_mem (force_const_mem (V2DImode, vec_cst));
1984 emit_insn (gen_rtx_SET
1985 (tmp2,
1986 gen_rtx_AND (V2DImode,
1987 gen_rtx_SUBREG (V2DImode, reg, 0),
1988 vec_cst)));
1990 rtx_insn *seq = get_insns ();
1991 end_sequence ();
1993 emit_insn_before (seq, insn);
1995 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1997 else if (!MEM_P (dst) || !REG_P (src))
1998 replace_with_subreg_in_insn (insn, reg, reg);
2000 bitmap_clear_bit (conv, INSN_UID (insn));
2003 /* Skip debug insns and uninitialized uses. */
2004 else if (DF_REF_CHAIN (ref)
2005 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2007 gcc_assert (scopy);
2008 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2009 df_insn_rescan (DF_REF_INSN (ref));
2012 BITMAP_FREE (conv);
2015 /* Convert operand OP in INSN. We should handle
2016 memory operands and uninitialized registers.
2017 All other register uses are converted during
2018 registers conversion. */
2020 void
2021 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2023 *op = copy_rtx_if_shared (*op);
2025 if (GET_CODE (*op) == NOT)
2027 convert_op (&XEXP (*op, 0), insn);
2028 PUT_MODE (*op, V2DImode);
2030 else if (MEM_P (*op))
2032 rtx tmp = gen_reg_rtx (DImode);
2034 emit_insn_before (gen_move_insn (tmp, *op), insn);
2035 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2037 if (dump_file)
2038 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2039 INSN_UID (insn), REGNO (tmp));
2041 else if (REG_P (*op))
2043 /* We may have not converted register usage in case
2044 this register has no definition. Otherwise it
2045 should be converted in convert_reg. */
2046 df_ref ref;
2047 FOR_EACH_INSN_USE (ref, insn)
2048 if (DF_REF_REGNO (ref) == REGNO (*op))
2050 gcc_assert (!DF_REF_CHAIN (ref));
2051 break;
2053 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2055 else if (CONST_INT_P (*op))
2057 rtx vec_cst;
2058 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2060 /* Prefer all ones vector in case of -1. */
2061 if (constm1_operand (*op, GET_MODE (*op)))
2062 vec_cst = CONSTM1_RTX (V2DImode);
2063 else
2064 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2065 gen_rtvec (2, *op, const0_rtx));
2067 if (!standard_sse_constant_p (vec_cst, V2DImode))
2069 start_sequence ();
2070 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2071 rtx_insn *seq = get_insns ();
2072 end_sequence ();
2073 emit_insn_before (seq, insn);
2076 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2077 *op = tmp;
2079 else
2081 gcc_assert (SUBREG_P (*op));
2082 gcc_assert (GET_MODE (*op) == V2DImode);
2086 /* Convert INSN to vector mode. */
2088 void
2089 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2091 rtx def_set = single_set (insn);
2092 rtx src = SET_SRC (def_set);
2093 rtx dst = SET_DEST (def_set);
2094 rtx subreg;
2096 if (MEM_P (dst) && !REG_P (src))
2098 /* There are no scalar integer instructions and therefore
2099 temporary register usage is required. */
2100 rtx tmp = gen_reg_rtx (DImode);
2101 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2102 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2105 switch (GET_CODE (src))
2107 case ASHIFT:
2108 case ASHIFTRT:
2109 case LSHIFTRT:
2110 convert_op (&XEXP (src, 0), insn);
2111 PUT_MODE (src, V2DImode);
2112 break;
2114 case PLUS:
2115 case MINUS:
2116 case IOR:
2117 case XOR:
2118 case AND:
2119 convert_op (&XEXP (src, 0), insn);
2120 convert_op (&XEXP (src, 1), insn);
2121 PUT_MODE (src, V2DImode);
2122 break;
2124 case NEG:
2125 src = XEXP (src, 0);
2126 convert_op (&src, insn);
2127 subreg = gen_reg_rtx (V2DImode);
2128 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2129 src = gen_rtx_MINUS (V2DImode, subreg, src);
2130 break;
2132 case NOT:
2133 src = XEXP (src, 0);
2134 convert_op (&src, insn);
2135 subreg = gen_reg_rtx (V2DImode);
2136 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2137 src = gen_rtx_XOR (V2DImode, src, subreg);
2138 break;
2140 case MEM:
2141 if (!REG_P (dst))
2142 convert_op (&src, insn);
2143 break;
2145 case REG:
2146 if (!MEM_P (dst))
2147 convert_op (&src, insn);
2148 break;
2150 case SUBREG:
2151 gcc_assert (GET_MODE (src) == V2DImode);
2152 break;
2154 case COMPARE:
2155 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2157 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2158 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2160 if (REG_P (src))
2161 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2162 else
2163 subreg = copy_rtx_if_shared (src);
2164 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2165 copy_rtx_if_shared (subreg),
2166 copy_rtx_if_shared (subreg)),
2167 insn);
2168 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2169 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2170 copy_rtx_if_shared (src)),
2171 UNSPEC_PTEST);
2172 break;
2174 case CONST_INT:
2175 convert_op (&src, insn);
2176 break;
2178 default:
2179 gcc_unreachable ();
2182 SET_SRC (def_set) = src;
2183 SET_DEST (def_set) = dst;
2185 /* Drop possible dead definitions. */
2186 PATTERN (insn) = def_set;
2188 INSN_CODE (insn) = -1;
2189 recog_memoized (insn);
2190 df_insn_rescan (insn);
2193 /* Fix uses of converted REG in debug insns. */
2195 void
2196 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2198 if (!flag_var_tracking)
2199 return;
2201 df_ref ref, next;
2202 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2204 rtx_insn *insn = DF_REF_INSN (ref);
2205 /* Make sure the next ref is for a different instruction,
2206 so that we're not affected by the rescan. */
2207 next = DF_REF_NEXT_REG (ref);
2208 while (next && DF_REF_INSN (next) == insn)
2209 next = DF_REF_NEXT_REG (next);
2211 if (DEBUG_INSN_P (insn))
2213 /* It may be a debug insn with a TImode variable in
2214 register. */
2215 bool changed = false;
2216 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2218 rtx *loc = DF_REF_LOC (ref);
2219 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2221 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2222 changed = true;
2225 if (changed)
2226 df_insn_rescan (insn);
2231 /* Convert INSN from TImode to V1T1mode. */
2233 void
2234 timode_scalar_chain::convert_insn (rtx_insn *insn)
2236 rtx def_set = single_set (insn);
2237 rtx src = SET_SRC (def_set);
2238 rtx dst = SET_DEST (def_set);
2240 switch (GET_CODE (dst))
2242 case REG:
2244 rtx tmp = find_reg_equal_equiv_note (insn);
2245 if (tmp)
2246 PUT_MODE (XEXP (tmp, 0), V1TImode);
2247 PUT_MODE (dst, V1TImode);
2248 fix_debug_reg_uses (dst);
2250 break;
2251 case MEM:
2252 PUT_MODE (dst, V1TImode);
2253 break;
2255 default:
2256 gcc_unreachable ();
2259 switch (GET_CODE (src))
2261 case REG:
2262 PUT_MODE (src, V1TImode);
2263 /* Call fix_debug_reg_uses only if SRC is never defined. */
2264 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2265 fix_debug_reg_uses (src);
2266 break;
2268 case MEM:
2269 PUT_MODE (src, V1TImode);
2270 break;
2272 case CONST_WIDE_INT:
2273 if (NONDEBUG_INSN_P (insn))
2275 /* Since there are no instructions to store 128-bit constant,
2276 temporary register usage is required. */
2277 rtx tmp = gen_reg_rtx (V1TImode);
2278 start_sequence ();
2279 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2280 src = validize_mem (force_const_mem (V1TImode, src));
2281 rtx_insn *seq = get_insns ();
2282 end_sequence ();
2283 if (seq)
2284 emit_insn_before (seq, insn);
2285 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2286 dst = tmp;
2288 break;
2290 case CONST_INT:
2291 switch (standard_sse_constant_p (src, TImode))
2293 case 1:
2294 src = CONST0_RTX (GET_MODE (dst));
2295 break;
2296 case 2:
2297 src = CONSTM1_RTX (GET_MODE (dst));
2298 break;
2299 default:
2300 gcc_unreachable ();
2302 if (NONDEBUG_INSN_P (insn))
2304 rtx tmp = gen_reg_rtx (V1TImode);
2305 /* Since there are no instructions to store standard SSE
2306 constant, temporary register usage is required. */
2307 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2308 dst = tmp;
2310 break;
2312 default:
2313 gcc_unreachable ();
2316 SET_SRC (def_set) = src;
2317 SET_DEST (def_set) = dst;
2319 /* Drop possible dead definitions. */
2320 PATTERN (insn) = def_set;
2322 INSN_CODE (insn) = -1;
2323 recog_memoized (insn);
2324 df_insn_rescan (insn);
2327 void
2328 dimode_scalar_chain::convert_registers ()
2330 bitmap_iterator bi;
2331 unsigned id;
2333 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2334 convert_reg (id);
2336 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2337 make_vector_copies (id);
2340 /* Convert whole chain creating required register
2341 conversions and copies. */
2344 scalar_chain::convert ()
2346 bitmap_iterator bi;
2347 unsigned id;
2348 int converted_insns = 0;
2350 if (!dbg_cnt (stv_conversion))
2351 return 0;
2353 if (dump_file)
2354 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2356 convert_registers ();
2358 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2360 convert_insn (DF_INSN_UID_GET (id)->insn);
2361 converted_insns++;
2364 return converted_insns;
2367 /* Main STV pass function. Find and convert scalar
2368 instructions into vector mode when profitable. */
2370 static unsigned int
2371 convert_scalars_to_vector ()
2373 basic_block bb;
2374 bitmap candidates;
2375 int converted_insns = 0;
2377 bitmap_obstack_initialize (NULL);
2378 candidates = BITMAP_ALLOC (NULL);
2380 calculate_dominance_info (CDI_DOMINATORS);
2381 df_set_flags (DF_DEFER_INSN_RESCAN);
2382 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2383 df_md_add_problem ();
2384 df_analyze ();
2386 /* Find all instructions we want to convert into vector mode. */
2387 if (dump_file)
2388 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2390 FOR_EACH_BB_FN (bb, cfun)
2392 rtx_insn *insn;
2393 FOR_BB_INSNS (bb, insn)
2394 if (scalar_to_vector_candidate_p (insn))
2396 if (dump_file)
2397 fprintf (dump_file, " insn %d is marked as a candidate\n",
2398 INSN_UID (insn));
2400 bitmap_set_bit (candidates, INSN_UID (insn));
2404 remove_non_convertible_regs (candidates);
2406 if (bitmap_empty_p (candidates))
2407 if (dump_file)
2408 fprintf (dump_file, "There are no candidates for optimization.\n");
2410 while (!bitmap_empty_p (candidates))
2412 unsigned uid = bitmap_first_set_bit (candidates);
2413 scalar_chain *chain;
2415 if (TARGET_64BIT)
2416 chain = new timode_scalar_chain;
2417 else
2418 chain = new dimode_scalar_chain;
2420 /* Find instructions chain we want to convert to vector mode.
2421 Check all uses and definitions to estimate all required
2422 conversions. */
2423 chain->build (candidates, uid);
2425 if (chain->compute_convert_gain () > 0)
2426 converted_insns += chain->convert ();
2427 else
2428 if (dump_file)
2429 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2430 chain->chain_id);
2432 delete chain;
2435 if (dump_file)
2436 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2438 BITMAP_FREE (candidates);
2439 bitmap_obstack_release (NULL);
2440 df_process_deferred_rescans ();
2442 /* Conversion means we may have 128bit register spills/fills
2443 which require aligned stack. */
2444 if (converted_insns)
2446 if (crtl->stack_alignment_needed < 128)
2447 crtl->stack_alignment_needed = 128;
2448 if (crtl->stack_alignment_estimated < 128)
2449 crtl->stack_alignment_estimated = 128;
2450 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2451 if (TARGET_64BIT)
2452 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2453 parm; parm = DECL_CHAIN (parm))
2455 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2456 continue;
2457 if (DECL_RTL_SET_P (parm)
2458 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2460 rtx r = DECL_RTL (parm);
2461 if (REG_P (r))
2462 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2464 if (DECL_INCOMING_RTL (parm)
2465 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2467 rtx r = DECL_INCOMING_RTL (parm);
2468 if (REG_P (r))
2469 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2474 return 0;
2477 namespace {
2479 const pass_data pass_data_insert_vzeroupper =
2481 RTL_PASS, /* type */
2482 "vzeroupper", /* name */
2483 OPTGROUP_NONE, /* optinfo_flags */
2484 TV_MACH_DEP, /* tv_id */
2485 0, /* properties_required */
2486 0, /* properties_provided */
2487 0, /* properties_destroyed */
2488 0, /* todo_flags_start */
2489 TODO_df_finish, /* todo_flags_finish */
2492 class pass_insert_vzeroupper : public rtl_opt_pass
2494 public:
2495 pass_insert_vzeroupper(gcc::context *ctxt)
2496 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2499 /* opt_pass methods: */
2500 virtual bool gate (function *)
2502 return TARGET_AVX
2503 && TARGET_VZEROUPPER && flag_expensive_optimizations
2504 && !optimize_size;
2507 virtual unsigned int execute (function *)
2509 return rest_of_handle_insert_vzeroupper ();
2512 }; // class pass_insert_vzeroupper
2514 const pass_data pass_data_stv =
2516 RTL_PASS, /* type */
2517 "stv", /* name */
2518 OPTGROUP_NONE, /* optinfo_flags */
2519 TV_MACH_DEP, /* tv_id */
2520 0, /* properties_required */
2521 0, /* properties_provided */
2522 0, /* properties_destroyed */
2523 0, /* todo_flags_start */
2524 TODO_df_finish, /* todo_flags_finish */
2527 class pass_stv : public rtl_opt_pass
2529 public:
2530 pass_stv (gcc::context *ctxt)
2531 : rtl_opt_pass (pass_data_stv, ctxt),
2532 timode_p (false)
2535 /* opt_pass methods: */
2536 virtual bool gate (function *)
2538 return (timode_p == !!TARGET_64BIT
2539 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2542 virtual unsigned int execute (function *)
2544 return convert_scalars_to_vector ();
2547 opt_pass *clone ()
2549 return new pass_stv (m_ctxt);
2552 void set_pass_param (unsigned int n, bool param)
2554 gcc_assert (n == 0);
2555 timode_p = param;
2558 private:
2559 bool timode_p;
2560 }; // class pass_stv
2562 } // anon namespace
2564 rtl_opt_pass *
2565 make_pass_insert_vzeroupper (gcc::context *ctxt)
2567 return new pass_insert_vzeroupper (ctxt);
2570 rtl_opt_pass *
2571 make_pass_stv (gcc::context *ctxt)
2573 return new pass_stv (ctxt);
2576 /* Inserting ENDBRANCH instructions. */
2578 static unsigned int
2579 rest_of_insert_endbranch (void)
2581 timevar_push (TV_MACH_DEP);
2583 rtx cet_eb;
2584 rtx_insn *insn;
2585 basic_block bb;
2587 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2588 absent among function attributes. Later an optimization will be
2589 introduced to make analysis if an address of a static function is
2590 taken. A static function whose address is not taken will get a
2591 nocf_check attribute. This will allow to reduce the number of EB. */
2593 if (!lookup_attribute ("nocf_check",
2594 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2595 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2597 cet_eb = gen_nop_endbr ();
2599 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2600 insn = BB_HEAD (bb);
2601 emit_insn_before (cet_eb, insn);
2604 bb = 0;
2605 FOR_EACH_BB_FN (bb, cfun)
2607 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2608 insn = NEXT_INSN (insn))
2610 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2612 rtx_insn *next_insn = insn;
2614 while ((next_insn != BB_END (bb))
2615 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2616 || NOTE_P (NEXT_INSN (next_insn))
2617 || BARRIER_P (NEXT_INSN (next_insn))))
2618 next_insn = NEXT_INSN (next_insn);
2620 /* Generate ENDBRANCH after CALL, which can return more than
2621 twice, setjmp-like functions. */
2622 if (find_reg_note (insn, REG_SETJMP, NULL) != NULL)
2624 cet_eb = gen_nop_endbr ();
2625 emit_insn_after (cet_eb, next_insn);
2627 continue;
2630 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2632 rtx target = JUMP_LABEL (insn);
2633 if (target == NULL_RTX || ANY_RETURN_P (target))
2634 continue;
2636 /* Check the jump is a switch table. */
2637 rtx_insn *label = as_a<rtx_insn *> (target);
2638 rtx_insn *table = next_insn (label);
2639 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2640 continue;
2642 /* For the indirect jump find out all places it jumps and insert
2643 ENDBRANCH there. It should be done under a special flag to
2644 control ENDBRANCH generation for switch stmts. */
2645 edge_iterator ei;
2646 edge e;
2647 basic_block dest_blk;
2649 FOR_EACH_EDGE (e, ei, bb->succs)
2651 rtx_insn *insn;
2653 dest_blk = e->dest;
2654 insn = BB_HEAD (dest_blk);
2655 gcc_assert (LABEL_P (insn));
2656 cet_eb = gen_nop_endbr ();
2657 emit_insn_after (cet_eb, insn);
2659 continue;
2662 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2663 || (NOTE_P (insn)
2664 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2665 /* TODO. Check /s bit also. */
2667 cet_eb = gen_nop_endbr ();
2668 emit_insn_after (cet_eb, insn);
2669 continue;
2674 timevar_pop (TV_MACH_DEP);
2675 return 0;
2678 namespace {
2680 const pass_data pass_data_insert_endbranch =
2682 RTL_PASS, /* type. */
2683 "cet", /* name. */
2684 OPTGROUP_NONE, /* optinfo_flags. */
2685 TV_MACH_DEP, /* tv_id. */
2686 0, /* properties_required. */
2687 0, /* properties_provided. */
2688 0, /* properties_destroyed. */
2689 0, /* todo_flags_start. */
2690 0, /* todo_flags_finish. */
2693 class pass_insert_endbranch : public rtl_opt_pass
2695 public:
2696 pass_insert_endbranch (gcc::context *ctxt)
2697 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2700 /* opt_pass methods: */
2701 virtual bool gate (function *)
2703 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2706 virtual unsigned int execute (function *)
2708 return rest_of_insert_endbranch ();
2711 }; // class pass_insert_endbranch
2713 } // anon namespace
2715 rtl_opt_pass *
2716 make_pass_insert_endbranch (gcc::context *ctxt)
2718 return new pass_insert_endbranch (ctxt);
2721 /* Return true if a red-zone is in use. */
2723 bool
2724 ix86_using_red_zone (void)
2726 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2729 /* Return a string that documents the current -m options. The caller is
2730 responsible for freeing the string. */
2732 static char *
2733 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2734 int flags, int flags2,
2735 const char *arch, const char *tune,
2736 enum fpmath_unit fpmath, bool add_nl_p)
2738 struct ix86_target_opts
2740 const char *option; /* option string */
2741 HOST_WIDE_INT mask; /* isa mask options */
2744 /* This table is ordered so that options like -msse4.2 that imply other
2745 ISAs come first. Target string will be displayed in the same order. */
2746 static struct ix86_target_opts isa2_opts[] =
2748 { "-mmpx", OPTION_MASK_ISA_MPX },
2749 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2750 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2751 { "-mvaes", OPTION_MASK_ISA_VAES },
2752 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2753 { "-msgx", OPTION_MASK_ISA_SGX },
2754 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2755 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2756 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2757 { "-mibt", OPTION_MASK_ISA_IBT },
2758 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2760 static struct ix86_target_opts isa_opts[] =
2762 { "-mgfni", OPTION_MASK_ISA_GFNI },
2763 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2764 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2765 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2766 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2767 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2768 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2769 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2770 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2771 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2772 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2773 { "-mfma", OPTION_MASK_ISA_FMA },
2774 { "-mxop", OPTION_MASK_ISA_XOP },
2775 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2776 { "-mf16c", OPTION_MASK_ISA_F16C },
2777 { "-mavx", OPTION_MASK_ISA_AVX },
2778 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2779 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2780 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2781 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2782 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2783 { "-msse3", OPTION_MASK_ISA_SSE3 },
2784 { "-maes", OPTION_MASK_ISA_AES },
2785 { "-msha", OPTION_MASK_ISA_SHA },
2786 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2787 { "-msse2", OPTION_MASK_ISA_SSE2 },
2788 { "-msse", OPTION_MASK_ISA_SSE },
2789 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2790 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2791 { "-mmmx", OPTION_MASK_ISA_MMX },
2792 { "-mrtm", OPTION_MASK_ISA_RTM },
2793 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2794 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2795 { "-madx", OPTION_MASK_ISA_ADX },
2796 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2797 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2798 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2799 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2800 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2801 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2802 { "-mabm", OPTION_MASK_ISA_ABM },
2803 { "-mbmi", OPTION_MASK_ISA_BMI },
2804 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2805 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2806 { "-mtbm", OPTION_MASK_ISA_TBM },
2807 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2808 { "-mcx16", OPTION_MASK_ISA_CX16 },
2809 { "-msahf", OPTION_MASK_ISA_SAHF },
2810 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2811 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2812 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2813 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2814 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
2815 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2816 { "-mpku", OPTION_MASK_ISA_PKU },
2817 { "-mlwp", OPTION_MASK_ISA_LWP },
2818 { "-mhle", OPTION_MASK_ISA_HLE },
2819 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2820 { "-mclwb", OPTION_MASK_ISA_CLWB }
2823 /* Flag options. */
2824 static struct ix86_target_opts flag_opts[] =
2826 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2827 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2828 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2829 { "-m80387", MASK_80387 },
2830 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2831 { "-malign-double", MASK_ALIGN_DOUBLE },
2832 { "-mcld", MASK_CLD },
2833 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2834 { "-mieee-fp", MASK_IEEE_FP },
2835 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2836 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2837 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2838 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2839 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2840 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2841 { "-mno-red-zone", MASK_NO_RED_ZONE },
2842 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2843 { "-mrecip", MASK_RECIP },
2844 { "-mrtd", MASK_RTD },
2845 { "-msseregparm", MASK_SSEREGPARM },
2846 { "-mstack-arg-probe", MASK_STACK_PROBE },
2847 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2848 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2849 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2850 { "-mvzeroupper", MASK_VZEROUPPER },
2851 { "-mstv", MASK_STV },
2852 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2853 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2854 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2857 /* Additional flag options. */
2858 static struct ix86_target_opts flag2_opts[] =
2860 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2863 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2864 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2866 char isa_other[40];
2867 char isa2_other[40];
2868 char flags_other[40];
2869 char flags2_other[40];
2870 unsigned num = 0;
2871 unsigned i, j;
2872 char *ret;
2873 char *ptr;
2874 size_t len;
2875 size_t line_len;
2876 size_t sep_len;
2877 const char *abi;
2879 memset (opts, '\0', sizeof (opts));
2881 /* Add -march= option. */
2882 if (arch)
2884 opts[num][0] = "-march=";
2885 opts[num++][1] = arch;
2888 /* Add -mtune= option. */
2889 if (tune)
2891 opts[num][0] = "-mtune=";
2892 opts[num++][1] = tune;
2895 /* Add -m32/-m64/-mx32. */
2896 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2898 if ((isa & OPTION_MASK_ABI_64) != 0)
2899 abi = "-m64";
2900 else
2901 abi = "-mx32";
2902 isa &= ~ (OPTION_MASK_ISA_64BIT
2903 | OPTION_MASK_ABI_64
2904 | OPTION_MASK_ABI_X32);
2906 else
2907 abi = "-m32";
2908 opts[num++][0] = abi;
2910 /* Pick out the options in isa2 options. */
2911 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2913 if ((isa2 & isa2_opts[i].mask) != 0)
2915 opts[num++][0] = isa2_opts[i].option;
2916 isa2 &= ~ isa2_opts[i].mask;
2920 if (isa2 && add_nl_p)
2922 opts[num++][0] = isa2_other;
2923 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2926 /* Pick out the options in isa options. */
2927 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2929 if ((isa & isa_opts[i].mask) != 0)
2931 opts[num++][0] = isa_opts[i].option;
2932 isa &= ~ isa_opts[i].mask;
2936 if (isa && add_nl_p)
2938 opts[num++][0] = isa_other;
2939 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2942 /* Add flag options. */
2943 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2945 if ((flags & flag_opts[i].mask) != 0)
2947 opts[num++][0] = flag_opts[i].option;
2948 flags &= ~ flag_opts[i].mask;
2952 if (flags && add_nl_p)
2954 opts[num++][0] = flags_other;
2955 sprintf (flags_other, "(other flags: %#x)", flags);
2958 /* Add additional flag options. */
2959 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2961 if ((flags2 & flag2_opts[i].mask) != 0)
2963 opts[num++][0] = flag2_opts[i].option;
2964 flags2 &= ~ flag2_opts[i].mask;
2968 if (flags2 && add_nl_p)
2970 opts[num++][0] = flags2_other;
2971 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2974 /* Add -fpmath= option. */
2975 if (fpmath)
2977 opts[num][0] = "-mfpmath=";
2978 switch ((int) fpmath)
2980 case FPMATH_387:
2981 opts[num++][1] = "387";
2982 break;
2984 case FPMATH_SSE:
2985 opts[num++][1] = "sse";
2986 break;
2988 case FPMATH_387 | FPMATH_SSE:
2989 opts[num++][1] = "sse+387";
2990 break;
2992 default:
2993 gcc_unreachable ();
2997 /* Any options? */
2998 if (num == 0)
2999 return NULL;
3001 gcc_assert (num < ARRAY_SIZE (opts));
3003 /* Size the string. */
3004 len = 0;
3005 sep_len = (add_nl_p) ? 3 : 1;
3006 for (i = 0; i < num; i++)
3008 len += sep_len;
3009 for (j = 0; j < 2; j++)
3010 if (opts[i][j])
3011 len += strlen (opts[i][j]);
3014 /* Build the string. */
3015 ret = ptr = (char *) xmalloc (len);
3016 line_len = 0;
3018 for (i = 0; i < num; i++)
3020 size_t len2[2];
3022 for (j = 0; j < 2; j++)
3023 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3025 if (i != 0)
3027 *ptr++ = ' ';
3028 line_len++;
3030 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3032 *ptr++ = '\\';
3033 *ptr++ = '\n';
3034 line_len = 0;
3038 for (j = 0; j < 2; j++)
3039 if (opts[i][j])
3041 memcpy (ptr, opts[i][j], len2[j]);
3042 ptr += len2[j];
3043 line_len += len2[j];
3047 *ptr = '\0';
3048 gcc_assert (ret + len >= ptr);
3050 return ret;
3053 /* Return true, if profiling code should be emitted before
3054 prologue. Otherwise it returns false.
3055 Note: For x86 with "hotfix" it is sorried. */
3056 static bool
3057 ix86_profile_before_prologue (void)
3059 return flag_fentry != 0;
3062 /* Function that is callable from the debugger to print the current
3063 options. */
3064 void ATTRIBUTE_UNUSED
3065 ix86_debug_options (void)
3067 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3068 target_flags, ix86_target_flags,
3069 ix86_arch_string,ix86_tune_string,
3070 ix86_fpmath, true);
3072 if (opts)
3074 fprintf (stderr, "%s\n\n", opts);
3075 free (opts);
3077 else
3078 fputs ("<no options>\n\n", stderr);
3080 return;
3083 /* Return true if T is one of the bytes we should avoid with
3084 -fmitigate-rop. */
3086 static bool
3087 ix86_rop_should_change_byte_p (int t)
3089 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3092 static const char *stringop_alg_names[] = {
3093 #define DEF_ENUM
3094 #define DEF_ALG(alg, name) #name,
3095 #include "stringop.def"
3096 #undef DEF_ENUM
3097 #undef DEF_ALG
3100 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3101 The string is of the following form (or comma separated list of it):
3103 strategy_alg:max_size:[align|noalign]
3105 where the full size range for the strategy is either [0, max_size] or
3106 [min_size, max_size], in which min_size is the max_size + 1 of the
3107 preceding range. The last size range must have max_size == -1.
3109 Examples:
3112 -mmemcpy-strategy=libcall:-1:noalign
3114 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3118 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3120 This is to tell the compiler to use the following strategy for memset
3121 1) when the expected size is between [1, 16], use rep_8byte strategy;
3122 2) when the size is between [17, 2048], use vector_loop;
3123 3) when the size is > 2048, use libcall. */
3125 struct stringop_size_range
3127 int max;
3128 stringop_alg alg;
3129 bool noalign;
3132 static void
3133 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3135 const struct stringop_algs *default_algs;
3136 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3137 char *curr_range_str, *next_range_str;
3138 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3139 int i = 0, n = 0;
3141 if (is_memset)
3142 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3143 else
3144 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3146 curr_range_str = strategy_str;
3150 int maxs;
3151 char alg_name[128];
3152 char align[16];
3153 next_range_str = strchr (curr_range_str, ',');
3154 if (next_range_str)
3155 *next_range_str++ = '\0';
3157 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
3158 alg_name, &maxs, align))
3160 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3161 return;
3164 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3166 error ("size ranges of option %qs should be increasing", opt);
3167 return;
3170 for (i = 0; i < last_alg; i++)
3171 if (!strcmp (alg_name, stringop_alg_names[i]))
3172 break;
3174 if (i == last_alg)
3176 error ("wrong strategy name %qs specified for option %qs",
3177 alg_name, opt);
3179 auto_vec <const char *> candidates;
3180 for (i = 0; i < last_alg; i++)
3181 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3182 candidates.safe_push (stringop_alg_names[i]);
3184 char *s;
3185 const char *hint
3186 = candidates_list_and_hint (alg_name, s, candidates);
3187 if (hint)
3188 inform (input_location,
3189 "valid arguments to %qs are: %s; did you mean %qs?",
3190 opt, s, hint);
3191 else
3192 inform (input_location, "valid arguments to %qs are: %s",
3193 opt, s);
3194 XDELETEVEC (s);
3195 return;
3198 if ((stringop_alg) i == rep_prefix_8_byte
3199 && !TARGET_64BIT)
3201 /* rep; movq isn't available in 32-bit code. */
3202 error ("strategy name %qs specified for option %qs "
3203 "not supported for 32-bit code", alg_name, opt);
3204 return;
3207 input_ranges[n].max = maxs;
3208 input_ranges[n].alg = (stringop_alg) i;
3209 if (!strcmp (align, "align"))
3210 input_ranges[n].noalign = false;
3211 else if (!strcmp (align, "noalign"))
3212 input_ranges[n].noalign = true;
3213 else
3215 error ("unknown alignment %qs specified for option %qs", align, opt);
3216 return;
3218 n++;
3219 curr_range_str = next_range_str;
3221 while (curr_range_str);
3223 if (input_ranges[n - 1].max != -1)
3225 error ("the max value for the last size range should be -1"
3226 " for option %qs", opt);
3227 return;
3230 if (n > MAX_STRINGOP_ALGS)
3232 error ("too many size ranges specified in option %qs", opt);
3233 return;
3236 /* Now override the default algs array. */
3237 for (i = 0; i < n; i++)
3239 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3240 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3241 = input_ranges[i].alg;
3242 *const_cast<int *>(&default_algs->size[i].noalign)
3243 = input_ranges[i].noalign;
3248 /* parse -mtune-ctrl= option. When DUMP is true,
3249 print the features that are explicitly set. */
3251 static void
3252 parse_mtune_ctrl_str (bool dump)
3254 if (!ix86_tune_ctrl_string)
3255 return;
3257 char *next_feature_string = NULL;
3258 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3259 char *orig = curr_feature_string;
3260 int i;
3263 bool clear = false;
3265 next_feature_string = strchr (curr_feature_string, ',');
3266 if (next_feature_string)
3267 *next_feature_string++ = '\0';
3268 if (*curr_feature_string == '^')
3270 curr_feature_string++;
3271 clear = true;
3273 for (i = 0; i < X86_TUNE_LAST; i++)
3275 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3277 ix86_tune_features[i] = !clear;
3278 if (dump)
3279 fprintf (stderr, "Explicitly %s feature %s\n",
3280 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3281 break;
3284 if (i == X86_TUNE_LAST)
3285 error ("unknown parameter to option -mtune-ctrl: %s",
3286 clear ? curr_feature_string - 1 : curr_feature_string);
3287 curr_feature_string = next_feature_string;
3289 while (curr_feature_string);
3290 free (orig);
3293 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3294 processor type. */
3296 static void
3297 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3299 unsigned int ix86_tune_mask = 1u << ix86_tune;
3300 int i;
3302 for (i = 0; i < X86_TUNE_LAST; ++i)
3304 if (ix86_tune_no_default)
3305 ix86_tune_features[i] = 0;
3306 else
3307 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3310 if (dump)
3312 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3313 for (i = 0; i < X86_TUNE_LAST; i++)
3314 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3315 ix86_tune_features[i] ? "on" : "off");
3318 parse_mtune_ctrl_str (dump);
3322 /* Default align_* from the processor table. */
3324 static void
3325 ix86_default_align (struct gcc_options *opts)
3327 if (opts->x_align_loops == 0)
3329 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3330 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3332 if (opts->x_align_jumps == 0)
3334 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3335 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3337 if (opts->x_align_functions == 0)
3339 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3343 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3345 static void
3346 ix86_override_options_after_change (void)
3348 ix86_default_align (&global_options);
3351 /* Override various settings based on options. If MAIN_ARGS_P, the
3352 options are from the command line, otherwise they are from
3353 attributes. Return true if there's an error related to march
3354 option. */
3356 static bool
3357 ix86_option_override_internal (bool main_args_p,
3358 struct gcc_options *opts,
3359 struct gcc_options *opts_set)
3361 int i;
3362 unsigned int ix86_arch_mask;
3363 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3365 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3366 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3367 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3368 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3369 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3370 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3371 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3372 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3373 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3374 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3375 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3376 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3377 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3378 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3379 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3380 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3381 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3382 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3383 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3384 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3385 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3386 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3387 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3388 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3389 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3390 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3391 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3392 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3393 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3394 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3395 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3396 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3397 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3398 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3399 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3400 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3401 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3402 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3403 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3404 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3405 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3406 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3407 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3408 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3409 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3410 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3411 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3412 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3413 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3414 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3415 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3416 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3417 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3418 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3419 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3420 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3421 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3422 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3423 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3424 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3425 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3426 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3427 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3428 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3430 #define PTA_CORE2 \
3431 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3432 | PTA_CX16 | PTA_FXSR)
3433 #define PTA_NEHALEM \
3434 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3435 #define PTA_WESTMERE \
3436 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3437 #define PTA_SANDYBRIDGE \
3438 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3439 #define PTA_IVYBRIDGE \
3440 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3441 #define PTA_HASWELL \
3442 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3443 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3444 #define PTA_BROADWELL \
3445 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3446 #define PTA_SKYLAKE \
3447 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3448 #define PTA_SKYLAKE_AVX512 \
3449 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3450 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU | PTA_CLWB)
3451 #define PTA_CANNONLAKE \
3452 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA)
3453 #define PTA_KNL \
3454 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3455 #define PTA_BONNELL \
3456 (PTA_CORE2 | PTA_MOVBE)
3457 #define PTA_SILVERMONT \
3458 (PTA_WESTMERE | PTA_MOVBE)
3459 #define PTA_KNM \
3460 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3462 /* if this reaches 64, need to widen struct pta flags below */
3464 static struct pta
3466 const char *const name; /* processor name or nickname. */
3467 const enum processor_type processor;
3468 const enum attr_cpu schedule;
3469 const unsigned HOST_WIDE_INT flags;
3471 const processor_alias_table[] =
3473 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3474 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3475 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3476 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3477 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3478 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3479 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3480 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3481 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3482 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3483 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3484 PTA_MMX | PTA_SSE | PTA_FXSR},
3485 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3486 PTA_MMX | PTA_SSE | PTA_FXSR},
3487 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3488 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3489 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3490 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3491 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3492 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3493 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3494 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3495 PTA_MMX | PTA_SSE | PTA_FXSR},
3496 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3497 PTA_MMX | PTA_SSE | PTA_FXSR},
3498 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3499 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3500 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3501 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3502 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3503 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3504 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3505 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3506 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3507 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3508 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3509 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3510 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3511 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3512 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3513 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3514 PTA_SANDYBRIDGE},
3515 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3516 PTA_SANDYBRIDGE},
3517 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3518 PTA_IVYBRIDGE},
3519 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3520 PTA_IVYBRIDGE},
3521 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3522 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3523 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3524 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3525 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3526 PTA_SKYLAKE_AVX512},
3527 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3528 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3529 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3530 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3531 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3532 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3533 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3534 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3535 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3536 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3537 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3538 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3539 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3540 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3541 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3542 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3543 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3544 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3545 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3546 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3547 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3548 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3549 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3550 {"x86-64", PROCESSOR_K8, CPU_K8,
3551 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3552 {"eden-x2", PROCESSOR_K8, CPU_K8,
3553 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3554 {"nano", PROCESSOR_K8, CPU_K8,
3555 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3556 | PTA_SSSE3 | PTA_FXSR},
3557 {"nano-1000", PROCESSOR_K8, CPU_K8,
3558 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3559 | PTA_SSSE3 | PTA_FXSR},
3560 {"nano-2000", PROCESSOR_K8, CPU_K8,
3561 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3562 | PTA_SSSE3 | PTA_FXSR},
3563 {"nano-3000", PROCESSOR_K8, CPU_K8,
3564 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3565 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3566 {"nano-x2", PROCESSOR_K8, CPU_K8,
3567 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3568 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3569 {"eden-x4", PROCESSOR_K8, CPU_K8,
3570 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3571 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3572 {"nano-x4", PROCESSOR_K8, CPU_K8,
3573 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3574 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3575 {"k8", PROCESSOR_K8, CPU_K8,
3576 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3577 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3578 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3579 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3580 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3581 {"opteron", PROCESSOR_K8, CPU_K8,
3582 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3583 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3584 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3585 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3586 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3587 {"athlon64", PROCESSOR_K8, CPU_K8,
3588 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3589 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3590 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3591 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3592 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3593 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3594 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3595 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3596 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3597 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3598 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3599 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3600 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3601 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3602 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3603 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3604 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3605 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3606 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3607 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3608 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3609 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3610 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3611 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3612 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3613 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3614 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3615 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3616 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3617 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3618 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3619 | PTA_XSAVEOPT | PTA_FSGSBASE},
3620 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3621 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3622 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3623 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3624 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3625 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3626 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3627 | PTA_MOVBE | PTA_MWAITX},
3628 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3629 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3630 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3631 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3632 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3633 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3634 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3635 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3636 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3637 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3638 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3639 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3640 | PTA_FXSR | PTA_XSAVE},
3641 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3642 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3643 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3644 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3645 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3646 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3648 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3649 PTA_64BIT
3650 | PTA_HLE /* flags are only used for -march switch. */ },
3653 /* -mrecip options. */
3654 static struct
3656 const char *string; /* option name */
3657 unsigned int mask; /* mask bits to set */
3659 const recip_options[] =
3661 { "all", RECIP_MASK_ALL },
3662 { "none", RECIP_MASK_NONE },
3663 { "div", RECIP_MASK_DIV },
3664 { "sqrt", RECIP_MASK_SQRT },
3665 { "vec-div", RECIP_MASK_VEC_DIV },
3666 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3669 int const pta_size = ARRAY_SIZE (processor_alias_table);
3671 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3672 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3673 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3674 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3675 #ifdef TARGET_BI_ARCH
3676 else
3678 #if TARGET_BI_ARCH == 1
3679 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3680 is on and OPTION_MASK_ABI_X32 is off. We turn off
3681 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3682 -mx32. */
3683 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3684 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3685 #else
3686 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3687 on and OPTION_MASK_ABI_64 is off. We turn off
3688 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3689 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3690 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3691 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3692 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3693 #endif
3694 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3695 && TARGET_IAMCU_P (opts->x_target_flags))
3696 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3697 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3699 #endif
3701 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3703 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3704 OPTION_MASK_ABI_64 for TARGET_X32. */
3705 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3706 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3708 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3709 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3710 | OPTION_MASK_ABI_X32
3711 | OPTION_MASK_ABI_64);
3712 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3714 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3715 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3716 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3717 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3720 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3721 SUBTARGET_OVERRIDE_OPTIONS;
3722 #endif
3724 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3725 SUBSUBTARGET_OVERRIDE_OPTIONS;
3726 #endif
3728 /* -fPIC is the default for x86_64. */
3729 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3730 opts->x_flag_pic = 2;
3732 /* Need to check -mtune=generic first. */
3733 if (opts->x_ix86_tune_string)
3735 /* As special support for cross compilers we read -mtune=native
3736 as -mtune=generic. With native compilers we won't see the
3737 -mtune=native, as it was changed by the driver. */
3738 if (!strcmp (opts->x_ix86_tune_string, "native"))
3740 opts->x_ix86_tune_string = "generic";
3742 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3743 warning (OPT_Wdeprecated,
3744 main_args_p
3745 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3746 "or %<-mtune=generic%> instead as appropriate")
3747 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3748 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3749 " instead as appropriate"));
3751 else
3753 if (opts->x_ix86_arch_string)
3754 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3755 if (!opts->x_ix86_tune_string)
3757 opts->x_ix86_tune_string
3758 = processor_target_table[TARGET_CPU_DEFAULT].name;
3759 ix86_tune_defaulted = 1;
3762 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3763 or defaulted. We need to use a sensible tune option. */
3764 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3766 opts->x_ix86_tune_string = "generic";
3770 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3771 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3773 /* rep; movq isn't available in 32-bit code. */
3774 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3775 opts->x_ix86_stringop_alg = no_stringop;
3778 if (!opts->x_ix86_arch_string)
3779 opts->x_ix86_arch_string
3780 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3781 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3782 else
3783 ix86_arch_specified = 1;
3785 if (opts_set->x_ix86_pmode)
3787 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3788 && opts->x_ix86_pmode == PMODE_SI)
3789 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3790 && opts->x_ix86_pmode == PMODE_DI))
3791 error ("address mode %qs not supported in the %s bit mode",
3792 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3793 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3795 else
3796 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3797 ? PMODE_DI : PMODE_SI;
3799 if (!opts_set->x_ix86_abi)
3800 opts->x_ix86_abi = DEFAULT_ABI;
3802 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3803 error ("-mabi=ms not supported with X32 ABI");
3804 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3806 /* For targets using ms ABI enable ms-extensions, if not
3807 explicit turned off. For non-ms ABI we turn off this
3808 option. */
3809 if (!opts_set->x_flag_ms_extensions)
3810 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3812 if (opts_set->x_ix86_cmodel)
3814 switch (opts->x_ix86_cmodel)
3816 case CM_SMALL:
3817 case CM_SMALL_PIC:
3818 if (opts->x_flag_pic)
3819 opts->x_ix86_cmodel = CM_SMALL_PIC;
3820 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3821 error ("code model %qs not supported in the %s bit mode",
3822 "small", "32");
3823 break;
3825 case CM_MEDIUM:
3826 case CM_MEDIUM_PIC:
3827 if (opts->x_flag_pic)
3828 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3829 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3830 error ("code model %qs not supported in the %s bit mode",
3831 "medium", "32");
3832 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3833 error ("code model %qs not supported in x32 mode",
3834 "medium");
3835 break;
3837 case CM_LARGE:
3838 case CM_LARGE_PIC:
3839 if (opts->x_flag_pic)
3840 opts->x_ix86_cmodel = CM_LARGE_PIC;
3841 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3842 error ("code model %qs not supported in the %s bit mode",
3843 "large", "32");
3844 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3845 error ("code model %qs not supported in x32 mode",
3846 "large");
3847 break;
3849 case CM_32:
3850 if (opts->x_flag_pic)
3851 error ("code model %s does not support PIC mode", "32");
3852 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3853 error ("code model %qs not supported in the %s bit mode",
3854 "32", "64");
3855 break;
3857 case CM_KERNEL:
3858 if (opts->x_flag_pic)
3860 error ("code model %s does not support PIC mode", "kernel");
3861 opts->x_ix86_cmodel = CM_32;
3863 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3864 error ("code model %qs not supported in the %s bit mode",
3865 "kernel", "32");
3866 break;
3868 default:
3869 gcc_unreachable ();
3872 else
3874 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3875 use of rip-relative addressing. This eliminates fixups that
3876 would otherwise be needed if this object is to be placed in a
3877 DLL, and is essentially just as efficient as direct addressing. */
3878 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3879 && (TARGET_RDOS || TARGET_PECOFF))
3880 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3881 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3882 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3883 else
3884 opts->x_ix86_cmodel = CM_32;
3886 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3888 error ("-masm=intel not supported in this configuration");
3889 opts->x_ix86_asm_dialect = ASM_ATT;
3891 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3892 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3893 sorry ("%i-bit mode not compiled in",
3894 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3896 for (i = 0; i < pta_size; i++)
3897 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3899 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3901 error (main_args_p
3902 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3903 "switch")
3904 : G_("%<generic%> CPU can be used only for "
3905 "%<target(\"tune=\")%> attribute"));
3906 return false;
3908 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3910 error (main_args_p
3911 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3912 "switch")
3913 : G_("%<intel%> CPU can be used only for "
3914 "%<target(\"tune=\")%> attribute"));
3915 return false;
3918 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3919 && !(processor_alias_table[i].flags & PTA_64BIT))
3921 error ("CPU you selected does not support x86-64 "
3922 "instruction set");
3923 return false;
3926 ix86_schedule = processor_alias_table[i].schedule;
3927 ix86_arch = processor_alias_table[i].processor;
3928 /* Default cpu tuning to the architecture. */
3929 ix86_tune = ix86_arch;
3931 if (processor_alias_table[i].flags & PTA_MMX
3932 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3933 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3934 if (processor_alias_table[i].flags & PTA_3DNOW
3935 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3937 if (processor_alias_table[i].flags & PTA_3DNOW_A
3938 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3939 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3940 if (processor_alias_table[i].flags & PTA_SSE
3941 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3942 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3943 if (processor_alias_table[i].flags & PTA_SSE2
3944 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3945 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3946 if (processor_alias_table[i].flags & PTA_SSE3
3947 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3948 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3949 if (processor_alias_table[i].flags & PTA_SSSE3
3950 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3951 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3952 if (processor_alias_table[i].flags & PTA_SSE4_1
3953 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3954 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3955 if (processor_alias_table[i].flags & PTA_SSE4_2
3956 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3957 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3958 if (processor_alias_table[i].flags & PTA_AVX
3959 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3960 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3961 if (processor_alias_table[i].flags & PTA_AVX2
3962 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3963 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3964 if (processor_alias_table[i].flags & PTA_FMA
3965 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3966 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3967 if (processor_alias_table[i].flags & PTA_SSE4A
3968 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3969 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3970 if (processor_alias_table[i].flags & PTA_FMA4
3971 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3972 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3973 if (processor_alias_table[i].flags & PTA_XOP
3974 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3976 if (processor_alias_table[i].flags & PTA_LWP
3977 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3978 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3979 if (processor_alias_table[i].flags & PTA_ABM
3980 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3981 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3982 if (processor_alias_table[i].flags & PTA_BMI
3983 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3984 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3985 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3986 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3987 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3988 if (processor_alias_table[i].flags & PTA_TBM
3989 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3990 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3991 if (processor_alias_table[i].flags & PTA_BMI2
3992 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3993 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3994 if (processor_alias_table[i].flags & PTA_CX16
3995 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3996 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3997 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3998 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3999 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4000 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4001 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4002 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4003 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4004 if (processor_alias_table[i].flags & PTA_MOVBE
4005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
4006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
4007 if (processor_alias_table[i].flags & PTA_AES
4008 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4009 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4010 if (processor_alias_table[i].flags & PTA_SHA
4011 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4012 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4013 if (processor_alias_table[i].flags & PTA_PCLMUL
4014 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4015 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4016 if (processor_alias_table[i].flags & PTA_FSGSBASE
4017 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4018 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4019 if (processor_alias_table[i].flags & PTA_RDRND
4020 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4021 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4022 if (processor_alias_table[i].flags & PTA_F16C
4023 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4024 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4025 if (processor_alias_table[i].flags & PTA_RTM
4026 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4027 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4028 if (processor_alias_table[i].flags & PTA_HLE
4029 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
4030 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
4031 if (processor_alias_table[i].flags & PTA_PRFCHW
4032 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4033 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4034 if (processor_alias_table[i].flags & PTA_RDSEED
4035 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4036 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4037 if (processor_alias_table[i].flags & PTA_ADX
4038 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4039 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4040 if (processor_alias_table[i].flags & PTA_FXSR
4041 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4042 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4043 if (processor_alias_table[i].flags & PTA_XSAVE
4044 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4045 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4046 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4047 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4048 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4049 if (processor_alias_table[i].flags & PTA_AVX512F
4050 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4051 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4052 if (processor_alias_table[i].flags & PTA_AVX512ER
4053 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4054 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4055 if (processor_alias_table[i].flags & PTA_AVX512PF
4056 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4057 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4058 if (processor_alias_table[i].flags & PTA_AVX512CD
4059 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4060 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4061 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4062 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4063 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4064 if (processor_alias_table[i].flags & PTA_CLWB
4065 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4066 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4067 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4068 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4069 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4070 if (processor_alias_table[i].flags & PTA_CLZERO
4071 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
4072 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
4073 if (processor_alias_table[i].flags & PTA_XSAVEC
4074 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4075 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4076 if (processor_alias_table[i].flags & PTA_XSAVES
4077 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4078 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4079 if (processor_alias_table[i].flags & PTA_AVX512DQ
4080 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4081 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4082 if (processor_alias_table[i].flags & PTA_AVX512BW
4083 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4084 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4085 if (processor_alias_table[i].flags & PTA_AVX512VL
4086 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4087 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4088 if (processor_alias_table[i].flags & PTA_MPX
4089 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4090 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4091 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4092 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4093 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4094 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4095 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4096 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4098 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4099 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
4100 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4101 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4102 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
4103 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4104 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4105 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4106 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4107 if (processor_alias_table[i].flags & PTA_SGX
4108 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4109 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4111 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4112 x86_prefetch_sse = true;
4113 if (processor_alias_table[i].flags & PTA_MWAITX
4114 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
4115 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
4116 if (processor_alias_table[i].flags & PTA_PKU
4117 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4118 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4120 /* Don't enable x87 instructions if only
4121 general registers are allowed. */
4122 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4123 && !(opts_set->x_target_flags & MASK_80387))
4125 if (processor_alias_table[i].flags & PTA_NO_80387)
4126 opts->x_target_flags &= ~MASK_80387;
4127 else
4128 opts->x_target_flags |= MASK_80387;
4130 break;
4133 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4134 error ("Intel MPX does not support x32");
4136 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4137 error ("Intel MPX does not support x32");
4139 if (i == pta_size)
4141 error (main_args_p
4142 ? G_("bad value (%qs) for %<-march=%> switch")
4143 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4144 opts->x_ix86_arch_string);
4146 auto_vec <const char *> candidates;
4147 for (i = 0; i < pta_size; i++)
4148 if (strcmp (processor_alias_table[i].name, "generic")
4149 && strcmp (processor_alias_table[i].name, "intel")
4150 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4151 || (processor_alias_table[i].flags & PTA_64BIT)))
4152 candidates.safe_push (processor_alias_table[i].name);
4154 char *s;
4155 const char *hint
4156 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4157 if (hint)
4158 inform (input_location,
4159 main_args_p
4160 ? G_("valid arguments to %<-march=%> switch are: "
4161 "%s; did you mean %qs?")
4162 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4163 "%s; did you mean %qs?"), s, hint);
4164 else
4165 inform (input_location,
4166 main_args_p
4167 ? G_("valid arguments to %<-march=%> switch are: %s")
4168 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4169 "are: %s"), s);
4170 XDELETEVEC (s);
4173 ix86_arch_mask = 1u << ix86_arch;
4174 for (i = 0; i < X86_ARCH_LAST; ++i)
4175 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4177 for (i = 0; i < pta_size; i++)
4178 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4180 ix86_schedule = processor_alias_table[i].schedule;
4181 ix86_tune = processor_alias_table[i].processor;
4182 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4184 if (!(processor_alias_table[i].flags & PTA_64BIT))
4186 if (ix86_tune_defaulted)
4188 opts->x_ix86_tune_string = "x86-64";
4189 for (i = 0; i < pta_size; i++)
4190 if (! strcmp (opts->x_ix86_tune_string,
4191 processor_alias_table[i].name))
4192 break;
4193 ix86_schedule = processor_alias_table[i].schedule;
4194 ix86_tune = processor_alias_table[i].processor;
4196 else
4197 error ("CPU you selected does not support x86-64 "
4198 "instruction set");
4201 /* Intel CPUs have always interpreted SSE prefetch instructions as
4202 NOPs; so, we can enable SSE prefetch instructions even when
4203 -mtune (rather than -march) points us to a processor that has them.
4204 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4205 higher processors. */
4206 if (TARGET_CMOV
4207 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4208 x86_prefetch_sse = true;
4209 break;
4212 if (ix86_tune_specified && i == pta_size)
4214 error (main_args_p
4215 ? G_("bad value (%qs) for %<-mtune=%> switch")
4216 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4217 opts->x_ix86_tune_string);
4219 auto_vec <const char *> candidates;
4220 for (i = 0; i < pta_size; i++)
4221 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4222 || (processor_alias_table[i].flags & PTA_64BIT))
4223 candidates.safe_push (processor_alias_table[i].name);
4225 char *s;
4226 const char *hint
4227 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4228 if (hint)
4229 inform (input_location,
4230 main_args_p
4231 ? G_("valid arguments to %<-mtune=%> switch are: "
4232 "%s; did you mean %qs?")
4233 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4234 "%s; did you mean %qs?"), s, hint);
4235 else
4236 inform (input_location,
4237 main_args_p
4238 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4239 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4240 "are: %s"), s);
4241 XDELETEVEC (s);
4244 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4246 #ifndef USE_IX86_FRAME_POINTER
4247 #define USE_IX86_FRAME_POINTER 0
4248 #endif
4250 #ifndef USE_X86_64_FRAME_POINTER
4251 #define USE_X86_64_FRAME_POINTER 0
4252 #endif
4254 /* Set the default values for switches whose default depends on TARGET_64BIT
4255 in case they weren't overwritten by command line options. */
4256 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4258 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4259 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4260 if (opts->x_flag_asynchronous_unwind_tables
4261 && !opts_set->x_flag_unwind_tables
4262 && TARGET_64BIT_MS_ABI)
4263 opts->x_flag_unwind_tables = 1;
4264 if (opts->x_flag_asynchronous_unwind_tables == 2)
4265 opts->x_flag_unwind_tables
4266 = opts->x_flag_asynchronous_unwind_tables = 1;
4267 if (opts->x_flag_pcc_struct_return == 2)
4268 opts->x_flag_pcc_struct_return = 0;
4270 else
4272 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4273 opts->x_flag_omit_frame_pointer
4274 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4275 if (opts->x_flag_asynchronous_unwind_tables == 2)
4276 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4277 if (opts->x_flag_pcc_struct_return == 2)
4279 /* Intel MCU psABI specifies that -freg-struct-return should
4280 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4281 we check -miamcu so that -freg-struct-return is always
4282 turned on if -miamcu is used. */
4283 if (TARGET_IAMCU_P (opts->x_target_flags))
4284 opts->x_flag_pcc_struct_return = 0;
4285 else
4286 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4290 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4291 /* TODO: ix86_cost should be chosen at instruction or function granuality
4292 so for cold code we use size_cost even in !optimize_size compilation. */
4293 if (opts->x_optimize_size)
4294 ix86_cost = &ix86_size_cost;
4295 else
4296 ix86_cost = ix86_tune_cost;
4298 /* Arrange to set up i386_stack_locals for all functions. */
4299 init_machine_status = ix86_init_machine_status;
4301 /* Validate -mregparm= value. */
4302 if (opts_set->x_ix86_regparm)
4304 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4305 warning (0, "-mregparm is ignored in 64-bit mode");
4306 else if (TARGET_IAMCU_P (opts->x_target_flags))
4307 warning (0, "-mregparm is ignored for Intel MCU psABI");
4308 if (opts->x_ix86_regparm > REGPARM_MAX)
4310 error ("-mregparm=%d is not between 0 and %d",
4311 opts->x_ix86_regparm, REGPARM_MAX);
4312 opts->x_ix86_regparm = 0;
4315 if (TARGET_IAMCU_P (opts->x_target_flags)
4316 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4317 opts->x_ix86_regparm = REGPARM_MAX;
4319 /* Default align_* from the processor table. */
4320 ix86_default_align (opts);
4322 /* Provide default for -mbranch-cost= value. */
4323 if (!opts_set->x_ix86_branch_cost)
4324 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4326 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4328 opts->x_target_flags
4329 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4331 /* Enable by default the SSE and MMX builtins. Do allow the user to
4332 explicitly disable any of these. In particular, disabling SSE and
4333 MMX for kernel code is extremely useful. */
4334 if (!ix86_arch_specified)
4335 opts->x_ix86_isa_flags
4336 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4337 | TARGET_SUBTARGET64_ISA_DEFAULT)
4338 & ~opts->x_ix86_isa_flags_explicit);
4340 if (TARGET_RTD_P (opts->x_target_flags))
4341 warning (0,
4342 main_args_p
4343 ? G_("%<-mrtd%> is ignored in 64bit mode")
4344 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4346 else
4348 opts->x_target_flags
4349 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4351 if (!ix86_arch_specified)
4352 opts->x_ix86_isa_flags
4353 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4355 /* i386 ABI does not specify red zone. It still makes sense to use it
4356 when programmer takes care to stack from being destroyed. */
4357 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4358 opts->x_target_flags |= MASK_NO_RED_ZONE;
4361 /* Keep nonleaf frame pointers. */
4362 if (opts->x_flag_omit_frame_pointer)
4363 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4364 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4365 opts->x_flag_omit_frame_pointer = 1;
4367 /* If we're doing fast math, we don't care about comparison order
4368 wrt NaNs. This lets us use a shorter comparison sequence. */
4369 if (opts->x_flag_finite_math_only)
4370 opts->x_target_flags &= ~MASK_IEEE_FP;
4372 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4373 since the insns won't need emulation. */
4374 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4375 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4377 /* Likewise, if the target doesn't have a 387, or we've specified
4378 software floating point, don't use 387 inline intrinsics. */
4379 if (!TARGET_80387_P (opts->x_target_flags))
4380 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4382 /* Turn on MMX builtins for -msse. */
4383 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4384 opts->x_ix86_isa_flags
4385 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4387 /* Enable SSE prefetch. */
4388 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4389 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4390 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4391 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4392 x86_prefetch_sse = true;
4394 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4395 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4396 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4397 opts->x_ix86_isa_flags
4398 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4400 /* Enable lzcnt instruction for -mabm. */
4401 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4402 opts->x_ix86_isa_flags
4403 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4405 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4406 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4407 opts->x_ix86_isa_flags
4408 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4409 & ~opts->x_ix86_isa_flags_explicit);
4411 /* Validate -mpreferred-stack-boundary= value or default it to
4412 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4413 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4414 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4416 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4417 int max = TARGET_SEH ? 4 : 12;
4419 if (opts->x_ix86_preferred_stack_boundary_arg < min
4420 || opts->x_ix86_preferred_stack_boundary_arg > max)
4422 if (min == max)
4423 error ("-mpreferred-stack-boundary is not supported "
4424 "for this target");
4425 else
4426 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4427 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4429 else
4430 ix86_preferred_stack_boundary
4431 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4434 /* Set the default value for -mstackrealign. */
4435 if (!opts_set->x_ix86_force_align_arg_pointer)
4436 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4438 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4440 /* Validate -mincoming-stack-boundary= value or default it to
4441 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4442 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4443 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4445 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4447 if (opts->x_ix86_incoming_stack_boundary_arg < min
4448 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4449 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4450 opts->x_ix86_incoming_stack_boundary_arg, min);
4451 else
4453 ix86_user_incoming_stack_boundary
4454 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4455 ix86_incoming_stack_boundary
4456 = ix86_user_incoming_stack_boundary;
4460 #ifndef NO_PROFILE_COUNTERS
4461 if (flag_nop_mcount)
4462 error ("-mnop-mcount is not compatible with this target");
4463 #endif
4464 if (flag_nop_mcount && flag_pic)
4465 error ("-mnop-mcount is not implemented for -fPIC");
4467 /* Accept -msseregparm only if at least SSE support is enabled. */
4468 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4469 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4470 error (main_args_p
4471 ? G_("%<-msseregparm%> used without SSE enabled")
4472 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4474 if (opts_set->x_ix86_fpmath)
4476 if (opts->x_ix86_fpmath & FPMATH_SSE)
4478 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4480 if (TARGET_80387_P (opts->x_target_flags))
4482 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4483 opts->x_ix86_fpmath = FPMATH_387;
4486 else if ((opts->x_ix86_fpmath & FPMATH_387)
4487 && !TARGET_80387_P (opts->x_target_flags))
4489 warning (0, "387 instruction set disabled, using SSE arithmetics");
4490 opts->x_ix86_fpmath = FPMATH_SSE;
4494 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4495 fpmath=387. The second is however default at many targets since the
4496 extra 80bit precision of temporaries is considered to be part of ABI.
4497 Overwrite the default at least for -ffast-math.
4498 TODO: -mfpmath=both seems to produce same performing code with bit
4499 smaller binaries. It is however not clear if register allocation is
4500 ready for this setting.
4501 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4502 codegen. We may switch to 387 with -ffast-math for size optimized
4503 functions. */
4504 else if (fast_math_flags_set_p (&global_options)
4505 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4506 opts->x_ix86_fpmath = FPMATH_SSE;
4507 else
4508 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4510 /* Use external vectorized library in vectorizing intrinsics. */
4511 if (opts_set->x_ix86_veclibabi_type)
4512 switch (opts->x_ix86_veclibabi_type)
4514 case ix86_veclibabi_type_svml:
4515 ix86_veclib_handler = ix86_veclibabi_svml;
4516 break;
4518 case ix86_veclibabi_type_acml:
4519 ix86_veclib_handler = ix86_veclibabi_acml;
4520 break;
4522 default:
4523 gcc_unreachable ();
4526 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4527 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4528 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4530 /* If stack probes are required, the space used for large function
4531 arguments on the stack must also be probed, so enable
4532 -maccumulate-outgoing-args so this happens in the prologue. */
4533 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4534 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4536 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4537 warning (0,
4538 main_args_p
4539 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4540 "for correctness")
4541 : G_("stack probing requires "
4542 "%<target(\"accumulate-outgoing-args\")%> for "
4543 "correctness"));
4544 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4547 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4548 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4549 if (fixed_regs[BP_REG]
4550 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4552 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4553 warning (0,
4554 main_args_p
4555 ? G_("fixed ebp register requires "
4556 "%<-maccumulate-outgoing-args%>")
4557 : G_("fixed ebp register requires "
4558 "%<target(\"accumulate-outgoing-args\")%>"));
4559 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4562 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4564 char *p;
4565 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4566 p = strchr (internal_label_prefix, 'X');
4567 internal_label_prefix_len = p - internal_label_prefix;
4568 *p = '\0';
4571 /* When scheduling description is not available, disable scheduler pass
4572 so it won't slow down the compilation and make x87 code slower. */
4573 if (!TARGET_SCHEDULE)
4574 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4576 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4577 ix86_tune_cost->simultaneous_prefetches,
4578 opts->x_param_values,
4579 opts_set->x_param_values);
4580 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4581 ix86_tune_cost->prefetch_block,
4582 opts->x_param_values,
4583 opts_set->x_param_values);
4584 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4585 ix86_tune_cost->l1_cache_size,
4586 opts->x_param_values,
4587 opts_set->x_param_values);
4588 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4589 ix86_tune_cost->l2_cache_size,
4590 opts->x_param_values,
4591 opts_set->x_param_values);
4593 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4594 if (opts->x_flag_prefetch_loop_arrays < 0
4595 && HAVE_prefetch
4596 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4597 && !opts->x_optimize_size
4598 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4599 opts->x_flag_prefetch_loop_arrays = 1;
4601 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4602 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4603 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4604 targetm.expand_builtin_va_start = NULL;
4606 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4608 ix86_gen_leave = gen_leave_rex64;
4609 if (Pmode == DImode)
4611 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4612 ix86_gen_tls_local_dynamic_base_64
4613 = gen_tls_local_dynamic_base_64_di;
4615 else
4617 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4618 ix86_gen_tls_local_dynamic_base_64
4619 = gen_tls_local_dynamic_base_64_si;
4622 else
4623 ix86_gen_leave = gen_leave;
4625 if (Pmode == DImode)
4627 ix86_gen_add3 = gen_adddi3;
4628 ix86_gen_sub3 = gen_subdi3;
4629 ix86_gen_sub3_carry = gen_subdi3_carry;
4630 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4631 ix86_gen_andsp = gen_anddi3;
4632 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4633 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4634 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4635 ix86_gen_monitor = gen_sse3_monitor_di;
4636 ix86_gen_monitorx = gen_monitorx_di;
4637 ix86_gen_clzero = gen_clzero_di;
4639 else
4641 ix86_gen_add3 = gen_addsi3;
4642 ix86_gen_sub3 = gen_subsi3;
4643 ix86_gen_sub3_carry = gen_subsi3_carry;
4644 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4645 ix86_gen_andsp = gen_andsi3;
4646 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4647 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4648 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4649 ix86_gen_monitor = gen_sse3_monitor_si;
4650 ix86_gen_monitorx = gen_monitorx_si;
4651 ix86_gen_clzero = gen_clzero_si;
4654 #ifdef USE_IX86_CLD
4655 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4656 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4657 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4658 #endif
4660 /* Set the default value for -mfentry. */
4661 if (!opts_set->x_flag_fentry)
4662 opts->x_flag_fentry = TARGET_SEH;
4663 else
4665 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4666 && opts->x_flag_fentry)
4667 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4668 "with -fpic");
4669 else if (TARGET_SEH && !opts->x_flag_fentry)
4670 sorry ("-mno-fentry isn%'t compatible with SEH");
4673 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4674 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4676 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4677 && TARGET_EMIT_VZEROUPPER)
4678 opts->x_target_flags |= MASK_VZEROUPPER;
4679 if (!(opts_set->x_target_flags & MASK_STV))
4680 opts->x_target_flags |= MASK_STV;
4681 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4682 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4683 stack realignment will be extra cost the pass doesn't take into
4684 account and the pass can't realign the stack. */
4685 if (ix86_preferred_stack_boundary < 128
4686 || ix86_incoming_stack_boundary < 128
4687 || opts->x_ix86_force_align_arg_pointer)
4688 opts->x_target_flags &= ~MASK_STV;
4689 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4690 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4691 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4692 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4693 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4694 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4696 /* Enable 128-bit AVX instruction generation
4697 for the auto-vectorizer. */
4698 if (TARGET_AVX128_OPTIMAL
4699 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4700 opts->x_prefer_vector_width_type = PVW_AVX128;
4702 /* Use 256-bit AVX instruction generation
4703 in the auto-vectorizer. */
4704 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4705 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4706 opts->x_prefer_vector_width_type = PVW_AVX256;
4708 if (opts->x_ix86_recip_name)
4710 char *p = ASTRDUP (opts->x_ix86_recip_name);
4711 char *q;
4712 unsigned int mask, i;
4713 bool invert;
4715 while ((q = strtok (p, ",")) != NULL)
4717 p = NULL;
4718 if (*q == '!')
4720 invert = true;
4721 q++;
4723 else
4724 invert = false;
4726 if (!strcmp (q, "default"))
4727 mask = RECIP_MASK_ALL;
4728 else
4730 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4731 if (!strcmp (q, recip_options[i].string))
4733 mask = recip_options[i].mask;
4734 break;
4737 if (i == ARRAY_SIZE (recip_options))
4739 error ("unknown option for -mrecip=%s", q);
4740 invert = false;
4741 mask = RECIP_MASK_NONE;
4745 opts->x_recip_mask_explicit |= mask;
4746 if (invert)
4747 opts->x_recip_mask &= ~mask;
4748 else
4749 opts->x_recip_mask |= mask;
4753 if (TARGET_RECIP_P (opts->x_target_flags))
4754 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4755 else if (opts_set->x_target_flags & MASK_RECIP)
4756 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4758 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4759 for 64-bit Bionic. Also default long double to 64-bit for Intel
4760 MCU psABI. */
4761 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4762 && !(opts_set->x_target_flags
4763 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4764 opts->x_target_flags |= (TARGET_64BIT
4765 ? MASK_LONG_DOUBLE_128
4766 : MASK_LONG_DOUBLE_64);
4768 /* Only one of them can be active. */
4769 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4770 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4772 /* Handle stack protector */
4773 if (!opts_set->x_ix86_stack_protector_guard)
4774 opts->x_ix86_stack_protector_guard
4775 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4777 #ifdef TARGET_THREAD_SSP_OFFSET
4778 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4779 #endif
4781 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4783 char *endp;
4784 const char *str = ix86_stack_protector_guard_offset_str;
4786 errno = 0;
4787 int64_t offset;
4789 #if defined(INT64_T_IS_LONG)
4790 offset = strtol (str, &endp, 0);
4791 #else
4792 offset = strtoll (str, &endp, 0);
4793 #endif
4795 if (!*str || *endp || errno)
4796 error ("%qs is not a valid number "
4797 "in -mstack-protector-guard-offset=", str);
4799 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4800 HOST_WIDE_INT_C (0x7fffffff)))
4801 error ("%qs is not a valid offset "
4802 "in -mstack-protector-guard-offset=", str);
4804 ix86_stack_protector_guard_offset = offset;
4807 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4809 /* The kernel uses a different segment register for performance
4810 reasons; a system call would not have to trash the userspace
4811 segment register, which would be expensive. */
4812 if (ix86_cmodel == CM_KERNEL)
4813 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4815 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4817 const char *str = ix86_stack_protector_guard_reg_str;
4818 addr_space_t seg = ADDR_SPACE_GENERIC;
4820 /* Discard optional register prefix. */
4821 if (str[0] == '%')
4822 str++;
4824 if (strlen (str) == 2 && str[1] == 's')
4826 if (str[0] == 'f')
4827 seg = ADDR_SPACE_SEG_FS;
4828 else if (str[0] == 'g')
4829 seg = ADDR_SPACE_SEG_GS;
4832 if (seg == ADDR_SPACE_GENERIC)
4833 error ("%qs is not a valid base register "
4834 "in -mstack-protector-guard-reg=",
4835 ix86_stack_protector_guard_reg_str);
4837 ix86_stack_protector_guard_reg = seg;
4840 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4841 if (opts->x_ix86_tune_memcpy_strategy)
4843 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4844 ix86_parse_stringop_strategy_string (str, false);
4845 free (str);
4848 if (opts->x_ix86_tune_memset_strategy)
4850 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4851 ix86_parse_stringop_strategy_string (str, true);
4852 free (str);
4855 /* Save the initial options in case the user does function specific
4856 options. */
4857 if (main_args_p)
4858 target_option_default_node = target_option_current_node
4859 = build_target_option_node (opts);
4861 /* Do not support control flow instrumentation if CET is not enabled. */
4862 if (opts->x_flag_cf_protection != CF_NONE)
4864 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4865 || TARGET_SHSTK_P (opts->x_ix86_isa_flags2)))
4867 if (flag_cf_protection == CF_FULL)
4869 error ("%<-fcf-protection=full%> requires CET support "
4870 "on this target. Use -mcet or one of -mibt, "
4871 "-mshstk options to enable CET");
4873 else if (flag_cf_protection == CF_BRANCH)
4875 error ("%<-fcf-protection=branch%> requires CET support "
4876 "on this target. Use -mcet or one of -mibt, "
4877 "-mshstk options to enable CET");
4879 else if (flag_cf_protection == CF_RETURN)
4881 error ("%<-fcf-protection=return%> requires CET support "
4882 "on this target. Use -mcet or one of -mibt, "
4883 "-mshstk options to enable CET");
4885 flag_cf_protection = CF_NONE;
4886 return false;
4888 opts->x_flag_cf_protection =
4889 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4892 return true;
4895 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4897 static void
4898 ix86_option_override (void)
4900 ix86_option_override_internal (true, &global_options, &global_options_set);
4903 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4904 static char *
4905 ix86_offload_options (void)
4907 if (TARGET_LP64)
4908 return xstrdup ("-foffload-abi=lp64");
4909 return xstrdup ("-foffload-abi=ilp32");
4912 /* Update register usage after having seen the compiler flags. */
4914 static void
4915 ix86_conditional_register_usage (void)
4917 int i, c_mask;
4919 /* If there are no caller-saved registers, preserve all registers.
4920 except fixed_regs and registers used for function return value
4921 since aggregate_value_p checks call_used_regs[regno] on return
4922 value. */
4923 if (cfun && cfun->machine->no_caller_saved_registers)
4924 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4925 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4926 call_used_regs[i] = 0;
4928 /* For 32-bit targets, squash the REX registers. */
4929 if (! TARGET_64BIT)
4931 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4932 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4933 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4934 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4935 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4936 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4939 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4940 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4942 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4944 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4946 /* Set/reset conditionally defined registers from
4947 CALL_USED_REGISTERS initializer. */
4948 if (call_used_regs[i] > 1)
4949 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4951 /* Calculate registers of CLOBBERED_REGS register set
4952 as call used registers from GENERAL_REGS register set. */
4953 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4954 && call_used_regs[i])
4955 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4958 /* If MMX is disabled, squash the registers. */
4959 if (! TARGET_MMX)
4960 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4961 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4962 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4964 /* If SSE is disabled, squash the registers. */
4965 if (! TARGET_SSE)
4966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4968 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4970 /* If the FPU is disabled, squash the registers. */
4971 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4973 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4974 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4976 /* If AVX512F is disabled, squash the registers. */
4977 if (! TARGET_AVX512F)
4979 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4980 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4982 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4983 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4986 /* If MPX is disabled, squash the registers. */
4987 if (! TARGET_MPX)
4988 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4989 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4992 /* Canonicalize a comparison from one we don't have to one we do have. */
4994 static void
4995 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
4996 bool op0_preserve_value)
4998 /* The order of operands in x87 ficom compare is forced by combine in
4999 simplify_comparison () function. Float operator is treated as RTX_OBJ
5000 with a precedence over other operators and is always put in the first
5001 place. Swap condition and operands to match ficom instruction. */
5002 if (!op0_preserve_value
5003 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5005 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5007 /* We are called only for compares that are split to SAHF instruction.
5008 Ensure that we have setcc/jcc insn for the swapped condition. */
5009 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5011 std::swap (*op0, *op1);
5012 *code = (int) scode;
5017 /* Save the current options */
5019 static void
5020 ix86_function_specific_save (struct cl_target_option *ptr,
5021 struct gcc_options *opts)
5023 ptr->arch = ix86_arch;
5024 ptr->schedule = ix86_schedule;
5025 ptr->prefetch_sse = x86_prefetch_sse;
5026 ptr->tune = ix86_tune;
5027 ptr->branch_cost = ix86_branch_cost;
5028 ptr->tune_defaulted = ix86_tune_defaulted;
5029 ptr->arch_specified = ix86_arch_specified;
5030 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5031 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5032 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5033 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5034 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5035 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5036 ptr->x_ix86_abi = opts->x_ix86_abi;
5037 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5038 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5039 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5040 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5041 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5042 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5043 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5044 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5045 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5046 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5047 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5048 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5049 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5050 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5051 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5052 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5053 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5054 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5055 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5056 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5058 /* The fields are char but the variables are not; make sure the
5059 values fit in the fields. */
5060 gcc_assert (ptr->arch == ix86_arch);
5061 gcc_assert (ptr->schedule == ix86_schedule);
5062 gcc_assert (ptr->tune == ix86_tune);
5063 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5066 /* Restore the current options */
5068 static void
5069 ix86_function_specific_restore (struct gcc_options *opts,
5070 struct cl_target_option *ptr)
5072 enum processor_type old_tune = ix86_tune;
5073 enum processor_type old_arch = ix86_arch;
5074 unsigned int ix86_arch_mask;
5075 int i;
5077 /* We don't change -fPIC. */
5078 opts->x_flag_pic = flag_pic;
5080 ix86_arch = (enum processor_type) ptr->arch;
5081 ix86_schedule = (enum attr_cpu) ptr->schedule;
5082 ix86_tune = (enum processor_type) ptr->tune;
5083 x86_prefetch_sse = ptr->prefetch_sse;
5084 opts->x_ix86_branch_cost = ptr->branch_cost;
5085 ix86_tune_defaulted = ptr->tune_defaulted;
5086 ix86_arch_specified = ptr->arch_specified;
5087 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5088 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5089 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5090 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5091 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5092 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5093 opts->x_ix86_abi = ptr->x_ix86_abi;
5094 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5095 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5096 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5097 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5098 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5099 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5100 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5101 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5102 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5103 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5104 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5105 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5106 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5107 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5108 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5109 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5110 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5111 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5112 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5113 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5114 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5115 /* TODO: ix86_cost should be chosen at instruction or function granuality
5116 so for cold code we use size_cost even in !optimize_size compilation. */
5117 if (opts->x_optimize_size)
5118 ix86_cost = &ix86_size_cost;
5119 else
5120 ix86_cost = ix86_tune_cost;
5122 /* Recreate the arch feature tests if the arch changed */
5123 if (old_arch != ix86_arch)
5125 ix86_arch_mask = 1u << ix86_arch;
5126 for (i = 0; i < X86_ARCH_LAST; ++i)
5127 ix86_arch_features[i]
5128 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5131 /* Recreate the tune optimization tests */
5132 if (old_tune != ix86_tune)
5133 set_ix86_tune_features (ix86_tune, false);
5136 /* Adjust target options after streaming them in. This is mainly about
5137 reconciling them with global options. */
5139 static void
5140 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5142 /* flag_pic is a global option, but ix86_cmodel is target saved option
5143 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5144 for PIC, or error out. */
5145 if (flag_pic)
5146 switch (ptr->x_ix86_cmodel)
5148 case CM_SMALL:
5149 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5150 break;
5152 case CM_MEDIUM:
5153 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5154 break;
5156 case CM_LARGE:
5157 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5158 break;
5160 case CM_KERNEL:
5161 error ("code model %s does not support PIC mode", "kernel");
5162 break;
5164 default:
5165 break;
5167 else
5168 switch (ptr->x_ix86_cmodel)
5170 case CM_SMALL_PIC:
5171 ptr->x_ix86_cmodel = CM_SMALL;
5172 break;
5174 case CM_MEDIUM_PIC:
5175 ptr->x_ix86_cmodel = CM_MEDIUM;
5176 break;
5178 case CM_LARGE_PIC:
5179 ptr->x_ix86_cmodel = CM_LARGE;
5180 break;
5182 default:
5183 break;
5187 /* Print the current options */
5189 static void
5190 ix86_function_specific_print (FILE *file, int indent,
5191 struct cl_target_option *ptr)
5193 char *target_string
5194 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5195 ptr->x_target_flags, ptr->x_ix86_target_flags,
5196 NULL, NULL, ptr->x_ix86_fpmath, false);
5198 gcc_assert (ptr->arch < PROCESSOR_max);
5199 fprintf (file, "%*sarch = %d (%s)\n",
5200 indent, "",
5201 ptr->arch, processor_target_table[ptr->arch].name);
5203 gcc_assert (ptr->tune < PROCESSOR_max);
5204 fprintf (file, "%*stune = %d (%s)\n",
5205 indent, "",
5206 ptr->tune, processor_target_table[ptr->tune].name);
5208 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5210 if (target_string)
5212 fprintf (file, "%*s%s\n", indent, "", target_string);
5213 free (target_string);
5218 /* Inner function to process the attribute((target(...))), take an argument and
5219 set the current options from the argument. If we have a list, recursively go
5220 over the list. */
5222 static bool
5223 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5224 struct gcc_options *opts,
5225 struct gcc_options *opts_set,
5226 struct gcc_options *enum_opts_set)
5228 char *next_optstr;
5229 bool ret = true;
5231 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5232 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5233 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5234 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5235 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5237 enum ix86_opt_type
5239 ix86_opt_unknown,
5240 ix86_opt_yes,
5241 ix86_opt_no,
5242 ix86_opt_str,
5243 ix86_opt_enum,
5244 ix86_opt_isa
5247 static const struct
5249 const char *string;
5250 size_t len;
5251 enum ix86_opt_type type;
5252 int opt;
5253 int mask;
5254 } attrs[] = {
5255 /* isa options */
5256 IX86_ATTR_ISA ("sgx", OPT_msgx),
5257 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5258 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5259 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5260 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5261 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5263 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5264 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5265 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5266 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5267 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5268 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5269 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5270 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5271 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5272 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5273 IX86_ATTR_ISA ("fma", OPT_mfma),
5274 IX86_ATTR_ISA ("xop", OPT_mxop),
5275 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5276 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5277 IX86_ATTR_ISA ("avx", OPT_mavx),
5278 IX86_ATTR_ISA ("sse4", OPT_msse4),
5279 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5280 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5281 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5282 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5283 IX86_ATTR_ISA ("sse3", OPT_msse3),
5284 IX86_ATTR_ISA ("aes", OPT_maes),
5285 IX86_ATTR_ISA ("sha", OPT_msha),
5286 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5287 IX86_ATTR_ISA ("sse2", OPT_msse2),
5288 IX86_ATTR_ISA ("sse", OPT_msse),
5289 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5290 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5291 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5292 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5293 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5294 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5295 IX86_ATTR_ISA ("adx", OPT_madx),
5296 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5297 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5298 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5299 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5300 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5301 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5302 IX86_ATTR_ISA ("abm", OPT_mabm),
5303 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5304 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5305 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5306 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5307 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5308 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5309 IX86_ATTR_ISA ("sahf", OPT_msahf),
5310 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5311 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5312 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5313 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5314 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5315 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5316 IX86_ATTR_ISA ("pku", OPT_mpku),
5317 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5318 IX86_ATTR_ISA ("hle", OPT_mhle),
5319 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5320 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5321 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5322 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5323 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5324 IX86_ATTR_ISA ("ibt", OPT_mibt),
5325 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5326 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5328 /* enum options */
5329 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5331 /* string options */
5332 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5333 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5335 /* flag options */
5336 IX86_ATTR_YES ("cld",
5337 OPT_mcld,
5338 MASK_CLD),
5340 IX86_ATTR_NO ("fancy-math-387",
5341 OPT_mfancy_math_387,
5342 MASK_NO_FANCY_MATH_387),
5344 IX86_ATTR_YES ("ieee-fp",
5345 OPT_mieee_fp,
5346 MASK_IEEE_FP),
5348 IX86_ATTR_YES ("inline-all-stringops",
5349 OPT_minline_all_stringops,
5350 MASK_INLINE_ALL_STRINGOPS),
5352 IX86_ATTR_YES ("inline-stringops-dynamically",
5353 OPT_minline_stringops_dynamically,
5354 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5356 IX86_ATTR_NO ("align-stringops",
5357 OPT_mno_align_stringops,
5358 MASK_NO_ALIGN_STRINGOPS),
5360 IX86_ATTR_YES ("recip",
5361 OPT_mrecip,
5362 MASK_RECIP),
5366 /* If this is a list, recurse to get the options. */
5367 if (TREE_CODE (args) == TREE_LIST)
5369 bool ret = true;
5371 for (; args; args = TREE_CHAIN (args))
5372 if (TREE_VALUE (args)
5373 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5374 p_strings, opts, opts_set,
5375 enum_opts_set))
5376 ret = false;
5378 return ret;
5381 else if (TREE_CODE (args) != STRING_CST)
5383 error ("attribute %<target%> argument not a string");
5384 return false;
5387 /* Handle multiple arguments separated by commas. */
5388 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5390 while (next_optstr && *next_optstr != '\0')
5392 char *p = next_optstr;
5393 char *orig_p = p;
5394 char *comma = strchr (next_optstr, ',');
5395 const char *opt_string;
5396 size_t len, opt_len;
5397 int opt;
5398 bool opt_set_p;
5399 char ch;
5400 unsigned i;
5401 enum ix86_opt_type type = ix86_opt_unknown;
5402 int mask = 0;
5404 if (comma)
5406 *comma = '\0';
5407 len = comma - next_optstr;
5408 next_optstr = comma + 1;
5410 else
5412 len = strlen (p);
5413 next_optstr = NULL;
5416 /* Recognize no-xxx. */
5417 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5419 opt_set_p = false;
5420 p += 3;
5421 len -= 3;
5423 else
5424 opt_set_p = true;
5426 /* Find the option. */
5427 ch = *p;
5428 opt = N_OPTS;
5429 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5431 type = attrs[i].type;
5432 opt_len = attrs[i].len;
5433 if (ch == attrs[i].string[0]
5434 && ((type != ix86_opt_str && type != ix86_opt_enum)
5435 ? len == opt_len
5436 : len > opt_len)
5437 && memcmp (p, attrs[i].string, opt_len) == 0)
5439 opt = attrs[i].opt;
5440 mask = attrs[i].mask;
5441 opt_string = attrs[i].string;
5442 break;
5446 /* Process the option. */
5447 if (opt == N_OPTS)
5449 error ("attribute(target(\"%s\")) is unknown", orig_p);
5450 ret = false;
5453 else if (type == ix86_opt_isa)
5455 struct cl_decoded_option decoded;
5457 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5458 ix86_handle_option (opts, opts_set,
5459 &decoded, input_location);
5462 else if (type == ix86_opt_yes || type == ix86_opt_no)
5464 if (type == ix86_opt_no)
5465 opt_set_p = !opt_set_p;
5467 if (opt_set_p)
5468 opts->x_target_flags |= mask;
5469 else
5470 opts->x_target_flags &= ~mask;
5473 else if (type == ix86_opt_str)
5475 if (p_strings[opt])
5477 error ("option(\"%s\") was already specified", opt_string);
5478 ret = false;
5480 else
5481 p_strings[opt] = xstrdup (p + opt_len);
5484 else if (type == ix86_opt_enum)
5486 bool arg_ok;
5487 int value;
5489 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5490 if (arg_ok)
5491 set_option (opts, enum_opts_set, opt, value,
5492 p + opt_len, DK_UNSPECIFIED, input_location,
5493 global_dc);
5494 else
5496 error ("attribute(target(\"%s\")) is unknown", orig_p);
5497 ret = false;
5501 else
5502 gcc_unreachable ();
5505 return ret;
5508 /* Release allocated strings. */
5509 static void
5510 release_options_strings (char **option_strings)
5512 /* Free up memory allocated to hold the strings */
5513 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5514 free (option_strings[i]);
5517 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5519 tree
5520 ix86_valid_target_attribute_tree (tree args,
5521 struct gcc_options *opts,
5522 struct gcc_options *opts_set)
5524 const char *orig_arch_string = opts->x_ix86_arch_string;
5525 const char *orig_tune_string = opts->x_ix86_tune_string;
5526 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5527 int orig_tune_defaulted = ix86_tune_defaulted;
5528 int orig_arch_specified = ix86_arch_specified;
5529 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5530 tree t = NULL_TREE;
5531 struct cl_target_option *def
5532 = TREE_TARGET_OPTION (target_option_default_node);
5533 struct gcc_options enum_opts_set;
5535 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5537 /* Process each of the options on the chain. */
5538 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5539 opts_set, &enum_opts_set))
5540 return error_mark_node;
5542 /* If the changed options are different from the default, rerun
5543 ix86_option_override_internal, and then save the options away.
5544 The string options are attribute options, and will be undone
5545 when we copy the save structure. */
5546 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5547 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5548 || opts->x_target_flags != def->x_target_flags
5549 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5550 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5551 || enum_opts_set.x_ix86_fpmath)
5553 /* If we are using the default tune= or arch=, undo the string assigned,
5554 and use the default. */
5555 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5557 opts->x_ix86_arch_string
5558 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5560 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5561 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5562 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5563 | OPTION_MASK_ABI_64
5564 | OPTION_MASK_ABI_X32
5565 | OPTION_MASK_CODE16);
5566 opts->x_ix86_isa_flags2 = 0;
5568 else if (!orig_arch_specified)
5569 opts->x_ix86_arch_string = NULL;
5571 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5572 opts->x_ix86_tune_string
5573 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5574 else if (orig_tune_defaulted)
5575 opts->x_ix86_tune_string = NULL;
5577 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5578 if (enum_opts_set.x_ix86_fpmath)
5579 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5581 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5582 bool r = ix86_option_override_internal (false, opts, opts_set);
5583 if (!r)
5585 release_options_strings (option_strings);
5586 return error_mark_node;
5589 /* Add any builtin functions with the new isa if any. */
5590 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5592 /* Save the current options unless we are validating options for
5593 #pragma. */
5594 t = build_target_option_node (opts);
5596 opts->x_ix86_arch_string = orig_arch_string;
5597 opts->x_ix86_tune_string = orig_tune_string;
5598 opts_set->x_ix86_fpmath = orig_fpmath_set;
5600 release_options_strings (option_strings);
5603 return t;
5606 /* Hook to validate attribute((target("string"))). */
5608 static bool
5609 ix86_valid_target_attribute_p (tree fndecl,
5610 tree ARG_UNUSED (name),
5611 tree args,
5612 int ARG_UNUSED (flags))
5614 struct gcc_options func_options;
5615 tree new_target, new_optimize;
5616 bool ret = true;
5618 /* attribute((target("default"))) does nothing, beyond
5619 affecting multi-versioning. */
5620 if (TREE_VALUE (args)
5621 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5622 && TREE_CHAIN (args) == NULL_TREE
5623 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5624 return true;
5626 tree old_optimize = build_optimization_node (&global_options);
5628 /* Get the optimization options of the current function. */
5629 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5631 if (!func_optimize)
5632 func_optimize = old_optimize;
5634 /* Init func_options. */
5635 memset (&func_options, 0, sizeof (func_options));
5636 init_options_struct (&func_options, NULL);
5637 lang_hooks.init_options_struct (&func_options);
5639 cl_optimization_restore (&func_options,
5640 TREE_OPTIMIZATION (func_optimize));
5642 /* Initialize func_options to the default before its target options can
5643 be set. */
5644 cl_target_option_restore (&func_options,
5645 TREE_TARGET_OPTION (target_option_default_node));
5647 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5648 &global_options_set);
5650 new_optimize = build_optimization_node (&func_options);
5652 if (new_target == error_mark_node)
5653 ret = false;
5655 else if (fndecl && new_target)
5657 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5659 if (old_optimize != new_optimize)
5660 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5663 finalize_options_struct (&func_options);
5665 return ret;
5669 /* Hook to determine if one function can safely inline another. */
5671 static bool
5672 ix86_can_inline_p (tree caller, tree callee)
5674 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5675 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5676 if (!callee_tree)
5677 callee_tree = target_option_default_node;
5678 if (!caller_tree)
5679 caller_tree = target_option_default_node;
5680 if (callee_tree == caller_tree)
5681 return true;
5683 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5684 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5685 bool ret = false;
5687 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5688 function can inline a SSE2 function but a SSE2 function can't inline
5689 a SSE4 function. */
5690 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5691 != callee_opts->x_ix86_isa_flags)
5692 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5693 != callee_opts->x_ix86_isa_flags2))
5694 ret = false;
5696 /* See if we have the same non-isa options. */
5697 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5698 ret = false;
5700 /* See if arch, tune, etc. are the same. */
5701 else if (caller_opts->arch != callee_opts->arch)
5702 ret = false;
5704 else if (caller_opts->tune != callee_opts->tune)
5705 ret = false;
5707 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5708 /* If the calle doesn't use FP expressions differences in
5709 ix86_fpmath can be ignored. We are called from FEs
5710 for multi-versioning call optimization, so beware of
5711 ipa_fn_summaries not available. */
5712 && (! ipa_fn_summaries
5713 || ipa_fn_summaries->get
5714 (cgraph_node::get (callee))->fp_expressions))
5715 ret = false;
5717 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5718 ret = false;
5720 else
5721 ret = true;
5723 return ret;
5727 /* Remember the last target of ix86_set_current_function. */
5728 static GTY(()) tree ix86_previous_fndecl;
5730 /* Set targets globals to the default (or current #pragma GCC target
5731 if active). Invalidate ix86_previous_fndecl cache. */
5733 void
5734 ix86_reset_previous_fndecl (void)
5736 tree new_tree = target_option_current_node;
5737 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5738 if (TREE_TARGET_GLOBALS (new_tree))
5739 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5740 else if (new_tree == target_option_default_node)
5741 restore_target_globals (&default_target_globals);
5742 else
5743 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5744 ix86_previous_fndecl = NULL_TREE;
5747 /* Set the func_type field from the function FNDECL. */
5749 static void
5750 ix86_set_func_type (tree fndecl)
5752 if (cfun->machine->func_type == TYPE_UNKNOWN)
5754 if (lookup_attribute ("interrupt",
5755 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5757 if (ix86_function_naked (fndecl))
5758 error_at (DECL_SOURCE_LOCATION (fndecl),
5759 "interrupt and naked attributes are not compatible");
5761 int nargs = 0;
5762 for (tree arg = DECL_ARGUMENTS (fndecl);
5763 arg;
5764 arg = TREE_CHAIN (arg))
5765 nargs++;
5766 cfun->machine->no_caller_saved_registers = true;
5767 cfun->machine->func_type
5768 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5770 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5772 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5773 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5774 sorry ("Only DWARF debug format is supported for interrupt "
5775 "service routine.");
5777 else
5779 cfun->machine->func_type = TYPE_NORMAL;
5780 if (lookup_attribute ("no_caller_saved_registers",
5781 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5782 cfun->machine->no_caller_saved_registers = true;
5787 /* Establish appropriate back-end context for processing the function
5788 FNDECL. The argument might be NULL to indicate processing at top
5789 level, outside of any function scope. */
5790 static void
5791 ix86_set_current_function (tree fndecl)
5793 /* Only change the context if the function changes. This hook is called
5794 several times in the course of compiling a function, and we don't want to
5795 slow things down too much or call target_reinit when it isn't safe. */
5796 if (fndecl == ix86_previous_fndecl)
5798 /* There may be 2 function bodies for the same function FNDECL,
5799 one is extern inline and one isn't. Call ix86_set_func_type
5800 to set the func_type field. */
5801 if (fndecl != NULL_TREE)
5802 ix86_set_func_type (fndecl);
5803 return;
5806 tree old_tree;
5807 if (ix86_previous_fndecl == NULL_TREE)
5808 old_tree = target_option_current_node;
5809 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5810 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5811 else
5812 old_tree = target_option_default_node;
5814 if (fndecl == NULL_TREE)
5816 if (old_tree != target_option_current_node)
5817 ix86_reset_previous_fndecl ();
5818 return;
5821 ix86_set_func_type (fndecl);
5823 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5824 if (new_tree == NULL_TREE)
5825 new_tree = target_option_default_node;
5827 if (old_tree != new_tree)
5829 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5830 if (TREE_TARGET_GLOBALS (new_tree))
5831 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5832 else if (new_tree == target_option_default_node)
5833 restore_target_globals (&default_target_globals);
5834 else
5835 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5837 ix86_previous_fndecl = fndecl;
5839 static bool prev_no_caller_saved_registers;
5841 /* 64-bit MS and SYSV ABI have different set of call used registers.
5842 Avoid expensive re-initialization of init_regs each time we switch
5843 function context. */
5844 if (TARGET_64BIT
5845 && (call_used_regs[SI_REG]
5846 == (cfun->machine->call_abi == MS_ABI)))
5847 reinit_regs ();
5848 /* Need to re-initialize init_regs if caller-saved registers are
5849 changed. */
5850 else if (prev_no_caller_saved_registers
5851 != cfun->machine->no_caller_saved_registers)
5852 reinit_regs ();
5854 if (cfun->machine->func_type != TYPE_NORMAL
5855 || cfun->machine->no_caller_saved_registers)
5857 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5858 may change processor state. */
5859 const char *isa;
5860 if (TARGET_MPX)
5861 isa = "MPX";
5862 else if (TARGET_SSE)
5863 isa = "SSE";
5864 else if (TARGET_MMX)
5865 isa = "MMX/3Dnow";
5866 else if (TARGET_80387)
5867 isa = "80387";
5868 else
5869 isa = NULL;
5870 if (isa != NULL)
5872 if (cfun->machine->func_type != TYPE_NORMAL)
5873 sorry ("%s instructions aren't allowed in %s service routine",
5874 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5875 ? "exception" : "interrupt"));
5876 else
5877 sorry ("%s instructions aren't allowed in function with "
5878 "no_caller_saved_registers attribute", isa);
5879 /* Don't issue the same error twice. */
5880 cfun->machine->func_type = TYPE_NORMAL;
5881 cfun->machine->no_caller_saved_registers = false;
5885 prev_no_caller_saved_registers
5886 = cfun->machine->no_caller_saved_registers;
5890 /* Return true if this goes in large data/bss. */
5892 static bool
5893 ix86_in_large_data_p (tree exp)
5895 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5896 return false;
5898 if (exp == NULL_TREE)
5899 return false;
5901 /* Functions are never large data. */
5902 if (TREE_CODE (exp) == FUNCTION_DECL)
5903 return false;
5905 /* Automatic variables are never large data. */
5906 if (VAR_P (exp) && !is_global_var (exp))
5907 return false;
5909 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5911 const char *section = DECL_SECTION_NAME (exp);
5912 if (strcmp (section, ".ldata") == 0
5913 || strcmp (section, ".lbss") == 0)
5914 return true;
5915 return false;
5917 else
5919 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5921 /* If this is an incomplete type with size 0, then we can't put it
5922 in data because it might be too big when completed. Also,
5923 int_size_in_bytes returns -1 if size can vary or is larger than
5924 an integer in which case also it is safer to assume that it goes in
5925 large data. */
5926 if (size <= 0 || size > ix86_section_threshold)
5927 return true;
5930 return false;
5933 /* i386-specific section flag to mark large sections. */
5934 #define SECTION_LARGE SECTION_MACH_DEP
5936 /* Switch to the appropriate section for output of DECL.
5937 DECL is either a `VAR_DECL' node or a constant of some sort.
5938 RELOC indicates whether forming the initial value of DECL requires
5939 link-time relocations. */
5941 ATTRIBUTE_UNUSED static section *
5942 x86_64_elf_select_section (tree decl, int reloc,
5943 unsigned HOST_WIDE_INT align)
5945 if (ix86_in_large_data_p (decl))
5947 const char *sname = NULL;
5948 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5949 switch (categorize_decl_for_section (decl, reloc))
5951 case SECCAT_DATA:
5952 sname = ".ldata";
5953 break;
5954 case SECCAT_DATA_REL:
5955 sname = ".ldata.rel";
5956 break;
5957 case SECCAT_DATA_REL_LOCAL:
5958 sname = ".ldata.rel.local";
5959 break;
5960 case SECCAT_DATA_REL_RO:
5961 sname = ".ldata.rel.ro";
5962 break;
5963 case SECCAT_DATA_REL_RO_LOCAL:
5964 sname = ".ldata.rel.ro.local";
5965 break;
5966 case SECCAT_BSS:
5967 sname = ".lbss";
5968 flags |= SECTION_BSS;
5969 break;
5970 case SECCAT_RODATA:
5971 case SECCAT_RODATA_MERGE_STR:
5972 case SECCAT_RODATA_MERGE_STR_INIT:
5973 case SECCAT_RODATA_MERGE_CONST:
5974 sname = ".lrodata";
5975 flags &= ~SECTION_WRITE;
5976 break;
5977 case SECCAT_SRODATA:
5978 case SECCAT_SDATA:
5979 case SECCAT_SBSS:
5980 gcc_unreachable ();
5981 case SECCAT_TEXT:
5982 case SECCAT_TDATA:
5983 case SECCAT_TBSS:
5984 /* We don't split these for medium model. Place them into
5985 default sections and hope for best. */
5986 break;
5988 if (sname)
5990 /* We might get called with string constants, but get_named_section
5991 doesn't like them as they are not DECLs. Also, we need to set
5992 flags in that case. */
5993 if (!DECL_P (decl))
5994 return get_section (sname, flags, NULL);
5995 return get_named_section (decl, sname, reloc);
5998 return default_elf_select_section (decl, reloc, align);
6001 /* Select a set of attributes for section NAME based on the properties
6002 of DECL and whether or not RELOC indicates that DECL's initializer
6003 might contain runtime relocations. */
6005 static unsigned int ATTRIBUTE_UNUSED
6006 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6008 unsigned int flags = default_section_type_flags (decl, name, reloc);
6010 if (ix86_in_large_data_p (decl))
6011 flags |= SECTION_LARGE;
6013 if (decl == NULL_TREE
6014 && (strcmp (name, ".ldata.rel.ro") == 0
6015 || strcmp (name, ".ldata.rel.ro.local") == 0))
6016 flags |= SECTION_RELRO;
6018 if (strcmp (name, ".lbss") == 0
6019 || strncmp (name, ".lbss.", 5) == 0
6020 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6021 flags |= SECTION_BSS;
6023 return flags;
6026 /* Build up a unique section name, expressed as a
6027 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6028 RELOC indicates whether the initial value of EXP requires
6029 link-time relocations. */
6031 static void ATTRIBUTE_UNUSED
6032 x86_64_elf_unique_section (tree decl, int reloc)
6034 if (ix86_in_large_data_p (decl))
6036 const char *prefix = NULL;
6037 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6038 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6040 switch (categorize_decl_for_section (decl, reloc))
6042 case SECCAT_DATA:
6043 case SECCAT_DATA_REL:
6044 case SECCAT_DATA_REL_LOCAL:
6045 case SECCAT_DATA_REL_RO:
6046 case SECCAT_DATA_REL_RO_LOCAL:
6047 prefix = one_only ? ".ld" : ".ldata";
6048 break;
6049 case SECCAT_BSS:
6050 prefix = one_only ? ".lb" : ".lbss";
6051 break;
6052 case SECCAT_RODATA:
6053 case SECCAT_RODATA_MERGE_STR:
6054 case SECCAT_RODATA_MERGE_STR_INIT:
6055 case SECCAT_RODATA_MERGE_CONST:
6056 prefix = one_only ? ".lr" : ".lrodata";
6057 break;
6058 case SECCAT_SRODATA:
6059 case SECCAT_SDATA:
6060 case SECCAT_SBSS:
6061 gcc_unreachable ();
6062 case SECCAT_TEXT:
6063 case SECCAT_TDATA:
6064 case SECCAT_TBSS:
6065 /* We don't split these for medium model. Place them into
6066 default sections and hope for best. */
6067 break;
6069 if (prefix)
6071 const char *name, *linkonce;
6072 char *string;
6074 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6075 name = targetm.strip_name_encoding (name);
6077 /* If we're using one_only, then there needs to be a .gnu.linkonce
6078 prefix to the section name. */
6079 linkonce = one_only ? ".gnu.linkonce" : "";
6081 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6083 set_decl_section_name (decl, string);
6084 return;
6087 default_unique_section (decl, reloc);
6090 #ifdef COMMON_ASM_OP
6092 #ifndef LARGECOMM_SECTION_ASM_OP
6093 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6094 #endif
6096 /* This says how to output assembler code to declare an
6097 uninitialized external linkage data object.
6099 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6100 large objects. */
6101 void
6102 x86_elf_aligned_decl_common (FILE *file, tree decl,
6103 const char *name, unsigned HOST_WIDE_INT size,
6104 int align)
6106 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6107 && size > (unsigned int)ix86_section_threshold)
6109 switch_to_section (get_named_section (decl, ".lbss", 0));
6110 fputs (LARGECOMM_SECTION_ASM_OP, file);
6112 else
6113 fputs (COMMON_ASM_OP, file);
6114 assemble_name (file, name);
6115 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6116 size, align / BITS_PER_UNIT);
6118 #endif
6120 /* Utility function for targets to use in implementing
6121 ASM_OUTPUT_ALIGNED_BSS. */
6123 void
6124 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6125 unsigned HOST_WIDE_INT size, int align)
6127 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6128 && size > (unsigned int)ix86_section_threshold)
6129 switch_to_section (get_named_section (decl, ".lbss", 0));
6130 else
6131 switch_to_section (bss_section);
6132 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6133 #ifdef ASM_DECLARE_OBJECT_NAME
6134 last_assemble_variable_decl = decl;
6135 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6136 #else
6137 /* Standard thing is just output label for the object. */
6138 ASM_OUTPUT_LABEL (file, name);
6139 #endif /* ASM_DECLARE_OBJECT_NAME */
6140 ASM_OUTPUT_SKIP (file, size ? size : 1);
6143 /* Decide whether we must probe the stack before any space allocation
6144 on this target. It's essentially TARGET_STACK_PROBE except when
6145 -fstack-check causes the stack to be already probed differently. */
6147 bool
6148 ix86_target_stack_probe (void)
6150 /* Do not probe the stack twice if static stack checking is enabled. */
6151 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6152 return false;
6154 return TARGET_STACK_PROBE;
6157 /* Decide whether we can make a sibling call to a function. DECL is the
6158 declaration of the function being targeted by the call and EXP is the
6159 CALL_EXPR representing the call. */
6161 static bool
6162 ix86_function_ok_for_sibcall (tree decl, tree exp)
6164 tree type, decl_or_type;
6165 rtx a, b;
6166 bool bind_global = decl && !targetm.binds_local_p (decl);
6168 if (ix86_function_naked (current_function_decl))
6169 return false;
6171 /* Sibling call isn't OK if there are no caller-saved registers
6172 since all registers must be preserved before return. */
6173 if (cfun->machine->no_caller_saved_registers)
6174 return false;
6176 /* If we are generating position-independent code, we cannot sibcall
6177 optimize direct calls to global functions, as the PLT requires
6178 %ebx be live. (Darwin does not have a PLT.) */
6179 if (!TARGET_MACHO
6180 && !TARGET_64BIT
6181 && flag_pic
6182 && flag_plt
6183 && bind_global)
6184 return false;
6186 /* If we need to align the outgoing stack, then sibcalling would
6187 unalign the stack, which may break the called function. */
6188 if (ix86_minimum_incoming_stack_boundary (true)
6189 < PREFERRED_STACK_BOUNDARY)
6190 return false;
6192 if (decl)
6194 decl_or_type = decl;
6195 type = TREE_TYPE (decl);
6197 else
6199 /* We're looking at the CALL_EXPR, we need the type of the function. */
6200 type = CALL_EXPR_FN (exp); /* pointer expression */
6201 type = TREE_TYPE (type); /* pointer type */
6202 type = TREE_TYPE (type); /* function type */
6203 decl_or_type = type;
6206 /* Check that the return value locations are the same. Like
6207 if we are returning floats on the 80387 register stack, we cannot
6208 make a sibcall from a function that doesn't return a float to a
6209 function that does or, conversely, from a function that does return
6210 a float to a function that doesn't; the necessary stack adjustment
6211 would not be executed. This is also the place we notice
6212 differences in the return value ABI. Note that it is ok for one
6213 of the functions to have void return type as long as the return
6214 value of the other is passed in a register. */
6215 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6216 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6217 cfun->decl, false);
6218 if (STACK_REG_P (a) || STACK_REG_P (b))
6220 if (!rtx_equal_p (a, b))
6221 return false;
6223 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6225 else if (!rtx_equal_p (a, b))
6226 return false;
6228 if (TARGET_64BIT)
6230 /* The SYSV ABI has more call-clobbered registers;
6231 disallow sibcalls from MS to SYSV. */
6232 if (cfun->machine->call_abi == MS_ABI
6233 && ix86_function_type_abi (type) == SYSV_ABI)
6234 return false;
6236 else
6238 /* If this call is indirect, we'll need to be able to use a
6239 call-clobbered register for the address of the target function.
6240 Make sure that all such registers are not used for passing
6241 parameters. Note that DLLIMPORT functions and call to global
6242 function via GOT slot are indirect. */
6243 if (!decl
6244 || (bind_global && flag_pic && !flag_plt)
6245 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6247 /* Check if regparm >= 3 since arg_reg_available is set to
6248 false if regparm == 0. If regparm is 1 or 2, there is
6249 always a call-clobbered register available.
6251 ??? The symbol indirect call doesn't need a call-clobbered
6252 register. But we don't know if this is a symbol indirect
6253 call or not here. */
6254 if (ix86_function_regparm (type, NULL) >= 3
6255 && !cfun->machine->arg_reg_available)
6256 return false;
6260 /* Otherwise okay. That also includes certain types of indirect calls. */
6261 return true;
6264 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6265 and "sseregparm" calling convention attributes;
6266 arguments as in struct attribute_spec.handler. */
6268 static tree
6269 ix86_handle_cconv_attribute (tree *node, tree name,
6270 tree args,
6271 int,
6272 bool *no_add_attrs)
6274 if (TREE_CODE (*node) != FUNCTION_TYPE
6275 && TREE_CODE (*node) != METHOD_TYPE
6276 && TREE_CODE (*node) != FIELD_DECL
6277 && TREE_CODE (*node) != TYPE_DECL)
6279 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6280 name);
6281 *no_add_attrs = true;
6282 return NULL_TREE;
6285 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6286 if (is_attribute_p ("regparm", name))
6288 tree cst;
6290 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6292 error ("fastcall and regparm attributes are not compatible");
6295 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6297 error ("regparam and thiscall attributes are not compatible");
6300 cst = TREE_VALUE (args);
6301 if (TREE_CODE (cst) != INTEGER_CST)
6303 warning (OPT_Wattributes,
6304 "%qE attribute requires an integer constant argument",
6305 name);
6306 *no_add_attrs = true;
6308 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6310 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6311 name, REGPARM_MAX);
6312 *no_add_attrs = true;
6315 return NULL_TREE;
6318 if (TARGET_64BIT)
6320 /* Do not warn when emulating the MS ABI. */
6321 if ((TREE_CODE (*node) != FUNCTION_TYPE
6322 && TREE_CODE (*node) != METHOD_TYPE)
6323 || ix86_function_type_abi (*node) != MS_ABI)
6324 warning (OPT_Wattributes, "%qE attribute ignored",
6325 name);
6326 *no_add_attrs = true;
6327 return NULL_TREE;
6330 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6331 if (is_attribute_p ("fastcall", name))
6333 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6335 error ("fastcall and cdecl attributes are not compatible");
6337 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6339 error ("fastcall and stdcall attributes are not compatible");
6341 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6343 error ("fastcall and regparm attributes are not compatible");
6345 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6347 error ("fastcall and thiscall attributes are not compatible");
6351 /* Can combine stdcall with fastcall (redundant), regparm and
6352 sseregparm. */
6353 else if (is_attribute_p ("stdcall", name))
6355 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6357 error ("stdcall and cdecl attributes are not compatible");
6359 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6361 error ("stdcall and fastcall attributes are not compatible");
6363 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6365 error ("stdcall and thiscall attributes are not compatible");
6369 /* Can combine cdecl with regparm and sseregparm. */
6370 else if (is_attribute_p ("cdecl", name))
6372 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6374 error ("stdcall and cdecl attributes are not compatible");
6376 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6378 error ("fastcall and cdecl attributes are not compatible");
6380 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6382 error ("cdecl and thiscall attributes are not compatible");
6385 else if (is_attribute_p ("thiscall", name))
6387 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6388 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6389 name);
6390 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6392 error ("stdcall and thiscall attributes are not compatible");
6394 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6396 error ("fastcall and thiscall attributes are not compatible");
6398 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6400 error ("cdecl and thiscall attributes are not compatible");
6404 /* Can combine sseregparm with all attributes. */
6406 return NULL_TREE;
6409 /* The transactional memory builtins are implicitly regparm or fastcall
6410 depending on the ABI. Override the generic do-nothing attribute that
6411 these builtins were declared with, and replace it with one of the two
6412 attributes that we expect elsewhere. */
6414 static tree
6415 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6416 int flags, bool *no_add_attrs)
6418 tree alt;
6420 /* In no case do we want to add the placeholder attribute. */
6421 *no_add_attrs = true;
6423 /* The 64-bit ABI is unchanged for transactional memory. */
6424 if (TARGET_64BIT)
6425 return NULL_TREE;
6427 /* ??? Is there a better way to validate 32-bit windows? We have
6428 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6429 if (CHECK_STACK_LIMIT > 0)
6430 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6431 else
6433 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6434 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6436 decl_attributes (node, alt, flags);
6438 return NULL_TREE;
6441 /* This function determines from TYPE the calling-convention. */
6443 unsigned int
6444 ix86_get_callcvt (const_tree type)
6446 unsigned int ret = 0;
6447 bool is_stdarg;
6448 tree attrs;
6450 if (TARGET_64BIT)
6451 return IX86_CALLCVT_CDECL;
6453 attrs = TYPE_ATTRIBUTES (type);
6454 if (attrs != NULL_TREE)
6456 if (lookup_attribute ("cdecl", attrs))
6457 ret |= IX86_CALLCVT_CDECL;
6458 else if (lookup_attribute ("stdcall", attrs))
6459 ret |= IX86_CALLCVT_STDCALL;
6460 else if (lookup_attribute ("fastcall", attrs))
6461 ret |= IX86_CALLCVT_FASTCALL;
6462 else if (lookup_attribute ("thiscall", attrs))
6463 ret |= IX86_CALLCVT_THISCALL;
6465 /* Regparam isn't allowed for thiscall and fastcall. */
6466 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6468 if (lookup_attribute ("regparm", attrs))
6469 ret |= IX86_CALLCVT_REGPARM;
6470 if (lookup_attribute ("sseregparm", attrs))
6471 ret |= IX86_CALLCVT_SSEREGPARM;
6474 if (IX86_BASE_CALLCVT(ret) != 0)
6475 return ret;
6478 is_stdarg = stdarg_p (type);
6479 if (TARGET_RTD && !is_stdarg)
6480 return IX86_CALLCVT_STDCALL | ret;
6482 if (ret != 0
6483 || is_stdarg
6484 || TREE_CODE (type) != METHOD_TYPE
6485 || ix86_function_type_abi (type) != MS_ABI)
6486 return IX86_CALLCVT_CDECL | ret;
6488 return IX86_CALLCVT_THISCALL;
6491 /* Return 0 if the attributes for two types are incompatible, 1 if they
6492 are compatible, and 2 if they are nearly compatible (which causes a
6493 warning to be generated). */
6495 static int
6496 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6498 unsigned int ccvt1, ccvt2;
6500 if (TREE_CODE (type1) != FUNCTION_TYPE
6501 && TREE_CODE (type1) != METHOD_TYPE)
6502 return 1;
6504 ccvt1 = ix86_get_callcvt (type1);
6505 ccvt2 = ix86_get_callcvt (type2);
6506 if (ccvt1 != ccvt2)
6507 return 0;
6508 if (ix86_function_regparm (type1, NULL)
6509 != ix86_function_regparm (type2, NULL))
6510 return 0;
6512 return 1;
6515 /* Return the regparm value for a function with the indicated TYPE and DECL.
6516 DECL may be NULL when calling function indirectly
6517 or considering a libcall. */
6519 static int
6520 ix86_function_regparm (const_tree type, const_tree decl)
6522 tree attr;
6523 int regparm;
6524 unsigned int ccvt;
6526 if (TARGET_64BIT)
6527 return (ix86_function_type_abi (type) == SYSV_ABI
6528 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6529 ccvt = ix86_get_callcvt (type);
6530 regparm = ix86_regparm;
6532 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6534 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6535 if (attr)
6537 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6538 return regparm;
6541 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6542 return 2;
6543 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6544 return 1;
6546 /* Use register calling convention for local functions when possible. */
6547 if (decl
6548 && TREE_CODE (decl) == FUNCTION_DECL)
6550 cgraph_node *target = cgraph_node::get (decl);
6551 if (target)
6552 target = target->function_symbol ();
6554 /* Caller and callee must agree on the calling convention, so
6555 checking here just optimize means that with
6556 __attribute__((optimize (...))) caller could use regparm convention
6557 and callee not, or vice versa. Instead look at whether the callee
6558 is optimized or not. */
6559 if (target && opt_for_fn (target->decl, optimize)
6560 && !(profile_flag && !flag_fentry))
6562 cgraph_local_info *i = &target->local;
6563 if (i && i->local && i->can_change_signature)
6565 int local_regparm, globals = 0, regno;
6567 /* Make sure no regparm register is taken by a
6568 fixed register variable. */
6569 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6570 local_regparm++)
6571 if (fixed_regs[local_regparm])
6572 break;
6574 /* We don't want to use regparm(3) for nested functions as
6575 these use a static chain pointer in the third argument. */
6576 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6577 local_regparm = 2;
6579 /* Save a register for the split stack. */
6580 if (flag_split_stack)
6582 if (local_regparm == 3)
6583 local_regparm = 2;
6584 else if (local_regparm == 2
6585 && DECL_STATIC_CHAIN (target->decl))
6586 local_regparm = 1;
6589 /* Each fixed register usage increases register pressure,
6590 so less registers should be used for argument passing.
6591 This functionality can be overriden by an explicit
6592 regparm value. */
6593 for (regno = AX_REG; regno <= DI_REG; regno++)
6594 if (fixed_regs[regno])
6595 globals++;
6597 local_regparm
6598 = globals < local_regparm ? local_regparm - globals : 0;
6600 if (local_regparm > regparm)
6601 regparm = local_regparm;
6606 return regparm;
6609 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6610 DFmode (2) arguments in SSE registers for a function with the
6611 indicated TYPE and DECL. DECL may be NULL when calling function
6612 indirectly or considering a libcall. Return -1 if any FP parameter
6613 should be rejected by error. This is used in siutation we imply SSE
6614 calling convetion but the function is called from another function with
6615 SSE disabled. Otherwise return 0. */
6617 static int
6618 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6620 gcc_assert (!TARGET_64BIT);
6622 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6623 by the sseregparm attribute. */
6624 if (TARGET_SSEREGPARM
6625 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6627 if (!TARGET_SSE)
6629 if (warn)
6631 if (decl)
6632 error ("calling %qD with attribute sseregparm without "
6633 "SSE/SSE2 enabled", decl);
6634 else
6635 error ("calling %qT with attribute sseregparm without "
6636 "SSE/SSE2 enabled", type);
6638 return 0;
6641 return 2;
6644 if (!decl)
6645 return 0;
6647 cgraph_node *target = cgraph_node::get (decl);
6648 if (target)
6649 target = target->function_symbol ();
6651 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6652 (and DFmode for SSE2) arguments in SSE registers. */
6653 if (target
6654 /* TARGET_SSE_MATH */
6655 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6656 && opt_for_fn (target->decl, optimize)
6657 && !(profile_flag && !flag_fentry))
6659 cgraph_local_info *i = &target->local;
6660 if (i && i->local && i->can_change_signature)
6662 /* Refuse to produce wrong code when local function with SSE enabled
6663 is called from SSE disabled function.
6664 FIXME: We need a way to detect these cases cross-ltrans partition
6665 and avoid using SSE calling conventions on local functions called
6666 from function with SSE disabled. For now at least delay the
6667 warning until we know we are going to produce wrong code.
6668 See PR66047 */
6669 if (!TARGET_SSE && warn)
6670 return -1;
6671 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6672 ->x_ix86_isa_flags) ? 2 : 1;
6676 return 0;
6679 /* Return true if EAX is live at the start of the function. Used by
6680 ix86_expand_prologue to determine if we need special help before
6681 calling allocate_stack_worker. */
6683 static bool
6684 ix86_eax_live_at_start_p (void)
6686 /* Cheat. Don't bother working forward from ix86_function_regparm
6687 to the function type to whether an actual argument is located in
6688 eax. Instead just look at cfg info, which is still close enough
6689 to correct at this point. This gives false positives for broken
6690 functions that might use uninitialized data that happens to be
6691 allocated in eax, but who cares? */
6692 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6695 static bool
6696 ix86_keep_aggregate_return_pointer (tree fntype)
6698 tree attr;
6700 if (!TARGET_64BIT)
6702 attr = lookup_attribute ("callee_pop_aggregate_return",
6703 TYPE_ATTRIBUTES (fntype));
6704 if (attr)
6705 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6707 /* For 32-bit MS-ABI the default is to keep aggregate
6708 return pointer. */
6709 if (ix86_function_type_abi (fntype) == MS_ABI)
6710 return true;
6712 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6715 /* Value is the number of bytes of arguments automatically
6716 popped when returning from a subroutine call.
6717 FUNDECL is the declaration node of the function (as a tree),
6718 FUNTYPE is the data type of the function (as a tree),
6719 or for a library call it is an identifier node for the subroutine name.
6720 SIZE is the number of bytes of arguments passed on the stack.
6722 On the 80386, the RTD insn may be used to pop them if the number
6723 of args is fixed, but if the number is variable then the caller
6724 must pop them all. RTD can't be used for library calls now
6725 because the library is compiled with the Unix compiler.
6726 Use of RTD is a selectable option, since it is incompatible with
6727 standard Unix calling sequences. If the option is not selected,
6728 the caller must always pop the args.
6730 The attribute stdcall is equivalent to RTD on a per module basis. */
6732 static int
6733 ix86_return_pops_args (tree fundecl, tree funtype, int size)
6735 unsigned int ccvt;
6737 /* None of the 64-bit ABIs pop arguments. */
6738 if (TARGET_64BIT)
6739 return 0;
6741 ccvt = ix86_get_callcvt (funtype);
6743 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6744 | IX86_CALLCVT_THISCALL)) != 0
6745 && ! stdarg_p (funtype))
6746 return size;
6748 /* Lose any fake structure return argument if it is passed on the stack. */
6749 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6750 && !ix86_keep_aggregate_return_pointer (funtype))
6752 int nregs = ix86_function_regparm (funtype, fundecl);
6753 if (nregs == 0)
6754 return GET_MODE_SIZE (Pmode);
6757 return 0;
6760 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6762 static bool
6763 ix86_legitimate_combined_insn (rtx_insn *insn)
6765 int i;
6767 /* Check operand constraints in case hard registers were propagated
6768 into insn pattern. This check prevents combine pass from
6769 generating insn patterns with invalid hard register operands.
6770 These invalid insns can eventually confuse reload to error out
6771 with a spill failure. See also PRs 46829 and 46843. */
6773 gcc_assert (INSN_CODE (insn) >= 0);
6775 extract_insn (insn);
6776 preprocess_constraints (insn);
6778 int n_operands = recog_data.n_operands;
6779 int n_alternatives = recog_data.n_alternatives;
6780 for (i = 0; i < n_operands; i++)
6782 rtx op = recog_data.operand[i];
6783 machine_mode mode = GET_MODE (op);
6784 const operand_alternative *op_alt;
6785 int offset = 0;
6786 bool win;
6787 int j;
6789 /* A unary operator may be accepted by the predicate, but it
6790 is irrelevant for matching constraints. */
6791 if (UNARY_P (op))
6792 op = XEXP (op, 0);
6794 if (SUBREG_P (op))
6796 if (REG_P (SUBREG_REG (op))
6797 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6798 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6799 GET_MODE (SUBREG_REG (op)),
6800 SUBREG_BYTE (op),
6801 GET_MODE (op));
6802 op = SUBREG_REG (op);
6805 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6806 continue;
6808 op_alt = recog_op_alt;
6810 /* Operand has no constraints, anything is OK. */
6811 win = !n_alternatives;
6813 alternative_mask preferred = get_preferred_alternatives (insn);
6814 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6816 if (!TEST_BIT (preferred, j))
6817 continue;
6818 if (op_alt[i].anything_ok
6819 || (op_alt[i].matches != -1
6820 && operands_match_p
6821 (recog_data.operand[i],
6822 recog_data.operand[op_alt[i].matches]))
6823 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6825 win = true;
6826 break;
6830 if (!win)
6831 return false;
6834 return true;
6837 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6839 static unsigned HOST_WIDE_INT
6840 ix86_asan_shadow_offset (void)
6842 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6843 : HOST_WIDE_INT_C (0x7fff8000))
6844 : (HOST_WIDE_INT_1 << 29);
6847 /* Argument support functions. */
6849 /* Return true when register may be used to pass function parameters. */
6850 bool
6851 ix86_function_arg_regno_p (int regno)
6853 int i;
6854 enum calling_abi call_abi;
6855 const int *parm_regs;
6857 if (TARGET_MPX && BND_REGNO_P (regno))
6858 return true;
6860 if (!TARGET_64BIT)
6862 if (TARGET_MACHO)
6863 return (regno < REGPARM_MAX
6864 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6865 else
6866 return (regno < REGPARM_MAX
6867 || (TARGET_MMX && MMX_REGNO_P (regno)
6868 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6869 || (TARGET_SSE && SSE_REGNO_P (regno)
6870 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6873 if (TARGET_SSE && SSE_REGNO_P (regno)
6874 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6875 return true;
6877 /* TODO: The function should depend on current function ABI but
6878 builtins.c would need updating then. Therefore we use the
6879 default ABI. */
6880 call_abi = ix86_cfun_abi ();
6882 /* RAX is used as hidden argument to va_arg functions. */
6883 if (call_abi == SYSV_ABI && regno == AX_REG)
6884 return true;
6886 if (call_abi == MS_ABI)
6887 parm_regs = x86_64_ms_abi_int_parameter_registers;
6888 else
6889 parm_regs = x86_64_int_parameter_registers;
6891 for (i = 0; i < (call_abi == MS_ABI
6892 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6893 if (regno == parm_regs[i])
6894 return true;
6895 return false;
6898 /* Return if we do not know how to pass TYPE solely in registers. */
6900 static bool
6901 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6903 if (must_pass_in_stack_var_size_or_pad (mode, type))
6904 return true;
6906 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6907 The layout_type routine is crafty and tries to trick us into passing
6908 currently unsupported vector types on the stack by using TImode. */
6909 return (!TARGET_64BIT && mode == TImode
6910 && type && TREE_CODE (type) != VECTOR_TYPE);
6913 /* It returns the size, in bytes, of the area reserved for arguments passed
6914 in registers for the function represented by fndecl dependent to the used
6915 abi format. */
6917 ix86_reg_parm_stack_space (const_tree fndecl)
6919 enum calling_abi call_abi = SYSV_ABI;
6920 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6921 call_abi = ix86_function_abi (fndecl);
6922 else
6923 call_abi = ix86_function_type_abi (fndecl);
6924 if (TARGET_64BIT && call_abi == MS_ABI)
6925 return 32;
6926 return 0;
6929 /* We add this as a workaround in order to use libc_has_function
6930 hook in i386.md. */
6931 bool
6932 ix86_libc_has_function (enum function_class fn_class)
6934 return targetm.libc_has_function (fn_class);
6937 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6938 specifying the call abi used. */
6939 enum calling_abi
6940 ix86_function_type_abi (const_tree fntype)
6942 enum calling_abi abi = ix86_abi;
6944 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6945 return abi;
6947 if (abi == SYSV_ABI
6948 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6950 static int warned;
6951 if (TARGET_X32 && !warned)
6953 error ("X32 does not support ms_abi attribute");
6954 warned = 1;
6957 abi = MS_ABI;
6959 else if (abi == MS_ABI
6960 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6961 abi = SYSV_ABI;
6963 return abi;
6966 static enum calling_abi
6967 ix86_function_abi (const_tree fndecl)
6969 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6972 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6973 specifying the call abi used. */
6974 enum calling_abi
6975 ix86_cfun_abi (void)
6977 return cfun ? cfun->machine->call_abi : ix86_abi;
6980 static bool
6981 ix86_function_ms_hook_prologue (const_tree fn)
6983 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6985 if (decl_function_context (fn) != NULL_TREE)
6986 error_at (DECL_SOURCE_LOCATION (fn),
6987 "ms_hook_prologue is not compatible with nested function");
6988 else
6989 return true;
6991 return false;
6994 static bool
6995 ix86_function_naked (const_tree fn)
6997 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
6998 return true;
7000 return false;
7003 /* Write the extra assembler code needed to declare a function properly. */
7005 void
7006 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7007 tree decl)
7009 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7011 if (is_ms_hook)
7013 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7014 unsigned int filler_cc = 0xcccccccc;
7016 for (i = 0; i < filler_count; i += 4)
7017 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7020 #ifdef SUBTARGET_ASM_UNWIND_INIT
7021 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7022 #endif
7024 ASM_OUTPUT_LABEL (asm_out_file, fname);
7026 /* Output magic byte marker, if hot-patch attribute is set. */
7027 if (is_ms_hook)
7029 if (TARGET_64BIT)
7031 /* leaq [%rsp + 0], %rsp */
7032 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7033 asm_out_file);
7035 else
7037 /* movl.s %edi, %edi
7038 push %ebp
7039 movl.s %esp, %ebp */
7040 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7045 /* Implementation of call abi switching target hook. Specific to FNDECL
7046 the specific call register sets are set. See also
7047 ix86_conditional_register_usage for more details. */
7048 void
7049 ix86_call_abi_override (const_tree fndecl)
7051 cfun->machine->call_abi = ix86_function_abi (fndecl);
7054 /* Return 1 if pseudo register should be created and used to hold
7055 GOT address for PIC code. */
7056 bool
7057 ix86_use_pseudo_pic_reg (void)
7059 if ((TARGET_64BIT
7060 && (ix86_cmodel == CM_SMALL_PIC
7061 || TARGET_PECOFF))
7062 || !flag_pic)
7063 return false;
7064 return true;
7067 /* Initialize large model PIC register. */
7069 static void
7070 ix86_init_large_pic_reg (unsigned int tmp_regno)
7072 rtx_code_label *label;
7073 rtx tmp_reg;
7075 gcc_assert (Pmode == DImode);
7076 label = gen_label_rtx ();
7077 emit_label (label);
7078 LABEL_PRESERVE_P (label) = 1;
7079 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7080 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7081 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7082 label));
7083 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7084 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7085 pic_offset_table_rtx, tmp_reg));
7086 const char *name = LABEL_NAME (label);
7087 PUT_CODE (label, NOTE);
7088 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7089 NOTE_DELETED_LABEL_NAME (label) = name;
7092 /* Create and initialize PIC register if required. */
7093 static void
7094 ix86_init_pic_reg (void)
7096 edge entry_edge;
7097 rtx_insn *seq;
7099 if (!ix86_use_pseudo_pic_reg ())
7100 return;
7102 start_sequence ();
7104 if (TARGET_64BIT)
7106 if (ix86_cmodel == CM_LARGE_PIC)
7107 ix86_init_large_pic_reg (R11_REG);
7108 else
7109 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7111 else
7113 /* If there is future mcount call in the function it is more profitable
7114 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7115 rtx reg = crtl->profile
7116 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7117 : pic_offset_table_rtx;
7118 rtx_insn *insn = emit_insn (gen_set_got (reg));
7119 RTX_FRAME_RELATED_P (insn) = 1;
7120 if (crtl->profile)
7121 emit_move_insn (pic_offset_table_rtx, reg);
7122 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7125 seq = get_insns ();
7126 end_sequence ();
7128 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7129 insert_insn_on_edge (seq, entry_edge);
7130 commit_one_edge_insertion (entry_edge);
7133 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7134 for a call to a function whose data type is FNTYPE.
7135 For a library call, FNTYPE is 0. */
7137 void
7138 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7139 tree fntype, /* tree ptr for function decl */
7140 rtx libname, /* SYMBOL_REF of library name or 0 */
7141 tree fndecl,
7142 int caller)
7144 struct cgraph_local_info *i = NULL;
7145 struct cgraph_node *target = NULL;
7147 memset (cum, 0, sizeof (*cum));
7149 if (fndecl)
7151 target = cgraph_node::get (fndecl);
7152 if (target)
7154 target = target->function_symbol ();
7155 i = cgraph_node::local_info (target->decl);
7156 cum->call_abi = ix86_function_abi (target->decl);
7158 else
7159 cum->call_abi = ix86_function_abi (fndecl);
7161 else
7162 cum->call_abi = ix86_function_type_abi (fntype);
7164 cum->caller = caller;
7166 /* Set up the number of registers to use for passing arguments. */
7167 cum->nregs = ix86_regparm;
7168 if (TARGET_64BIT)
7170 cum->nregs = (cum->call_abi == SYSV_ABI
7171 ? X86_64_REGPARM_MAX
7172 : X86_64_MS_REGPARM_MAX);
7174 if (TARGET_SSE)
7176 cum->sse_nregs = SSE_REGPARM_MAX;
7177 if (TARGET_64BIT)
7179 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7180 ? X86_64_SSE_REGPARM_MAX
7181 : X86_64_MS_SSE_REGPARM_MAX);
7184 if (TARGET_MMX)
7185 cum->mmx_nregs = MMX_REGPARM_MAX;
7186 cum->warn_avx512f = true;
7187 cum->warn_avx = true;
7188 cum->warn_sse = true;
7189 cum->warn_mmx = true;
7191 /* Because type might mismatch in between caller and callee, we need to
7192 use actual type of function for local calls.
7193 FIXME: cgraph_analyze can be told to actually record if function uses
7194 va_start so for local functions maybe_vaarg can be made aggressive
7195 helping K&R code.
7196 FIXME: once typesytem is fixed, we won't need this code anymore. */
7197 if (i && i->local && i->can_change_signature)
7198 fntype = TREE_TYPE (target->decl);
7199 cum->stdarg = stdarg_p (fntype);
7200 cum->maybe_vaarg = (fntype
7201 ? (!prototype_p (fntype) || stdarg_p (fntype))
7202 : !libname);
7204 cum->bnd_regno = FIRST_BND_REG;
7205 cum->bnds_in_bt = 0;
7206 cum->force_bnd_pass = 0;
7207 cum->decl = fndecl;
7209 cum->warn_empty = !warn_abi || cum->stdarg;
7210 if (!cum->warn_empty && fntype)
7212 function_args_iterator iter;
7213 tree argtype;
7214 bool seen_empty_type = false;
7215 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7217 if (VOID_TYPE_P (argtype))
7218 break;
7219 if (TYPE_EMPTY_P (argtype))
7220 seen_empty_type = true;
7221 else if (seen_empty_type)
7223 cum->warn_empty = true;
7224 break;
7229 if (!TARGET_64BIT)
7231 /* If there are variable arguments, then we won't pass anything
7232 in registers in 32-bit mode. */
7233 if (stdarg_p (fntype))
7235 cum->nregs = 0;
7236 /* Since in 32-bit, variable arguments are always passed on
7237 stack, there is scratch register available for indirect
7238 sibcall. */
7239 cfun->machine->arg_reg_available = true;
7240 cum->sse_nregs = 0;
7241 cum->mmx_nregs = 0;
7242 cum->warn_avx512f = false;
7243 cum->warn_avx = false;
7244 cum->warn_sse = false;
7245 cum->warn_mmx = false;
7246 return;
7249 /* Use ecx and edx registers if function has fastcall attribute,
7250 else look for regparm information. */
7251 if (fntype)
7253 unsigned int ccvt = ix86_get_callcvt (fntype);
7254 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7256 cum->nregs = 1;
7257 cum->fastcall = 1; /* Same first register as in fastcall. */
7259 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7261 cum->nregs = 2;
7262 cum->fastcall = 1;
7264 else
7265 cum->nregs = ix86_function_regparm (fntype, fndecl);
7268 /* Set up the number of SSE registers used for passing SFmode
7269 and DFmode arguments. Warn for mismatching ABI. */
7270 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7273 cfun->machine->arg_reg_available = (cum->nregs > 0);
7276 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7277 But in the case of vector types, it is some vector mode.
7279 When we have only some of our vector isa extensions enabled, then there
7280 are some modes for which vector_mode_supported_p is false. For these
7281 modes, the generic vector support in gcc will choose some non-vector mode
7282 in order to implement the type. By computing the natural mode, we'll
7283 select the proper ABI location for the operand and not depend on whatever
7284 the middle-end decides to do with these vector types.
7286 The midde-end can't deal with the vector types > 16 bytes. In this
7287 case, we return the original mode and warn ABI change if CUM isn't
7288 NULL.
7290 If INT_RETURN is true, warn ABI change if the vector mode isn't
7291 available for function return value. */
7293 static machine_mode
7294 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7295 bool in_return)
7297 machine_mode mode = TYPE_MODE (type);
7299 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7301 HOST_WIDE_INT size = int_size_in_bytes (type);
7302 if ((size == 8 || size == 16 || size == 32 || size == 64)
7303 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7304 && TYPE_VECTOR_SUBPARTS (type) > 1)
7306 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7308 /* There are no XFmode vector modes. */
7309 if (innermode == XFmode)
7310 return mode;
7312 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7313 mode = MIN_MODE_VECTOR_FLOAT;
7314 else
7315 mode = MIN_MODE_VECTOR_INT;
7317 /* Get the mode which has this inner mode and number of units. */
7318 FOR_EACH_MODE_FROM (mode, mode)
7319 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7320 && GET_MODE_INNER (mode) == innermode)
7322 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7324 static bool warnedavx512f;
7325 static bool warnedavx512f_ret;
7327 if (cum && cum->warn_avx512f && !warnedavx512f)
7329 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7330 "without AVX512F enabled changes the ABI"))
7331 warnedavx512f = true;
7333 else if (in_return && !warnedavx512f_ret)
7335 if (warning (OPT_Wpsabi, "AVX512F vector return "
7336 "without AVX512F enabled changes the ABI"))
7337 warnedavx512f_ret = true;
7340 return TYPE_MODE (type);
7342 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7344 static bool warnedavx;
7345 static bool warnedavx_ret;
7347 if (cum && cum->warn_avx && !warnedavx)
7349 if (warning (OPT_Wpsabi, "AVX vector argument "
7350 "without AVX enabled changes the ABI"))
7351 warnedavx = true;
7353 else if (in_return && !warnedavx_ret)
7355 if (warning (OPT_Wpsabi, "AVX vector return "
7356 "without AVX enabled changes the ABI"))
7357 warnedavx_ret = true;
7360 return TYPE_MODE (type);
7362 else if (((size == 8 && TARGET_64BIT) || size == 16)
7363 && !TARGET_SSE
7364 && !TARGET_IAMCU)
7366 static bool warnedsse;
7367 static bool warnedsse_ret;
7369 if (cum && cum->warn_sse && !warnedsse)
7371 if (warning (OPT_Wpsabi, "SSE vector argument "
7372 "without SSE enabled changes the ABI"))
7373 warnedsse = true;
7375 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7377 if (warning (OPT_Wpsabi, "SSE vector return "
7378 "without SSE enabled changes the ABI"))
7379 warnedsse_ret = true;
7382 else if ((size == 8 && !TARGET_64BIT)
7383 && (!cfun
7384 || cfun->machine->func_type == TYPE_NORMAL)
7385 && !TARGET_MMX
7386 && !TARGET_IAMCU)
7388 static bool warnedmmx;
7389 static bool warnedmmx_ret;
7391 if (cum && cum->warn_mmx && !warnedmmx)
7393 if (warning (OPT_Wpsabi, "MMX vector argument "
7394 "without MMX enabled changes the ABI"))
7395 warnedmmx = true;
7397 else if (in_return && !warnedmmx_ret)
7399 if (warning (OPT_Wpsabi, "MMX vector return "
7400 "without MMX enabled changes the ABI"))
7401 warnedmmx_ret = true;
7404 return mode;
7407 gcc_unreachable ();
7411 return mode;
7414 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7415 this may not agree with the mode that the type system has chosen for the
7416 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7417 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7419 static rtx
7420 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7421 unsigned int regno)
7423 rtx tmp;
7425 if (orig_mode != BLKmode)
7426 tmp = gen_rtx_REG (orig_mode, regno);
7427 else
7429 tmp = gen_rtx_REG (mode, regno);
7430 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7431 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7434 return tmp;
7437 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7438 of this code is to classify each 8bytes of incoming argument by the register
7439 class and assign registers accordingly. */
7441 /* Return the union class of CLASS1 and CLASS2.
7442 See the x86-64 PS ABI for details. */
7444 static enum x86_64_reg_class
7445 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7447 /* Rule #1: If both classes are equal, this is the resulting class. */
7448 if (class1 == class2)
7449 return class1;
7451 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7452 the other class. */
7453 if (class1 == X86_64_NO_CLASS)
7454 return class2;
7455 if (class2 == X86_64_NO_CLASS)
7456 return class1;
7458 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7459 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7460 return X86_64_MEMORY_CLASS;
7462 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7463 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7464 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7465 return X86_64_INTEGERSI_CLASS;
7466 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7467 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7468 return X86_64_INTEGER_CLASS;
7470 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7471 MEMORY is used. */
7472 if (class1 == X86_64_X87_CLASS
7473 || class1 == X86_64_X87UP_CLASS
7474 || class1 == X86_64_COMPLEX_X87_CLASS
7475 || class2 == X86_64_X87_CLASS
7476 || class2 == X86_64_X87UP_CLASS
7477 || class2 == X86_64_COMPLEX_X87_CLASS)
7478 return X86_64_MEMORY_CLASS;
7480 /* Rule #6: Otherwise class SSE is used. */
7481 return X86_64_SSE_CLASS;
7484 /* Classify the argument of type TYPE and mode MODE.
7485 CLASSES will be filled by the register class used to pass each word
7486 of the operand. The number of words is returned. In case the parameter
7487 should be passed in memory, 0 is returned. As a special case for zero
7488 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7490 BIT_OFFSET is used internally for handling records and specifies offset
7491 of the offset in bits modulo 512 to avoid overflow cases.
7493 See the x86-64 PS ABI for details.
7496 static int
7497 classify_argument (machine_mode mode, const_tree type,
7498 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7500 HOST_WIDE_INT bytes =
7501 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7502 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7504 /* Variable sized entities are always passed/returned in memory. */
7505 if (bytes < 0)
7506 return 0;
7508 if (mode != VOIDmode
7509 && targetm.calls.must_pass_in_stack (mode, type))
7510 return 0;
7512 if (type && AGGREGATE_TYPE_P (type))
7514 int i;
7515 tree field;
7516 enum x86_64_reg_class subclasses[MAX_CLASSES];
7518 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7519 if (bytes > 64)
7520 return 0;
7522 for (i = 0; i < words; i++)
7523 classes[i] = X86_64_NO_CLASS;
7525 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7526 signalize memory class, so handle it as special case. */
7527 if (!words)
7529 classes[0] = X86_64_NO_CLASS;
7530 return 1;
7533 /* Classify each field of record and merge classes. */
7534 switch (TREE_CODE (type))
7536 case RECORD_TYPE:
7537 /* And now merge the fields of structure. */
7538 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7540 if (TREE_CODE (field) == FIELD_DECL)
7542 int num;
7544 if (TREE_TYPE (field) == error_mark_node)
7545 continue;
7547 /* Bitfields are always classified as integer. Handle them
7548 early, since later code would consider them to be
7549 misaligned integers. */
7550 if (DECL_BIT_FIELD (field))
7552 for (i = (int_bit_position (field)
7553 + (bit_offset % 64)) / 8 / 8;
7554 i < ((int_bit_position (field) + (bit_offset % 64))
7555 + tree_to_shwi (DECL_SIZE (field))
7556 + 63) / 8 / 8; i++)
7557 classes[i] =
7558 merge_classes (X86_64_INTEGER_CLASS,
7559 classes[i]);
7561 else
7563 int pos;
7565 type = TREE_TYPE (field);
7567 /* Flexible array member is ignored. */
7568 if (TYPE_MODE (type) == BLKmode
7569 && TREE_CODE (type) == ARRAY_TYPE
7570 && TYPE_SIZE (type) == NULL_TREE
7571 && TYPE_DOMAIN (type) != NULL_TREE
7572 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7573 == NULL_TREE))
7575 static bool warned;
7577 if (!warned && warn_psabi)
7579 warned = true;
7580 inform (input_location,
7581 "the ABI of passing struct with"
7582 " a flexible array member has"
7583 " changed in GCC 4.4");
7585 continue;
7587 num = classify_argument (TYPE_MODE (type), type,
7588 subclasses,
7589 (int_bit_position (field)
7590 + bit_offset) % 512);
7591 if (!num)
7592 return 0;
7593 pos = (int_bit_position (field)
7594 + (bit_offset % 64)) / 8 / 8;
7595 for (i = 0; i < num && (i + pos) < words; i++)
7596 classes[i + pos] =
7597 merge_classes (subclasses[i], classes[i + pos]);
7601 break;
7603 case ARRAY_TYPE:
7604 /* Arrays are handled as small records. */
7606 int num;
7607 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7608 TREE_TYPE (type), subclasses, bit_offset);
7609 if (!num)
7610 return 0;
7612 /* The partial classes are now full classes. */
7613 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7614 subclasses[0] = X86_64_SSE_CLASS;
7615 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7616 && !((bit_offset % 64) == 0 && bytes == 4))
7617 subclasses[0] = X86_64_INTEGER_CLASS;
7619 for (i = 0; i < words; i++)
7620 classes[i] = subclasses[i % num];
7622 break;
7624 case UNION_TYPE:
7625 case QUAL_UNION_TYPE:
7626 /* Unions are similar to RECORD_TYPE but offset is always 0.
7628 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7630 if (TREE_CODE (field) == FIELD_DECL)
7632 int num;
7634 if (TREE_TYPE (field) == error_mark_node)
7635 continue;
7637 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7638 TREE_TYPE (field), subclasses,
7639 bit_offset);
7640 if (!num)
7641 return 0;
7642 for (i = 0; i < num && i < words; i++)
7643 classes[i] = merge_classes (subclasses[i], classes[i]);
7646 break;
7648 default:
7649 gcc_unreachable ();
7652 if (words > 2)
7654 /* When size > 16 bytes, if the first one isn't
7655 X86_64_SSE_CLASS or any other ones aren't
7656 X86_64_SSEUP_CLASS, everything should be passed in
7657 memory. */
7658 if (classes[0] != X86_64_SSE_CLASS)
7659 return 0;
7661 for (i = 1; i < words; i++)
7662 if (classes[i] != X86_64_SSEUP_CLASS)
7663 return 0;
7666 /* Final merger cleanup. */
7667 for (i = 0; i < words; i++)
7669 /* If one class is MEMORY, everything should be passed in
7670 memory. */
7671 if (classes[i] == X86_64_MEMORY_CLASS)
7672 return 0;
7674 /* The X86_64_SSEUP_CLASS should be always preceded by
7675 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7676 if (classes[i] == X86_64_SSEUP_CLASS
7677 && classes[i - 1] != X86_64_SSE_CLASS
7678 && classes[i - 1] != X86_64_SSEUP_CLASS)
7680 /* The first one should never be X86_64_SSEUP_CLASS. */
7681 gcc_assert (i != 0);
7682 classes[i] = X86_64_SSE_CLASS;
7685 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7686 everything should be passed in memory. */
7687 if (classes[i] == X86_64_X87UP_CLASS
7688 && (classes[i - 1] != X86_64_X87_CLASS))
7690 static bool warned;
7692 /* The first one should never be X86_64_X87UP_CLASS. */
7693 gcc_assert (i != 0);
7694 if (!warned && warn_psabi)
7696 warned = true;
7697 inform (input_location,
7698 "the ABI of passing union with long double"
7699 " has changed in GCC 4.4");
7701 return 0;
7704 return words;
7707 /* Compute alignment needed. We align all types to natural boundaries with
7708 exception of XFmode that is aligned to 64bits. */
7709 if (mode != VOIDmode && mode != BLKmode)
7711 int mode_alignment = GET_MODE_BITSIZE (mode);
7713 if (mode == XFmode)
7714 mode_alignment = 128;
7715 else if (mode == XCmode)
7716 mode_alignment = 256;
7717 if (COMPLEX_MODE_P (mode))
7718 mode_alignment /= 2;
7719 /* Misaligned fields are always returned in memory. */
7720 if (bit_offset % mode_alignment)
7721 return 0;
7724 /* for V1xx modes, just use the base mode */
7725 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7726 && GET_MODE_UNIT_SIZE (mode) == bytes)
7727 mode = GET_MODE_INNER (mode);
7729 /* Classification of atomic types. */
7730 switch (mode)
7732 case E_SDmode:
7733 case E_DDmode:
7734 classes[0] = X86_64_SSE_CLASS;
7735 return 1;
7736 case E_TDmode:
7737 classes[0] = X86_64_SSE_CLASS;
7738 classes[1] = X86_64_SSEUP_CLASS;
7739 return 2;
7740 case E_DImode:
7741 case E_SImode:
7742 case E_HImode:
7743 case E_QImode:
7744 case E_CSImode:
7745 case E_CHImode:
7746 case E_CQImode:
7748 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7750 /* Analyze last 128 bits only. */
7751 size = (size - 1) & 0x7f;
7753 if (size < 32)
7755 classes[0] = X86_64_INTEGERSI_CLASS;
7756 return 1;
7758 else if (size < 64)
7760 classes[0] = X86_64_INTEGER_CLASS;
7761 return 1;
7763 else if (size < 64+32)
7765 classes[0] = X86_64_INTEGER_CLASS;
7766 classes[1] = X86_64_INTEGERSI_CLASS;
7767 return 2;
7769 else if (size < 64+64)
7771 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7772 return 2;
7774 else
7775 gcc_unreachable ();
7777 case E_CDImode:
7778 case E_TImode:
7779 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7780 return 2;
7781 case E_COImode:
7782 case E_OImode:
7783 /* OImode shouldn't be used directly. */
7784 gcc_unreachable ();
7785 case E_CTImode:
7786 return 0;
7787 case E_SFmode:
7788 if (!(bit_offset % 64))
7789 classes[0] = X86_64_SSESF_CLASS;
7790 else
7791 classes[0] = X86_64_SSE_CLASS;
7792 return 1;
7793 case E_DFmode:
7794 classes[0] = X86_64_SSEDF_CLASS;
7795 return 1;
7796 case E_XFmode:
7797 classes[0] = X86_64_X87_CLASS;
7798 classes[1] = X86_64_X87UP_CLASS;
7799 return 2;
7800 case E_TFmode:
7801 classes[0] = X86_64_SSE_CLASS;
7802 classes[1] = X86_64_SSEUP_CLASS;
7803 return 2;
7804 case E_SCmode:
7805 classes[0] = X86_64_SSE_CLASS;
7806 if (!(bit_offset % 64))
7807 return 1;
7808 else
7810 static bool warned;
7812 if (!warned && warn_psabi)
7814 warned = true;
7815 inform (input_location,
7816 "the ABI of passing structure with complex float"
7817 " member has changed in GCC 4.4");
7819 classes[1] = X86_64_SSESF_CLASS;
7820 return 2;
7822 case E_DCmode:
7823 classes[0] = X86_64_SSEDF_CLASS;
7824 classes[1] = X86_64_SSEDF_CLASS;
7825 return 2;
7826 case E_XCmode:
7827 classes[0] = X86_64_COMPLEX_X87_CLASS;
7828 return 1;
7829 case E_TCmode:
7830 /* This modes is larger than 16 bytes. */
7831 return 0;
7832 case E_V8SFmode:
7833 case E_V8SImode:
7834 case E_V32QImode:
7835 case E_V16HImode:
7836 case E_V4DFmode:
7837 case E_V4DImode:
7838 classes[0] = X86_64_SSE_CLASS;
7839 classes[1] = X86_64_SSEUP_CLASS;
7840 classes[2] = X86_64_SSEUP_CLASS;
7841 classes[3] = X86_64_SSEUP_CLASS;
7842 return 4;
7843 case E_V8DFmode:
7844 case E_V16SFmode:
7845 case E_V8DImode:
7846 case E_V16SImode:
7847 case E_V32HImode:
7848 case E_V64QImode:
7849 classes[0] = X86_64_SSE_CLASS;
7850 classes[1] = X86_64_SSEUP_CLASS;
7851 classes[2] = X86_64_SSEUP_CLASS;
7852 classes[3] = X86_64_SSEUP_CLASS;
7853 classes[4] = X86_64_SSEUP_CLASS;
7854 classes[5] = X86_64_SSEUP_CLASS;
7855 classes[6] = X86_64_SSEUP_CLASS;
7856 classes[7] = X86_64_SSEUP_CLASS;
7857 return 8;
7858 case E_V4SFmode:
7859 case E_V4SImode:
7860 case E_V16QImode:
7861 case E_V8HImode:
7862 case E_V2DFmode:
7863 case E_V2DImode:
7864 classes[0] = X86_64_SSE_CLASS;
7865 classes[1] = X86_64_SSEUP_CLASS;
7866 return 2;
7867 case E_V1TImode:
7868 case E_V1DImode:
7869 case E_V2SFmode:
7870 case E_V2SImode:
7871 case E_V4HImode:
7872 case E_V8QImode:
7873 classes[0] = X86_64_SSE_CLASS;
7874 return 1;
7875 case E_BLKmode:
7876 case E_VOIDmode:
7877 return 0;
7878 default:
7879 gcc_assert (VECTOR_MODE_P (mode));
7881 if (bytes > 16)
7882 return 0;
7884 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7886 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7887 classes[0] = X86_64_INTEGERSI_CLASS;
7888 else
7889 classes[0] = X86_64_INTEGER_CLASS;
7890 classes[1] = X86_64_INTEGER_CLASS;
7891 return 1 + (bytes > 8);
7895 /* Examine the argument and return set number of register required in each
7896 class. Return true iff parameter should be passed in memory. */
7898 static bool
7899 examine_argument (machine_mode mode, const_tree type, int in_return,
7900 int *int_nregs, int *sse_nregs)
7902 enum x86_64_reg_class regclass[MAX_CLASSES];
7903 int n = classify_argument (mode, type, regclass, 0);
7905 *int_nregs = 0;
7906 *sse_nregs = 0;
7908 if (!n)
7909 return true;
7910 for (n--; n >= 0; n--)
7911 switch (regclass[n])
7913 case X86_64_INTEGER_CLASS:
7914 case X86_64_INTEGERSI_CLASS:
7915 (*int_nregs)++;
7916 break;
7917 case X86_64_SSE_CLASS:
7918 case X86_64_SSESF_CLASS:
7919 case X86_64_SSEDF_CLASS:
7920 (*sse_nregs)++;
7921 break;
7922 case X86_64_NO_CLASS:
7923 case X86_64_SSEUP_CLASS:
7924 break;
7925 case X86_64_X87_CLASS:
7926 case X86_64_X87UP_CLASS:
7927 case X86_64_COMPLEX_X87_CLASS:
7928 if (!in_return)
7929 return true;
7930 break;
7931 case X86_64_MEMORY_CLASS:
7932 gcc_unreachable ();
7935 return false;
7938 /* Construct container for the argument used by GCC interface. See
7939 FUNCTION_ARG for the detailed description. */
7941 static rtx
7942 construct_container (machine_mode mode, machine_mode orig_mode,
7943 const_tree type, int in_return, int nintregs, int nsseregs,
7944 const int *intreg, int sse_regno)
7946 /* The following variables hold the static issued_error state. */
7947 static bool issued_sse_arg_error;
7948 static bool issued_sse_ret_error;
7949 static bool issued_x87_ret_error;
7951 machine_mode tmpmode;
7952 int bytes =
7953 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7954 enum x86_64_reg_class regclass[MAX_CLASSES];
7955 int n;
7956 int i;
7957 int nexps = 0;
7958 int needed_sseregs, needed_intregs;
7959 rtx exp[MAX_CLASSES];
7960 rtx ret;
7962 n = classify_argument (mode, type, regclass, 0);
7963 if (!n)
7964 return NULL;
7965 if (examine_argument (mode, type, in_return, &needed_intregs,
7966 &needed_sseregs))
7967 return NULL;
7968 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7969 return NULL;
7971 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7972 some less clueful developer tries to use floating-point anyway. */
7973 if (needed_sseregs && !TARGET_SSE)
7975 if (in_return)
7977 if (!issued_sse_ret_error)
7979 error ("SSE register return with SSE disabled");
7980 issued_sse_ret_error = true;
7983 else if (!issued_sse_arg_error)
7985 error ("SSE register argument with SSE disabled");
7986 issued_sse_arg_error = true;
7988 return NULL;
7991 /* Likewise, error if the ABI requires us to return values in the
7992 x87 registers and the user specified -mno-80387. */
7993 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
7994 for (i = 0; i < n; i++)
7995 if (regclass[i] == X86_64_X87_CLASS
7996 || regclass[i] == X86_64_X87UP_CLASS
7997 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
7999 if (!issued_x87_ret_error)
8001 error ("x87 register return with x87 disabled");
8002 issued_x87_ret_error = true;
8004 return NULL;
8007 /* First construct simple cases. Avoid SCmode, since we want to use
8008 single register to pass this type. */
8009 if (n == 1 && mode != SCmode)
8010 switch (regclass[0])
8012 case X86_64_INTEGER_CLASS:
8013 case X86_64_INTEGERSI_CLASS:
8014 return gen_rtx_REG (mode, intreg[0]);
8015 case X86_64_SSE_CLASS:
8016 case X86_64_SSESF_CLASS:
8017 case X86_64_SSEDF_CLASS:
8018 if (mode != BLKmode)
8019 return gen_reg_or_parallel (mode, orig_mode,
8020 SSE_REGNO (sse_regno));
8021 break;
8022 case X86_64_X87_CLASS:
8023 case X86_64_COMPLEX_X87_CLASS:
8024 return gen_rtx_REG (mode, FIRST_STACK_REG);
8025 case X86_64_NO_CLASS:
8026 /* Zero sized array, struct or class. */
8027 return NULL;
8028 default:
8029 gcc_unreachable ();
8031 if (n == 2
8032 && regclass[0] == X86_64_SSE_CLASS
8033 && regclass[1] == X86_64_SSEUP_CLASS
8034 && mode != BLKmode)
8035 return gen_reg_or_parallel (mode, orig_mode,
8036 SSE_REGNO (sse_regno));
8037 if (n == 4
8038 && regclass[0] == X86_64_SSE_CLASS
8039 && regclass[1] == X86_64_SSEUP_CLASS
8040 && regclass[2] == X86_64_SSEUP_CLASS
8041 && regclass[3] == X86_64_SSEUP_CLASS
8042 && mode != BLKmode)
8043 return gen_reg_or_parallel (mode, orig_mode,
8044 SSE_REGNO (sse_regno));
8045 if (n == 8
8046 && regclass[0] == X86_64_SSE_CLASS
8047 && regclass[1] == X86_64_SSEUP_CLASS
8048 && regclass[2] == X86_64_SSEUP_CLASS
8049 && regclass[3] == X86_64_SSEUP_CLASS
8050 && regclass[4] == X86_64_SSEUP_CLASS
8051 && regclass[5] == X86_64_SSEUP_CLASS
8052 && regclass[6] == X86_64_SSEUP_CLASS
8053 && regclass[7] == X86_64_SSEUP_CLASS
8054 && mode != BLKmode)
8055 return gen_reg_or_parallel (mode, orig_mode,
8056 SSE_REGNO (sse_regno));
8057 if (n == 2
8058 && regclass[0] == X86_64_X87_CLASS
8059 && regclass[1] == X86_64_X87UP_CLASS)
8060 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8062 if (n == 2
8063 && regclass[0] == X86_64_INTEGER_CLASS
8064 && regclass[1] == X86_64_INTEGER_CLASS
8065 && (mode == CDImode || mode == TImode)
8066 && intreg[0] + 1 == intreg[1])
8067 return gen_rtx_REG (mode, intreg[0]);
8069 /* Otherwise figure out the entries of the PARALLEL. */
8070 for (i = 0; i < n; i++)
8072 int pos;
8074 switch (regclass[i])
8076 case X86_64_NO_CLASS:
8077 break;
8078 case X86_64_INTEGER_CLASS:
8079 case X86_64_INTEGERSI_CLASS:
8080 /* Merge TImodes on aligned occasions here too. */
8081 if (i * 8 + 8 > bytes)
8083 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8084 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8085 /* We've requested 24 bytes we
8086 don't have mode for. Use DImode. */
8087 tmpmode = DImode;
8089 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8090 tmpmode = SImode;
8091 else
8092 tmpmode = DImode;
8093 exp [nexps++]
8094 = gen_rtx_EXPR_LIST (VOIDmode,
8095 gen_rtx_REG (tmpmode, *intreg),
8096 GEN_INT (i*8));
8097 intreg++;
8098 break;
8099 case X86_64_SSESF_CLASS:
8100 exp [nexps++]
8101 = gen_rtx_EXPR_LIST (VOIDmode,
8102 gen_rtx_REG (SFmode,
8103 SSE_REGNO (sse_regno)),
8104 GEN_INT (i*8));
8105 sse_regno++;
8106 break;
8107 case X86_64_SSEDF_CLASS:
8108 exp [nexps++]
8109 = gen_rtx_EXPR_LIST (VOIDmode,
8110 gen_rtx_REG (DFmode,
8111 SSE_REGNO (sse_regno)),
8112 GEN_INT (i*8));
8113 sse_regno++;
8114 break;
8115 case X86_64_SSE_CLASS:
8116 pos = i;
8117 switch (n)
8119 case 1:
8120 tmpmode = DImode;
8121 break;
8122 case 2:
8123 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8125 tmpmode = TImode;
8126 i++;
8128 else
8129 tmpmode = DImode;
8130 break;
8131 case 4:
8132 gcc_assert (i == 0
8133 && regclass[1] == X86_64_SSEUP_CLASS
8134 && regclass[2] == X86_64_SSEUP_CLASS
8135 && regclass[3] == X86_64_SSEUP_CLASS);
8136 tmpmode = OImode;
8137 i += 3;
8138 break;
8139 case 8:
8140 gcc_assert (i == 0
8141 && regclass[1] == X86_64_SSEUP_CLASS
8142 && regclass[2] == X86_64_SSEUP_CLASS
8143 && regclass[3] == X86_64_SSEUP_CLASS
8144 && regclass[4] == X86_64_SSEUP_CLASS
8145 && regclass[5] == X86_64_SSEUP_CLASS
8146 && regclass[6] == X86_64_SSEUP_CLASS
8147 && regclass[7] == X86_64_SSEUP_CLASS);
8148 tmpmode = XImode;
8149 i += 7;
8150 break;
8151 default:
8152 gcc_unreachable ();
8154 exp [nexps++]
8155 = gen_rtx_EXPR_LIST (VOIDmode,
8156 gen_rtx_REG (tmpmode,
8157 SSE_REGNO (sse_regno)),
8158 GEN_INT (pos*8));
8159 sse_regno++;
8160 break;
8161 default:
8162 gcc_unreachable ();
8166 /* Empty aligned struct, union or class. */
8167 if (nexps == 0)
8168 return NULL;
8170 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8171 for (i = 0; i < nexps; i++)
8172 XVECEXP (ret, 0, i) = exp [i];
8173 return ret;
8176 /* Update the data in CUM to advance over an argument of mode MODE
8177 and data type TYPE. (TYPE is null for libcalls where that information
8178 may not be available.)
8180 Return a number of integer regsiters advanced over. */
8182 static int
8183 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8184 const_tree type, HOST_WIDE_INT bytes,
8185 HOST_WIDE_INT words)
8187 int res = 0;
8188 bool error_p = false;
8190 if (TARGET_IAMCU)
8192 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8193 bytes in registers. */
8194 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8195 goto pass_in_reg;
8196 return res;
8199 switch (mode)
8201 default:
8202 break;
8204 case E_BLKmode:
8205 if (bytes < 0)
8206 break;
8207 /* FALLTHRU */
8209 case E_DImode:
8210 case E_SImode:
8211 case E_HImode:
8212 case E_QImode:
8213 pass_in_reg:
8214 cum->words += words;
8215 cum->nregs -= words;
8216 cum->regno += words;
8217 if (cum->nregs >= 0)
8218 res = words;
8219 if (cum->nregs <= 0)
8221 cum->nregs = 0;
8222 cfun->machine->arg_reg_available = false;
8223 cum->regno = 0;
8225 break;
8227 case E_OImode:
8228 /* OImode shouldn't be used directly. */
8229 gcc_unreachable ();
8231 case E_DFmode:
8232 if (cum->float_in_sse == -1)
8233 error_p = true;
8234 if (cum->float_in_sse < 2)
8235 break;
8236 /* FALLTHRU */
8237 case E_SFmode:
8238 if (cum->float_in_sse == -1)
8239 error_p = true;
8240 if (cum->float_in_sse < 1)
8241 break;
8242 /* FALLTHRU */
8244 case E_V8SFmode:
8245 case E_V8SImode:
8246 case E_V64QImode:
8247 case E_V32HImode:
8248 case E_V16SImode:
8249 case E_V8DImode:
8250 case E_V16SFmode:
8251 case E_V8DFmode:
8252 case E_V32QImode:
8253 case E_V16HImode:
8254 case E_V4DFmode:
8255 case E_V4DImode:
8256 case E_TImode:
8257 case E_V16QImode:
8258 case E_V8HImode:
8259 case E_V4SImode:
8260 case E_V2DImode:
8261 case E_V4SFmode:
8262 case E_V2DFmode:
8263 if (!type || !AGGREGATE_TYPE_P (type))
8265 cum->sse_words += words;
8266 cum->sse_nregs -= 1;
8267 cum->sse_regno += 1;
8268 if (cum->sse_nregs <= 0)
8270 cum->sse_nregs = 0;
8271 cum->sse_regno = 0;
8274 break;
8276 case E_V8QImode:
8277 case E_V4HImode:
8278 case E_V2SImode:
8279 case E_V2SFmode:
8280 case E_V1TImode:
8281 case E_V1DImode:
8282 if (!type || !AGGREGATE_TYPE_P (type))
8284 cum->mmx_words += words;
8285 cum->mmx_nregs -= 1;
8286 cum->mmx_regno += 1;
8287 if (cum->mmx_nregs <= 0)
8289 cum->mmx_nregs = 0;
8290 cum->mmx_regno = 0;
8293 break;
8295 if (error_p)
8297 cum->float_in_sse = 0;
8298 error ("calling %qD with SSE calling convention without "
8299 "SSE/SSE2 enabled", cum->decl);
8300 sorry ("this is a GCC bug that can be worked around by adding "
8301 "attribute used to function called");
8304 return res;
8307 static int
8308 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8309 const_tree type, HOST_WIDE_INT words, bool named)
8311 int int_nregs, sse_nregs;
8313 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8314 if (!named && (VALID_AVX512F_REG_MODE (mode)
8315 || VALID_AVX256_REG_MODE (mode)))
8316 return 0;
8318 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8319 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8321 cum->nregs -= int_nregs;
8322 cum->sse_nregs -= sse_nregs;
8323 cum->regno += int_nregs;
8324 cum->sse_regno += sse_nregs;
8325 return int_nregs;
8327 else
8329 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8330 cum->words = ROUND_UP (cum->words, align);
8331 cum->words += words;
8332 return 0;
8336 static int
8337 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8338 HOST_WIDE_INT words)
8340 /* Otherwise, this should be passed indirect. */
8341 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8343 cum->words += words;
8344 if (cum->nregs > 0)
8346 cum->nregs -= 1;
8347 cum->regno += 1;
8348 return 1;
8350 return 0;
8353 /* Update the data in CUM to advance over an argument of mode MODE and
8354 data type TYPE. (TYPE is null for libcalls where that information
8355 may not be available.) */
8357 static void
8358 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8359 const_tree type, bool named)
8361 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8362 HOST_WIDE_INT bytes, words;
8363 int nregs;
8365 /* The argument of interrupt handler is a special case and is
8366 handled in ix86_function_arg. */
8367 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8368 return;
8370 if (mode == BLKmode)
8371 bytes = int_size_in_bytes (type);
8372 else
8373 bytes = GET_MODE_SIZE (mode);
8374 words = CEIL (bytes, UNITS_PER_WORD);
8376 if (type)
8377 mode = type_natural_mode (type, NULL, false);
8379 if ((type && POINTER_BOUNDS_TYPE_P (type))
8380 || POINTER_BOUNDS_MODE_P (mode))
8382 /* If we pass bounds in BT then just update remained bounds count. */
8383 if (cum->bnds_in_bt)
8385 cum->bnds_in_bt--;
8386 return;
8389 /* Update remained number of bounds to force. */
8390 if (cum->force_bnd_pass)
8391 cum->force_bnd_pass--;
8393 cum->bnd_regno++;
8395 return;
8398 /* The first arg not going to Bounds Tables resets this counter. */
8399 cum->bnds_in_bt = 0;
8400 /* For unnamed args we always pass bounds to avoid bounds mess when
8401 passed and received types do not match. If bounds do not follow
8402 unnamed arg, still pretend required number of bounds were passed. */
8403 if (cum->force_bnd_pass)
8405 cum->bnd_regno += cum->force_bnd_pass;
8406 cum->force_bnd_pass = 0;
8409 if (TARGET_64BIT)
8411 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8413 if (call_abi == MS_ABI)
8414 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8415 else
8416 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8418 else
8419 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8421 /* For stdarg we expect bounds to be passed for each value passed
8422 in register. */
8423 if (cum->stdarg)
8424 cum->force_bnd_pass = nregs;
8425 /* For pointers passed in memory we expect bounds passed in Bounds
8426 Table. */
8427 if (!nregs)
8429 /* Track if there are outgoing arguments on stack. */
8430 if (cum->caller)
8431 cfun->machine->outgoing_args_on_stack = true;
8433 cum->bnds_in_bt = chkp_type_bounds_count (type);
8437 /* Define where to put the arguments to a function.
8438 Value is zero to push the argument on the stack,
8439 or a hard register in which to store the argument.
8441 MODE is the argument's machine mode.
8442 TYPE is the data type of the argument (as a tree).
8443 This is null for libcalls where that information may
8444 not be available.
8445 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8446 the preceding args and about the function being called.
8447 NAMED is nonzero if this argument is a named parameter
8448 (otherwise it is an extra parameter matching an ellipsis). */
8450 static rtx
8451 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8452 machine_mode orig_mode, const_tree type,
8453 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8455 bool error_p = false;
8457 /* Avoid the AL settings for the Unix64 ABI. */
8458 if (mode == VOIDmode)
8459 return constm1_rtx;
8461 if (TARGET_IAMCU)
8463 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8464 bytes in registers. */
8465 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8466 goto pass_in_reg;
8467 return NULL_RTX;
8470 switch (mode)
8472 default:
8473 break;
8475 case E_BLKmode:
8476 if (bytes < 0)
8477 break;
8478 /* FALLTHRU */
8479 case E_DImode:
8480 case E_SImode:
8481 case E_HImode:
8482 case E_QImode:
8483 pass_in_reg:
8484 if (words <= cum->nregs)
8486 int regno = cum->regno;
8488 /* Fastcall allocates the first two DWORD (SImode) or
8489 smaller arguments to ECX and EDX if it isn't an
8490 aggregate type . */
8491 if (cum->fastcall)
8493 if (mode == BLKmode
8494 || mode == DImode
8495 || (type && AGGREGATE_TYPE_P (type)))
8496 break;
8498 /* ECX not EAX is the first allocated register. */
8499 if (regno == AX_REG)
8500 regno = CX_REG;
8502 return gen_rtx_REG (mode, regno);
8504 break;
8506 case E_DFmode:
8507 if (cum->float_in_sse == -1)
8508 error_p = true;
8509 if (cum->float_in_sse < 2)
8510 break;
8511 /* FALLTHRU */
8512 case E_SFmode:
8513 if (cum->float_in_sse == -1)
8514 error_p = true;
8515 if (cum->float_in_sse < 1)
8516 break;
8517 /* FALLTHRU */
8518 case E_TImode:
8519 /* In 32bit, we pass TImode in xmm registers. */
8520 case E_V16QImode:
8521 case E_V8HImode:
8522 case E_V4SImode:
8523 case E_V2DImode:
8524 case E_V4SFmode:
8525 case E_V2DFmode:
8526 if (!type || !AGGREGATE_TYPE_P (type))
8528 if (cum->sse_nregs)
8529 return gen_reg_or_parallel (mode, orig_mode,
8530 cum->sse_regno + FIRST_SSE_REG);
8532 break;
8534 case E_OImode:
8535 case E_XImode:
8536 /* OImode and XImode shouldn't be used directly. */
8537 gcc_unreachable ();
8539 case E_V64QImode:
8540 case E_V32HImode:
8541 case E_V16SImode:
8542 case E_V8DImode:
8543 case E_V16SFmode:
8544 case E_V8DFmode:
8545 case E_V8SFmode:
8546 case E_V8SImode:
8547 case E_V32QImode:
8548 case E_V16HImode:
8549 case E_V4DFmode:
8550 case E_V4DImode:
8551 if (!type || !AGGREGATE_TYPE_P (type))
8553 if (cum->sse_nregs)
8554 return gen_reg_or_parallel (mode, orig_mode,
8555 cum->sse_regno + FIRST_SSE_REG);
8557 break;
8559 case E_V8QImode:
8560 case E_V4HImode:
8561 case E_V2SImode:
8562 case E_V2SFmode:
8563 case E_V1TImode:
8564 case E_V1DImode:
8565 if (!type || !AGGREGATE_TYPE_P (type))
8567 if (cum->mmx_nregs)
8568 return gen_reg_or_parallel (mode, orig_mode,
8569 cum->mmx_regno + FIRST_MMX_REG);
8571 break;
8573 if (error_p)
8575 cum->float_in_sse = 0;
8576 error ("calling %qD with SSE calling convention without "
8577 "SSE/SSE2 enabled", cum->decl);
8578 sorry ("this is a GCC bug that can be worked around by adding "
8579 "attribute used to function called");
8582 return NULL_RTX;
8585 static rtx
8586 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8587 machine_mode orig_mode, const_tree type, bool named)
8589 /* Handle a hidden AL argument containing number of registers
8590 for varargs x86-64 functions. */
8591 if (mode == VOIDmode)
8592 return GEN_INT (cum->maybe_vaarg
8593 ? (cum->sse_nregs < 0
8594 ? X86_64_SSE_REGPARM_MAX
8595 : cum->sse_regno)
8596 : -1);
8598 switch (mode)
8600 default:
8601 break;
8603 case E_V8SFmode:
8604 case E_V8SImode:
8605 case E_V32QImode:
8606 case E_V16HImode:
8607 case E_V4DFmode:
8608 case E_V4DImode:
8609 case E_V16SFmode:
8610 case E_V16SImode:
8611 case E_V64QImode:
8612 case E_V32HImode:
8613 case E_V8DFmode:
8614 case E_V8DImode:
8615 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8616 if (!named)
8617 return NULL;
8618 break;
8621 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8622 cum->sse_nregs,
8623 &x86_64_int_parameter_registers [cum->regno],
8624 cum->sse_regno);
8627 static rtx
8628 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8629 machine_mode orig_mode, bool named,
8630 HOST_WIDE_INT bytes)
8632 unsigned int regno;
8634 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8635 We use value of -2 to specify that current function call is MSABI. */
8636 if (mode == VOIDmode)
8637 return GEN_INT (-2);
8639 /* If we've run out of registers, it goes on the stack. */
8640 if (cum->nregs == 0)
8641 return NULL_RTX;
8643 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8645 /* Only floating point modes are passed in anything but integer regs. */
8646 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8648 if (named)
8649 regno = cum->regno + FIRST_SSE_REG;
8650 else
8652 rtx t1, t2;
8654 /* Unnamed floating parameters are passed in both the
8655 SSE and integer registers. */
8656 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8657 t2 = gen_rtx_REG (mode, regno);
8658 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8659 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8660 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8663 /* Handle aggregated types passed in register. */
8664 if (orig_mode == BLKmode)
8666 if (bytes > 0 && bytes <= 8)
8667 mode = (bytes > 4 ? DImode : SImode);
8668 if (mode == BLKmode)
8669 mode = DImode;
8672 return gen_reg_or_parallel (mode, orig_mode, regno);
8675 /* Return where to put the arguments to a function.
8676 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8678 MODE is the argument's machine mode. TYPE is the data type of the
8679 argument. It is null for libcalls where that information may not be
8680 available. CUM gives information about the preceding args and about
8681 the function being called. NAMED is nonzero if this argument is a
8682 named parameter (otherwise it is an extra parameter matching an
8683 ellipsis). */
8685 static rtx
8686 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8687 const_tree type, bool named)
8689 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8690 machine_mode mode = omode;
8691 HOST_WIDE_INT bytes, words;
8692 rtx arg;
8694 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8696 gcc_assert (type != NULL_TREE);
8697 if (POINTER_TYPE_P (type))
8699 /* This is the pointer argument. */
8700 gcc_assert (TYPE_MODE (type) == Pmode);
8701 /* It is at -WORD(AP) in the current frame in interrupt and
8702 exception handlers. */
8703 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8705 else
8707 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8708 && TREE_CODE (type) == INTEGER_TYPE
8709 && TYPE_MODE (type) == word_mode);
8710 /* The error code is the word-mode integer argument at
8711 -2 * WORD(AP) in the current frame of the exception
8712 handler. */
8713 arg = gen_rtx_MEM (word_mode,
8714 plus_constant (Pmode,
8715 arg_pointer_rtx,
8716 -2 * UNITS_PER_WORD));
8718 return arg;
8721 /* All pointer bounds arguments are handled separately here. */
8722 if ((type && POINTER_BOUNDS_TYPE_P (type))
8723 || POINTER_BOUNDS_MODE_P (mode))
8725 /* Return NULL if bounds are forced to go in Bounds Table. */
8726 if (cum->bnds_in_bt)
8727 arg = NULL;
8728 /* Return the next available bound reg if any. */
8729 else if (cum->bnd_regno <= LAST_BND_REG)
8730 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8731 /* Return the next special slot number otherwise. */
8732 else
8733 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8735 return arg;
8738 if (mode == BLKmode)
8739 bytes = int_size_in_bytes (type);
8740 else
8741 bytes = GET_MODE_SIZE (mode);
8742 words = CEIL (bytes, UNITS_PER_WORD);
8744 /* To simplify the code below, represent vector types with a vector mode
8745 even if MMX/SSE are not active. */
8746 if (type && TREE_CODE (type) == VECTOR_TYPE)
8747 mode = type_natural_mode (type, cum, false);
8749 if (TARGET_64BIT)
8751 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8753 if (call_abi == MS_ABI)
8754 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8755 else
8756 arg = function_arg_64 (cum, mode, omode, type, named);
8758 else
8759 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8761 /* Track if there are outgoing arguments on stack. */
8762 if (arg == NULL_RTX && cum->caller)
8763 cfun->machine->outgoing_args_on_stack = true;
8765 return arg;
8768 /* A C expression that indicates when an argument must be passed by
8769 reference. If nonzero for an argument, a copy of that argument is
8770 made in memory and a pointer to the argument is passed instead of
8771 the argument itself. The pointer is passed in whatever way is
8772 appropriate for passing a pointer to that type. */
8774 static bool
8775 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8776 const_tree type, bool)
8778 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8780 /* Bounds are never passed by reference. */
8781 if ((type && POINTER_BOUNDS_TYPE_P (type))
8782 || POINTER_BOUNDS_MODE_P (mode))
8783 return false;
8785 if (TARGET_64BIT)
8787 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8789 /* See Windows x64 Software Convention. */
8790 if (call_abi == MS_ABI)
8792 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8794 if (type)
8796 /* Arrays are passed by reference. */
8797 if (TREE_CODE (type) == ARRAY_TYPE)
8798 return true;
8800 if (RECORD_OR_UNION_TYPE_P (type))
8802 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8803 are passed by reference. */
8804 msize = int_size_in_bytes (type);
8808 /* __m128 is passed by reference. */
8809 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8811 else if (type && int_size_in_bytes (type) == -1)
8812 return true;
8815 return false;
8818 /* Return true when TYPE should be 128bit aligned for 32bit argument
8819 passing ABI. XXX: This function is obsolete and is only used for
8820 checking psABI compatibility with previous versions of GCC. */
8822 static bool
8823 ix86_compat_aligned_value_p (const_tree type)
8825 machine_mode mode = TYPE_MODE (type);
8826 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8827 || mode == TDmode
8828 || mode == TFmode
8829 || mode == TCmode)
8830 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8831 return true;
8832 if (TYPE_ALIGN (type) < 128)
8833 return false;
8835 if (AGGREGATE_TYPE_P (type))
8837 /* Walk the aggregates recursively. */
8838 switch (TREE_CODE (type))
8840 case RECORD_TYPE:
8841 case UNION_TYPE:
8842 case QUAL_UNION_TYPE:
8844 tree field;
8846 /* Walk all the structure fields. */
8847 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8849 if (TREE_CODE (field) == FIELD_DECL
8850 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8851 return true;
8853 break;
8856 case ARRAY_TYPE:
8857 /* Just for use if some languages passes arrays by value. */
8858 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8859 return true;
8860 break;
8862 default:
8863 gcc_unreachable ();
8866 return false;
8869 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8870 XXX: This function is obsolete and is only used for checking psABI
8871 compatibility with previous versions of GCC. */
8873 static unsigned int
8874 ix86_compat_function_arg_boundary (machine_mode mode,
8875 const_tree type, unsigned int align)
8877 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8878 natural boundaries. */
8879 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8881 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8882 make an exception for SSE modes since these require 128bit
8883 alignment.
8885 The handling here differs from field_alignment. ICC aligns MMX
8886 arguments to 4 byte boundaries, while structure fields are aligned
8887 to 8 byte boundaries. */
8888 if (!type)
8890 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8891 align = PARM_BOUNDARY;
8893 else
8895 if (!ix86_compat_aligned_value_p (type))
8896 align = PARM_BOUNDARY;
8899 if (align > BIGGEST_ALIGNMENT)
8900 align = BIGGEST_ALIGNMENT;
8901 return align;
8904 /* Return true when TYPE should be 128bit aligned for 32bit argument
8905 passing ABI. */
8907 static bool
8908 ix86_contains_aligned_value_p (const_tree type)
8910 machine_mode mode = TYPE_MODE (type);
8912 if (mode == XFmode || mode == XCmode)
8913 return false;
8915 if (TYPE_ALIGN (type) < 128)
8916 return false;
8918 if (AGGREGATE_TYPE_P (type))
8920 /* Walk the aggregates recursively. */
8921 switch (TREE_CODE (type))
8923 case RECORD_TYPE:
8924 case UNION_TYPE:
8925 case QUAL_UNION_TYPE:
8927 tree field;
8929 /* Walk all the structure fields. */
8930 for (field = TYPE_FIELDS (type);
8931 field;
8932 field = DECL_CHAIN (field))
8934 if (TREE_CODE (field) == FIELD_DECL
8935 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8936 return true;
8938 break;
8941 case ARRAY_TYPE:
8942 /* Just for use if some languages passes arrays by value. */
8943 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8944 return true;
8945 break;
8947 default:
8948 gcc_unreachable ();
8951 else
8952 return TYPE_ALIGN (type) >= 128;
8954 return false;
8957 /* Gives the alignment boundary, in bits, of an argument with the
8958 specified mode and type. */
8960 static unsigned int
8961 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8963 unsigned int align;
8964 if (type)
8966 /* Since the main variant type is used for call, we convert it to
8967 the main variant type. */
8968 type = TYPE_MAIN_VARIANT (type);
8969 align = TYPE_ALIGN (type);
8971 else
8972 align = GET_MODE_ALIGNMENT (mode);
8973 if (align < PARM_BOUNDARY)
8974 align = PARM_BOUNDARY;
8975 else
8977 static bool warned;
8978 unsigned int saved_align = align;
8980 if (!TARGET_64BIT)
8982 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8983 if (!type)
8985 if (mode == XFmode || mode == XCmode)
8986 align = PARM_BOUNDARY;
8988 else if (!ix86_contains_aligned_value_p (type))
8989 align = PARM_BOUNDARY;
8991 if (align < 128)
8992 align = PARM_BOUNDARY;
8995 if (warn_psabi
8996 && !warned
8997 && align != ix86_compat_function_arg_boundary (mode, type,
8998 saved_align))
9000 warned = true;
9001 inform (input_location,
9002 "The ABI for passing parameters with %d-byte"
9003 " alignment has changed in GCC 4.6",
9004 align / BITS_PER_UNIT);
9008 return align;
9011 /* Return true if N is a possible register number of function value. */
9013 static bool
9014 ix86_function_value_regno_p (const unsigned int regno)
9016 switch (regno)
9018 case AX_REG:
9019 return true;
9020 case DX_REG:
9021 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9022 case DI_REG:
9023 case SI_REG:
9024 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9026 case BND0_REG:
9027 case BND1_REG:
9028 return chkp_function_instrumented_p (current_function_decl);
9030 /* Complex values are returned in %st(0)/%st(1) pair. */
9031 case ST0_REG:
9032 case ST1_REG:
9033 /* TODO: The function should depend on current function ABI but
9034 builtins.c would need updating then. Therefore we use the
9035 default ABI. */
9036 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9037 return false;
9038 return TARGET_FLOAT_RETURNS_IN_80387;
9040 /* Complex values are returned in %xmm0/%xmm1 pair. */
9041 case XMM0_REG:
9042 case XMM1_REG:
9043 return TARGET_SSE;
9045 case MM0_REG:
9046 if (TARGET_MACHO || TARGET_64BIT)
9047 return false;
9048 return TARGET_MMX;
9051 return false;
9054 /* Define how to find the value returned by a function.
9055 VALTYPE is the data type of the value (as a tree).
9056 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9057 otherwise, FUNC is 0. */
9059 static rtx
9060 function_value_32 (machine_mode orig_mode, machine_mode mode,
9061 const_tree fntype, const_tree fn)
9063 unsigned int regno;
9065 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9066 we normally prevent this case when mmx is not available. However
9067 some ABIs may require the result to be returned like DImode. */
9068 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9069 regno = FIRST_MMX_REG;
9071 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9072 we prevent this case when sse is not available. However some ABIs
9073 may require the result to be returned like integer TImode. */
9074 else if (mode == TImode
9075 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9076 regno = FIRST_SSE_REG;
9078 /* 32-byte vector modes in %ymm0. */
9079 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9080 regno = FIRST_SSE_REG;
9082 /* 64-byte vector modes in %zmm0. */
9083 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9084 regno = FIRST_SSE_REG;
9086 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9087 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9088 regno = FIRST_FLOAT_REG;
9089 else
9090 /* Most things go in %eax. */
9091 regno = AX_REG;
9093 /* Override FP return register with %xmm0 for local functions when
9094 SSE math is enabled or for functions with sseregparm attribute. */
9095 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9097 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9098 if (sse_level == -1)
9100 error ("calling %qD with SSE calling convention without "
9101 "SSE/SSE2 enabled", fn);
9102 sorry ("this is a GCC bug that can be worked around by adding "
9103 "attribute used to function called");
9105 else if ((sse_level >= 1 && mode == SFmode)
9106 || (sse_level == 2 && mode == DFmode))
9107 regno = FIRST_SSE_REG;
9110 /* OImode shouldn't be used directly. */
9111 gcc_assert (mode != OImode);
9113 return gen_rtx_REG (orig_mode, regno);
9116 static rtx
9117 function_value_64 (machine_mode orig_mode, machine_mode mode,
9118 const_tree valtype)
9120 rtx ret;
9122 /* Handle libcalls, which don't provide a type node. */
9123 if (valtype == NULL)
9125 unsigned int regno;
9127 switch (mode)
9129 case E_SFmode:
9130 case E_SCmode:
9131 case E_DFmode:
9132 case E_DCmode:
9133 case E_TFmode:
9134 case E_SDmode:
9135 case E_DDmode:
9136 case E_TDmode:
9137 regno = FIRST_SSE_REG;
9138 break;
9139 case E_XFmode:
9140 case E_XCmode:
9141 regno = FIRST_FLOAT_REG;
9142 break;
9143 case E_TCmode:
9144 return NULL;
9145 default:
9146 regno = AX_REG;
9149 return gen_rtx_REG (mode, regno);
9151 else if (POINTER_TYPE_P (valtype))
9153 /* Pointers are always returned in word_mode. */
9154 mode = word_mode;
9157 ret = construct_container (mode, orig_mode, valtype, 1,
9158 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9159 x86_64_int_return_registers, 0);
9161 /* For zero sized structures, construct_container returns NULL, but we
9162 need to keep rest of compiler happy by returning meaningful value. */
9163 if (!ret)
9164 ret = gen_rtx_REG (orig_mode, AX_REG);
9166 return ret;
9169 static rtx
9170 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9171 const_tree valtype)
9173 unsigned int regno = AX_REG;
9175 if (TARGET_SSE)
9177 switch (GET_MODE_SIZE (mode))
9179 case 16:
9180 if (valtype != NULL_TREE
9181 && !VECTOR_INTEGER_TYPE_P (valtype)
9182 && !VECTOR_INTEGER_TYPE_P (valtype)
9183 && !INTEGRAL_TYPE_P (valtype)
9184 && !VECTOR_FLOAT_TYPE_P (valtype))
9185 break;
9186 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9187 && !COMPLEX_MODE_P (mode))
9188 regno = FIRST_SSE_REG;
9189 break;
9190 case 8:
9191 case 4:
9192 if (mode == SFmode || mode == DFmode)
9193 regno = FIRST_SSE_REG;
9194 break;
9195 default:
9196 break;
9199 return gen_rtx_REG (orig_mode, regno);
9202 static rtx
9203 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9204 machine_mode orig_mode, machine_mode mode)
9206 const_tree fn, fntype;
9208 fn = NULL_TREE;
9209 if (fntype_or_decl && DECL_P (fntype_or_decl))
9210 fn = fntype_or_decl;
9211 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9213 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9214 || POINTER_BOUNDS_MODE_P (mode))
9215 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9216 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9217 return function_value_ms_64 (orig_mode, mode, valtype);
9218 else if (TARGET_64BIT)
9219 return function_value_64 (orig_mode, mode, valtype);
9220 else
9221 return function_value_32 (orig_mode, mode, fntype, fn);
9224 static rtx
9225 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9227 machine_mode mode, orig_mode;
9229 orig_mode = TYPE_MODE (valtype);
9230 mode = type_natural_mode (valtype, NULL, true);
9231 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9234 /* Return an RTX representing a place where a function returns
9235 or recieves pointer bounds or NULL if no bounds are returned.
9237 VALTYPE is a data type of a value returned by the function.
9239 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9240 or FUNCTION_TYPE of the function.
9242 If OUTGOING is false, return a place in which the caller will
9243 see the return value. Otherwise, return a place where a
9244 function returns a value. */
9246 static rtx
9247 ix86_function_value_bounds (const_tree valtype,
9248 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9249 bool outgoing ATTRIBUTE_UNUSED)
9251 rtx res = NULL_RTX;
9253 if (BOUNDED_TYPE_P (valtype))
9254 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9255 else if (chkp_type_has_pointer (valtype))
9257 bitmap slots;
9258 rtx bounds[2];
9259 bitmap_iterator bi;
9260 unsigned i, bnd_no = 0;
9262 bitmap_obstack_initialize (NULL);
9263 slots = BITMAP_ALLOC (NULL);
9264 chkp_find_bound_slots (valtype, slots);
9266 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9268 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9269 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9270 gcc_assert (bnd_no < 2);
9271 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9274 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9276 BITMAP_FREE (slots);
9277 bitmap_obstack_release (NULL);
9279 else
9280 res = NULL_RTX;
9282 return res;
9285 /* Pointer function arguments and return values are promoted to
9286 word_mode for normal functions. */
9288 static machine_mode
9289 ix86_promote_function_mode (const_tree type, machine_mode mode,
9290 int *punsignedp, const_tree fntype,
9291 int for_return)
9293 if (cfun->machine->func_type == TYPE_NORMAL
9294 && type != NULL_TREE
9295 && POINTER_TYPE_P (type))
9297 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9298 return word_mode;
9300 return default_promote_function_mode (type, mode, punsignedp, fntype,
9301 for_return);
9304 /* Return true if a structure, union or array with MODE containing FIELD
9305 should be accessed using BLKmode. */
9307 static bool
9308 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9310 /* Union with XFmode must be in BLKmode. */
9311 return (mode == XFmode
9312 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9313 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9317 ix86_libcall_value (machine_mode mode)
9319 return ix86_function_value_1 (NULL, NULL, mode, mode);
9322 /* Return true iff type is returned in memory. */
9324 static bool
9325 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9327 #ifdef SUBTARGET_RETURN_IN_MEMORY
9328 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9329 #else
9330 const machine_mode mode = type_natural_mode (type, NULL, true);
9331 HOST_WIDE_INT size;
9333 if (POINTER_BOUNDS_TYPE_P (type))
9334 return false;
9336 if (TARGET_64BIT)
9338 if (ix86_function_type_abi (fntype) == MS_ABI)
9340 size = int_size_in_bytes (type);
9342 /* __m128 is returned in xmm0. */
9343 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9344 || INTEGRAL_TYPE_P (type)
9345 || VECTOR_FLOAT_TYPE_P (type))
9346 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9347 && !COMPLEX_MODE_P (mode)
9348 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9349 return false;
9351 /* Otherwise, the size must be exactly in [1248]. */
9352 return size != 1 && size != 2 && size != 4 && size != 8;
9354 else
9356 int needed_intregs, needed_sseregs;
9358 return examine_argument (mode, type, 1,
9359 &needed_intregs, &needed_sseregs);
9362 else
9364 size = int_size_in_bytes (type);
9366 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9367 bytes in registers. */
9368 if (TARGET_IAMCU)
9369 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9371 if (mode == BLKmode)
9372 return true;
9374 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9375 return false;
9377 if (VECTOR_MODE_P (mode) || mode == TImode)
9379 /* User-created vectors small enough to fit in EAX. */
9380 if (size < 8)
9381 return false;
9383 /* Unless ABI prescibes otherwise,
9384 MMX/3dNow values are returned in MM0 if available. */
9386 if (size == 8)
9387 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9389 /* SSE values are returned in XMM0 if available. */
9390 if (size == 16)
9391 return !TARGET_SSE;
9393 /* AVX values are returned in YMM0 if available. */
9394 if (size == 32)
9395 return !TARGET_AVX;
9397 /* AVX512F values are returned in ZMM0 if available. */
9398 if (size == 64)
9399 return !TARGET_AVX512F;
9402 if (mode == XFmode)
9403 return false;
9405 if (size > 12)
9406 return true;
9408 /* OImode shouldn't be used directly. */
9409 gcc_assert (mode != OImode);
9411 return false;
9413 #endif
9417 /* Create the va_list data type. */
9419 static tree
9420 ix86_build_builtin_va_list_64 (void)
9422 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9424 record = lang_hooks.types.make_type (RECORD_TYPE);
9425 type_decl = build_decl (BUILTINS_LOCATION,
9426 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9428 f_gpr = build_decl (BUILTINS_LOCATION,
9429 FIELD_DECL, get_identifier ("gp_offset"),
9430 unsigned_type_node);
9431 f_fpr = build_decl (BUILTINS_LOCATION,
9432 FIELD_DECL, get_identifier ("fp_offset"),
9433 unsigned_type_node);
9434 f_ovf = build_decl (BUILTINS_LOCATION,
9435 FIELD_DECL, get_identifier ("overflow_arg_area"),
9436 ptr_type_node);
9437 f_sav = build_decl (BUILTINS_LOCATION,
9438 FIELD_DECL, get_identifier ("reg_save_area"),
9439 ptr_type_node);
9441 va_list_gpr_counter_field = f_gpr;
9442 va_list_fpr_counter_field = f_fpr;
9444 DECL_FIELD_CONTEXT (f_gpr) = record;
9445 DECL_FIELD_CONTEXT (f_fpr) = record;
9446 DECL_FIELD_CONTEXT (f_ovf) = record;
9447 DECL_FIELD_CONTEXT (f_sav) = record;
9449 TYPE_STUB_DECL (record) = type_decl;
9450 TYPE_NAME (record) = type_decl;
9451 TYPE_FIELDS (record) = f_gpr;
9452 DECL_CHAIN (f_gpr) = f_fpr;
9453 DECL_CHAIN (f_fpr) = f_ovf;
9454 DECL_CHAIN (f_ovf) = f_sav;
9456 layout_type (record);
9458 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9459 NULL_TREE, TYPE_ATTRIBUTES (record));
9461 /* The correct type is an array type of one element. */
9462 return build_array_type (record, build_index_type (size_zero_node));
9465 /* Setup the builtin va_list data type and for 64-bit the additional
9466 calling convention specific va_list data types. */
9468 static tree
9469 ix86_build_builtin_va_list (void)
9471 if (TARGET_64BIT)
9473 /* Initialize ABI specific va_list builtin types.
9475 In lto1, we can encounter two va_list types:
9476 - one as a result of the type-merge across TUs, and
9477 - the one constructed here.
9478 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9479 a type identity check in canonical_va_list_type based on
9480 TYPE_MAIN_VARIANT (which we used to have) will not work.
9481 Instead, we tag each va_list_type_node with its unique attribute, and
9482 look for the attribute in the type identity check in
9483 canonical_va_list_type.
9485 Tagging sysv_va_list_type_node directly with the attribute is
9486 problematic since it's a array of one record, which will degrade into a
9487 pointer to record when used as parameter (see build_va_arg comments for
9488 an example), dropping the attribute in the process. So we tag the
9489 record instead. */
9491 /* For SYSV_ABI we use an array of one record. */
9492 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9494 /* For MS_ABI we use plain pointer to argument area. */
9495 tree char_ptr_type = build_pointer_type (char_type_node);
9496 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9497 TYPE_ATTRIBUTES (char_ptr_type));
9498 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9500 return ((ix86_abi == MS_ABI)
9501 ? ms_va_list_type_node
9502 : sysv_va_list_type_node);
9504 else
9506 /* For i386 we use plain pointer to argument area. */
9507 return build_pointer_type (char_type_node);
9511 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9513 static void
9514 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9516 rtx save_area, mem;
9517 alias_set_type set;
9518 int i, max;
9520 /* GPR size of varargs save area. */
9521 if (cfun->va_list_gpr_size)
9522 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9523 else
9524 ix86_varargs_gpr_size = 0;
9526 /* FPR size of varargs save area. We don't need it if we don't pass
9527 anything in SSE registers. */
9528 if (TARGET_SSE && cfun->va_list_fpr_size)
9529 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9530 else
9531 ix86_varargs_fpr_size = 0;
9533 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9534 return;
9536 save_area = frame_pointer_rtx;
9537 set = get_varargs_alias_set ();
9539 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9540 if (max > X86_64_REGPARM_MAX)
9541 max = X86_64_REGPARM_MAX;
9543 for (i = cum->regno; i < max; i++)
9545 mem = gen_rtx_MEM (word_mode,
9546 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9547 MEM_NOTRAP_P (mem) = 1;
9548 set_mem_alias_set (mem, set);
9549 emit_move_insn (mem,
9550 gen_rtx_REG (word_mode,
9551 x86_64_int_parameter_registers[i]));
9554 if (ix86_varargs_fpr_size)
9556 machine_mode smode;
9557 rtx_code_label *label;
9558 rtx test;
9560 /* Now emit code to save SSE registers. The AX parameter contains number
9561 of SSE parameter registers used to call this function, though all we
9562 actually check here is the zero/non-zero status. */
9564 label = gen_label_rtx ();
9565 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9566 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9567 label));
9569 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9570 we used movdqa (i.e. TImode) instead? Perhaps even better would
9571 be if we could determine the real mode of the data, via a hook
9572 into pass_stdarg. Ignore all that for now. */
9573 smode = V4SFmode;
9574 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9575 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9577 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9578 if (max > X86_64_SSE_REGPARM_MAX)
9579 max = X86_64_SSE_REGPARM_MAX;
9581 for (i = cum->sse_regno; i < max; ++i)
9583 mem = plus_constant (Pmode, save_area,
9584 i * 16 + ix86_varargs_gpr_size);
9585 mem = gen_rtx_MEM (smode, mem);
9586 MEM_NOTRAP_P (mem) = 1;
9587 set_mem_alias_set (mem, set);
9588 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9590 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9593 emit_label (label);
9597 static void
9598 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9600 alias_set_type set = get_varargs_alias_set ();
9601 int i;
9603 /* Reset to zero, as there might be a sysv vaarg used
9604 before. */
9605 ix86_varargs_gpr_size = 0;
9606 ix86_varargs_fpr_size = 0;
9608 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9610 rtx reg, mem;
9612 mem = gen_rtx_MEM (Pmode,
9613 plus_constant (Pmode, virtual_incoming_args_rtx,
9614 i * UNITS_PER_WORD));
9615 MEM_NOTRAP_P (mem) = 1;
9616 set_mem_alias_set (mem, set);
9618 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9619 emit_move_insn (mem, reg);
9623 static void
9624 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9625 tree type, int *, int no_rtl)
9627 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9628 CUMULATIVE_ARGS next_cum;
9629 tree fntype;
9631 /* This argument doesn't appear to be used anymore. Which is good,
9632 because the old code here didn't suppress rtl generation. */
9633 gcc_assert (!no_rtl);
9635 if (!TARGET_64BIT)
9636 return;
9638 fntype = TREE_TYPE (current_function_decl);
9640 /* For varargs, we do not want to skip the dummy va_dcl argument.
9641 For stdargs, we do want to skip the last named argument. */
9642 next_cum = *cum;
9643 if (stdarg_p (fntype))
9644 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9645 true);
9647 if (cum->call_abi == MS_ABI)
9648 setup_incoming_varargs_ms_64 (&next_cum);
9649 else
9650 setup_incoming_varargs_64 (&next_cum);
9653 static void
9654 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9655 machine_mode mode,
9656 tree type,
9657 int *pretend_size ATTRIBUTE_UNUSED,
9658 int no_rtl)
9660 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9661 CUMULATIVE_ARGS next_cum;
9662 tree fntype;
9663 rtx save_area;
9664 int bnd_reg, i, max;
9666 gcc_assert (!no_rtl);
9668 /* Do nothing if we use plain pointer to argument area. */
9669 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9670 return;
9672 fntype = TREE_TYPE (current_function_decl);
9674 /* For varargs, we do not want to skip the dummy va_dcl argument.
9675 For stdargs, we do want to skip the last named argument. */
9676 next_cum = *cum;
9677 if (stdarg_p (fntype))
9678 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9679 true);
9680 save_area = frame_pointer_rtx;
9682 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9683 if (max > X86_64_REGPARM_MAX)
9684 max = X86_64_REGPARM_MAX;
9686 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9687 if (chkp_function_instrumented_p (current_function_decl))
9688 for (i = cum->regno; i < max; i++)
9690 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9691 rtx ptr = gen_rtx_REG (Pmode,
9692 x86_64_int_parameter_registers[i]);
9693 rtx bounds;
9695 if (bnd_reg <= LAST_BND_REG)
9696 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9697 else
9699 rtx ldx_addr =
9700 plus_constant (Pmode, arg_pointer_rtx,
9701 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9702 bounds = gen_reg_rtx (BNDmode);
9703 emit_insn (BNDmode == BND64mode
9704 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9705 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9708 emit_insn (BNDmode == BND64mode
9709 ? gen_bnd64_stx (addr, ptr, bounds)
9710 : gen_bnd32_stx (addr, ptr, bounds));
9712 bnd_reg++;
9717 /* Checks if TYPE is of kind va_list char *. */
9719 static bool
9720 is_va_list_char_pointer (tree type)
9722 tree canonic;
9724 /* For 32-bit it is always true. */
9725 if (!TARGET_64BIT)
9726 return true;
9727 canonic = ix86_canonical_va_list_type (type);
9728 return (canonic == ms_va_list_type_node
9729 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9732 /* Implement va_start. */
9734 static void
9735 ix86_va_start (tree valist, rtx nextarg)
9737 HOST_WIDE_INT words, n_gpr, n_fpr;
9738 tree f_gpr, f_fpr, f_ovf, f_sav;
9739 tree gpr, fpr, ovf, sav, t;
9740 tree type;
9741 rtx ovf_rtx;
9743 if (flag_split_stack
9744 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9746 unsigned int scratch_regno;
9748 /* When we are splitting the stack, we can't refer to the stack
9749 arguments using internal_arg_pointer, because they may be on
9750 the old stack. The split stack prologue will arrange to
9751 leave a pointer to the old stack arguments in a scratch
9752 register, which we here copy to a pseudo-register. The split
9753 stack prologue can't set the pseudo-register directly because
9754 it (the prologue) runs before any registers have been saved. */
9756 scratch_regno = split_stack_prologue_scratch_regno ();
9757 if (scratch_regno != INVALID_REGNUM)
9759 rtx reg;
9760 rtx_insn *seq;
9762 reg = gen_reg_rtx (Pmode);
9763 cfun->machine->split_stack_varargs_pointer = reg;
9765 start_sequence ();
9766 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9767 seq = get_insns ();
9768 end_sequence ();
9770 push_topmost_sequence ();
9771 emit_insn_after (seq, entry_of_function ());
9772 pop_topmost_sequence ();
9776 /* Only 64bit target needs something special. */
9777 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9779 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9780 std_expand_builtin_va_start (valist, nextarg);
9781 else
9783 rtx va_r, next;
9785 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9786 next = expand_binop (ptr_mode, add_optab,
9787 cfun->machine->split_stack_varargs_pointer,
9788 crtl->args.arg_offset_rtx,
9789 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9790 convert_move (va_r, next, 0);
9792 /* Store zero bounds for va_list. */
9793 if (chkp_function_instrumented_p (current_function_decl))
9794 chkp_expand_bounds_reset_for_mem (valist,
9795 make_tree (TREE_TYPE (valist),
9796 next));
9799 return;
9802 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9803 f_fpr = DECL_CHAIN (f_gpr);
9804 f_ovf = DECL_CHAIN (f_fpr);
9805 f_sav = DECL_CHAIN (f_ovf);
9807 valist = build_simple_mem_ref (valist);
9808 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9809 /* The following should be folded into the MEM_REF offset. */
9810 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9811 f_gpr, NULL_TREE);
9812 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9813 f_fpr, NULL_TREE);
9814 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9815 f_ovf, NULL_TREE);
9816 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9817 f_sav, NULL_TREE);
9819 /* Count number of gp and fp argument registers used. */
9820 words = crtl->args.info.words;
9821 n_gpr = crtl->args.info.regno;
9822 n_fpr = crtl->args.info.sse_regno;
9824 if (cfun->va_list_gpr_size)
9826 type = TREE_TYPE (gpr);
9827 t = build2 (MODIFY_EXPR, type,
9828 gpr, build_int_cst (type, n_gpr * 8));
9829 TREE_SIDE_EFFECTS (t) = 1;
9830 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9833 if (TARGET_SSE && cfun->va_list_fpr_size)
9835 type = TREE_TYPE (fpr);
9836 t = build2 (MODIFY_EXPR, type, fpr,
9837 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9838 TREE_SIDE_EFFECTS (t) = 1;
9839 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9842 /* Find the overflow area. */
9843 type = TREE_TYPE (ovf);
9844 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9845 ovf_rtx = crtl->args.internal_arg_pointer;
9846 else
9847 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9848 t = make_tree (type, ovf_rtx);
9849 if (words != 0)
9850 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9852 /* Store zero bounds for overflow area pointer. */
9853 if (chkp_function_instrumented_p (current_function_decl))
9854 chkp_expand_bounds_reset_for_mem (ovf, t);
9856 t = build2 (MODIFY_EXPR, type, ovf, t);
9857 TREE_SIDE_EFFECTS (t) = 1;
9858 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9860 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9862 /* Find the register save area.
9863 Prologue of the function save it right above stack frame. */
9864 type = TREE_TYPE (sav);
9865 t = make_tree (type, frame_pointer_rtx);
9866 if (!ix86_varargs_gpr_size)
9867 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9869 /* Store zero bounds for save area pointer. */
9870 if (chkp_function_instrumented_p (current_function_decl))
9871 chkp_expand_bounds_reset_for_mem (sav, t);
9873 t = build2 (MODIFY_EXPR, type, sav, t);
9874 TREE_SIDE_EFFECTS (t) = 1;
9875 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9879 /* Implement va_arg. */
9881 static tree
9882 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9883 gimple_seq *post_p)
9885 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9886 tree f_gpr, f_fpr, f_ovf, f_sav;
9887 tree gpr, fpr, ovf, sav, t;
9888 int size, rsize;
9889 tree lab_false, lab_over = NULL_TREE;
9890 tree addr, t2;
9891 rtx container;
9892 int indirect_p = 0;
9893 tree ptrtype;
9894 machine_mode nat_mode;
9895 unsigned int arg_boundary;
9897 /* Only 64bit target needs something special. */
9898 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9899 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9901 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9902 f_fpr = DECL_CHAIN (f_gpr);
9903 f_ovf = DECL_CHAIN (f_fpr);
9904 f_sav = DECL_CHAIN (f_ovf);
9906 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9907 valist, f_gpr, NULL_TREE);
9909 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9910 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9911 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9913 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9914 if (indirect_p)
9915 type = build_pointer_type (type);
9916 size = arg_int_size_in_bytes (type);
9917 rsize = CEIL (size, UNITS_PER_WORD);
9919 nat_mode = type_natural_mode (type, NULL, false);
9920 switch (nat_mode)
9922 case E_V8SFmode:
9923 case E_V8SImode:
9924 case E_V32QImode:
9925 case E_V16HImode:
9926 case E_V4DFmode:
9927 case E_V4DImode:
9928 case E_V16SFmode:
9929 case E_V16SImode:
9930 case E_V64QImode:
9931 case E_V32HImode:
9932 case E_V8DFmode:
9933 case E_V8DImode:
9934 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9935 if (!TARGET_64BIT_MS_ABI)
9937 container = NULL;
9938 break;
9940 /* FALLTHRU */
9942 default:
9943 container = construct_container (nat_mode, TYPE_MODE (type),
9944 type, 0, X86_64_REGPARM_MAX,
9945 X86_64_SSE_REGPARM_MAX, intreg,
9947 break;
9950 /* Pull the value out of the saved registers. */
9952 addr = create_tmp_var (ptr_type_node, "addr");
9954 if (container)
9956 int needed_intregs, needed_sseregs;
9957 bool need_temp;
9958 tree int_addr, sse_addr;
9960 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9961 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9963 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9965 need_temp = (!REG_P (container)
9966 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9967 || TYPE_ALIGN (type) > 128));
9969 /* In case we are passing structure, verify that it is consecutive block
9970 on the register save area. If not we need to do moves. */
9971 if (!need_temp && !REG_P (container))
9973 /* Verify that all registers are strictly consecutive */
9974 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9976 int i;
9978 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9980 rtx slot = XVECEXP (container, 0, i);
9981 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9982 || INTVAL (XEXP (slot, 1)) != i * 16)
9983 need_temp = true;
9986 else
9988 int i;
9990 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9992 rtx slot = XVECEXP (container, 0, i);
9993 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
9994 || INTVAL (XEXP (slot, 1)) != i * 8)
9995 need_temp = true;
9999 if (!need_temp)
10001 int_addr = addr;
10002 sse_addr = addr;
10004 else
10006 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10007 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10010 /* First ensure that we fit completely in registers. */
10011 if (needed_intregs)
10013 t = build_int_cst (TREE_TYPE (gpr),
10014 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10015 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10016 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10017 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10018 gimplify_and_add (t, pre_p);
10020 if (needed_sseregs)
10022 t = build_int_cst (TREE_TYPE (fpr),
10023 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10024 + X86_64_REGPARM_MAX * 8);
10025 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10026 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10027 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10028 gimplify_and_add (t, pre_p);
10031 /* Compute index to start of area used for integer regs. */
10032 if (needed_intregs)
10034 /* int_addr = gpr + sav; */
10035 t = fold_build_pointer_plus (sav, gpr);
10036 gimplify_assign (int_addr, t, pre_p);
10038 if (needed_sseregs)
10040 /* sse_addr = fpr + sav; */
10041 t = fold_build_pointer_plus (sav, fpr);
10042 gimplify_assign (sse_addr, t, pre_p);
10044 if (need_temp)
10046 int i, prev_size = 0;
10047 tree temp = create_tmp_var (type, "va_arg_tmp");
10049 /* addr = &temp; */
10050 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10051 gimplify_assign (addr, t, pre_p);
10053 for (i = 0; i < XVECLEN (container, 0); i++)
10055 rtx slot = XVECEXP (container, 0, i);
10056 rtx reg = XEXP (slot, 0);
10057 machine_mode mode = GET_MODE (reg);
10058 tree piece_type;
10059 tree addr_type;
10060 tree daddr_type;
10061 tree src_addr, src;
10062 int src_offset;
10063 tree dest_addr, dest;
10064 int cur_size = GET_MODE_SIZE (mode);
10066 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10067 prev_size = INTVAL (XEXP (slot, 1));
10068 if (prev_size + cur_size > size)
10070 cur_size = size - prev_size;
10071 unsigned int nbits = cur_size * BITS_PER_UNIT;
10072 if (!int_mode_for_size (nbits, 1).exists (&mode))
10073 mode = QImode;
10075 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10076 if (mode == GET_MODE (reg))
10077 addr_type = build_pointer_type (piece_type);
10078 else
10079 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10080 true);
10081 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10082 true);
10084 if (SSE_REGNO_P (REGNO (reg)))
10086 src_addr = sse_addr;
10087 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10089 else
10091 src_addr = int_addr;
10092 src_offset = REGNO (reg) * 8;
10094 src_addr = fold_convert (addr_type, src_addr);
10095 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10097 dest_addr = fold_convert (daddr_type, addr);
10098 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10099 if (cur_size == GET_MODE_SIZE (mode))
10101 src = build_va_arg_indirect_ref (src_addr);
10102 dest = build_va_arg_indirect_ref (dest_addr);
10104 gimplify_assign (dest, src, pre_p);
10106 else
10108 tree copy
10109 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10110 3, dest_addr, src_addr,
10111 size_int (cur_size));
10112 gimplify_and_add (copy, pre_p);
10114 prev_size += cur_size;
10118 if (needed_intregs)
10120 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10121 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10122 gimplify_assign (gpr, t, pre_p);
10125 if (needed_sseregs)
10127 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10128 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10129 gimplify_assign (unshare_expr (fpr), t, pre_p);
10132 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10134 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10137 /* ... otherwise out of the overflow area. */
10139 /* When we align parameter on stack for caller, if the parameter
10140 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10141 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10142 here with caller. */
10143 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10144 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10145 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10147 /* Care for on-stack alignment if needed. */
10148 if (arg_boundary <= 64 || size == 0)
10149 t = ovf;
10150 else
10152 HOST_WIDE_INT align = arg_boundary / 8;
10153 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10154 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10155 build_int_cst (TREE_TYPE (t), -align));
10158 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10159 gimplify_assign (addr, t, pre_p);
10161 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10162 gimplify_assign (unshare_expr (ovf), t, pre_p);
10164 if (container)
10165 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10167 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10168 addr = fold_convert (ptrtype, addr);
10170 if (indirect_p)
10171 addr = build_va_arg_indirect_ref (addr);
10172 return build_va_arg_indirect_ref (addr);
10175 /* Return true if OPNUM's MEM should be matched
10176 in movabs* patterns. */
10178 bool
10179 ix86_check_movabs (rtx insn, int opnum)
10181 rtx set, mem;
10183 set = PATTERN (insn);
10184 if (GET_CODE (set) == PARALLEL)
10185 set = XVECEXP (set, 0, 0);
10186 gcc_assert (GET_CODE (set) == SET);
10187 mem = XEXP (set, opnum);
10188 while (SUBREG_P (mem))
10189 mem = SUBREG_REG (mem);
10190 gcc_assert (MEM_P (mem));
10191 return volatile_ok || !MEM_VOLATILE_P (mem);
10194 /* Return false if INSN contains a MEM with a non-default address space. */
10195 bool
10196 ix86_check_no_addr_space (rtx insn)
10198 subrtx_var_iterator::array_type array;
10199 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10201 rtx x = *iter;
10202 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10203 return false;
10205 return true;
10208 /* Initialize the table of extra 80387 mathematical constants. */
10210 static void
10211 init_ext_80387_constants (void)
10213 static const char * cst[5] =
10215 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10216 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10217 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10218 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10219 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10221 int i;
10223 for (i = 0; i < 5; i++)
10225 real_from_string (&ext_80387_constants_table[i], cst[i]);
10226 /* Ensure each constant is rounded to XFmode precision. */
10227 real_convert (&ext_80387_constants_table[i],
10228 XFmode, &ext_80387_constants_table[i]);
10231 ext_80387_constants_init = 1;
10234 /* Return non-zero if the constant is something that
10235 can be loaded with a special instruction. */
10238 standard_80387_constant_p (rtx x)
10240 machine_mode mode = GET_MODE (x);
10242 const REAL_VALUE_TYPE *r;
10244 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10245 return -1;
10247 if (x == CONST0_RTX (mode))
10248 return 1;
10249 if (x == CONST1_RTX (mode))
10250 return 2;
10252 r = CONST_DOUBLE_REAL_VALUE (x);
10254 /* For XFmode constants, try to find a special 80387 instruction when
10255 optimizing for size or on those CPUs that benefit from them. */
10256 if (mode == XFmode
10257 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10259 int i;
10261 if (! ext_80387_constants_init)
10262 init_ext_80387_constants ();
10264 for (i = 0; i < 5; i++)
10265 if (real_identical (r, &ext_80387_constants_table[i]))
10266 return i + 3;
10269 /* Load of the constant -0.0 or -1.0 will be split as
10270 fldz;fchs or fld1;fchs sequence. */
10271 if (real_isnegzero (r))
10272 return 8;
10273 if (real_identical (r, &dconstm1))
10274 return 9;
10276 return 0;
10279 /* Return the opcode of the special instruction to be used to load
10280 the constant X. */
10282 const char *
10283 standard_80387_constant_opcode (rtx x)
10285 switch (standard_80387_constant_p (x))
10287 case 1:
10288 return "fldz";
10289 case 2:
10290 return "fld1";
10291 case 3:
10292 return "fldlg2";
10293 case 4:
10294 return "fldln2";
10295 case 5:
10296 return "fldl2e";
10297 case 6:
10298 return "fldl2t";
10299 case 7:
10300 return "fldpi";
10301 case 8:
10302 case 9:
10303 return "#";
10304 default:
10305 gcc_unreachable ();
10309 /* Return the CONST_DOUBLE representing the 80387 constant that is
10310 loaded by the specified special instruction. The argument IDX
10311 matches the return value from standard_80387_constant_p. */
10314 standard_80387_constant_rtx (int idx)
10316 int i;
10318 if (! ext_80387_constants_init)
10319 init_ext_80387_constants ();
10321 switch (idx)
10323 case 3:
10324 case 4:
10325 case 5:
10326 case 6:
10327 case 7:
10328 i = idx - 3;
10329 break;
10331 default:
10332 gcc_unreachable ();
10335 return const_double_from_real_value (ext_80387_constants_table[i],
10336 XFmode);
10339 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10340 in supported SSE/AVX vector mode. */
10343 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10345 machine_mode mode;
10347 if (!TARGET_SSE)
10348 return 0;
10350 mode = GET_MODE (x);
10352 if (x == const0_rtx || const0_operand (x, mode))
10353 return 1;
10355 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10357 /* VOIDmode integer constant, get mode from the predicate. */
10358 if (mode == VOIDmode)
10359 mode = pred_mode;
10361 switch (GET_MODE_SIZE (mode))
10363 case 64:
10364 if (TARGET_AVX512F)
10365 return 2;
10366 break;
10367 case 32:
10368 if (TARGET_AVX2)
10369 return 2;
10370 break;
10371 case 16:
10372 if (TARGET_SSE2)
10373 return 2;
10374 break;
10375 case 0:
10376 /* VOIDmode */
10377 gcc_unreachable ();
10378 default:
10379 break;
10383 return 0;
10386 /* Return the opcode of the special instruction to be used to load
10387 the constant operands[1] into operands[0]. */
10389 const char *
10390 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10392 machine_mode mode;
10393 rtx x = operands[1];
10395 gcc_assert (TARGET_SSE);
10397 mode = GET_MODE (x);
10399 if (x == const0_rtx || const0_operand (x, mode))
10401 switch (get_attr_mode (insn))
10403 case MODE_TI:
10404 if (!EXT_REX_SSE_REG_P (operands[0]))
10405 return "%vpxor\t%0, %d0";
10406 /* FALLTHRU */
10407 case MODE_XI:
10408 case MODE_OI:
10409 if (EXT_REX_SSE_REG_P (operands[0]))
10410 return (TARGET_AVX512VL
10411 ? "vpxord\t%x0, %x0, %x0"
10412 : "vpxord\t%g0, %g0, %g0");
10413 return "vpxor\t%x0, %x0, %x0";
10415 case MODE_V2DF:
10416 if (!EXT_REX_SSE_REG_P (operands[0]))
10417 return "%vxorpd\t%0, %d0";
10418 /* FALLTHRU */
10419 case MODE_V8DF:
10420 case MODE_V4DF:
10421 if (!EXT_REX_SSE_REG_P (operands[0]))
10422 return "vxorpd\t%x0, %x0, %x0";
10423 else if (TARGET_AVX512DQ)
10424 return (TARGET_AVX512VL
10425 ? "vxorpd\t%x0, %x0, %x0"
10426 : "vxorpd\t%g0, %g0, %g0");
10427 else
10428 return (TARGET_AVX512VL
10429 ? "vpxorq\t%x0, %x0, %x0"
10430 : "vpxorq\t%g0, %g0, %g0");
10432 case MODE_V4SF:
10433 if (!EXT_REX_SSE_REG_P (operands[0]))
10434 return "%vxorps\t%0, %d0";
10435 /* FALLTHRU */
10436 case MODE_V16SF:
10437 case MODE_V8SF:
10438 if (!EXT_REX_SSE_REG_P (operands[0]))
10439 return "vxorps\t%x0, %x0, %x0";
10440 else if (TARGET_AVX512DQ)
10441 return (TARGET_AVX512VL
10442 ? "vxorps\t%x0, %x0, %x0"
10443 : "vxorps\t%g0, %g0, %g0");
10444 else
10445 return (TARGET_AVX512VL
10446 ? "vpxord\t%x0, %x0, %x0"
10447 : "vpxord\t%g0, %g0, %g0");
10449 default:
10450 gcc_unreachable ();
10453 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10455 enum attr_mode insn_mode = get_attr_mode (insn);
10457 switch (insn_mode)
10459 case MODE_XI:
10460 case MODE_V8DF:
10461 case MODE_V16SF:
10462 gcc_assert (TARGET_AVX512F);
10463 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10465 case MODE_OI:
10466 case MODE_V4DF:
10467 case MODE_V8SF:
10468 gcc_assert (TARGET_AVX2);
10469 /* FALLTHRU */
10470 case MODE_TI:
10471 case MODE_V2DF:
10472 case MODE_V4SF:
10473 gcc_assert (TARGET_SSE2);
10474 if (!EXT_REX_SSE_REG_P (operands[0]))
10475 return (TARGET_AVX
10476 ? "vpcmpeqd\t%0, %0, %0"
10477 : "pcmpeqd\t%0, %0");
10478 else if (TARGET_AVX512VL)
10479 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10480 else
10481 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10483 default:
10484 gcc_unreachable ();
10488 gcc_unreachable ();
10491 /* Returns true if INSN can be transformed from a memory load
10492 to a supported FP constant load. */
10494 bool
10495 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10497 rtx src = find_constant_src (insn);
10499 gcc_assert (REG_P (dst));
10501 if (src == NULL
10502 || (SSE_REGNO_P (REGNO (dst))
10503 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10504 || (STACK_REGNO_P (REGNO (dst))
10505 && standard_80387_constant_p (src) < 1))
10506 return false;
10508 return true;
10511 /* Returns true if OP contains a symbol reference */
10513 bool
10514 symbolic_reference_mentioned_p (rtx op)
10516 const char *fmt;
10517 int i;
10519 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10520 return true;
10522 fmt = GET_RTX_FORMAT (GET_CODE (op));
10523 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10525 if (fmt[i] == 'E')
10527 int j;
10529 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10530 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10531 return true;
10534 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10535 return true;
10538 return false;
10541 /* Return true if it is appropriate to emit `ret' instructions in the
10542 body of a function. Do this only if the epilogue is simple, needing a
10543 couple of insns. Prior to reloading, we can't tell how many registers
10544 must be saved, so return false then. Return false if there is no frame
10545 marker to de-allocate. */
10547 bool
10548 ix86_can_use_return_insn_p (void)
10550 if (ix86_function_naked (current_function_decl))
10551 return false;
10553 /* Don't use `ret' instruction in interrupt handler. */
10554 if (! reload_completed
10555 || frame_pointer_needed
10556 || cfun->machine->func_type != TYPE_NORMAL)
10557 return 0;
10559 /* Don't allow more than 32k pop, since that's all we can do
10560 with one instruction. */
10561 if (crtl->args.pops_args && crtl->args.size >= 32768)
10562 return 0;
10564 struct ix86_frame &frame = cfun->machine->frame;
10565 return (frame.stack_pointer_offset == UNITS_PER_WORD
10566 && (frame.nregs + frame.nsseregs) == 0);
10569 /* Value should be nonzero if functions must have frame pointers.
10570 Zero means the frame pointer need not be set up (and parms may
10571 be accessed via the stack pointer) in functions that seem suitable. */
10573 static bool
10574 ix86_frame_pointer_required (void)
10576 /* If we accessed previous frames, then the generated code expects
10577 to be able to access the saved ebp value in our frame. */
10578 if (cfun->machine->accesses_prev_frame)
10579 return true;
10581 /* Several x86 os'es need a frame pointer for other reasons,
10582 usually pertaining to setjmp. */
10583 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10584 return true;
10586 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10587 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10588 return true;
10590 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10591 allocation is 4GB. */
10592 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10593 return true;
10595 /* SSE saves require frame-pointer when stack is misaligned. */
10596 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10597 return true;
10599 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10600 turns off the frame pointer by default. Turn it back on now if
10601 we've not got a leaf function. */
10602 if (TARGET_OMIT_LEAF_FRAME_POINTER
10603 && (!crtl->is_leaf
10604 || ix86_current_function_calls_tls_descriptor))
10605 return true;
10607 if (crtl->profile && !flag_fentry)
10608 return true;
10610 return false;
10613 /* Record that the current function accesses previous call frames. */
10615 void
10616 ix86_setup_frame_addresses (void)
10618 cfun->machine->accesses_prev_frame = 1;
10621 #ifndef USE_HIDDEN_LINKONCE
10622 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10623 # define USE_HIDDEN_LINKONCE 1
10624 # else
10625 # define USE_HIDDEN_LINKONCE 0
10626 # endif
10627 #endif
10629 static int pic_labels_used;
10631 /* Fills in the label name that should be used for a pc thunk for
10632 the given register. */
10634 static void
10635 get_pc_thunk_name (char name[32], unsigned int regno)
10637 gcc_assert (!TARGET_64BIT);
10639 if (USE_HIDDEN_LINKONCE)
10640 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10641 else
10642 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10646 /* This function generates code for -fpic that loads %ebx with
10647 the return address of the caller and then returns. */
10649 static void
10650 ix86_code_end (void)
10652 rtx xops[2];
10653 int regno;
10655 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10657 char name[32];
10658 tree decl;
10660 if (!(pic_labels_used & (1 << regno)))
10661 continue;
10663 get_pc_thunk_name (name, regno);
10665 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10666 get_identifier (name),
10667 build_function_type_list (void_type_node, NULL_TREE));
10668 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10669 NULL_TREE, void_type_node);
10670 TREE_PUBLIC (decl) = 1;
10671 TREE_STATIC (decl) = 1;
10672 DECL_IGNORED_P (decl) = 1;
10674 #if TARGET_MACHO
10675 if (TARGET_MACHO)
10677 switch_to_section (darwin_sections[picbase_thunk_section]);
10678 fputs ("\t.weak_definition\t", asm_out_file);
10679 assemble_name (asm_out_file, name);
10680 fputs ("\n\t.private_extern\t", asm_out_file);
10681 assemble_name (asm_out_file, name);
10682 putc ('\n', asm_out_file);
10683 ASM_OUTPUT_LABEL (asm_out_file, name);
10684 DECL_WEAK (decl) = 1;
10686 else
10687 #endif
10688 if (USE_HIDDEN_LINKONCE)
10690 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10692 targetm.asm_out.unique_section (decl, 0);
10693 switch_to_section (get_named_section (decl, NULL, 0));
10695 targetm.asm_out.globalize_label (asm_out_file, name);
10696 fputs ("\t.hidden\t", asm_out_file);
10697 assemble_name (asm_out_file, name);
10698 putc ('\n', asm_out_file);
10699 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10701 else
10703 switch_to_section (text_section);
10704 ASM_OUTPUT_LABEL (asm_out_file, name);
10707 DECL_INITIAL (decl) = make_node (BLOCK);
10708 current_function_decl = decl;
10709 allocate_struct_function (decl, false);
10710 init_function_start (decl);
10711 /* We're about to hide the function body from callees of final_* by
10712 emitting it directly; tell them we're a thunk, if they care. */
10713 cfun->is_thunk = true;
10714 first_function_block_is_cold = false;
10715 /* Make sure unwind info is emitted for the thunk if needed. */
10716 final_start_function (emit_barrier (), asm_out_file, 1);
10718 /* Pad stack IP move with 4 instructions (two NOPs count
10719 as one instruction). */
10720 if (TARGET_PAD_SHORT_FUNCTION)
10722 int i = 8;
10724 while (i--)
10725 fputs ("\tnop\n", asm_out_file);
10728 xops[0] = gen_rtx_REG (Pmode, regno);
10729 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10730 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10731 output_asm_insn ("%!ret", NULL);
10732 final_end_function ();
10733 init_insn_lengths ();
10734 free_after_compilation (cfun);
10735 set_cfun (NULL);
10736 current_function_decl = NULL;
10739 if (flag_split_stack)
10740 file_end_indicate_split_stack ();
10743 /* Emit code for the SET_GOT patterns. */
10745 const char *
10746 output_set_got (rtx dest, rtx label)
10748 rtx xops[3];
10750 xops[0] = dest;
10752 if (TARGET_VXWORKS_RTP && flag_pic)
10754 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10755 xops[2] = gen_rtx_MEM (Pmode,
10756 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10757 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10759 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10760 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10761 an unadorned address. */
10762 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10763 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10764 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10765 return "";
10768 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10770 if (flag_pic)
10772 char name[32];
10773 get_pc_thunk_name (name, REGNO (dest));
10774 pic_labels_used |= 1 << REGNO (dest);
10776 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10777 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10778 output_asm_insn ("%!call\t%X2", xops);
10780 #if TARGET_MACHO
10781 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10782 This is what will be referenced by the Mach-O PIC subsystem. */
10783 if (machopic_should_output_picbase_label () || !label)
10784 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10786 /* When we are restoring the pic base at the site of a nonlocal label,
10787 and we decided to emit the pic base above, we will still output a
10788 local label used for calculating the correction offset (even though
10789 the offset will be 0 in that case). */
10790 if (label)
10791 targetm.asm_out.internal_label (asm_out_file, "L",
10792 CODE_LABEL_NUMBER (label));
10793 #endif
10795 else
10797 if (TARGET_MACHO)
10798 /* We don't need a pic base, we're not producing pic. */
10799 gcc_unreachable ();
10801 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10802 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10803 targetm.asm_out.internal_label (asm_out_file, "L",
10804 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10807 if (!TARGET_MACHO)
10808 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10810 return "";
10813 /* Generate an "push" pattern for input ARG. */
10815 static rtx
10816 gen_push (rtx arg)
10818 struct machine_function *m = cfun->machine;
10820 if (m->fs.cfa_reg == stack_pointer_rtx)
10821 m->fs.cfa_offset += UNITS_PER_WORD;
10822 m->fs.sp_offset += UNITS_PER_WORD;
10824 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10825 arg = gen_rtx_REG (word_mode, REGNO (arg));
10827 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10828 gen_rtx_PRE_DEC (Pmode,
10829 stack_pointer_rtx)),
10830 arg);
10833 /* Generate an "pop" pattern for input ARG. */
10835 static rtx
10836 gen_pop (rtx arg)
10838 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10839 arg = gen_rtx_REG (word_mode, REGNO (arg));
10841 return gen_rtx_SET (arg,
10842 gen_rtx_MEM (word_mode,
10843 gen_rtx_POST_INC (Pmode,
10844 stack_pointer_rtx)));
10847 /* Return >= 0 if there is an unused call-clobbered register available
10848 for the entire function. */
10850 static unsigned int
10851 ix86_select_alt_pic_regnum (void)
10853 if (ix86_use_pseudo_pic_reg ())
10854 return INVALID_REGNUM;
10856 if (crtl->is_leaf
10857 && !crtl->profile
10858 && !ix86_current_function_calls_tls_descriptor)
10860 int i, drap;
10861 /* Can't use the same register for both PIC and DRAP. */
10862 if (crtl->drap_reg)
10863 drap = REGNO (crtl->drap_reg);
10864 else
10865 drap = -1;
10866 for (i = 2; i >= 0; --i)
10867 if (i != drap && !df_regs_ever_live_p (i))
10868 return i;
10871 return INVALID_REGNUM;
10874 /* Return true if REGNO is used by the epilogue. */
10876 bool
10877 ix86_epilogue_uses (int regno)
10879 /* If there are no caller-saved registers, we preserve all registers,
10880 except for MMX and x87 registers which aren't supported when saving
10881 and restoring registers. Don't explicitly save SP register since
10882 it is always preserved. */
10883 return (epilogue_completed
10884 && cfun->machine->no_caller_saved_registers
10885 && !fixed_regs[regno]
10886 && !STACK_REGNO_P (regno)
10887 && !MMX_REGNO_P (regno));
10890 /* Return nonzero if register REGNO can be used as a scratch register
10891 in peephole2. */
10893 static bool
10894 ix86_hard_regno_scratch_ok (unsigned int regno)
10896 /* If there are no caller-saved registers, we can't use any register
10897 as a scratch register after epilogue and use REGNO as scratch
10898 register only if it has been used before to avoid saving and
10899 restoring it. */
10900 return (!cfun->machine->no_caller_saved_registers
10901 || (!epilogue_completed
10902 && df_regs_ever_live_p (regno)));
10905 /* Return true if register class CL should be an additional allocno
10906 class. */
10908 static bool
10909 ix86_additional_allocno_class_p (reg_class_t cl)
10911 return cl == MOD4_SSE_REGS;
10914 /* Return TRUE if we need to save REGNO. */
10916 static bool
10917 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10919 /* If there are no caller-saved registers, we preserve all registers,
10920 except for MMX and x87 registers which aren't supported when saving
10921 and restoring registers. Don't explicitly save SP register since
10922 it is always preserved. */
10923 if (cfun->machine->no_caller_saved_registers)
10925 /* Don't preserve registers used for function return value. */
10926 rtx reg = crtl->return_rtx;
10927 if (reg)
10929 unsigned int i = REGNO (reg);
10930 unsigned int nregs = REG_NREGS (reg);
10931 while (nregs-- > 0)
10932 if ((i + nregs) == regno)
10933 return false;
10935 reg = crtl->return_bnd;
10936 if (reg)
10938 i = REGNO (reg);
10939 nregs = REG_NREGS (reg);
10940 while (nregs-- > 0)
10941 if ((i + nregs) == regno)
10942 return false;
10946 return (df_regs_ever_live_p (regno)
10947 && !fixed_regs[regno]
10948 && !STACK_REGNO_P (regno)
10949 && !MMX_REGNO_P (regno)
10950 && (regno != HARD_FRAME_POINTER_REGNUM
10951 || !frame_pointer_needed));
10954 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10955 && pic_offset_table_rtx)
10957 if (ix86_use_pseudo_pic_reg ())
10959 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10960 _mcount in prologue. */
10961 if (!TARGET_64BIT && flag_pic && crtl->profile)
10962 return true;
10964 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10965 || crtl->profile
10966 || crtl->calls_eh_return
10967 || crtl->uses_const_pool
10968 || cfun->has_nonlocal_label)
10969 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10972 if (crtl->calls_eh_return && maybe_eh_return)
10974 unsigned i;
10975 for (i = 0; ; i++)
10977 unsigned test = EH_RETURN_DATA_REGNO (i);
10978 if (test == INVALID_REGNUM)
10979 break;
10980 if (test == regno)
10981 return true;
10985 if (ignore_outlined && cfun->machine->call_ms2sysv)
10987 unsigned count = cfun->machine->call_ms2sysv_extra_regs
10988 + xlogue_layout::MIN_REGS;
10989 if (xlogue_layout::is_stub_managed_reg (regno, count))
10990 return false;
10993 if (crtl->drap_reg
10994 && regno == REGNO (crtl->drap_reg)
10995 && !cfun->machine->no_drap_save_restore)
10996 return true;
10998 return (df_regs_ever_live_p (regno)
10999 && !call_used_regs[regno]
11000 && !fixed_regs[regno]
11001 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11004 /* Return number of saved general prupose registers. */
11006 static int
11007 ix86_nsaved_regs (void)
11009 int nregs = 0;
11010 int regno;
11012 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11013 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11014 nregs ++;
11015 return nregs;
11018 /* Return number of saved SSE registers. */
11020 static int
11021 ix86_nsaved_sseregs (void)
11023 int nregs = 0;
11024 int regno;
11026 if (!TARGET_64BIT_MS_ABI)
11027 return 0;
11028 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11029 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11030 nregs ++;
11031 return nregs;
11034 /* Given FROM and TO register numbers, say whether this elimination is
11035 allowed. If stack alignment is needed, we can only replace argument
11036 pointer with hard frame pointer, or replace frame pointer with stack
11037 pointer. Otherwise, frame pointer elimination is automatically
11038 handled and all other eliminations are valid. */
11040 static bool
11041 ix86_can_eliminate (const int from, const int to)
11043 if (stack_realign_fp)
11044 return ((from == ARG_POINTER_REGNUM
11045 && to == HARD_FRAME_POINTER_REGNUM)
11046 || (from == FRAME_POINTER_REGNUM
11047 && to == STACK_POINTER_REGNUM));
11048 else
11049 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11052 /* Return the offset between two registers, one to be eliminated, and the other
11053 its replacement, at the start of a routine. */
11055 HOST_WIDE_INT
11056 ix86_initial_elimination_offset (int from, int to)
11058 struct ix86_frame &frame = cfun->machine->frame;
11060 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11061 return frame.hard_frame_pointer_offset;
11062 else if (from == FRAME_POINTER_REGNUM
11063 && to == HARD_FRAME_POINTER_REGNUM)
11064 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11065 else
11067 gcc_assert (to == STACK_POINTER_REGNUM);
11069 if (from == ARG_POINTER_REGNUM)
11070 return frame.stack_pointer_offset;
11072 gcc_assert (from == FRAME_POINTER_REGNUM);
11073 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11077 /* In a dynamically-aligned function, we can't know the offset from
11078 stack pointer to frame pointer, so we must ensure that setjmp
11079 eliminates fp against the hard fp (%ebp) rather than trying to
11080 index from %esp up to the top of the frame across a gap that is
11081 of unknown (at compile-time) size. */
11082 static rtx
11083 ix86_builtin_setjmp_frame_value (void)
11085 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11088 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11089 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11091 static bool warned_once = false;
11092 if (!warned_once)
11094 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11095 feature);
11096 warned_once = true;
11100 /* When using -fsplit-stack, the allocation routines set a field in
11101 the TCB to the bottom of the stack plus this much space, measured
11102 in bytes. */
11104 #define SPLIT_STACK_AVAILABLE 256
11106 /* Fill structure ix86_frame about frame of currently computed function. */
11108 static void
11109 ix86_compute_frame_layout (void)
11111 struct ix86_frame *frame = &cfun->machine->frame;
11112 struct machine_function *m = cfun->machine;
11113 unsigned HOST_WIDE_INT stack_alignment_needed;
11114 HOST_WIDE_INT offset;
11115 unsigned HOST_WIDE_INT preferred_alignment;
11116 HOST_WIDE_INT size = get_frame_size ();
11117 HOST_WIDE_INT to_allocate;
11119 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11120 * ms_abi functions that call a sysv function. We now need to prune away
11121 * cases where it should be disabled. */
11122 if (TARGET_64BIT && m->call_ms2sysv)
11124 gcc_assert (TARGET_64BIT_MS_ABI);
11125 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11126 gcc_assert (!TARGET_SEH);
11127 gcc_assert (TARGET_SSE);
11128 gcc_assert (!ix86_using_red_zone ());
11130 if (crtl->calls_eh_return)
11132 gcc_assert (!reload_completed);
11133 m->call_ms2sysv = false;
11134 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11137 else if (ix86_static_chain_on_stack)
11139 gcc_assert (!reload_completed);
11140 m->call_ms2sysv = false;
11141 warn_once_call_ms2sysv_xlogues ("static call chains");
11144 /* Finally, compute which registers the stub will manage. */
11145 else
11147 unsigned count = xlogue_layout::count_stub_managed_regs ();
11148 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11149 m->call_ms2sysv_pad_in = 0;
11153 frame->nregs = ix86_nsaved_regs ();
11154 frame->nsseregs = ix86_nsaved_sseregs ();
11156 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11157 except for function prologues, leaf functions and when the defult
11158 incoming stack boundary is overriden at command line or via
11159 force_align_arg_pointer attribute. */
11160 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11161 && (!crtl->is_leaf || cfun->calls_alloca != 0
11162 || ix86_current_function_calls_tls_descriptor
11163 || ix86_incoming_stack_boundary < 128))
11165 crtl->preferred_stack_boundary = 128;
11166 crtl->stack_alignment_needed = 128;
11169 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11170 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11172 gcc_assert (!size || stack_alignment_needed);
11173 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11174 gcc_assert (preferred_alignment <= stack_alignment_needed);
11176 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11177 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11178 if (TARGET_64BIT && m->call_ms2sysv)
11180 gcc_assert (stack_alignment_needed >= 16);
11181 gcc_assert (!frame->nsseregs);
11184 /* For SEH we have to limit the amount of code movement into the prologue.
11185 At present we do this via a BLOCKAGE, at which point there's very little
11186 scheduling that can be done, which means that there's very little point
11187 in doing anything except PUSHs. */
11188 if (TARGET_SEH)
11189 m->use_fast_prologue_epilogue = false;
11190 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11192 int count = frame->nregs;
11193 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11195 /* The fast prologue uses move instead of push to save registers. This
11196 is significantly longer, but also executes faster as modern hardware
11197 can execute the moves in parallel, but can't do that for push/pop.
11199 Be careful about choosing what prologue to emit: When function takes
11200 many instructions to execute we may use slow version as well as in
11201 case function is known to be outside hot spot (this is known with
11202 feedback only). Weight the size of function by number of registers
11203 to save as it is cheap to use one or two push instructions but very
11204 slow to use many of them. */
11205 if (count)
11206 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11207 if (node->frequency < NODE_FREQUENCY_NORMAL
11208 || (flag_branch_probabilities
11209 && node->frequency < NODE_FREQUENCY_HOT))
11210 m->use_fast_prologue_epilogue = false;
11211 else
11212 m->use_fast_prologue_epilogue
11213 = !expensive_function_p (count);
11216 frame->save_regs_using_mov
11217 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11218 /* If static stack checking is enabled and done with probes,
11219 the registers need to be saved before allocating the frame. */
11220 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11222 /* Skip return address and error code in exception handler. */
11223 offset = INCOMING_FRAME_SP_OFFSET;
11225 /* Skip pushed static chain. */
11226 if (ix86_static_chain_on_stack)
11227 offset += UNITS_PER_WORD;
11229 /* Skip saved base pointer. */
11230 if (frame_pointer_needed)
11231 offset += UNITS_PER_WORD;
11232 frame->hfp_save_offset = offset;
11234 /* The traditional frame pointer location is at the top of the frame. */
11235 frame->hard_frame_pointer_offset = offset;
11237 /* Register save area */
11238 offset += frame->nregs * UNITS_PER_WORD;
11239 frame->reg_save_offset = offset;
11241 /* On SEH target, registers are pushed just before the frame pointer
11242 location. */
11243 if (TARGET_SEH)
11244 frame->hard_frame_pointer_offset = offset;
11246 /* Calculate the size of the va-arg area (not including padding, if any). */
11247 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11249 if (stack_realign_fp)
11251 /* We may need a 16-byte aligned stack for the remainder of the
11252 register save area, but the stack frame for the local function
11253 may require a greater alignment if using AVX/2/512. In order
11254 to avoid wasting space, we first calculate the space needed for
11255 the rest of the register saves, add that to the stack pointer,
11256 and then realign the stack to the boundary of the start of the
11257 frame for the local function. */
11258 HOST_WIDE_INT space_needed = 0;
11259 HOST_WIDE_INT sse_reg_space_needed = 0;
11261 if (TARGET_64BIT)
11263 if (m->call_ms2sysv)
11265 m->call_ms2sysv_pad_in = 0;
11266 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11269 else if (frame->nsseregs)
11270 /* The only ABI that has saved SSE registers (Win64) also has a
11271 16-byte aligned default stack. However, many programs violate
11272 the ABI, and Wine64 forces stack realignment to compensate. */
11273 space_needed = frame->nsseregs * 16;
11275 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11277 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11278 rounding to be pedantic. */
11279 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11281 else
11282 space_needed = frame->va_arg_size;
11284 /* Record the allocation size required prior to the realignment AND. */
11285 frame->stack_realign_allocate = space_needed;
11287 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11288 before this point are not directly comparable with values below
11289 this point. Use sp_valid_at to determine if the stack pointer is
11290 valid for a given offset, fp_valid_at for the frame pointer, or
11291 choose_baseaddr to have a base register chosen for you.
11293 Note that the result of (frame->stack_realign_offset
11294 & (stack_alignment_needed - 1)) may not equal zero. */
11295 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11296 frame->stack_realign_offset = offset - space_needed;
11297 frame->sse_reg_save_offset = frame->stack_realign_offset
11298 + sse_reg_space_needed;
11300 else
11302 frame->stack_realign_offset = offset;
11304 if (TARGET_64BIT && m->call_ms2sysv)
11306 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11307 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11310 /* Align and set SSE register save area. */
11311 else if (frame->nsseregs)
11313 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11314 required and the DRAP re-alignment boundary is at least 16 bytes,
11315 then we want the SSE register save area properly aligned. */
11316 if (ix86_incoming_stack_boundary >= 128
11317 || (stack_realign_drap && stack_alignment_needed >= 16))
11318 offset = ROUND_UP (offset, 16);
11319 offset += frame->nsseregs * 16;
11321 frame->sse_reg_save_offset = offset;
11322 offset += frame->va_arg_size;
11325 /* Align start of frame for local function. */
11326 if (m->call_ms2sysv
11327 || frame->va_arg_size != 0
11328 || size != 0
11329 || !crtl->is_leaf
11330 || cfun->calls_alloca
11331 || ix86_current_function_calls_tls_descriptor)
11332 offset = ROUND_UP (offset, stack_alignment_needed);
11334 /* Frame pointer points here. */
11335 frame->frame_pointer_offset = offset;
11337 offset += size;
11339 /* Add outgoing arguments area. Can be skipped if we eliminated
11340 all the function calls as dead code.
11341 Skipping is however impossible when function calls alloca. Alloca
11342 expander assumes that last crtl->outgoing_args_size
11343 of stack frame are unused. */
11344 if (ACCUMULATE_OUTGOING_ARGS
11345 && (!crtl->is_leaf || cfun->calls_alloca
11346 || ix86_current_function_calls_tls_descriptor))
11348 offset += crtl->outgoing_args_size;
11349 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11351 else
11352 frame->outgoing_arguments_size = 0;
11354 /* Align stack boundary. Only needed if we're calling another function
11355 or using alloca. */
11356 if (!crtl->is_leaf || cfun->calls_alloca
11357 || ix86_current_function_calls_tls_descriptor)
11358 offset = ROUND_UP (offset, preferred_alignment);
11360 /* We've reached end of stack frame. */
11361 frame->stack_pointer_offset = offset;
11363 /* Size prologue needs to allocate. */
11364 to_allocate = offset - frame->sse_reg_save_offset;
11366 if ((!to_allocate && frame->nregs <= 1)
11367 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11368 frame->save_regs_using_mov = false;
11370 if (ix86_using_red_zone ()
11371 && crtl->sp_is_unchanging
11372 && crtl->is_leaf
11373 && !ix86_pc_thunk_call_expanded
11374 && !ix86_current_function_calls_tls_descriptor)
11376 frame->red_zone_size = to_allocate;
11377 if (frame->save_regs_using_mov)
11378 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11379 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11380 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11382 else
11383 frame->red_zone_size = 0;
11384 frame->stack_pointer_offset -= frame->red_zone_size;
11386 /* The SEH frame pointer location is near the bottom of the frame.
11387 This is enforced by the fact that the difference between the
11388 stack pointer and the frame pointer is limited to 240 bytes in
11389 the unwind data structure. */
11390 if (TARGET_SEH)
11392 HOST_WIDE_INT diff;
11394 /* If we can leave the frame pointer where it is, do so. Also, returns
11395 the establisher frame for __builtin_frame_address (0). */
11396 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11397 if (diff <= SEH_MAX_FRAME_SIZE
11398 && (diff > 240 || (diff & 15) != 0)
11399 && !crtl->accesses_prior_frames)
11401 /* Ideally we'd determine what portion of the local stack frame
11402 (within the constraint of the lowest 240) is most heavily used.
11403 But without that complication, simply bias the frame pointer
11404 by 128 bytes so as to maximize the amount of the local stack
11405 frame that is addressable with 8-bit offsets. */
11406 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11411 /* This is semi-inlined memory_address_length, but simplified
11412 since we know that we're always dealing with reg+offset, and
11413 to avoid having to create and discard all that rtl. */
11415 static inline int
11416 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11418 int len = 4;
11420 if (offset == 0)
11422 /* EBP and R13 cannot be encoded without an offset. */
11423 len = (regno == BP_REG || regno == R13_REG);
11425 else if (IN_RANGE (offset, -128, 127))
11426 len = 1;
11428 /* ESP and R12 must be encoded with a SIB byte. */
11429 if (regno == SP_REG || regno == R12_REG)
11430 len++;
11432 return len;
11435 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11436 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11438 static bool
11439 sp_valid_at (HOST_WIDE_INT cfa_offset)
11441 const struct machine_frame_state &fs = cfun->machine->fs;
11442 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11444 /* Validate that the cfa_offset isn't in a "no-man's land". */
11445 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11446 return false;
11448 return fs.sp_valid;
11451 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11452 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11454 static inline bool
11455 fp_valid_at (HOST_WIDE_INT cfa_offset)
11457 const struct machine_frame_state &fs = cfun->machine->fs;
11458 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11460 /* Validate that the cfa_offset isn't in a "no-man's land". */
11461 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11462 return false;
11464 return fs.fp_valid;
11467 /* Choose a base register based upon alignment requested, speed and/or
11468 size. */
11470 static void
11471 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11472 HOST_WIDE_INT &base_offset,
11473 unsigned int align_reqested, unsigned int *align)
11475 const struct machine_function *m = cfun->machine;
11476 unsigned int hfp_align;
11477 unsigned int drap_align;
11478 unsigned int sp_align;
11479 bool hfp_ok = fp_valid_at (cfa_offset);
11480 bool drap_ok = m->fs.drap_valid;
11481 bool sp_ok = sp_valid_at (cfa_offset);
11483 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11485 /* Filter out any registers that don't meet the requested alignment
11486 criteria. */
11487 if (align_reqested)
11489 if (m->fs.realigned)
11490 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11491 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11492 notes (which we would need to use a realigned stack pointer),
11493 so disable on SEH targets. */
11494 else if (m->fs.sp_realigned)
11495 sp_align = crtl->stack_alignment_needed;
11497 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11498 drap_ok = drap_ok && drap_align >= align_reqested;
11499 sp_ok = sp_ok && sp_align >= align_reqested;
11502 if (m->use_fast_prologue_epilogue)
11504 /* Choose the base register most likely to allow the most scheduling
11505 opportunities. Generally FP is valid throughout the function,
11506 while DRAP must be reloaded within the epilogue. But choose either
11507 over the SP due to increased encoding size. */
11509 if (hfp_ok)
11511 base_reg = hard_frame_pointer_rtx;
11512 base_offset = m->fs.fp_offset - cfa_offset;
11514 else if (drap_ok)
11516 base_reg = crtl->drap_reg;
11517 base_offset = 0 - cfa_offset;
11519 else if (sp_ok)
11521 base_reg = stack_pointer_rtx;
11522 base_offset = m->fs.sp_offset - cfa_offset;
11525 else
11527 HOST_WIDE_INT toffset;
11528 int len = 16, tlen;
11530 /* Choose the base register with the smallest address encoding.
11531 With a tie, choose FP > DRAP > SP. */
11532 if (sp_ok)
11534 base_reg = stack_pointer_rtx;
11535 base_offset = m->fs.sp_offset - cfa_offset;
11536 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11538 if (drap_ok)
11540 toffset = 0 - cfa_offset;
11541 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11542 if (tlen <= len)
11544 base_reg = crtl->drap_reg;
11545 base_offset = toffset;
11546 len = tlen;
11549 if (hfp_ok)
11551 toffset = m->fs.fp_offset - cfa_offset;
11552 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11553 if (tlen <= len)
11555 base_reg = hard_frame_pointer_rtx;
11556 base_offset = toffset;
11557 len = tlen;
11562 /* Set the align return value. */
11563 if (align)
11565 if (base_reg == stack_pointer_rtx)
11566 *align = sp_align;
11567 else if (base_reg == crtl->drap_reg)
11568 *align = drap_align;
11569 else if (base_reg == hard_frame_pointer_rtx)
11570 *align = hfp_align;
11574 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11575 the alignment of address. If ALIGN is non-null, it should point to
11576 an alignment value (in bits) that is preferred or zero and will
11577 recieve the alignment of the base register that was selected,
11578 irrespective of rather or not CFA_OFFSET is a multiple of that
11579 alignment value. If it is possible for the base register offset to be
11580 non-immediate then SCRATCH_REGNO should specify a scratch register to
11581 use.
11583 The valid base registers are taken from CFUN->MACHINE->FS. */
11585 static rtx
11586 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11587 unsigned int scratch_regno = INVALID_REGNUM)
11589 rtx base_reg = NULL;
11590 HOST_WIDE_INT base_offset = 0;
11592 /* If a specific alignment is requested, try to get a base register
11593 with that alignment first. */
11594 if (align && *align)
11595 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11597 if (!base_reg)
11598 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11600 gcc_assert (base_reg != NULL);
11602 rtx base_offset_rtx = GEN_INT (base_offset);
11604 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11606 gcc_assert (scratch_regno != INVALID_REGNUM);
11608 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11609 emit_move_insn (scratch_reg, base_offset_rtx);
11611 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11614 return plus_constant (Pmode, base_reg, base_offset);
11617 /* Emit code to save registers in the prologue. */
11619 static void
11620 ix86_emit_save_regs (void)
11622 unsigned int regno;
11623 rtx_insn *insn;
11625 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11626 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11628 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11629 RTX_FRAME_RELATED_P (insn) = 1;
11633 /* Emit a single register save at CFA - CFA_OFFSET. */
11635 static void
11636 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11637 HOST_WIDE_INT cfa_offset)
11639 struct machine_function *m = cfun->machine;
11640 rtx reg = gen_rtx_REG (mode, regno);
11641 rtx mem, addr, base, insn;
11642 unsigned int align = GET_MODE_ALIGNMENT (mode);
11644 addr = choose_baseaddr (cfa_offset, &align);
11645 mem = gen_frame_mem (mode, addr);
11647 /* The location aligment depends upon the base register. */
11648 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11649 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11650 set_mem_align (mem, align);
11652 insn = emit_insn (gen_rtx_SET (mem, reg));
11653 RTX_FRAME_RELATED_P (insn) = 1;
11655 base = addr;
11656 if (GET_CODE (base) == PLUS)
11657 base = XEXP (base, 0);
11658 gcc_checking_assert (REG_P (base));
11660 /* When saving registers into a re-aligned local stack frame, avoid
11661 any tricky guessing by dwarf2out. */
11662 if (m->fs.realigned)
11664 gcc_checking_assert (stack_realign_drap);
11666 if (regno == REGNO (crtl->drap_reg))
11668 /* A bit of a hack. We force the DRAP register to be saved in
11669 the re-aligned stack frame, which provides us with a copy
11670 of the CFA that will last past the prologue. Install it. */
11671 gcc_checking_assert (cfun->machine->fs.fp_valid);
11672 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11673 cfun->machine->fs.fp_offset - cfa_offset);
11674 mem = gen_rtx_MEM (mode, addr);
11675 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11677 else
11679 /* The frame pointer is a stable reference within the
11680 aligned frame. Use it. */
11681 gcc_checking_assert (cfun->machine->fs.fp_valid);
11682 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11683 cfun->machine->fs.fp_offset - cfa_offset);
11684 mem = gen_rtx_MEM (mode, addr);
11685 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11689 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11690 && cfa_offset >= m->fs.sp_realigned_offset)
11692 gcc_checking_assert (stack_realign_fp);
11693 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11696 /* The memory may not be relative to the current CFA register,
11697 which means that we may need to generate a new pattern for
11698 use by the unwind info. */
11699 else if (base != m->fs.cfa_reg)
11701 addr = plus_constant (Pmode, m->fs.cfa_reg,
11702 m->fs.cfa_offset - cfa_offset);
11703 mem = gen_rtx_MEM (mode, addr);
11704 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11708 /* Emit code to save registers using MOV insns.
11709 First register is stored at CFA - CFA_OFFSET. */
11710 static void
11711 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11713 unsigned int regno;
11715 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11716 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11718 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11719 cfa_offset -= UNITS_PER_WORD;
11723 /* Emit code to save SSE registers using MOV insns.
11724 First register is stored at CFA - CFA_OFFSET. */
11725 static void
11726 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11728 unsigned int regno;
11730 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11731 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11733 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11734 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11738 static GTY(()) rtx queued_cfa_restores;
11740 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11741 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11742 Don't add the note if the previously saved value will be left untouched
11743 within stack red-zone till return, as unwinders can find the same value
11744 in the register and on the stack. */
11746 static void
11747 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11749 if (!crtl->shrink_wrapped
11750 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11751 return;
11753 if (insn)
11755 add_reg_note (insn, REG_CFA_RESTORE, reg);
11756 RTX_FRAME_RELATED_P (insn) = 1;
11758 else
11759 queued_cfa_restores
11760 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11763 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11765 static void
11766 ix86_add_queued_cfa_restore_notes (rtx insn)
11768 rtx last;
11769 if (!queued_cfa_restores)
11770 return;
11771 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11773 XEXP (last, 1) = REG_NOTES (insn);
11774 REG_NOTES (insn) = queued_cfa_restores;
11775 queued_cfa_restores = NULL_RTX;
11776 RTX_FRAME_RELATED_P (insn) = 1;
11779 /* Expand prologue or epilogue stack adjustment.
11780 The pattern exist to put a dependency on all ebp-based memory accesses.
11781 STYLE should be negative if instructions should be marked as frame related,
11782 zero if %r11 register is live and cannot be freely used and positive
11783 otherwise. */
11785 static rtx
11786 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11787 int style, bool set_cfa)
11789 struct machine_function *m = cfun->machine;
11790 rtx insn;
11791 bool add_frame_related_expr = false;
11793 if (Pmode == SImode)
11794 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11795 else if (x86_64_immediate_operand (offset, DImode))
11796 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11797 else
11799 rtx tmp;
11800 /* r11 is used by indirect sibcall return as well, set before the
11801 epilogue and used after the epilogue. */
11802 if (style)
11803 tmp = gen_rtx_REG (DImode, R11_REG);
11804 else
11806 gcc_assert (src != hard_frame_pointer_rtx
11807 && dest != hard_frame_pointer_rtx);
11808 tmp = hard_frame_pointer_rtx;
11810 insn = emit_insn (gen_rtx_SET (tmp, offset));
11811 if (style < 0)
11812 add_frame_related_expr = true;
11814 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11817 insn = emit_insn (insn);
11818 if (style >= 0)
11819 ix86_add_queued_cfa_restore_notes (insn);
11821 if (set_cfa)
11823 rtx r;
11825 gcc_assert (m->fs.cfa_reg == src);
11826 m->fs.cfa_offset += INTVAL (offset);
11827 m->fs.cfa_reg = dest;
11829 r = gen_rtx_PLUS (Pmode, src, offset);
11830 r = gen_rtx_SET (dest, r);
11831 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11832 RTX_FRAME_RELATED_P (insn) = 1;
11834 else if (style < 0)
11836 RTX_FRAME_RELATED_P (insn) = 1;
11837 if (add_frame_related_expr)
11839 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11840 r = gen_rtx_SET (dest, r);
11841 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11845 if (dest == stack_pointer_rtx)
11847 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11848 bool valid = m->fs.sp_valid;
11849 bool realigned = m->fs.sp_realigned;
11851 if (src == hard_frame_pointer_rtx)
11853 valid = m->fs.fp_valid;
11854 realigned = false;
11855 ooffset = m->fs.fp_offset;
11857 else if (src == crtl->drap_reg)
11859 valid = m->fs.drap_valid;
11860 realigned = false;
11861 ooffset = 0;
11863 else
11865 /* Else there are two possibilities: SP itself, which we set
11866 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11867 taken care of this by hand along the eh_return path. */
11868 gcc_checking_assert (src == stack_pointer_rtx
11869 || offset == const0_rtx);
11872 m->fs.sp_offset = ooffset - INTVAL (offset);
11873 m->fs.sp_valid = valid;
11874 m->fs.sp_realigned = realigned;
11876 return insn;
11879 /* Find an available register to be used as dynamic realign argument
11880 pointer regsiter. Such a register will be written in prologue and
11881 used in begin of body, so it must not be
11882 1. parameter passing register.
11883 2. GOT pointer.
11884 We reuse static-chain register if it is available. Otherwise, we
11885 use DI for i386 and R13 for x86-64. We chose R13 since it has
11886 shorter encoding.
11888 Return: the regno of chosen register. */
11890 static unsigned int
11891 find_drap_reg (void)
11893 tree decl = cfun->decl;
11895 /* Always use callee-saved register if there are no caller-saved
11896 registers. */
11897 if (TARGET_64BIT)
11899 /* Use R13 for nested function or function need static chain.
11900 Since function with tail call may use any caller-saved
11901 registers in epilogue, DRAP must not use caller-saved
11902 register in such case. */
11903 if (DECL_STATIC_CHAIN (decl)
11904 || cfun->machine->no_caller_saved_registers
11905 || crtl->tail_call_emit)
11906 return R13_REG;
11908 return R10_REG;
11910 else
11912 /* Use DI for nested function or function need static chain.
11913 Since function with tail call may use any caller-saved
11914 registers in epilogue, DRAP must not use caller-saved
11915 register in such case. */
11916 if (DECL_STATIC_CHAIN (decl)
11917 || cfun->machine->no_caller_saved_registers
11918 || crtl->tail_call_emit)
11919 return DI_REG;
11921 /* Reuse static chain register if it isn't used for parameter
11922 passing. */
11923 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11925 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11926 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11927 return CX_REG;
11929 return DI_REG;
11933 /* Handle a "force_align_arg_pointer" attribute. */
11935 static tree
11936 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11937 tree, int, bool *no_add_attrs)
11939 if (TREE_CODE (*node) != FUNCTION_TYPE
11940 && TREE_CODE (*node) != METHOD_TYPE
11941 && TREE_CODE (*node) != FIELD_DECL
11942 && TREE_CODE (*node) != TYPE_DECL)
11944 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11945 name);
11946 *no_add_attrs = true;
11949 return NULL_TREE;
11952 /* Return minimum incoming stack alignment. */
11954 static unsigned int
11955 ix86_minimum_incoming_stack_boundary (bool sibcall)
11957 unsigned int incoming_stack_boundary;
11959 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11960 if (cfun->machine->func_type != TYPE_NORMAL)
11961 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11962 /* Prefer the one specified at command line. */
11963 else if (ix86_user_incoming_stack_boundary)
11964 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11965 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11966 if -mstackrealign is used, it isn't used for sibcall check and
11967 estimated stack alignment is 128bit. */
11968 else if (!sibcall
11969 && ix86_force_align_arg_pointer
11970 && crtl->stack_alignment_estimated == 128)
11971 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11972 else
11973 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11975 /* Incoming stack alignment can be changed on individual functions
11976 via force_align_arg_pointer attribute. We use the smallest
11977 incoming stack boundary. */
11978 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
11979 && lookup_attribute (ix86_force_align_arg_pointer_string,
11980 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
11981 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11983 /* The incoming stack frame has to be aligned at least at
11984 parm_stack_boundary. */
11985 if (incoming_stack_boundary < crtl->parm_stack_boundary)
11986 incoming_stack_boundary = crtl->parm_stack_boundary;
11988 /* Stack at entrance of main is aligned by runtime. We use the
11989 smallest incoming stack boundary. */
11990 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
11991 && DECL_NAME (current_function_decl)
11992 && MAIN_NAME_P (DECL_NAME (current_function_decl))
11993 && DECL_FILE_SCOPE_P (current_function_decl))
11994 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
11996 return incoming_stack_boundary;
11999 /* Update incoming stack boundary and estimated stack alignment. */
12001 static void
12002 ix86_update_stack_boundary (void)
12004 ix86_incoming_stack_boundary
12005 = ix86_minimum_incoming_stack_boundary (false);
12007 /* x86_64 vararg needs 16byte stack alignment for register save
12008 area. */
12009 if (TARGET_64BIT
12010 && cfun->stdarg
12011 && crtl->stack_alignment_estimated < 128)
12012 crtl->stack_alignment_estimated = 128;
12014 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12015 if (ix86_tls_descriptor_calls_expanded_in_cfun
12016 && crtl->preferred_stack_boundary < 128)
12017 crtl->preferred_stack_boundary = 128;
12020 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12021 needed or an rtx for DRAP otherwise. */
12023 static rtx
12024 ix86_get_drap_rtx (void)
12026 /* We must use DRAP if there are outgoing arguments on stack and
12027 ACCUMULATE_OUTGOING_ARGS is false. */
12028 if (ix86_force_drap
12029 || (cfun->machine->outgoing_args_on_stack
12030 && !ACCUMULATE_OUTGOING_ARGS))
12031 crtl->need_drap = true;
12033 if (stack_realign_drap)
12035 /* Assign DRAP to vDRAP and returns vDRAP */
12036 unsigned int regno = find_drap_reg ();
12037 rtx drap_vreg;
12038 rtx arg_ptr;
12039 rtx_insn *seq, *insn;
12041 arg_ptr = gen_rtx_REG (Pmode, regno);
12042 crtl->drap_reg = arg_ptr;
12044 start_sequence ();
12045 drap_vreg = copy_to_reg (arg_ptr);
12046 seq = get_insns ();
12047 end_sequence ();
12049 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12050 if (!optimize)
12052 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12053 RTX_FRAME_RELATED_P (insn) = 1;
12055 return drap_vreg;
12057 else
12058 return NULL;
12061 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12063 static rtx
12064 ix86_internal_arg_pointer (void)
12066 return virtual_incoming_args_rtx;
12069 struct scratch_reg {
12070 rtx reg;
12071 bool saved;
12074 /* Return a short-lived scratch register for use on function entry.
12075 In 32-bit mode, it is valid only after the registers are saved
12076 in the prologue. This register must be released by means of
12077 release_scratch_register_on_entry once it is dead. */
12079 static void
12080 get_scratch_register_on_entry (struct scratch_reg *sr)
12082 int regno;
12084 sr->saved = false;
12086 if (TARGET_64BIT)
12088 /* We always use R11 in 64-bit mode. */
12089 regno = R11_REG;
12091 else
12093 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12094 bool fastcall_p
12095 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12096 bool thiscall_p
12097 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12098 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12099 int regparm = ix86_function_regparm (fntype, decl);
12100 int drap_regno
12101 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12103 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12104 for the static chain register. */
12105 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12106 && drap_regno != AX_REG)
12107 regno = AX_REG;
12108 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12109 for the static chain register. */
12110 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12111 regno = AX_REG;
12112 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12113 regno = DX_REG;
12114 /* ecx is the static chain register. */
12115 else if (regparm < 3 && !fastcall_p && !thiscall_p
12116 && !static_chain_p
12117 && drap_regno != CX_REG)
12118 regno = CX_REG;
12119 else if (ix86_save_reg (BX_REG, true, false))
12120 regno = BX_REG;
12121 /* esi is the static chain register. */
12122 else if (!(regparm == 3 && static_chain_p)
12123 && ix86_save_reg (SI_REG, true, false))
12124 regno = SI_REG;
12125 else if (ix86_save_reg (DI_REG, true, false))
12126 regno = DI_REG;
12127 else
12129 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12130 sr->saved = true;
12134 sr->reg = gen_rtx_REG (Pmode, regno);
12135 if (sr->saved)
12137 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12138 RTX_FRAME_RELATED_P (insn) = 1;
12142 /* Release a scratch register obtained from the preceding function. */
12144 static void
12145 release_scratch_register_on_entry (struct scratch_reg *sr)
12147 if (sr->saved)
12149 struct machine_function *m = cfun->machine;
12150 rtx x, insn = emit_insn (gen_pop (sr->reg));
12152 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12153 RTX_FRAME_RELATED_P (insn) = 1;
12154 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12155 x = gen_rtx_SET (stack_pointer_rtx, x);
12156 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12157 m->fs.sp_offset -= UNITS_PER_WORD;
12161 /* Return the probing interval for -fstack-clash-protection. */
12163 static HOST_WIDE_INT
12164 get_probe_interval (void)
12166 if (flag_stack_clash_protection)
12167 return (HOST_WIDE_INT_1U
12168 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
12169 else
12170 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
12173 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12175 This differs from the next routine in that it tries hard to prevent
12176 attacks that jump the stack guard. Thus it is never allowed to allocate
12177 more than PROBE_INTERVAL bytes of stack space without a suitable
12178 probe. */
12180 static void
12181 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12183 struct machine_function *m = cfun->machine;
12185 /* If this function does not statically allocate stack space, then
12186 no probes are needed. */
12187 if (!size)
12189 /* However, the allocation of space via pushes for register
12190 saves could be viewed as allocating space, but without the
12191 need to probe. */
12192 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12193 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12194 else
12195 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12196 return;
12199 /* If we are a noreturn function, then we have to consider the
12200 possibility that we're called via a jump rather than a call.
12202 Thus we don't have the implicit probe generated by saving the
12203 return address into the stack at the call. Thus, the stack
12204 pointer could be anywhere in the guard page. The safe thing
12205 to do is emit a probe now.
12207 ?!? This should be revamped to work like aarch64 and s390 where
12208 we track the offset from the most recent probe. Normally that
12209 offset would be zero. For a noreturn function we would reset
12210 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12211 we just probe when we cross PROBE_INTERVAL. */
12212 if (TREE_THIS_VOLATILE (cfun->decl))
12214 /* We can safely use any register here since we're just going to push
12215 its value and immediately pop it back. But we do try and avoid
12216 argument passing registers so as not to introduce dependencies in
12217 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12218 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12219 rtx_insn *insn = emit_insn (gen_push (dummy_reg));
12220 RTX_FRAME_RELATED_P (insn) = 1;
12221 ix86_emit_restore_reg_using_pop (dummy_reg);
12222 emit_insn (gen_blockage ());
12225 /* If we allocate less than the size of the guard statically,
12226 then no probing is necessary, but we do need to allocate
12227 the stack. */
12228 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12230 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12231 GEN_INT (-size), -1,
12232 m->fs.cfa_reg == stack_pointer_rtx);
12233 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12234 return;
12237 /* We're allocating a large enough stack frame that we need to
12238 emit probes. Either emit them inline or in a loop depending
12239 on the size. */
12240 HOST_WIDE_INT probe_interval = get_probe_interval ();
12241 if (size <= 4 * probe_interval)
12243 HOST_WIDE_INT i;
12244 for (i = probe_interval; i <= size; i += probe_interval)
12246 /* Allocate PROBE_INTERVAL bytes. */
12247 rtx insn
12248 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12249 GEN_INT (-probe_interval), -1,
12250 m->fs.cfa_reg == stack_pointer_rtx);
12251 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12253 /* And probe at *sp. */
12254 emit_stack_probe (stack_pointer_rtx);
12255 emit_insn (gen_blockage ());
12258 /* We need to allocate space for the residual, but we do not need
12259 to probe the residual. */
12260 HOST_WIDE_INT residual = (i - probe_interval - size);
12261 if (residual)
12262 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12263 GEN_INT (residual), -1,
12264 m->fs.cfa_reg == stack_pointer_rtx);
12265 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12267 else
12269 struct scratch_reg sr;
12270 get_scratch_register_on_entry (&sr);
12272 /* Step 1: round SIZE down to a multiple of the interval. */
12273 HOST_WIDE_INT rounded_size = size & -probe_interval;
12275 /* Step 2: compute final value of the loop counter. Use lea if
12276 possible. */
12277 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12278 rtx insn;
12279 if (address_no_seg_operand (addr, Pmode))
12280 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12281 else
12283 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12284 insn = emit_insn (gen_rtx_SET (sr.reg,
12285 gen_rtx_PLUS (Pmode, sr.reg,
12286 stack_pointer_rtx)));
12288 if (m->fs.cfa_reg == stack_pointer_rtx)
12290 add_reg_note (insn, REG_CFA_DEF_CFA,
12291 plus_constant (Pmode, sr.reg,
12292 m->fs.cfa_offset + rounded_size));
12293 RTX_FRAME_RELATED_P (insn) = 1;
12296 /* Step 3: the loop. */
12297 rtx size_rtx = GEN_INT (rounded_size);
12298 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12299 size_rtx));
12300 if (m->fs.cfa_reg == stack_pointer_rtx)
12302 m->fs.cfa_offset += rounded_size;
12303 add_reg_note (insn, REG_CFA_DEF_CFA,
12304 plus_constant (Pmode, stack_pointer_rtx,
12305 m->fs.cfa_offset));
12306 RTX_FRAME_RELATED_P (insn) = 1;
12308 m->fs.sp_offset += rounded_size;
12309 emit_insn (gen_blockage ());
12311 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12312 is equal to ROUNDED_SIZE. */
12314 if (size != rounded_size)
12315 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12316 GEN_INT (rounded_size - size), -1,
12317 m->fs.cfa_reg == stack_pointer_rtx);
12318 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12320 release_scratch_register_on_entry (&sr);
12323 /* Make sure nothing is scheduled before we are done. */
12324 emit_insn (gen_blockage ());
12327 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12329 static void
12330 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12332 /* We skip the probe for the first interval + a small dope of 4 words and
12333 probe that many bytes past the specified size to maintain a protection
12334 area at the botton of the stack. */
12335 const int dope = 4 * UNITS_PER_WORD;
12336 rtx size_rtx = GEN_INT (size), last;
12338 /* See if we have a constant small number of probes to generate. If so,
12339 that's the easy case. The run-time loop is made up of 9 insns in the
12340 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12341 for n # of intervals. */
12342 if (size <= 4 * get_probe_interval ())
12344 HOST_WIDE_INT i, adjust;
12345 bool first_probe = true;
12347 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12348 values of N from 1 until it exceeds SIZE. If only one probe is
12349 needed, this will not generate any code. Then adjust and probe
12350 to PROBE_INTERVAL + SIZE. */
12351 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12353 if (first_probe)
12355 adjust = 2 * get_probe_interval () + dope;
12356 first_probe = false;
12358 else
12359 adjust = get_probe_interval ();
12361 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12362 plus_constant (Pmode, stack_pointer_rtx,
12363 -adjust)));
12364 emit_stack_probe (stack_pointer_rtx);
12367 if (first_probe)
12368 adjust = size + get_probe_interval () + dope;
12369 else
12370 adjust = size + get_probe_interval () - i;
12372 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12373 plus_constant (Pmode, stack_pointer_rtx,
12374 -adjust)));
12375 emit_stack_probe (stack_pointer_rtx);
12377 /* Adjust back to account for the additional first interval. */
12378 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12379 plus_constant (Pmode, stack_pointer_rtx,
12380 (get_probe_interval ()
12381 + dope))));
12384 /* Otherwise, do the same as above, but in a loop. Note that we must be
12385 extra careful with variables wrapping around because we might be at
12386 the very top (or the very bottom) of the address space and we have
12387 to be able to handle this case properly; in particular, we use an
12388 equality test for the loop condition. */
12389 else
12391 HOST_WIDE_INT rounded_size;
12392 struct scratch_reg sr;
12394 get_scratch_register_on_entry (&sr);
12397 /* Step 1: round SIZE to the previous multiple of the interval. */
12399 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12402 /* Step 2: compute initial and final value of the loop counter. */
12404 /* SP = SP_0 + PROBE_INTERVAL. */
12405 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12406 plus_constant (Pmode, stack_pointer_rtx,
12407 - (get_probe_interval () + dope))));
12409 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12410 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12411 emit_insn (gen_rtx_SET (sr.reg,
12412 plus_constant (Pmode, stack_pointer_rtx,
12413 -rounded_size)));
12414 else
12416 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12417 emit_insn (gen_rtx_SET (sr.reg,
12418 gen_rtx_PLUS (Pmode, sr.reg,
12419 stack_pointer_rtx)));
12423 /* Step 3: the loop
12427 SP = SP + PROBE_INTERVAL
12428 probe at SP
12430 while (SP != LAST_ADDR)
12432 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12433 values of N from 1 until it is equal to ROUNDED_SIZE. */
12435 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12438 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12439 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12441 if (size != rounded_size)
12443 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12444 plus_constant (Pmode, stack_pointer_rtx,
12445 rounded_size - size)));
12446 emit_stack_probe (stack_pointer_rtx);
12449 /* Adjust back to account for the additional first interval. */
12450 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12451 plus_constant (Pmode, stack_pointer_rtx,
12452 (get_probe_interval ()
12453 + dope))));
12455 release_scratch_register_on_entry (&sr);
12458 /* Even if the stack pointer isn't the CFA register, we need to correctly
12459 describe the adjustments made to it, in particular differentiate the
12460 frame-related ones from the frame-unrelated ones. */
12461 if (size > 0)
12463 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12464 XVECEXP (expr, 0, 0)
12465 = gen_rtx_SET (stack_pointer_rtx,
12466 plus_constant (Pmode, stack_pointer_rtx, -size));
12467 XVECEXP (expr, 0, 1)
12468 = gen_rtx_SET (stack_pointer_rtx,
12469 plus_constant (Pmode, stack_pointer_rtx,
12470 get_probe_interval () + dope + size));
12471 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12472 RTX_FRAME_RELATED_P (last) = 1;
12474 cfun->machine->fs.sp_offset += size;
12477 /* Make sure nothing is scheduled before we are done. */
12478 emit_insn (gen_blockage ());
12481 /* Adjust the stack pointer up to REG while probing it. */
12483 const char *
12484 output_adjust_stack_and_probe (rtx reg)
12486 static int labelno = 0;
12487 char loop_lab[32];
12488 rtx xops[2];
12490 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12492 /* Loop. */
12493 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12495 /* SP = SP + PROBE_INTERVAL. */
12496 xops[0] = stack_pointer_rtx;
12497 xops[1] = GEN_INT (get_probe_interval ());
12498 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12500 /* Probe at SP. */
12501 xops[1] = const0_rtx;
12502 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12504 /* Test if SP == LAST_ADDR. */
12505 xops[0] = stack_pointer_rtx;
12506 xops[1] = reg;
12507 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12509 /* Branch. */
12510 fputs ("\tjne\t", asm_out_file);
12511 assemble_name_raw (asm_out_file, loop_lab);
12512 fputc ('\n', asm_out_file);
12514 return "";
12517 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12518 inclusive. These are offsets from the current stack pointer. */
12520 static void
12521 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12523 /* See if we have a constant small number of probes to generate. If so,
12524 that's the easy case. The run-time loop is made up of 6 insns in the
12525 generic case while the compile-time loop is made up of n insns for n #
12526 of intervals. */
12527 if (size <= 6 * get_probe_interval ())
12529 HOST_WIDE_INT i;
12531 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12532 it exceeds SIZE. If only one probe is needed, this will not
12533 generate any code. Then probe at FIRST + SIZE. */
12534 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12535 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12536 -(first + i)));
12538 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12539 -(first + size)));
12542 /* Otherwise, do the same as above, but in a loop. Note that we must be
12543 extra careful with variables wrapping around because we might be at
12544 the very top (or the very bottom) of the address space and we have
12545 to be able to handle this case properly; in particular, we use an
12546 equality test for the loop condition. */
12547 else
12549 HOST_WIDE_INT rounded_size, last;
12550 struct scratch_reg sr;
12552 get_scratch_register_on_entry (&sr);
12555 /* Step 1: round SIZE to the previous multiple of the interval. */
12557 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12560 /* Step 2: compute initial and final value of the loop counter. */
12562 /* TEST_OFFSET = FIRST. */
12563 emit_move_insn (sr.reg, GEN_INT (-first));
12565 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12566 last = first + rounded_size;
12569 /* Step 3: the loop
12573 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12574 probe at TEST_ADDR
12576 while (TEST_ADDR != LAST_ADDR)
12578 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12579 until it is equal to ROUNDED_SIZE. */
12581 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12584 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12585 that SIZE is equal to ROUNDED_SIZE. */
12587 if (size != rounded_size)
12588 emit_stack_probe (plus_constant (Pmode,
12589 gen_rtx_PLUS (Pmode,
12590 stack_pointer_rtx,
12591 sr.reg),
12592 rounded_size - size));
12594 release_scratch_register_on_entry (&sr);
12597 /* Make sure nothing is scheduled before we are done. */
12598 emit_insn (gen_blockage ());
12601 /* Probe a range of stack addresses from REG to END, inclusive. These are
12602 offsets from the current stack pointer. */
12604 const char *
12605 output_probe_stack_range (rtx reg, rtx end)
12607 static int labelno = 0;
12608 char loop_lab[32];
12609 rtx xops[3];
12611 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12613 /* Loop. */
12614 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12616 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12617 xops[0] = reg;
12618 xops[1] = GEN_INT (get_probe_interval ());
12619 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12621 /* Probe at TEST_ADDR. */
12622 xops[0] = stack_pointer_rtx;
12623 xops[1] = reg;
12624 xops[2] = const0_rtx;
12625 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12627 /* Test if TEST_ADDR == LAST_ADDR. */
12628 xops[0] = reg;
12629 xops[1] = end;
12630 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12632 /* Branch. */
12633 fputs ("\tjne\t", asm_out_file);
12634 assemble_name_raw (asm_out_file, loop_lab);
12635 fputc ('\n', asm_out_file);
12637 return "";
12640 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12641 will guide prologue/epilogue to be generated in correct form. */
12643 static void
12644 ix86_finalize_stack_frame_flags (void)
12646 /* Check if stack realign is really needed after reload, and
12647 stores result in cfun */
12648 unsigned int incoming_stack_boundary
12649 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12650 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12651 unsigned int stack_alignment
12652 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12653 ? crtl->max_used_stack_slot_alignment
12654 : crtl->stack_alignment_needed);
12655 unsigned int stack_realign
12656 = (incoming_stack_boundary < stack_alignment);
12657 bool recompute_frame_layout_p = false;
12659 if (crtl->stack_realign_finalized)
12661 /* After stack_realign_needed is finalized, we can't no longer
12662 change it. */
12663 gcc_assert (crtl->stack_realign_needed == stack_realign);
12664 return;
12667 /* If the only reason for frame_pointer_needed is that we conservatively
12668 assumed stack realignment might be needed or -fno-omit-frame-pointer
12669 is used, but in the end nothing that needed the stack alignment had
12670 been spilled nor stack access, clear frame_pointer_needed and say we
12671 don't need stack realignment. */
12672 if ((stack_realign || !flag_omit_frame_pointer)
12673 && frame_pointer_needed
12674 && crtl->is_leaf
12675 && crtl->sp_is_unchanging
12676 && !ix86_current_function_calls_tls_descriptor
12677 && !crtl->accesses_prior_frames
12678 && !cfun->calls_alloca
12679 && !crtl->calls_eh_return
12680 /* See ira_setup_eliminable_regset for the rationale. */
12681 && !(STACK_CHECK_MOVING_SP
12682 && flag_stack_check
12683 && flag_exceptions
12684 && cfun->can_throw_non_call_exceptions)
12685 && !ix86_frame_pointer_required ()
12686 && get_frame_size () == 0
12687 && ix86_nsaved_sseregs () == 0
12688 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12690 HARD_REG_SET set_up_by_prologue, prologue_used;
12691 basic_block bb;
12693 CLEAR_HARD_REG_SET (prologue_used);
12694 CLEAR_HARD_REG_SET (set_up_by_prologue);
12695 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12696 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12697 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12698 HARD_FRAME_POINTER_REGNUM);
12700 /* The preferred stack alignment is the minimum stack alignment. */
12701 if (stack_alignment > crtl->preferred_stack_boundary)
12702 stack_alignment = crtl->preferred_stack_boundary;
12704 bool require_stack_frame = false;
12706 FOR_EACH_BB_FN (bb, cfun)
12708 rtx_insn *insn;
12709 FOR_BB_INSNS (bb, insn)
12710 if (NONDEBUG_INSN_P (insn)
12711 && requires_stack_frame_p (insn, prologue_used,
12712 set_up_by_prologue))
12714 require_stack_frame = true;
12716 if (stack_realign)
12718 /* Find the maximum stack alignment. */
12719 subrtx_iterator::array_type array;
12720 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12721 if (MEM_P (*iter)
12722 && (reg_mentioned_p (stack_pointer_rtx,
12723 *iter)
12724 || reg_mentioned_p (frame_pointer_rtx,
12725 *iter)))
12727 unsigned int alignment = MEM_ALIGN (*iter);
12728 if (alignment > stack_alignment)
12729 stack_alignment = alignment;
12735 if (require_stack_frame)
12737 /* Stack frame is required. If stack alignment needed is less
12738 than incoming stack boundary, don't realign stack. */
12739 stack_realign = incoming_stack_boundary < stack_alignment;
12740 if (!stack_realign)
12742 crtl->max_used_stack_slot_alignment
12743 = incoming_stack_boundary;
12744 crtl->stack_alignment_needed
12745 = incoming_stack_boundary;
12746 /* Also update preferred_stack_boundary for leaf
12747 functions. */
12748 crtl->preferred_stack_boundary
12749 = incoming_stack_boundary;
12752 else
12754 /* If drap has been set, but it actually isn't live at the
12755 start of the function, there is no reason to set it up. */
12756 if (crtl->drap_reg)
12758 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12759 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12760 REGNO (crtl->drap_reg)))
12762 crtl->drap_reg = NULL_RTX;
12763 crtl->need_drap = false;
12766 else
12767 cfun->machine->no_drap_save_restore = true;
12769 frame_pointer_needed = false;
12770 stack_realign = false;
12771 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12772 crtl->stack_alignment_needed = incoming_stack_boundary;
12773 crtl->stack_alignment_estimated = incoming_stack_boundary;
12774 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12775 crtl->preferred_stack_boundary = incoming_stack_boundary;
12776 df_finish_pass (true);
12777 df_scan_alloc (NULL);
12778 df_scan_blocks ();
12779 df_compute_regs_ever_live (true);
12780 df_analyze ();
12782 if (flag_var_tracking)
12784 /* Since frame pointer is no longer available, replace it with
12785 stack pointer - UNITS_PER_WORD in debug insns. */
12786 df_ref ref, next;
12787 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12788 ref; ref = next)
12790 next = DF_REF_NEXT_REG (ref);
12791 if (!DF_REF_INSN_INFO (ref))
12792 continue;
12794 /* Make sure the next ref is for a different instruction,
12795 so that we're not affected by the rescan. */
12796 rtx_insn *insn = DF_REF_INSN (ref);
12797 while (next && DF_REF_INSN (next) == insn)
12798 next = DF_REF_NEXT_REG (next);
12800 if (DEBUG_INSN_P (insn))
12802 bool changed = false;
12803 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12805 rtx *loc = DF_REF_LOC (ref);
12806 if (*loc == hard_frame_pointer_rtx)
12808 *loc = plus_constant (Pmode,
12809 stack_pointer_rtx,
12810 -UNITS_PER_WORD);
12811 changed = true;
12814 if (changed)
12815 df_insn_rescan (insn);
12820 recompute_frame_layout_p = true;
12824 if (crtl->stack_realign_needed != stack_realign)
12825 recompute_frame_layout_p = true;
12826 crtl->stack_realign_needed = stack_realign;
12827 crtl->stack_realign_finalized = true;
12828 if (recompute_frame_layout_p)
12829 ix86_compute_frame_layout ();
12832 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12834 static void
12835 ix86_elim_entry_set_got (rtx reg)
12837 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12838 rtx_insn *c_insn = BB_HEAD (bb);
12839 if (!NONDEBUG_INSN_P (c_insn))
12840 c_insn = next_nonnote_nondebug_insn (c_insn);
12841 if (c_insn && NONJUMP_INSN_P (c_insn))
12843 rtx pat = PATTERN (c_insn);
12844 if (GET_CODE (pat) == PARALLEL)
12846 rtx vec = XVECEXP (pat, 0, 0);
12847 if (GET_CODE (vec) == SET
12848 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12849 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12850 delete_insn (c_insn);
12855 static rtx
12856 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12858 rtx addr, mem;
12860 if (offset)
12861 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12862 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12863 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12866 static inline rtx
12867 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12869 return gen_frame_set (reg, frame_reg, offset, false);
12872 static inline rtx
12873 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12875 return gen_frame_set (reg, frame_reg, offset, true);
12878 static void
12879 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12881 struct machine_function *m = cfun->machine;
12882 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12883 + m->call_ms2sysv_extra_regs;
12884 rtvec v = rtvec_alloc (ncregs + 1);
12885 unsigned int align, i, vi = 0;
12886 rtx_insn *insn;
12887 rtx sym, addr;
12888 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12889 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12891 /* AL should only be live with sysv_abi. */
12892 gcc_assert (!ix86_eax_live_at_start_p ());
12893 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
12895 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12896 we've actually realigned the stack or not. */
12897 align = GET_MODE_ALIGNMENT (V4SFmode);
12898 addr = choose_baseaddr (frame.stack_realign_offset
12899 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
12900 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12902 emit_insn (gen_rtx_SET (rax, addr));
12904 /* Get the stub symbol. */
12905 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12906 : XLOGUE_STUB_SAVE);
12907 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12909 for (i = 0; i < ncregs; ++i)
12911 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12912 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12913 r.regno);
12914 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12917 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12919 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12920 RTX_FRAME_RELATED_P (insn) = true;
12923 /* Expand the prologue into a bunch of separate insns. */
12925 void
12926 ix86_expand_prologue (void)
12928 struct machine_function *m = cfun->machine;
12929 rtx insn, t;
12930 struct ix86_frame frame;
12931 HOST_WIDE_INT allocate;
12932 bool int_registers_saved;
12933 bool sse_registers_saved;
12934 bool save_stub_call_needed;
12935 rtx static_chain = NULL_RTX;
12937 if (ix86_function_naked (current_function_decl))
12938 return;
12940 ix86_finalize_stack_frame_flags ();
12942 /* DRAP should not coexist with stack_realign_fp */
12943 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
12945 memset (&m->fs, 0, sizeof (m->fs));
12947 /* Initialize CFA state for before the prologue. */
12948 m->fs.cfa_reg = stack_pointer_rtx;
12949 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
12951 /* Track SP offset to the CFA. We continue tracking this after we've
12952 swapped the CFA register away from SP. In the case of re-alignment
12953 this is fudged; we're interested to offsets within the local frame. */
12954 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
12955 m->fs.sp_valid = true;
12956 m->fs.sp_realigned = false;
12958 frame = m->frame;
12960 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
12962 /* We should have already generated an error for any use of
12963 ms_hook on a nested function. */
12964 gcc_checking_assert (!ix86_static_chain_on_stack);
12966 /* Check if profiling is active and we shall use profiling before
12967 prologue variant. If so sorry. */
12968 if (crtl->profile && flag_fentry != 0)
12969 sorry ("ms_hook_prologue attribute isn%'t compatible "
12970 "with -mfentry for 32-bit");
12972 /* In ix86_asm_output_function_label we emitted:
12973 8b ff movl.s %edi,%edi
12974 55 push %ebp
12975 8b ec movl.s %esp,%ebp
12977 This matches the hookable function prologue in Win32 API
12978 functions in Microsoft Windows XP Service Pack 2 and newer.
12979 Wine uses this to enable Windows apps to hook the Win32 API
12980 functions provided by Wine.
12982 What that means is that we've already set up the frame pointer. */
12984 if (frame_pointer_needed
12985 && !(crtl->drap_reg && crtl->stack_realign_needed))
12987 rtx push, mov;
12989 /* We've decided to use the frame pointer already set up.
12990 Describe this to the unwinder by pretending that both
12991 push and mov insns happen right here.
12993 Putting the unwind info here at the end of the ms_hook
12994 is done so that we can make absolutely certain we get
12995 the required byte sequence at the start of the function,
12996 rather than relying on an assembler that can produce
12997 the exact encoding required.
12999 However it does mean (in the unpatched case) that we have
13000 a 1 insn window where the asynchronous unwind info is
13001 incorrect. However, if we placed the unwind info at
13002 its correct location we would have incorrect unwind info
13003 in the patched case. Which is probably all moot since
13004 I don't expect Wine generates dwarf2 unwind info for the
13005 system libraries that use this feature. */
13007 insn = emit_insn (gen_blockage ());
13009 push = gen_push (hard_frame_pointer_rtx);
13010 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13011 stack_pointer_rtx);
13012 RTX_FRAME_RELATED_P (push) = 1;
13013 RTX_FRAME_RELATED_P (mov) = 1;
13015 RTX_FRAME_RELATED_P (insn) = 1;
13016 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13017 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13019 /* Note that gen_push incremented m->fs.cfa_offset, even
13020 though we didn't emit the push insn here. */
13021 m->fs.cfa_reg = hard_frame_pointer_rtx;
13022 m->fs.fp_offset = m->fs.cfa_offset;
13023 m->fs.fp_valid = true;
13025 else
13027 /* The frame pointer is not needed so pop %ebp again.
13028 This leaves us with a pristine state. */
13029 emit_insn (gen_pop (hard_frame_pointer_rtx));
13033 /* The first insn of a function that accepts its static chain on the
13034 stack is to push the register that would be filled in by a direct
13035 call. This insn will be skipped by the trampoline. */
13036 else if (ix86_static_chain_on_stack)
13038 static_chain = ix86_static_chain (cfun->decl, false);
13039 insn = emit_insn (gen_push (static_chain));
13040 emit_insn (gen_blockage ());
13042 /* We don't want to interpret this push insn as a register save,
13043 only as a stack adjustment. The real copy of the register as
13044 a save will be done later, if needed. */
13045 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13046 t = gen_rtx_SET (stack_pointer_rtx, t);
13047 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13048 RTX_FRAME_RELATED_P (insn) = 1;
13051 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13052 of DRAP is needed and stack realignment is really needed after reload */
13053 if (stack_realign_drap)
13055 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13057 /* Can't use DRAP in interrupt function. */
13058 if (cfun->machine->func_type != TYPE_NORMAL)
13059 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13060 "in interrupt service routine. This may be worked "
13061 "around by avoiding functions with aggregate return.");
13063 /* Only need to push parameter pointer reg if it is caller saved. */
13064 if (!call_used_regs[REGNO (crtl->drap_reg)])
13066 /* Push arg pointer reg */
13067 insn = emit_insn (gen_push (crtl->drap_reg));
13068 RTX_FRAME_RELATED_P (insn) = 1;
13071 /* Grab the argument pointer. */
13072 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13073 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13074 RTX_FRAME_RELATED_P (insn) = 1;
13075 m->fs.cfa_reg = crtl->drap_reg;
13076 m->fs.cfa_offset = 0;
13078 /* Align the stack. */
13079 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13080 stack_pointer_rtx,
13081 GEN_INT (-align_bytes)));
13082 RTX_FRAME_RELATED_P (insn) = 1;
13084 /* Replicate the return address on the stack so that return
13085 address can be reached via (argp - 1) slot. This is needed
13086 to implement macro RETURN_ADDR_RTX and intrinsic function
13087 expand_builtin_return_addr etc. */
13088 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13089 t = gen_frame_mem (word_mode, t);
13090 insn = emit_insn (gen_push (t));
13091 RTX_FRAME_RELATED_P (insn) = 1;
13093 /* For the purposes of frame and register save area addressing,
13094 we've started over with a new frame. */
13095 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13096 m->fs.realigned = true;
13098 if (static_chain)
13100 /* Replicate static chain on the stack so that static chain
13101 can be reached via (argp - 2) slot. This is needed for
13102 nested function with stack realignment. */
13103 insn = emit_insn (gen_push (static_chain));
13104 RTX_FRAME_RELATED_P (insn) = 1;
13108 int_registers_saved = (frame.nregs == 0);
13109 sse_registers_saved = (frame.nsseregs == 0);
13110 save_stub_call_needed = (m->call_ms2sysv);
13111 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13113 if (frame_pointer_needed && !m->fs.fp_valid)
13115 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13116 slower on all targets. Also sdb didn't like it. */
13117 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13118 RTX_FRAME_RELATED_P (insn) = 1;
13120 /* Push registers now, before setting the frame pointer
13121 on SEH target. */
13122 if (!int_registers_saved
13123 && TARGET_SEH
13124 && !frame.save_regs_using_mov)
13126 ix86_emit_save_regs ();
13127 int_registers_saved = true;
13128 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13131 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13133 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13134 RTX_FRAME_RELATED_P (insn) = 1;
13136 if (m->fs.cfa_reg == stack_pointer_rtx)
13137 m->fs.cfa_reg = hard_frame_pointer_rtx;
13138 m->fs.fp_offset = m->fs.sp_offset;
13139 m->fs.fp_valid = true;
13143 if (!int_registers_saved)
13145 /* If saving registers via PUSH, do so now. */
13146 if (!frame.save_regs_using_mov)
13148 ix86_emit_save_regs ();
13149 int_registers_saved = true;
13150 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13153 /* When using red zone we may start register saving before allocating
13154 the stack frame saving one cycle of the prologue. However, avoid
13155 doing this if we have to probe the stack; at least on x86_64 the
13156 stack probe can turn into a call that clobbers a red zone location. */
13157 else if (ix86_using_red_zone ()
13158 && (! TARGET_STACK_PROBE
13159 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13161 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13162 int_registers_saved = true;
13166 if (stack_realign_fp)
13168 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13169 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13171 /* Record last valid frame pointer offset. */
13172 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13174 /* The computation of the size of the re-aligned stack frame means
13175 that we must allocate the size of the register save area before
13176 performing the actual alignment. Otherwise we cannot guarantee
13177 that there's enough storage above the realignment point. */
13178 allocate = frame.reg_save_offset - m->fs.sp_offset
13179 + frame.stack_realign_allocate;
13180 if (allocate)
13181 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13182 GEN_INT (-allocate), -1, false);
13184 /* Align the stack. */
13185 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13186 stack_pointer_rtx,
13187 GEN_INT (-align_bytes)));
13188 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13189 m->fs.sp_realigned_offset = m->fs.sp_offset
13190 - frame.stack_realign_allocate;
13191 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13192 Beyond this point, stack access should be done via choose_baseaddr or
13193 by using sp_valid_at and fp_valid_at to determine the correct base
13194 register. Henceforth, any CFA offset should be thought of as logical
13195 and not physical. */
13196 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13197 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13198 m->fs.sp_realigned = true;
13200 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13201 is needed to describe where a register is saved using a realigned
13202 stack pointer, so we need to invalidate the stack pointer for that
13203 target. */
13204 if (TARGET_SEH)
13205 m->fs.sp_valid = false;
13207 /* If SP offset is non-immediate after allocation of the stack frame,
13208 then emit SSE saves or stub call prior to allocating the rest of the
13209 stack frame. This is less efficient for the out-of-line stub because
13210 we can't combine allocations across the call barrier, but it's better
13211 than using a scratch register. */
13212 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13213 - m->fs.sp_realigned_offset),
13214 Pmode))
13216 if (!sse_registers_saved)
13218 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13219 sse_registers_saved = true;
13221 else if (save_stub_call_needed)
13223 ix86_emit_outlined_ms2sysv_save (frame);
13224 save_stub_call_needed = false;
13229 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13231 if (flag_stack_usage_info)
13233 /* We start to count from ARG_POINTER. */
13234 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13236 /* If it was realigned, take into account the fake frame. */
13237 if (stack_realign_drap)
13239 if (ix86_static_chain_on_stack)
13240 stack_size += UNITS_PER_WORD;
13242 if (!call_used_regs[REGNO (crtl->drap_reg)])
13243 stack_size += UNITS_PER_WORD;
13245 /* This over-estimates by 1 minimal-stack-alignment-unit but
13246 mitigates that by counting in the new return address slot. */
13247 current_function_dynamic_stack_size
13248 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13251 current_function_static_stack_size = stack_size;
13254 /* On SEH target with very large frame size, allocate an area to save
13255 SSE registers (as the very large allocation won't be described). */
13256 if (TARGET_SEH
13257 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13258 && !sse_registers_saved)
13260 HOST_WIDE_INT sse_size =
13261 frame.sse_reg_save_offset - frame.reg_save_offset;
13263 gcc_assert (int_registers_saved);
13265 /* No need to do stack checking as the area will be immediately
13266 written. */
13267 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13268 GEN_INT (-sse_size), -1,
13269 m->fs.cfa_reg == stack_pointer_rtx);
13270 allocate -= sse_size;
13271 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13272 sse_registers_saved = true;
13275 /* The stack has already been decremented by the instruction calling us
13276 so probe if the size is non-negative to preserve the protection area. */
13277 if (allocate >= 0
13278 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13279 || flag_stack_clash_protection))
13281 /* This assert wants to verify that integer registers were saved
13282 prior to probing. This is necessary when probing may be implemented
13283 as a function call (Windows). It is not necessary for stack clash
13284 protection probing. */
13285 if (!flag_stack_clash_protection)
13286 gcc_assert (int_registers_saved);
13288 if (flag_stack_clash_protection)
13290 ix86_adjust_stack_and_probe_stack_clash (allocate);
13291 allocate = 0;
13293 else if (STACK_CHECK_MOVING_SP)
13295 if (!(crtl->is_leaf && !cfun->calls_alloca
13296 && allocate <= get_probe_interval ()))
13298 ix86_adjust_stack_and_probe (allocate);
13299 allocate = 0;
13302 else
13304 HOST_WIDE_INT size = allocate;
13306 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13307 size = 0x80000000 - get_stack_check_protect () - 1;
13309 if (TARGET_STACK_PROBE)
13311 if (crtl->is_leaf && !cfun->calls_alloca)
13313 if (size > get_probe_interval ())
13314 ix86_emit_probe_stack_range (0, size);
13316 else
13317 ix86_emit_probe_stack_range (0,
13318 size + get_stack_check_protect ());
13320 else
13322 if (crtl->is_leaf && !cfun->calls_alloca)
13324 if (size > get_probe_interval ()
13325 && size > get_stack_check_protect ())
13326 ix86_emit_probe_stack_range (get_stack_check_protect (),
13327 size - get_stack_check_protect ());
13329 else
13330 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13335 if (allocate == 0)
13337 else if (!ix86_target_stack_probe ()
13338 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13340 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13341 GEN_INT (-allocate), -1,
13342 m->fs.cfa_reg == stack_pointer_rtx);
13344 else
13346 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13347 rtx r10 = NULL;
13348 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13349 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13350 bool eax_live = ix86_eax_live_at_start_p ();
13351 bool r10_live = false;
13353 if (TARGET_64BIT)
13354 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13356 if (eax_live)
13358 insn = emit_insn (gen_push (eax));
13359 allocate -= UNITS_PER_WORD;
13360 /* Note that SEH directives need to continue tracking the stack
13361 pointer even after the frame pointer has been set up. */
13362 if (sp_is_cfa_reg || TARGET_SEH)
13364 if (sp_is_cfa_reg)
13365 m->fs.cfa_offset += UNITS_PER_WORD;
13366 RTX_FRAME_RELATED_P (insn) = 1;
13367 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13368 gen_rtx_SET (stack_pointer_rtx,
13369 plus_constant (Pmode, stack_pointer_rtx,
13370 -UNITS_PER_WORD)));
13374 if (r10_live)
13376 r10 = gen_rtx_REG (Pmode, R10_REG);
13377 insn = emit_insn (gen_push (r10));
13378 allocate -= UNITS_PER_WORD;
13379 if (sp_is_cfa_reg || TARGET_SEH)
13381 if (sp_is_cfa_reg)
13382 m->fs.cfa_offset += UNITS_PER_WORD;
13383 RTX_FRAME_RELATED_P (insn) = 1;
13384 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13385 gen_rtx_SET (stack_pointer_rtx,
13386 plus_constant (Pmode, stack_pointer_rtx,
13387 -UNITS_PER_WORD)));
13391 emit_move_insn (eax, GEN_INT (allocate));
13392 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13394 /* Use the fact that AX still contains ALLOCATE. */
13395 adjust_stack_insn = (Pmode == DImode
13396 ? gen_pro_epilogue_adjust_stack_di_sub
13397 : gen_pro_epilogue_adjust_stack_si_sub);
13399 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13400 stack_pointer_rtx, eax));
13402 if (sp_is_cfa_reg || TARGET_SEH)
13404 if (sp_is_cfa_reg)
13405 m->fs.cfa_offset += allocate;
13406 RTX_FRAME_RELATED_P (insn) = 1;
13407 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13408 gen_rtx_SET (stack_pointer_rtx,
13409 plus_constant (Pmode, stack_pointer_rtx,
13410 -allocate)));
13412 m->fs.sp_offset += allocate;
13414 /* Use stack_pointer_rtx for relative addressing so that code
13415 works for realigned stack, too. */
13416 if (r10_live && eax_live)
13418 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13419 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13420 gen_frame_mem (word_mode, t));
13421 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13422 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13423 gen_frame_mem (word_mode, t));
13425 else if (eax_live || r10_live)
13427 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13428 emit_move_insn (gen_rtx_REG (word_mode,
13429 (eax_live ? AX_REG : R10_REG)),
13430 gen_frame_mem (word_mode, t));
13433 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13435 /* If we havn't already set up the frame pointer, do so now. */
13436 if (frame_pointer_needed && !m->fs.fp_valid)
13438 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13439 GEN_INT (frame.stack_pointer_offset
13440 - frame.hard_frame_pointer_offset));
13441 insn = emit_insn (insn);
13442 RTX_FRAME_RELATED_P (insn) = 1;
13443 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13445 if (m->fs.cfa_reg == stack_pointer_rtx)
13446 m->fs.cfa_reg = hard_frame_pointer_rtx;
13447 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13448 m->fs.fp_valid = true;
13451 if (!int_registers_saved)
13452 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13453 if (!sse_registers_saved)
13454 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13455 else if (save_stub_call_needed)
13456 ix86_emit_outlined_ms2sysv_save (frame);
13458 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13459 in PROLOGUE. */
13460 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13462 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13463 insn = emit_insn (gen_set_got (pic));
13464 RTX_FRAME_RELATED_P (insn) = 1;
13465 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13466 emit_insn (gen_prologue_use (pic));
13467 /* Deleting already emmitted SET_GOT if exist and allocated to
13468 REAL_PIC_OFFSET_TABLE_REGNUM. */
13469 ix86_elim_entry_set_got (pic);
13472 if (crtl->drap_reg && !crtl->stack_realign_needed)
13474 /* vDRAP is setup but after reload it turns out stack realign
13475 isn't necessary, here we will emit prologue to setup DRAP
13476 without stack realign adjustment */
13477 t = choose_baseaddr (0, NULL);
13478 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13481 /* Prevent instructions from being scheduled into register save push
13482 sequence when access to the redzone area is done through frame pointer.
13483 The offset between the frame pointer and the stack pointer is calculated
13484 relative to the value of the stack pointer at the end of the function
13485 prologue, and moving instructions that access redzone area via frame
13486 pointer inside push sequence violates this assumption. */
13487 if (frame_pointer_needed && frame.red_zone_size)
13488 emit_insn (gen_memory_blockage ());
13490 /* SEH requires that the prologue end within 256 bytes of the start of
13491 the function. Prevent instruction schedules that would extend that.
13492 Further, prevent alloca modifications to the stack pointer from being
13493 combined with prologue modifications. */
13494 if (TARGET_SEH)
13495 emit_insn (gen_prologue_use (stack_pointer_rtx));
13498 /* Emit code to restore REG using a POP insn. */
13500 static void
13501 ix86_emit_restore_reg_using_pop (rtx reg)
13503 struct machine_function *m = cfun->machine;
13504 rtx_insn *insn = emit_insn (gen_pop (reg));
13506 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13507 m->fs.sp_offset -= UNITS_PER_WORD;
13509 if (m->fs.cfa_reg == crtl->drap_reg
13510 && REGNO (reg) == REGNO (crtl->drap_reg))
13512 /* Previously we'd represented the CFA as an expression
13513 like *(%ebp - 8). We've just popped that value from
13514 the stack, which means we need to reset the CFA to
13515 the drap register. This will remain until we restore
13516 the stack pointer. */
13517 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13518 RTX_FRAME_RELATED_P (insn) = 1;
13520 /* This means that the DRAP register is valid for addressing too. */
13521 m->fs.drap_valid = true;
13522 return;
13525 if (m->fs.cfa_reg == stack_pointer_rtx)
13527 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13528 x = gen_rtx_SET (stack_pointer_rtx, x);
13529 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13530 RTX_FRAME_RELATED_P (insn) = 1;
13532 m->fs.cfa_offset -= UNITS_PER_WORD;
13535 /* When the frame pointer is the CFA, and we pop it, we are
13536 swapping back to the stack pointer as the CFA. This happens
13537 for stack frames that don't allocate other data, so we assume
13538 the stack pointer is now pointing at the return address, i.e.
13539 the function entry state, which makes the offset be 1 word. */
13540 if (reg == hard_frame_pointer_rtx)
13542 m->fs.fp_valid = false;
13543 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13545 m->fs.cfa_reg = stack_pointer_rtx;
13546 m->fs.cfa_offset -= UNITS_PER_WORD;
13548 add_reg_note (insn, REG_CFA_DEF_CFA,
13549 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13550 GEN_INT (m->fs.cfa_offset)));
13551 RTX_FRAME_RELATED_P (insn) = 1;
13556 /* Emit code to restore saved registers using POP insns. */
13558 static void
13559 ix86_emit_restore_regs_using_pop (void)
13561 unsigned int regno;
13563 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13564 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13565 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13568 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13569 omits the emit and only attaches the notes. */
13571 static void
13572 ix86_emit_leave (rtx_insn *insn)
13574 struct machine_function *m = cfun->machine;
13575 if (!insn)
13576 insn = emit_insn (ix86_gen_leave ());
13578 ix86_add_queued_cfa_restore_notes (insn);
13580 gcc_assert (m->fs.fp_valid);
13581 m->fs.sp_valid = true;
13582 m->fs.sp_realigned = false;
13583 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13584 m->fs.fp_valid = false;
13586 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13588 m->fs.cfa_reg = stack_pointer_rtx;
13589 m->fs.cfa_offset = m->fs.sp_offset;
13591 add_reg_note (insn, REG_CFA_DEF_CFA,
13592 plus_constant (Pmode, stack_pointer_rtx,
13593 m->fs.sp_offset));
13594 RTX_FRAME_RELATED_P (insn) = 1;
13596 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13597 m->fs.fp_offset);
13600 /* Emit code to restore saved registers using MOV insns.
13601 First register is restored from CFA - CFA_OFFSET. */
13602 static void
13603 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13604 bool maybe_eh_return)
13606 struct machine_function *m = cfun->machine;
13607 unsigned int regno;
13609 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13610 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13612 rtx reg = gen_rtx_REG (word_mode, regno);
13613 rtx mem;
13614 rtx_insn *insn;
13616 mem = choose_baseaddr (cfa_offset, NULL);
13617 mem = gen_frame_mem (word_mode, mem);
13618 insn = emit_move_insn (reg, mem);
13620 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13622 /* Previously we'd represented the CFA as an expression
13623 like *(%ebp - 8). We've just popped that value from
13624 the stack, which means we need to reset the CFA to
13625 the drap register. This will remain until we restore
13626 the stack pointer. */
13627 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13628 RTX_FRAME_RELATED_P (insn) = 1;
13630 /* This means that the DRAP register is valid for addressing. */
13631 m->fs.drap_valid = true;
13633 else
13634 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13636 cfa_offset -= UNITS_PER_WORD;
13640 /* Emit code to restore saved registers using MOV insns.
13641 First register is restored from CFA - CFA_OFFSET. */
13642 static void
13643 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13644 bool maybe_eh_return)
13646 unsigned int regno;
13648 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13649 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13651 rtx reg = gen_rtx_REG (V4SFmode, regno);
13652 rtx mem;
13653 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13655 mem = choose_baseaddr (cfa_offset, &align);
13656 mem = gen_rtx_MEM (V4SFmode, mem);
13658 /* The location aligment depends upon the base register. */
13659 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13660 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13661 set_mem_align (mem, align);
13662 emit_insn (gen_rtx_SET (reg, mem));
13664 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13666 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13670 static void
13671 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13672 bool use_call, int style)
13674 struct machine_function *m = cfun->machine;
13675 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13676 + m->call_ms2sysv_extra_regs;
13677 rtvec v;
13678 unsigned int elems_needed, align, i, vi = 0;
13679 rtx_insn *insn;
13680 rtx sym, tmp;
13681 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13682 rtx r10 = NULL_RTX;
13683 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13684 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13685 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13686 rtx rsi_frame_load = NULL_RTX;
13687 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13688 enum xlogue_stub stub;
13690 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13692 /* If using a realigned stack, we should never start with padding. */
13693 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13695 /* Setup RSI as the stub's base pointer. */
13696 align = GET_MODE_ALIGNMENT (V4SFmode);
13697 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
13698 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13700 emit_insn (gen_rtx_SET (rsi, tmp));
13702 /* Get a symbol for the stub. */
13703 if (frame_pointer_needed)
13704 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13705 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13706 else
13707 stub = use_call ? XLOGUE_STUB_RESTORE
13708 : XLOGUE_STUB_RESTORE_TAIL;
13709 sym = xlogue.get_stub_rtx (stub);
13711 elems_needed = ncregs;
13712 if (use_call)
13713 elems_needed += 1;
13714 else
13715 elems_needed += frame_pointer_needed ? 5 : 3;
13716 v = rtvec_alloc (elems_needed);
13718 /* We call the epilogue stub when we need to pop incoming args or we are
13719 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13720 epilogue stub and it is the tail-call. */
13721 if (use_call)
13722 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13723 else
13725 RTVEC_ELT (v, vi++) = ret_rtx;
13726 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13727 if (frame_pointer_needed)
13729 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13730 gcc_assert (m->fs.fp_valid);
13731 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13733 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13734 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13735 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13736 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13737 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13739 else
13741 /* If no hard frame pointer, we set R10 to the SP restore value. */
13742 gcc_assert (!m->fs.fp_valid);
13743 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13744 gcc_assert (m->fs.sp_valid);
13746 r10 = gen_rtx_REG (DImode, R10_REG);
13747 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13748 emit_insn (gen_rtx_SET (r10, tmp));
13750 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13754 /* Generate frame load insns and restore notes. */
13755 for (i = 0; i < ncregs; ++i)
13757 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13758 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13759 rtx reg, frame_load;
13761 reg = gen_rtx_REG (mode, r.regno);
13762 frame_load = gen_frame_load (reg, rsi, r.offset);
13764 /* Save RSI frame load insn & note to add last. */
13765 if (r.regno == SI_REG)
13767 gcc_assert (!rsi_frame_load);
13768 rsi_frame_load = frame_load;
13769 rsi_restore_offset = r.offset;
13771 else
13773 RTVEC_ELT (v, vi++) = frame_load;
13774 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13778 /* Add RSI frame load & restore note at the end. */
13779 gcc_assert (rsi_frame_load);
13780 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13781 RTVEC_ELT (v, vi++) = rsi_frame_load;
13782 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13783 rsi_restore_offset);
13785 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13786 if (!use_call && !frame_pointer_needed)
13788 gcc_assert (m->fs.sp_valid);
13789 gcc_assert (!m->fs.sp_realigned);
13791 /* At this point, R10 should point to frame.stack_realign_offset. */
13792 if (m->fs.cfa_reg == stack_pointer_rtx)
13793 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13794 m->fs.sp_offset = frame.stack_realign_offset;
13797 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13798 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13799 if (use_call)
13800 insn = emit_insn (tmp);
13801 else
13803 insn = emit_jump_insn (tmp);
13804 JUMP_LABEL (insn) = ret_rtx;
13806 if (frame_pointer_needed)
13807 ix86_emit_leave (insn);
13808 else
13810 /* Need CFA adjust note. */
13811 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13812 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13816 RTX_FRAME_RELATED_P (insn) = true;
13817 ix86_add_queued_cfa_restore_notes (insn);
13819 /* If we're not doing a tail-call, we need to adjust the stack. */
13820 if (use_call && m->fs.sp_valid)
13822 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13823 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13824 GEN_INT (dealloc), style,
13825 m->fs.cfa_reg == stack_pointer_rtx);
13829 /* Restore function stack, frame, and registers. */
13831 void
13832 ix86_expand_epilogue (int style)
13834 struct machine_function *m = cfun->machine;
13835 struct machine_frame_state frame_state_save = m->fs;
13836 struct ix86_frame frame;
13837 bool restore_regs_via_mov;
13838 bool using_drap;
13839 bool restore_stub_is_tail = false;
13841 if (ix86_function_naked (current_function_decl))
13843 /* The program should not reach this point. */
13844 emit_insn (gen_ud2 ());
13845 return;
13848 ix86_finalize_stack_frame_flags ();
13849 frame = m->frame;
13851 m->fs.sp_realigned = stack_realign_fp;
13852 m->fs.sp_valid = stack_realign_fp
13853 || !frame_pointer_needed
13854 || crtl->sp_is_unchanging;
13855 gcc_assert (!m->fs.sp_valid
13856 || m->fs.sp_offset == frame.stack_pointer_offset);
13858 /* The FP must be valid if the frame pointer is present. */
13859 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13860 gcc_assert (!m->fs.fp_valid
13861 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13863 /* We must have *some* valid pointer to the stack frame. */
13864 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13866 /* The DRAP is never valid at this point. */
13867 gcc_assert (!m->fs.drap_valid);
13869 /* See the comment about red zone and frame
13870 pointer usage in ix86_expand_prologue. */
13871 if (frame_pointer_needed && frame.red_zone_size)
13872 emit_insn (gen_memory_blockage ());
13874 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13875 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13877 /* Determine the CFA offset of the end of the red-zone. */
13878 m->fs.red_zone_offset = 0;
13879 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13881 /* The red-zone begins below return address and error code in
13882 exception handler. */
13883 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13885 /* When the register save area is in the aligned portion of
13886 the stack, determine the maximum runtime displacement that
13887 matches up with the aligned frame. */
13888 if (stack_realign_drap)
13889 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13890 + UNITS_PER_WORD);
13893 /* Special care must be taken for the normal return case of a function
13894 using eh_return: the eax and edx registers are marked as saved, but
13895 not restored along this path. Adjust the save location to match. */
13896 if (crtl->calls_eh_return && style != 2)
13897 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13899 /* EH_RETURN requires the use of moves to function properly. */
13900 if (crtl->calls_eh_return)
13901 restore_regs_via_mov = true;
13902 /* SEH requires the use of pops to identify the epilogue. */
13903 else if (TARGET_SEH)
13904 restore_regs_via_mov = false;
13905 /* If we're only restoring one register and sp cannot be used then
13906 using a move instruction to restore the register since it's
13907 less work than reloading sp and popping the register. */
13908 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13909 restore_regs_via_mov = true;
13910 else if (TARGET_EPILOGUE_USING_MOVE
13911 && cfun->machine->use_fast_prologue_epilogue
13912 && (frame.nregs > 1
13913 || m->fs.sp_offset != frame.reg_save_offset))
13914 restore_regs_via_mov = true;
13915 else if (frame_pointer_needed
13916 && !frame.nregs
13917 && m->fs.sp_offset != frame.reg_save_offset)
13918 restore_regs_via_mov = true;
13919 else if (frame_pointer_needed
13920 && TARGET_USE_LEAVE
13921 && cfun->machine->use_fast_prologue_epilogue
13922 && frame.nregs == 1)
13923 restore_regs_via_mov = true;
13924 else
13925 restore_regs_via_mov = false;
13927 if (restore_regs_via_mov || frame.nsseregs)
13929 /* Ensure that the entire register save area is addressable via
13930 the stack pointer, if we will restore SSE regs via sp. */
13931 if (TARGET_64BIT
13932 && m->fs.sp_offset > 0x7fffffff
13933 && sp_valid_at (frame.stack_realign_offset + 1)
13934 && (frame.nsseregs + frame.nregs) != 0)
13936 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13937 GEN_INT (m->fs.sp_offset
13938 - frame.sse_reg_save_offset),
13939 style,
13940 m->fs.cfa_reg == stack_pointer_rtx);
13944 /* If there are any SSE registers to restore, then we have to do it
13945 via moves, since there's obviously no pop for SSE regs. */
13946 if (frame.nsseregs)
13947 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
13948 style == 2);
13950 if (m->call_ms2sysv)
13952 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
13954 /* We cannot use a tail-call for the stub if:
13955 1. We have to pop incoming args,
13956 2. We have additional int regs to restore, or
13957 3. A sibling call will be the tail-call, or
13958 4. We are emitting an eh_return_internal epilogue.
13960 TODO: Item 4 has not yet tested!
13962 If any of the above are true, we will call the stub rather than
13963 jump to it. */
13964 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
13965 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
13968 /* If using out-of-line stub that is a tail-call, then...*/
13969 if (m->call_ms2sysv && restore_stub_is_tail)
13971 /* TODO: parinoid tests. (remove eventually) */
13972 gcc_assert (m->fs.sp_valid);
13973 gcc_assert (!m->fs.sp_realigned);
13974 gcc_assert (!m->fs.fp_valid);
13975 gcc_assert (!m->fs.realigned);
13976 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
13977 gcc_assert (!crtl->drap_reg);
13978 gcc_assert (!frame.nregs);
13980 else if (restore_regs_via_mov)
13982 rtx t;
13984 if (frame.nregs)
13985 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
13987 /* eh_return epilogues need %ecx added to the stack pointer. */
13988 if (style == 2)
13990 rtx sa = EH_RETURN_STACKADJ_RTX;
13991 rtx_insn *insn;
13993 /* %ecx can't be used for both DRAP register and eh_return. */
13994 if (crtl->drap_reg)
13995 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
13997 /* regparm nested functions don't work with eh_return. */
13998 gcc_assert (!ix86_static_chain_on_stack);
14000 if (frame_pointer_needed)
14002 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14003 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14004 emit_insn (gen_rtx_SET (sa, t));
14006 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14007 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14009 /* Note that we use SA as a temporary CFA, as the return
14010 address is at the proper place relative to it. We
14011 pretend this happens at the FP restore insn because
14012 prior to this insn the FP would be stored at the wrong
14013 offset relative to SA, and after this insn we have no
14014 other reasonable register to use for the CFA. We don't
14015 bother resetting the CFA to the SP for the duration of
14016 the return insn, unless the control flow instrumentation
14017 is done. In this case the SP is used later and we have
14018 to reset CFA to SP. */
14019 add_reg_note (insn, REG_CFA_DEF_CFA,
14020 plus_constant (Pmode, sa, UNITS_PER_WORD));
14021 ix86_add_queued_cfa_restore_notes (insn);
14022 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14023 RTX_FRAME_RELATED_P (insn) = 1;
14025 m->fs.cfa_reg = sa;
14026 m->fs.cfa_offset = UNITS_PER_WORD;
14027 m->fs.fp_valid = false;
14029 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14030 const0_rtx, style,
14031 flag_cf_protection);
14033 else
14035 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14036 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14037 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14038 ix86_add_queued_cfa_restore_notes (insn);
14040 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14041 if (m->fs.cfa_offset != UNITS_PER_WORD)
14043 m->fs.cfa_offset = UNITS_PER_WORD;
14044 add_reg_note (insn, REG_CFA_DEF_CFA,
14045 plus_constant (Pmode, stack_pointer_rtx,
14046 UNITS_PER_WORD));
14047 RTX_FRAME_RELATED_P (insn) = 1;
14050 m->fs.sp_offset = UNITS_PER_WORD;
14051 m->fs.sp_valid = true;
14052 m->fs.sp_realigned = false;
14055 else
14057 /* SEH requires that the function end with (1) a stack adjustment
14058 if necessary, (2) a sequence of pops, and (3) a return or
14059 jump instruction. Prevent insns from the function body from
14060 being scheduled into this sequence. */
14061 if (TARGET_SEH)
14063 /* Prevent a catch region from being adjacent to the standard
14064 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14065 several other flags that would be interesting to test are
14066 not yet set up. */
14067 if (flag_non_call_exceptions)
14068 emit_insn (gen_nops (const1_rtx));
14069 else
14070 emit_insn (gen_blockage ());
14073 /* First step is to deallocate the stack frame so that we can
14074 pop the registers. If the stack pointer was realigned, it needs
14075 to be restored now. Also do it on SEH target for very large
14076 frame as the emitted instructions aren't allowed by the ABI
14077 in epilogues. */
14078 if (!m->fs.sp_valid || m->fs.sp_realigned
14079 || (TARGET_SEH
14080 && (m->fs.sp_offset - frame.reg_save_offset
14081 >= SEH_MAX_FRAME_SIZE)))
14083 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14084 GEN_INT (m->fs.fp_offset
14085 - frame.reg_save_offset),
14086 style, false);
14088 else if (m->fs.sp_offset != frame.reg_save_offset)
14090 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14091 GEN_INT (m->fs.sp_offset
14092 - frame.reg_save_offset),
14093 style,
14094 m->fs.cfa_reg == stack_pointer_rtx);
14097 ix86_emit_restore_regs_using_pop ();
14100 /* If we used a stack pointer and haven't already got rid of it,
14101 then do so now. */
14102 if (m->fs.fp_valid)
14104 /* If the stack pointer is valid and pointing at the frame
14105 pointer store address, then we only need a pop. */
14106 if (sp_valid_at (frame.hfp_save_offset)
14107 && m->fs.sp_offset == frame.hfp_save_offset)
14108 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14109 /* Leave results in shorter dependency chains on CPUs that are
14110 able to grok it fast. */
14111 else if (TARGET_USE_LEAVE
14112 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14113 || !cfun->machine->use_fast_prologue_epilogue)
14114 ix86_emit_leave (NULL);
14115 else
14117 pro_epilogue_adjust_stack (stack_pointer_rtx,
14118 hard_frame_pointer_rtx,
14119 const0_rtx, style, !using_drap);
14120 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14124 if (using_drap)
14126 int param_ptr_offset = UNITS_PER_WORD;
14127 rtx_insn *insn;
14129 gcc_assert (stack_realign_drap);
14131 if (ix86_static_chain_on_stack)
14132 param_ptr_offset += UNITS_PER_WORD;
14133 if (!call_used_regs[REGNO (crtl->drap_reg)])
14134 param_ptr_offset += UNITS_PER_WORD;
14136 insn = emit_insn (gen_rtx_SET
14137 (stack_pointer_rtx,
14138 gen_rtx_PLUS (Pmode,
14139 crtl->drap_reg,
14140 GEN_INT (-param_ptr_offset))));
14141 m->fs.cfa_reg = stack_pointer_rtx;
14142 m->fs.cfa_offset = param_ptr_offset;
14143 m->fs.sp_offset = param_ptr_offset;
14144 m->fs.realigned = false;
14146 add_reg_note (insn, REG_CFA_DEF_CFA,
14147 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14148 GEN_INT (param_ptr_offset)));
14149 RTX_FRAME_RELATED_P (insn) = 1;
14151 if (!call_used_regs[REGNO (crtl->drap_reg)])
14152 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14155 /* At this point the stack pointer must be valid, and we must have
14156 restored all of the registers. We may not have deallocated the
14157 entire stack frame. We've delayed this until now because it may
14158 be possible to merge the local stack deallocation with the
14159 deallocation forced by ix86_static_chain_on_stack. */
14160 gcc_assert (m->fs.sp_valid);
14161 gcc_assert (!m->fs.sp_realigned);
14162 gcc_assert (!m->fs.fp_valid);
14163 gcc_assert (!m->fs.realigned);
14164 if (m->fs.sp_offset != UNITS_PER_WORD)
14166 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14167 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14168 style, true);
14170 else
14171 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14173 /* Sibcall epilogues don't want a return instruction. */
14174 if (style == 0)
14176 m->fs = frame_state_save;
14177 return;
14180 if (cfun->machine->func_type != TYPE_NORMAL)
14181 emit_jump_insn (gen_interrupt_return ());
14182 else if (crtl->args.pops_args && crtl->args.size)
14184 rtx popc = GEN_INT (crtl->args.pops_args);
14186 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14187 address, do explicit add, and jump indirectly to the caller. */
14189 if (crtl->args.pops_args >= 65536)
14191 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14192 rtx_insn *insn;
14194 /* There is no "pascal" calling convention in any 64bit ABI. */
14195 gcc_assert (!TARGET_64BIT);
14197 insn = emit_insn (gen_pop (ecx));
14198 m->fs.cfa_offset -= UNITS_PER_WORD;
14199 m->fs.sp_offset -= UNITS_PER_WORD;
14201 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14202 x = gen_rtx_SET (stack_pointer_rtx, x);
14203 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14204 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14205 RTX_FRAME_RELATED_P (insn) = 1;
14207 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14208 popc, -1, true);
14209 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14211 else
14212 emit_jump_insn (gen_simple_return_pop_internal (popc));
14214 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14216 /* In case of return from EH a simple return cannot be used
14217 as a return address will be compared with a shadow stack
14218 return address. Use indirect jump instead. */
14219 if (style == 2 && flag_cf_protection)
14221 /* Register used in indirect jump must be in word_mode. But
14222 Pmode may not be the same as word_mode for x32. */
14223 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14224 rtx_insn *insn;
14226 insn = emit_insn (gen_pop (ecx));
14227 m->fs.cfa_offset -= UNITS_PER_WORD;
14228 m->fs.sp_offset -= UNITS_PER_WORD;
14230 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14231 x = gen_rtx_SET (stack_pointer_rtx, x);
14232 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14233 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14234 RTX_FRAME_RELATED_P (insn) = 1;
14236 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14238 else
14239 emit_jump_insn (gen_simple_return_internal ());
14242 /* Restore the state back to the state from the prologue,
14243 so that it's correct for the next epilogue. */
14244 m->fs = frame_state_save;
14247 /* Reset from the function's potential modifications. */
14249 static void
14250 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14252 if (pic_offset_table_rtx
14253 && !ix86_use_pseudo_pic_reg ())
14254 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14256 if (TARGET_MACHO)
14258 rtx_insn *insn = get_last_insn ();
14259 rtx_insn *deleted_debug_label = NULL;
14261 /* Mach-O doesn't support labels at the end of objects, so if
14262 it looks like we might want one, take special action.
14263 First, collect any sequence of deleted debug labels. */
14264 while (insn
14265 && NOTE_P (insn)
14266 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14268 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14269 notes only, instead set their CODE_LABEL_NUMBER to -1,
14270 otherwise there would be code generation differences
14271 in between -g and -g0. */
14272 if (NOTE_P (insn) && NOTE_KIND (insn)
14273 == NOTE_INSN_DELETED_DEBUG_LABEL)
14274 deleted_debug_label = insn;
14275 insn = PREV_INSN (insn);
14278 /* If we have:
14279 label:
14280 barrier
14281 then this needs to be detected, so skip past the barrier. */
14283 if (insn && BARRIER_P (insn))
14284 insn = PREV_INSN (insn);
14286 /* Up to now we've only seen notes or barriers. */
14287 if (insn)
14289 if (LABEL_P (insn)
14290 || (NOTE_P (insn)
14291 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14292 /* Trailing label. */
14293 fputs ("\tnop\n", file);
14294 else if (cfun && ! cfun->is_thunk)
14296 /* See if we have a completely empty function body, skipping
14297 the special case of the picbase thunk emitted as asm. */
14298 while (insn && ! INSN_P (insn))
14299 insn = PREV_INSN (insn);
14300 /* If we don't find any insns, we've got an empty function body;
14301 I.e. completely empty - without a return or branch. This is
14302 taken as the case where a function body has been removed
14303 because it contains an inline __builtin_unreachable(). GCC
14304 declares that reaching __builtin_unreachable() means UB so
14305 we're not obliged to do anything special; however, we want
14306 non-zero-sized function bodies. To meet this, and help the
14307 user out, let's trap the case. */
14308 if (insn == NULL)
14309 fputs ("\tud2\n", file);
14312 else if (deleted_debug_label)
14313 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14314 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14315 CODE_LABEL_NUMBER (insn) = -1;
14319 /* Return a scratch register to use in the split stack prologue. The
14320 split stack prologue is used for -fsplit-stack. It is the first
14321 instructions in the function, even before the regular prologue.
14322 The scratch register can be any caller-saved register which is not
14323 used for parameters or for the static chain. */
14325 static unsigned int
14326 split_stack_prologue_scratch_regno (void)
14328 if (TARGET_64BIT)
14329 return R11_REG;
14330 else
14332 bool is_fastcall, is_thiscall;
14333 int regparm;
14335 is_fastcall = (lookup_attribute ("fastcall",
14336 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14337 != NULL);
14338 is_thiscall = (lookup_attribute ("thiscall",
14339 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14340 != NULL);
14341 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14343 if (is_fastcall)
14345 if (DECL_STATIC_CHAIN (cfun->decl))
14347 sorry ("-fsplit-stack does not support fastcall with "
14348 "nested function");
14349 return INVALID_REGNUM;
14351 return AX_REG;
14353 else if (is_thiscall)
14355 if (!DECL_STATIC_CHAIN (cfun->decl))
14356 return DX_REG;
14357 return AX_REG;
14359 else if (regparm < 3)
14361 if (!DECL_STATIC_CHAIN (cfun->decl))
14362 return CX_REG;
14363 else
14365 if (regparm >= 2)
14367 sorry ("-fsplit-stack does not support 2 register "
14368 "parameters for a nested function");
14369 return INVALID_REGNUM;
14371 return DX_REG;
14374 else
14376 /* FIXME: We could make this work by pushing a register
14377 around the addition and comparison. */
14378 sorry ("-fsplit-stack does not support 3 register parameters");
14379 return INVALID_REGNUM;
14384 /* A SYMBOL_REF for the function which allocates new stackspace for
14385 -fsplit-stack. */
14387 static GTY(()) rtx split_stack_fn;
14389 /* A SYMBOL_REF for the more stack function when using the large
14390 model. */
14392 static GTY(()) rtx split_stack_fn_large;
14394 /* Return location of the stack guard value in the TLS block. */
14397 ix86_split_stack_guard (void)
14399 int offset;
14400 addr_space_t as = DEFAULT_TLS_SEG_REG;
14401 rtx r;
14403 gcc_assert (flag_split_stack);
14405 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14406 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14407 #else
14408 gcc_unreachable ();
14409 #endif
14411 r = GEN_INT (offset);
14412 r = gen_const_mem (Pmode, r);
14413 set_mem_addr_space (r, as);
14415 return r;
14418 /* Handle -fsplit-stack. These are the first instructions in the
14419 function, even before the regular prologue. */
14421 void
14422 ix86_expand_split_stack_prologue (void)
14424 HOST_WIDE_INT allocate;
14425 unsigned HOST_WIDE_INT args_size;
14426 rtx_code_label *label;
14427 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14428 rtx scratch_reg = NULL_RTX;
14429 rtx_code_label *varargs_label = NULL;
14430 rtx fn;
14432 gcc_assert (flag_split_stack && reload_completed);
14434 ix86_finalize_stack_frame_flags ();
14435 struct ix86_frame &frame = cfun->machine->frame;
14436 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14438 /* This is the label we will branch to if we have enough stack
14439 space. We expect the basic block reordering pass to reverse this
14440 branch if optimizing, so that we branch in the unlikely case. */
14441 label = gen_label_rtx ();
14443 /* We need to compare the stack pointer minus the frame size with
14444 the stack boundary in the TCB. The stack boundary always gives
14445 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14446 can compare directly. Otherwise we need to do an addition. */
14448 limit = ix86_split_stack_guard ();
14450 if (allocate < SPLIT_STACK_AVAILABLE)
14451 current = stack_pointer_rtx;
14452 else
14454 unsigned int scratch_regno;
14455 rtx offset;
14457 /* We need a scratch register to hold the stack pointer minus
14458 the required frame size. Since this is the very start of the
14459 function, the scratch register can be any caller-saved
14460 register which is not used for parameters. */
14461 offset = GEN_INT (- allocate);
14462 scratch_regno = split_stack_prologue_scratch_regno ();
14463 if (scratch_regno == INVALID_REGNUM)
14464 return;
14465 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14466 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14468 /* We don't use ix86_gen_add3 in this case because it will
14469 want to split to lea, but when not optimizing the insn
14470 will not be split after this point. */
14471 emit_insn (gen_rtx_SET (scratch_reg,
14472 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14473 offset)));
14475 else
14477 emit_move_insn (scratch_reg, offset);
14478 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14479 stack_pointer_rtx));
14481 current = scratch_reg;
14484 ix86_expand_branch (GEU, current, limit, label);
14485 rtx_insn *jump_insn = get_last_insn ();
14486 JUMP_LABEL (jump_insn) = label;
14488 /* Mark the jump as very likely to be taken. */
14489 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14491 if (split_stack_fn == NULL_RTX)
14493 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14494 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14496 fn = split_stack_fn;
14498 /* Get more stack space. We pass in the desired stack space and the
14499 size of the arguments to copy to the new stack. In 32-bit mode
14500 we push the parameters; __morestack will return on a new stack
14501 anyhow. In 64-bit mode we pass the parameters in r10 and
14502 r11. */
14503 allocate_rtx = GEN_INT (allocate);
14504 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14505 call_fusage = NULL_RTX;
14506 rtx pop = NULL_RTX;
14507 if (TARGET_64BIT)
14509 rtx reg10, reg11;
14511 reg10 = gen_rtx_REG (Pmode, R10_REG);
14512 reg11 = gen_rtx_REG (Pmode, R11_REG);
14514 /* If this function uses a static chain, it will be in %r10.
14515 Preserve it across the call to __morestack. */
14516 if (DECL_STATIC_CHAIN (cfun->decl))
14518 rtx rax;
14520 rax = gen_rtx_REG (word_mode, AX_REG);
14521 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14522 use_reg (&call_fusage, rax);
14525 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14526 && !TARGET_PECOFF)
14528 HOST_WIDE_INT argval;
14530 gcc_assert (Pmode == DImode);
14531 /* When using the large model we need to load the address
14532 into a register, and we've run out of registers. So we
14533 switch to a different calling convention, and we call a
14534 different function: __morestack_large. We pass the
14535 argument size in the upper 32 bits of r10 and pass the
14536 frame size in the lower 32 bits. */
14537 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14538 gcc_assert ((args_size & 0xffffffff) == args_size);
14540 if (split_stack_fn_large == NULL_RTX)
14542 split_stack_fn_large =
14543 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14544 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14546 if (ix86_cmodel == CM_LARGE_PIC)
14548 rtx_code_label *label;
14549 rtx x;
14551 label = gen_label_rtx ();
14552 emit_label (label);
14553 LABEL_PRESERVE_P (label) = 1;
14554 emit_insn (gen_set_rip_rex64 (reg10, label));
14555 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14556 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14557 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14558 UNSPEC_GOT);
14559 x = gen_rtx_CONST (Pmode, x);
14560 emit_move_insn (reg11, x);
14561 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14562 x = gen_const_mem (Pmode, x);
14563 emit_move_insn (reg11, x);
14565 else
14566 emit_move_insn (reg11, split_stack_fn_large);
14568 fn = reg11;
14570 argval = ((args_size << 16) << 16) + allocate;
14571 emit_move_insn (reg10, GEN_INT (argval));
14573 else
14575 emit_move_insn (reg10, allocate_rtx);
14576 emit_move_insn (reg11, GEN_INT (args_size));
14577 use_reg (&call_fusage, reg11);
14580 use_reg (&call_fusage, reg10);
14582 else
14584 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14585 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14586 insn = emit_insn (gen_push (allocate_rtx));
14587 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14588 pop = GEN_INT (2 * UNITS_PER_WORD);
14590 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14591 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14592 pop, false);
14593 add_function_usage_to (call_insn, call_fusage);
14594 if (!TARGET_64BIT)
14595 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14596 /* Indicate that this function can't jump to non-local gotos. */
14597 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14599 /* In order to make call/return prediction work right, we now need
14600 to execute a return instruction. See
14601 libgcc/config/i386/morestack.S for the details on how this works.
14603 For flow purposes gcc must not see this as a return
14604 instruction--we need control flow to continue at the subsequent
14605 label. Therefore, we use an unspec. */
14606 gcc_assert (crtl->args.pops_args < 65536);
14607 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14609 /* If we are in 64-bit mode and this function uses a static chain,
14610 we saved %r10 in %rax before calling _morestack. */
14611 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14612 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14613 gen_rtx_REG (word_mode, AX_REG));
14615 /* If this function calls va_start, we need to store a pointer to
14616 the arguments on the old stack, because they may not have been
14617 all copied to the new stack. At this point the old stack can be
14618 found at the frame pointer value used by __morestack, because
14619 __morestack has set that up before calling back to us. Here we
14620 store that pointer in a scratch register, and in
14621 ix86_expand_prologue we store the scratch register in a stack
14622 slot. */
14623 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14625 unsigned int scratch_regno;
14626 rtx frame_reg;
14627 int words;
14629 scratch_regno = split_stack_prologue_scratch_regno ();
14630 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14631 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14633 /* 64-bit:
14634 fp -> old fp value
14635 return address within this function
14636 return address of caller of this function
14637 stack arguments
14638 So we add three words to get to the stack arguments.
14640 32-bit:
14641 fp -> old fp value
14642 return address within this function
14643 first argument to __morestack
14644 second argument to __morestack
14645 return address of caller of this function
14646 stack arguments
14647 So we add five words to get to the stack arguments.
14649 words = TARGET_64BIT ? 3 : 5;
14650 emit_insn (gen_rtx_SET (scratch_reg,
14651 gen_rtx_PLUS (Pmode, frame_reg,
14652 GEN_INT (words * UNITS_PER_WORD))));
14654 varargs_label = gen_label_rtx ();
14655 emit_jump_insn (gen_jump (varargs_label));
14656 JUMP_LABEL (get_last_insn ()) = varargs_label;
14658 emit_barrier ();
14661 emit_label (label);
14662 LABEL_NUSES (label) = 1;
14664 /* If this function calls va_start, we now have to set the scratch
14665 register for the case where we do not call __morestack. In this
14666 case we need to set it based on the stack pointer. */
14667 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14669 emit_insn (gen_rtx_SET (scratch_reg,
14670 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14671 GEN_INT (UNITS_PER_WORD))));
14673 emit_label (varargs_label);
14674 LABEL_NUSES (varargs_label) = 1;
14678 /* We may have to tell the dataflow pass that the split stack prologue
14679 is initializing a scratch register. */
14681 static void
14682 ix86_live_on_entry (bitmap regs)
14684 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14686 gcc_assert (flag_split_stack);
14687 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14691 /* Extract the parts of an RTL expression that is a valid memory address
14692 for an instruction. Return 0 if the structure of the address is
14693 grossly off. Return -1 if the address contains ASHIFT, so it is not
14694 strictly valid, but still used for computing length of lea instruction. */
14697 ix86_decompose_address (rtx addr, struct ix86_address *out)
14699 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14700 rtx base_reg, index_reg;
14701 HOST_WIDE_INT scale = 1;
14702 rtx scale_rtx = NULL_RTX;
14703 rtx tmp;
14704 int retval = 1;
14705 addr_space_t seg = ADDR_SPACE_GENERIC;
14707 /* Allow zero-extended SImode addresses,
14708 they will be emitted with addr32 prefix. */
14709 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14711 if (GET_CODE (addr) == ZERO_EXTEND
14712 && GET_MODE (XEXP (addr, 0)) == SImode)
14714 addr = XEXP (addr, 0);
14715 if (CONST_INT_P (addr))
14716 return 0;
14718 else if (GET_CODE (addr) == AND
14719 && const_32bit_mask (XEXP (addr, 1), DImode))
14721 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14722 if (addr == NULL_RTX)
14723 return 0;
14725 if (CONST_INT_P (addr))
14726 return 0;
14730 /* Allow SImode subregs of DImode addresses,
14731 they will be emitted with addr32 prefix. */
14732 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14734 if (SUBREG_P (addr)
14735 && GET_MODE (SUBREG_REG (addr)) == DImode)
14737 addr = SUBREG_REG (addr);
14738 if (CONST_INT_P (addr))
14739 return 0;
14743 if (REG_P (addr))
14744 base = addr;
14745 else if (SUBREG_P (addr))
14747 if (REG_P (SUBREG_REG (addr)))
14748 base = addr;
14749 else
14750 return 0;
14752 else if (GET_CODE (addr) == PLUS)
14754 rtx addends[4], op;
14755 int n = 0, i;
14757 op = addr;
14760 if (n >= 4)
14761 return 0;
14762 addends[n++] = XEXP (op, 1);
14763 op = XEXP (op, 0);
14765 while (GET_CODE (op) == PLUS);
14766 if (n >= 4)
14767 return 0;
14768 addends[n] = op;
14770 for (i = n; i >= 0; --i)
14772 op = addends[i];
14773 switch (GET_CODE (op))
14775 case MULT:
14776 if (index)
14777 return 0;
14778 index = XEXP (op, 0);
14779 scale_rtx = XEXP (op, 1);
14780 break;
14782 case ASHIFT:
14783 if (index)
14784 return 0;
14785 index = XEXP (op, 0);
14786 tmp = XEXP (op, 1);
14787 if (!CONST_INT_P (tmp))
14788 return 0;
14789 scale = INTVAL (tmp);
14790 if ((unsigned HOST_WIDE_INT) scale > 3)
14791 return 0;
14792 scale = 1 << scale;
14793 break;
14795 case ZERO_EXTEND:
14796 op = XEXP (op, 0);
14797 if (GET_CODE (op) != UNSPEC)
14798 return 0;
14799 /* FALLTHRU */
14801 case UNSPEC:
14802 if (XINT (op, 1) == UNSPEC_TP
14803 && TARGET_TLS_DIRECT_SEG_REFS
14804 && seg == ADDR_SPACE_GENERIC)
14805 seg = DEFAULT_TLS_SEG_REG;
14806 else
14807 return 0;
14808 break;
14810 case SUBREG:
14811 if (!REG_P (SUBREG_REG (op)))
14812 return 0;
14813 /* FALLTHRU */
14815 case REG:
14816 if (!base)
14817 base = op;
14818 else if (!index)
14819 index = op;
14820 else
14821 return 0;
14822 break;
14824 case CONST:
14825 case CONST_INT:
14826 case SYMBOL_REF:
14827 case LABEL_REF:
14828 if (disp)
14829 return 0;
14830 disp = op;
14831 break;
14833 default:
14834 return 0;
14838 else if (GET_CODE (addr) == MULT)
14840 index = XEXP (addr, 0); /* index*scale */
14841 scale_rtx = XEXP (addr, 1);
14843 else if (GET_CODE (addr) == ASHIFT)
14845 /* We're called for lea too, which implements ashift on occasion. */
14846 index = XEXP (addr, 0);
14847 tmp = XEXP (addr, 1);
14848 if (!CONST_INT_P (tmp))
14849 return 0;
14850 scale = INTVAL (tmp);
14851 if ((unsigned HOST_WIDE_INT) scale > 3)
14852 return 0;
14853 scale = 1 << scale;
14854 retval = -1;
14856 else
14857 disp = addr; /* displacement */
14859 if (index)
14861 if (REG_P (index))
14863 else if (SUBREG_P (index)
14864 && REG_P (SUBREG_REG (index)))
14866 else
14867 return 0;
14870 /* Extract the integral value of scale. */
14871 if (scale_rtx)
14873 if (!CONST_INT_P (scale_rtx))
14874 return 0;
14875 scale = INTVAL (scale_rtx);
14878 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14879 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14881 /* Avoid useless 0 displacement. */
14882 if (disp == const0_rtx && (base || index))
14883 disp = NULL_RTX;
14885 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14886 if (base_reg && index_reg && scale == 1
14887 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14888 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14889 || REGNO (index_reg) == SP_REG))
14891 std::swap (base, index);
14892 std::swap (base_reg, index_reg);
14895 /* Special case: %ebp cannot be encoded as a base without a displacement.
14896 Similarly %r13. */
14897 if (!disp && base_reg
14898 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14899 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14900 || REGNO (base_reg) == BP_REG
14901 || REGNO (base_reg) == R13_REG))
14902 disp = const0_rtx;
14904 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14905 Avoid this by transforming to [%esi+0].
14906 Reload calls address legitimization without cfun defined, so we need
14907 to test cfun for being non-NULL. */
14908 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14909 && base_reg && !index_reg && !disp
14910 && REGNO (base_reg) == SI_REG)
14911 disp = const0_rtx;
14913 /* Special case: encode reg+reg instead of reg*2. */
14914 if (!base && index && scale == 2)
14915 base = index, base_reg = index_reg, scale = 1;
14917 /* Special case: scaling cannot be encoded without base or displacement. */
14918 if (!base && !disp && index && scale != 1)
14919 disp = const0_rtx;
14921 out->base = base;
14922 out->index = index;
14923 out->disp = disp;
14924 out->scale = scale;
14925 out->seg = seg;
14927 return retval;
14930 /* Return cost of the memory address x.
14931 For i386, it is better to use a complex address than let gcc copy
14932 the address into a reg and make a new pseudo. But not if the address
14933 requires to two regs - that would mean more pseudos with longer
14934 lifetimes. */
14935 static int
14936 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
14938 struct ix86_address parts;
14939 int cost = 1;
14940 int ok = ix86_decompose_address (x, &parts);
14942 gcc_assert (ok);
14944 if (parts.base && SUBREG_P (parts.base))
14945 parts.base = SUBREG_REG (parts.base);
14946 if (parts.index && SUBREG_P (parts.index))
14947 parts.index = SUBREG_REG (parts.index);
14949 /* Attempt to minimize number of registers in the address by increasing
14950 address cost for each used register. We don't increase address cost
14951 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
14952 is not invariant itself it most likely means that base or index is not
14953 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
14954 which is not profitable for x86. */
14955 if (parts.base
14956 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
14957 && (current_pass->type == GIMPLE_PASS
14958 || !pic_offset_table_rtx
14959 || !REG_P (parts.base)
14960 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
14961 cost++;
14963 if (parts.index
14964 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
14965 && (current_pass->type == GIMPLE_PASS
14966 || !pic_offset_table_rtx
14967 || !REG_P (parts.index)
14968 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
14969 cost++;
14971 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
14972 since it's predecode logic can't detect the length of instructions
14973 and it degenerates to vector decoded. Increase cost of such
14974 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
14975 to split such addresses or even refuse such addresses at all.
14977 Following addressing modes are affected:
14978 [base+scale*index]
14979 [scale*index+disp]
14980 [base+index]
14982 The first and last case may be avoidable by explicitly coding the zero in
14983 memory address, but I don't have AMD-K6 machine handy to check this
14984 theory. */
14986 if (TARGET_K6
14987 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
14988 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
14989 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
14990 cost += 10;
14992 return cost;
14995 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
14996 this is used for to form addresses to local data when -fPIC is in
14997 use. */
14999 static bool
15000 darwin_local_data_pic (rtx disp)
15002 return (GET_CODE (disp) == UNSPEC
15003 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15006 /* True if operand X should be loaded from GOT. */
15008 bool
15009 ix86_force_load_from_GOT_p (rtx x)
15011 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15012 && !TARGET_PECOFF && !TARGET_MACHO
15013 && !flag_plt && !flag_pic
15014 && ix86_cmodel != CM_LARGE
15015 && GET_CODE (x) == SYMBOL_REF
15016 && SYMBOL_REF_FUNCTION_P (x)
15017 && !SYMBOL_REF_LOCAL_P (x));
15020 /* Determine if a given RTX is a valid constant. We already know this
15021 satisfies CONSTANT_P. */
15023 static bool
15024 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15026 /* Pointer bounds constants are not valid. */
15027 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15028 return false;
15030 switch (GET_CODE (x))
15032 case CONST:
15033 x = XEXP (x, 0);
15035 if (GET_CODE (x) == PLUS)
15037 if (!CONST_INT_P (XEXP (x, 1)))
15038 return false;
15039 x = XEXP (x, 0);
15042 if (TARGET_MACHO && darwin_local_data_pic (x))
15043 return true;
15045 /* Only some unspecs are valid as "constants". */
15046 if (GET_CODE (x) == UNSPEC)
15047 switch (XINT (x, 1))
15049 case UNSPEC_GOT:
15050 case UNSPEC_GOTOFF:
15051 case UNSPEC_PLTOFF:
15052 return TARGET_64BIT;
15053 case UNSPEC_TPOFF:
15054 case UNSPEC_NTPOFF:
15055 x = XVECEXP (x, 0, 0);
15056 return (GET_CODE (x) == SYMBOL_REF
15057 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15058 case UNSPEC_DTPOFF:
15059 x = XVECEXP (x, 0, 0);
15060 return (GET_CODE (x) == SYMBOL_REF
15061 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15062 default:
15063 return false;
15066 /* We must have drilled down to a symbol. */
15067 if (GET_CODE (x) == LABEL_REF)
15068 return true;
15069 if (GET_CODE (x) != SYMBOL_REF)
15070 return false;
15071 /* FALLTHRU */
15073 case SYMBOL_REF:
15074 /* TLS symbols are never valid. */
15075 if (SYMBOL_REF_TLS_MODEL (x))
15076 return false;
15078 /* DLLIMPORT symbols are never valid. */
15079 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15080 && SYMBOL_REF_DLLIMPORT_P (x))
15081 return false;
15083 #if TARGET_MACHO
15084 /* mdynamic-no-pic */
15085 if (MACHO_DYNAMIC_NO_PIC_P)
15086 return machopic_symbol_defined_p (x);
15087 #endif
15089 /* External function address should be loaded
15090 via the GOT slot to avoid PLT. */
15091 if (ix86_force_load_from_GOT_p (x))
15092 return false;
15094 break;
15096 CASE_CONST_SCALAR_INT:
15097 switch (mode)
15099 case E_TImode:
15100 if (TARGET_64BIT)
15101 return true;
15102 /* FALLTHRU */
15103 case E_OImode:
15104 case E_XImode:
15105 if (!standard_sse_constant_p (x, mode))
15106 return false;
15107 default:
15108 break;
15110 break;
15112 case CONST_VECTOR:
15113 if (!standard_sse_constant_p (x, mode))
15114 return false;
15116 default:
15117 break;
15120 /* Otherwise we handle everything else in the move patterns. */
15121 return true;
15124 /* Determine if it's legal to put X into the constant pool. This
15125 is not possible for the address of thread-local symbols, which
15126 is checked above. */
15128 static bool
15129 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15131 /* We can put any immediate constant in memory. */
15132 switch (GET_CODE (x))
15134 CASE_CONST_ANY:
15135 return false;
15137 default:
15138 break;
15141 return !ix86_legitimate_constant_p (mode, x);
15144 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15145 otherwise zero. */
15147 static bool
15148 is_imported_p (rtx x)
15150 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15151 || GET_CODE (x) != SYMBOL_REF)
15152 return false;
15154 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15158 /* Nonzero if the constant value X is a legitimate general operand
15159 when generating PIC code. It is given that flag_pic is on and
15160 that X satisfies CONSTANT_P. */
15162 bool
15163 legitimate_pic_operand_p (rtx x)
15165 rtx inner;
15167 switch (GET_CODE (x))
15169 case CONST:
15170 inner = XEXP (x, 0);
15171 if (GET_CODE (inner) == PLUS
15172 && CONST_INT_P (XEXP (inner, 1)))
15173 inner = XEXP (inner, 0);
15175 /* Only some unspecs are valid as "constants". */
15176 if (GET_CODE (inner) == UNSPEC)
15177 switch (XINT (inner, 1))
15179 case UNSPEC_GOT:
15180 case UNSPEC_GOTOFF:
15181 case UNSPEC_PLTOFF:
15182 return TARGET_64BIT;
15183 case UNSPEC_TPOFF:
15184 x = XVECEXP (inner, 0, 0);
15185 return (GET_CODE (x) == SYMBOL_REF
15186 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15187 case UNSPEC_MACHOPIC_OFFSET:
15188 return legitimate_pic_address_disp_p (x);
15189 default:
15190 return false;
15192 /* FALLTHRU */
15194 case SYMBOL_REF:
15195 case LABEL_REF:
15196 return legitimate_pic_address_disp_p (x);
15198 default:
15199 return true;
15203 /* Determine if a given CONST RTX is a valid memory displacement
15204 in PIC mode. */
15206 bool
15207 legitimate_pic_address_disp_p (rtx disp)
15209 bool saw_plus;
15211 /* In 64bit mode we can allow direct addresses of symbols and labels
15212 when they are not dynamic symbols. */
15213 if (TARGET_64BIT)
15215 rtx op0 = disp, op1;
15217 switch (GET_CODE (disp))
15219 case LABEL_REF:
15220 return true;
15222 case CONST:
15223 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15224 break;
15225 op0 = XEXP (XEXP (disp, 0), 0);
15226 op1 = XEXP (XEXP (disp, 0), 1);
15227 if (!CONST_INT_P (op1))
15228 break;
15229 if (GET_CODE (op0) == UNSPEC
15230 && (XINT (op0, 1) == UNSPEC_DTPOFF
15231 || XINT (op0, 1) == UNSPEC_NTPOFF)
15232 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15233 return true;
15234 if (INTVAL (op1) >= 16*1024*1024
15235 || INTVAL (op1) < -16*1024*1024)
15236 break;
15237 if (GET_CODE (op0) == LABEL_REF)
15238 return true;
15239 if (GET_CODE (op0) == CONST
15240 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15241 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15242 return true;
15243 if (GET_CODE (op0) == UNSPEC
15244 && XINT (op0, 1) == UNSPEC_PCREL)
15245 return true;
15246 if (GET_CODE (op0) != SYMBOL_REF)
15247 break;
15248 /* FALLTHRU */
15250 case SYMBOL_REF:
15251 /* TLS references should always be enclosed in UNSPEC.
15252 The dllimported symbol needs always to be resolved. */
15253 if (SYMBOL_REF_TLS_MODEL (op0)
15254 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15255 return false;
15257 if (TARGET_PECOFF)
15259 if (is_imported_p (op0))
15260 return true;
15262 if (SYMBOL_REF_FAR_ADDR_P (op0)
15263 || !SYMBOL_REF_LOCAL_P (op0))
15264 break;
15266 /* Function-symbols need to be resolved only for
15267 large-model.
15268 For the small-model we don't need to resolve anything
15269 here. */
15270 if ((ix86_cmodel != CM_LARGE_PIC
15271 && SYMBOL_REF_FUNCTION_P (op0))
15272 || ix86_cmodel == CM_SMALL_PIC)
15273 return true;
15274 /* Non-external symbols don't need to be resolved for
15275 large, and medium-model. */
15276 if ((ix86_cmodel == CM_LARGE_PIC
15277 || ix86_cmodel == CM_MEDIUM_PIC)
15278 && !SYMBOL_REF_EXTERNAL_P (op0))
15279 return true;
15281 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15282 && (SYMBOL_REF_LOCAL_P (op0)
15283 || (HAVE_LD_PIE_COPYRELOC
15284 && flag_pie
15285 && !SYMBOL_REF_WEAK (op0)
15286 && !SYMBOL_REF_FUNCTION_P (op0)))
15287 && ix86_cmodel != CM_LARGE_PIC)
15288 return true;
15289 break;
15291 default:
15292 break;
15295 if (GET_CODE (disp) != CONST)
15296 return false;
15297 disp = XEXP (disp, 0);
15299 if (TARGET_64BIT)
15301 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15302 of GOT tables. We should not need these anyway. */
15303 if (GET_CODE (disp) != UNSPEC
15304 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15305 && XINT (disp, 1) != UNSPEC_GOTOFF
15306 && XINT (disp, 1) != UNSPEC_PCREL
15307 && XINT (disp, 1) != UNSPEC_PLTOFF))
15308 return false;
15310 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15311 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15312 return false;
15313 return true;
15316 saw_plus = false;
15317 if (GET_CODE (disp) == PLUS)
15319 if (!CONST_INT_P (XEXP (disp, 1)))
15320 return false;
15321 disp = XEXP (disp, 0);
15322 saw_plus = true;
15325 if (TARGET_MACHO && darwin_local_data_pic (disp))
15326 return true;
15328 if (GET_CODE (disp) != UNSPEC)
15329 return false;
15331 switch (XINT (disp, 1))
15333 case UNSPEC_GOT:
15334 if (saw_plus)
15335 return false;
15336 /* We need to check for both symbols and labels because VxWorks loads
15337 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15338 details. */
15339 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15340 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15341 case UNSPEC_GOTOFF:
15342 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15343 While ABI specify also 32bit relocation but we don't produce it in
15344 small PIC model at all. */
15345 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15346 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15347 && !TARGET_64BIT)
15348 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15349 return false;
15350 case UNSPEC_GOTTPOFF:
15351 case UNSPEC_GOTNTPOFF:
15352 case UNSPEC_INDNTPOFF:
15353 if (saw_plus)
15354 return false;
15355 disp = XVECEXP (disp, 0, 0);
15356 return (GET_CODE (disp) == SYMBOL_REF
15357 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15358 case UNSPEC_NTPOFF:
15359 disp = XVECEXP (disp, 0, 0);
15360 return (GET_CODE (disp) == SYMBOL_REF
15361 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15362 case UNSPEC_DTPOFF:
15363 disp = XVECEXP (disp, 0, 0);
15364 return (GET_CODE (disp) == SYMBOL_REF
15365 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15368 return false;
15371 /* Determine if op is suitable RTX for an address register.
15372 Return naked register if a register or a register subreg is
15373 found, otherwise return NULL_RTX. */
15375 static rtx
15376 ix86_validate_address_register (rtx op)
15378 machine_mode mode = GET_MODE (op);
15380 /* Only SImode or DImode registers can form the address. */
15381 if (mode != SImode && mode != DImode)
15382 return NULL_RTX;
15384 if (REG_P (op))
15385 return op;
15386 else if (SUBREG_P (op))
15388 rtx reg = SUBREG_REG (op);
15390 if (!REG_P (reg))
15391 return NULL_RTX;
15393 mode = GET_MODE (reg);
15395 /* Don't allow SUBREGs that span more than a word. It can
15396 lead to spill failures when the register is one word out
15397 of a two word structure. */
15398 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15399 return NULL_RTX;
15401 /* Allow only SUBREGs of non-eliminable hard registers. */
15402 if (register_no_elim_operand (reg, mode))
15403 return reg;
15406 /* Op is not a register. */
15407 return NULL_RTX;
15410 /* Recognizes RTL expressions that are valid memory addresses for an
15411 instruction. The MODE argument is the machine mode for the MEM
15412 expression that wants to use this address.
15414 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15415 convert common non-canonical forms to canonical form so that they will
15416 be recognized. */
15418 static bool
15419 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15421 struct ix86_address parts;
15422 rtx base, index, disp;
15423 HOST_WIDE_INT scale;
15424 addr_space_t seg;
15426 if (ix86_decompose_address (addr, &parts) <= 0)
15427 /* Decomposition failed. */
15428 return false;
15430 base = parts.base;
15431 index = parts.index;
15432 disp = parts.disp;
15433 scale = parts.scale;
15434 seg = parts.seg;
15436 /* Validate base register. */
15437 if (base)
15439 rtx reg = ix86_validate_address_register (base);
15441 if (reg == NULL_RTX)
15442 return false;
15444 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15445 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15446 /* Base is not valid. */
15447 return false;
15450 /* Validate index register. */
15451 if (index)
15453 rtx reg = ix86_validate_address_register (index);
15455 if (reg == NULL_RTX)
15456 return false;
15458 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15459 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15460 /* Index is not valid. */
15461 return false;
15464 /* Index and base should have the same mode. */
15465 if (base && index
15466 && GET_MODE (base) != GET_MODE (index))
15467 return false;
15469 /* Address override works only on the (%reg) part of %fs:(%reg). */
15470 if (seg != ADDR_SPACE_GENERIC
15471 && ((base && GET_MODE (base) != word_mode)
15472 || (index && GET_MODE (index) != word_mode)))
15473 return false;
15475 /* Validate scale factor. */
15476 if (scale != 1)
15478 if (!index)
15479 /* Scale without index. */
15480 return false;
15482 if (scale != 2 && scale != 4 && scale != 8)
15483 /* Scale is not a valid multiplier. */
15484 return false;
15487 /* Validate displacement. */
15488 if (disp)
15490 if (GET_CODE (disp) == CONST
15491 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15492 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15493 switch (XINT (XEXP (disp, 0), 1))
15495 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15496 when used. While ABI specify also 32bit relocations, we
15497 don't produce them at all and use IP relative instead.
15498 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15499 should be loaded via GOT. */
15500 case UNSPEC_GOT:
15501 if (!TARGET_64BIT
15502 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15503 goto is_legitimate_pic;
15504 /* FALLTHRU */
15505 case UNSPEC_GOTOFF:
15506 gcc_assert (flag_pic);
15507 if (!TARGET_64BIT)
15508 goto is_legitimate_pic;
15510 /* 64bit address unspec. */
15511 return false;
15513 case UNSPEC_GOTPCREL:
15514 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15515 goto is_legitimate_pic;
15516 /* FALLTHRU */
15517 case UNSPEC_PCREL:
15518 gcc_assert (flag_pic);
15519 goto is_legitimate_pic;
15521 case UNSPEC_GOTTPOFF:
15522 case UNSPEC_GOTNTPOFF:
15523 case UNSPEC_INDNTPOFF:
15524 case UNSPEC_NTPOFF:
15525 case UNSPEC_DTPOFF:
15526 break;
15528 default:
15529 /* Invalid address unspec. */
15530 return false;
15533 else if (SYMBOLIC_CONST (disp)
15534 && (flag_pic
15535 || (TARGET_MACHO
15536 #if TARGET_MACHO
15537 && MACHOPIC_INDIRECT
15538 && !machopic_operand_p (disp)
15539 #endif
15543 is_legitimate_pic:
15544 if (TARGET_64BIT && (index || base))
15546 /* foo@dtpoff(%rX) is ok. */
15547 if (GET_CODE (disp) != CONST
15548 || GET_CODE (XEXP (disp, 0)) != PLUS
15549 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15550 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15551 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15552 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15553 /* Non-constant pic memory reference. */
15554 return false;
15556 else if ((!TARGET_MACHO || flag_pic)
15557 && ! legitimate_pic_address_disp_p (disp))
15558 /* Displacement is an invalid pic construct. */
15559 return false;
15560 #if TARGET_MACHO
15561 else if (MACHO_DYNAMIC_NO_PIC_P
15562 && !ix86_legitimate_constant_p (Pmode, disp))
15563 /* displacment must be referenced via non_lazy_pointer */
15564 return false;
15565 #endif
15567 /* This code used to verify that a symbolic pic displacement
15568 includes the pic_offset_table_rtx register.
15570 While this is good idea, unfortunately these constructs may
15571 be created by "adds using lea" optimization for incorrect
15572 code like:
15574 int a;
15575 int foo(int i)
15577 return *(&a+i);
15580 This code is nonsensical, but results in addressing
15581 GOT table with pic_offset_table_rtx base. We can't
15582 just refuse it easily, since it gets matched by
15583 "addsi3" pattern, that later gets split to lea in the
15584 case output register differs from input. While this
15585 can be handled by separate addsi pattern for this case
15586 that never results in lea, this seems to be easier and
15587 correct fix for crash to disable this test. */
15589 else if (GET_CODE (disp) != LABEL_REF
15590 && !CONST_INT_P (disp)
15591 && (GET_CODE (disp) != CONST
15592 || !ix86_legitimate_constant_p (Pmode, disp))
15593 && (GET_CODE (disp) != SYMBOL_REF
15594 || !ix86_legitimate_constant_p (Pmode, disp)))
15595 /* Displacement is not constant. */
15596 return false;
15597 else if (TARGET_64BIT
15598 && !x86_64_immediate_operand (disp, VOIDmode))
15599 /* Displacement is out of range. */
15600 return false;
15601 /* In x32 mode, constant addresses are sign extended to 64bit, so
15602 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15603 else if (TARGET_X32 && !(index || base)
15604 && CONST_INT_P (disp)
15605 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15606 return false;
15609 /* Everything looks valid. */
15610 return true;
15613 /* Determine if a given RTX is a valid constant address. */
15615 bool
15616 constant_address_p (rtx x)
15618 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15621 /* Return a unique alias set for the GOT. */
15623 static alias_set_type
15624 ix86_GOT_alias_set (void)
15626 static alias_set_type set = -1;
15627 if (set == -1)
15628 set = new_alias_set ();
15629 return set;
15632 /* Return a legitimate reference for ORIG (an address) using the
15633 register REG. If REG is 0, a new pseudo is generated.
15635 There are two types of references that must be handled:
15637 1. Global data references must load the address from the GOT, via
15638 the PIC reg. An insn is emitted to do this load, and the reg is
15639 returned.
15641 2. Static data references, constant pool addresses, and code labels
15642 compute the address as an offset from the GOT, whose base is in
15643 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15644 differentiate them from global data objects. The returned
15645 address is the PIC reg + an unspec constant.
15647 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15648 reg also appears in the address. */
15650 static rtx
15651 legitimize_pic_address (rtx orig, rtx reg)
15653 rtx addr = orig;
15654 rtx new_rtx = orig;
15656 #if TARGET_MACHO
15657 if (TARGET_MACHO && !TARGET_64BIT)
15659 if (reg == 0)
15660 reg = gen_reg_rtx (Pmode);
15661 /* Use the generic Mach-O PIC machinery. */
15662 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15664 #endif
15666 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15668 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15669 if (tmp)
15670 return tmp;
15673 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15674 new_rtx = addr;
15675 else if ((!TARGET_64BIT
15676 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15677 && !TARGET_PECOFF
15678 && gotoff_operand (addr, Pmode))
15680 /* This symbol may be referenced via a displacement
15681 from the PIC base address (@GOTOFF). */
15682 if (GET_CODE (addr) == CONST)
15683 addr = XEXP (addr, 0);
15685 if (GET_CODE (addr) == PLUS)
15687 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15688 UNSPEC_GOTOFF);
15689 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15691 else
15692 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15694 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15696 if (TARGET_64BIT)
15697 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15699 if (reg != 0)
15701 gcc_assert (REG_P (reg));
15702 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15703 new_rtx, reg, 1, OPTAB_DIRECT);
15705 else
15706 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15708 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15709 /* We can't use @GOTOFF for text labels
15710 on VxWorks, see gotoff_operand. */
15711 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15713 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15714 if (tmp)
15715 return tmp;
15717 /* For x64 PE-COFF there is no GOT table,
15718 so we use address directly. */
15719 if (TARGET_64BIT && TARGET_PECOFF)
15721 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15722 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15724 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15726 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15727 UNSPEC_GOTPCREL);
15728 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15729 new_rtx = gen_const_mem (Pmode, new_rtx);
15730 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15732 else
15734 /* This symbol must be referenced via a load
15735 from the Global Offset Table (@GOT). */
15736 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15737 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15738 if (TARGET_64BIT)
15739 new_rtx = force_reg (Pmode, new_rtx);
15740 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15741 new_rtx = gen_const_mem (Pmode, new_rtx);
15742 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15745 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15747 else
15749 if (CONST_INT_P (addr)
15750 && !x86_64_immediate_operand (addr, VOIDmode))
15751 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15752 else if (GET_CODE (addr) == CONST)
15754 addr = XEXP (addr, 0);
15756 /* We must match stuff we generate before. Assume the only
15757 unspecs that can get here are ours. Not that we could do
15758 anything with them anyway.... */
15759 if (GET_CODE (addr) == UNSPEC
15760 || (GET_CODE (addr) == PLUS
15761 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15762 return orig;
15763 gcc_assert (GET_CODE (addr) == PLUS);
15766 if (GET_CODE (addr) == PLUS)
15768 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15770 /* Check first to see if this is a constant
15771 offset from a @GOTOFF symbol reference. */
15772 if (!TARGET_PECOFF
15773 && gotoff_operand (op0, Pmode)
15774 && CONST_INT_P (op1))
15776 if (!TARGET_64BIT)
15778 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15779 UNSPEC_GOTOFF);
15780 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15781 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15783 if (reg != 0)
15785 gcc_assert (REG_P (reg));
15786 new_rtx = expand_simple_binop (Pmode, PLUS,
15787 pic_offset_table_rtx,
15788 new_rtx, reg, 1,
15789 OPTAB_DIRECT);
15791 else
15792 new_rtx
15793 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15795 else
15797 if (INTVAL (op1) < -16*1024*1024
15798 || INTVAL (op1) >= 16*1024*1024)
15800 if (!x86_64_immediate_operand (op1, Pmode))
15801 op1 = force_reg (Pmode, op1);
15803 new_rtx
15804 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15808 else
15810 rtx base = legitimize_pic_address (op0, reg);
15811 machine_mode mode = GET_MODE (base);
15812 new_rtx
15813 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15815 if (CONST_INT_P (new_rtx))
15817 if (INTVAL (new_rtx) < -16*1024*1024
15818 || INTVAL (new_rtx) >= 16*1024*1024)
15820 if (!x86_64_immediate_operand (new_rtx, mode))
15821 new_rtx = force_reg (mode, new_rtx);
15823 new_rtx
15824 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15826 else
15827 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15829 else
15831 /* For %rip addressing, we have to use
15832 just disp32, not base nor index. */
15833 if (TARGET_64BIT
15834 && (GET_CODE (base) == SYMBOL_REF
15835 || GET_CODE (base) == LABEL_REF))
15836 base = force_reg (mode, base);
15837 if (GET_CODE (new_rtx) == PLUS
15838 && CONSTANT_P (XEXP (new_rtx, 1)))
15840 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15841 new_rtx = XEXP (new_rtx, 1);
15843 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15848 return new_rtx;
15851 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15853 static rtx
15854 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15856 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15858 if (GET_MODE (tp) != tp_mode)
15860 gcc_assert (GET_MODE (tp) == SImode);
15861 gcc_assert (tp_mode == DImode);
15863 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15866 if (to_reg)
15867 tp = copy_to_mode_reg (tp_mode, tp);
15869 return tp;
15872 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15874 static GTY(()) rtx ix86_tls_symbol;
15876 static rtx
15877 ix86_tls_get_addr (void)
15879 if (!ix86_tls_symbol)
15881 const char *sym
15882 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15883 ? "___tls_get_addr" : "__tls_get_addr");
15885 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15888 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15890 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15891 UNSPEC_PLTOFF);
15892 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15893 gen_rtx_CONST (Pmode, unspec));
15896 return ix86_tls_symbol;
15899 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15901 static GTY(()) rtx ix86_tls_module_base_symbol;
15904 ix86_tls_module_base (void)
15906 if (!ix86_tls_module_base_symbol)
15908 ix86_tls_module_base_symbol
15909 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15911 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15912 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15915 return ix86_tls_module_base_symbol;
15918 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15919 false if we expect this to be used for a memory address and true if
15920 we expect to load the address into a register. */
15922 static rtx
15923 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15925 rtx dest, base, off;
15926 rtx pic = NULL_RTX, tp = NULL_RTX;
15927 machine_mode tp_mode = Pmode;
15928 int type;
15930 /* Fall back to global dynamic model if tool chain cannot support local
15931 dynamic. */
15932 if (TARGET_SUN_TLS && !TARGET_64BIT
15933 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15934 && model == TLS_MODEL_LOCAL_DYNAMIC)
15935 model = TLS_MODEL_GLOBAL_DYNAMIC;
15937 switch (model)
15939 case TLS_MODEL_GLOBAL_DYNAMIC:
15940 dest = gen_reg_rtx (Pmode);
15942 if (!TARGET_64BIT)
15944 if (flag_pic && !TARGET_PECOFF)
15945 pic = pic_offset_table_rtx;
15946 else
15948 pic = gen_reg_rtx (Pmode);
15949 emit_insn (gen_set_got (pic));
15953 if (TARGET_GNU2_TLS)
15955 if (TARGET_64BIT)
15956 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
15957 else
15958 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
15960 tp = get_thread_pointer (Pmode, true);
15961 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
15963 if (GET_MODE (x) != Pmode)
15964 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15966 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15968 else
15970 rtx caddr = ix86_tls_get_addr ();
15972 if (TARGET_64BIT)
15974 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15975 rtx_insn *insns;
15977 start_sequence ();
15978 emit_call_insn
15979 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
15980 insns = get_insns ();
15981 end_sequence ();
15983 if (GET_MODE (x) != Pmode)
15984 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15986 RTL_CONST_CALL_P (insns) = 1;
15987 emit_libcall_block (insns, dest, rax, x);
15989 else
15990 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
15992 break;
15994 case TLS_MODEL_LOCAL_DYNAMIC:
15995 base = gen_reg_rtx (Pmode);
15997 if (!TARGET_64BIT)
15999 if (flag_pic)
16000 pic = pic_offset_table_rtx;
16001 else
16003 pic = gen_reg_rtx (Pmode);
16004 emit_insn (gen_set_got (pic));
16008 if (TARGET_GNU2_TLS)
16010 rtx tmp = ix86_tls_module_base ();
16012 if (TARGET_64BIT)
16013 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16014 else
16015 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16017 tp = get_thread_pointer (Pmode, true);
16018 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16019 gen_rtx_MINUS (Pmode, tmp, tp));
16021 else
16023 rtx caddr = ix86_tls_get_addr ();
16025 if (TARGET_64BIT)
16027 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16028 rtx_insn *insns;
16029 rtx eqv;
16031 start_sequence ();
16032 emit_call_insn
16033 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16034 insns = get_insns ();
16035 end_sequence ();
16037 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16038 share the LD_BASE result with other LD model accesses. */
16039 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16040 UNSPEC_TLS_LD_BASE);
16042 RTL_CONST_CALL_P (insns) = 1;
16043 emit_libcall_block (insns, base, rax, eqv);
16045 else
16046 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16049 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16050 off = gen_rtx_CONST (Pmode, off);
16052 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16054 if (TARGET_GNU2_TLS)
16056 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16058 if (GET_MODE (x) != Pmode)
16059 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16061 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16063 break;
16065 case TLS_MODEL_INITIAL_EXEC:
16066 if (TARGET_64BIT)
16068 if (TARGET_SUN_TLS && !TARGET_X32)
16070 /* The Sun linker took the AMD64 TLS spec literally
16071 and can only handle %rax as destination of the
16072 initial executable code sequence. */
16074 dest = gen_reg_rtx (DImode);
16075 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16076 return dest;
16079 /* Generate DImode references to avoid %fs:(%reg32)
16080 problems and linker IE->LE relaxation bug. */
16081 tp_mode = DImode;
16082 pic = NULL;
16083 type = UNSPEC_GOTNTPOFF;
16085 else if (flag_pic)
16087 pic = pic_offset_table_rtx;
16088 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16090 else if (!TARGET_ANY_GNU_TLS)
16092 pic = gen_reg_rtx (Pmode);
16093 emit_insn (gen_set_got (pic));
16094 type = UNSPEC_GOTTPOFF;
16096 else
16098 pic = NULL;
16099 type = UNSPEC_INDNTPOFF;
16102 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16103 off = gen_rtx_CONST (tp_mode, off);
16104 if (pic)
16105 off = gen_rtx_PLUS (tp_mode, pic, off);
16106 off = gen_const_mem (tp_mode, off);
16107 set_mem_alias_set (off, ix86_GOT_alias_set ());
16109 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16111 base = get_thread_pointer (tp_mode,
16112 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16113 off = force_reg (tp_mode, off);
16114 dest = gen_rtx_PLUS (tp_mode, base, off);
16115 if (tp_mode != Pmode)
16116 dest = convert_to_mode (Pmode, dest, 1);
16118 else
16120 base = get_thread_pointer (Pmode, true);
16121 dest = gen_reg_rtx (Pmode);
16122 emit_insn (ix86_gen_sub3 (dest, base, off));
16124 break;
16126 case TLS_MODEL_LOCAL_EXEC:
16127 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16128 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16129 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16130 off = gen_rtx_CONST (Pmode, off);
16132 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16134 base = get_thread_pointer (Pmode,
16135 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16136 return gen_rtx_PLUS (Pmode, base, off);
16138 else
16140 base = get_thread_pointer (Pmode, true);
16141 dest = gen_reg_rtx (Pmode);
16142 emit_insn (ix86_gen_sub3 (dest, base, off));
16144 break;
16146 default:
16147 gcc_unreachable ();
16150 return dest;
16153 /* Return true if OP refers to a TLS address. */
16154 bool
16155 ix86_tls_address_pattern_p (rtx op)
16157 subrtx_var_iterator::array_type array;
16158 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16160 rtx op = *iter;
16161 if (MEM_P (op))
16163 rtx *x = &XEXP (op, 0);
16164 while (GET_CODE (*x) == PLUS)
16166 int i;
16167 for (i = 0; i < 2; i++)
16169 rtx u = XEXP (*x, i);
16170 if (GET_CODE (u) == ZERO_EXTEND)
16171 u = XEXP (u, 0);
16172 if (GET_CODE (u) == UNSPEC
16173 && XINT (u, 1) == UNSPEC_TP)
16174 return true;
16176 x = &XEXP (*x, 0);
16179 iter.skip_subrtxes ();
16183 return false;
16186 /* Rewrite *LOC so that it refers to a default TLS address space. */
16187 void
16188 ix86_rewrite_tls_address_1 (rtx *loc)
16190 subrtx_ptr_iterator::array_type array;
16191 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16193 rtx *loc = *iter;
16194 if (MEM_P (*loc))
16196 rtx addr = XEXP (*loc, 0);
16197 rtx *x = &addr;
16198 while (GET_CODE (*x) == PLUS)
16200 int i;
16201 for (i = 0; i < 2; i++)
16203 rtx u = XEXP (*x, i);
16204 if (GET_CODE (u) == ZERO_EXTEND)
16205 u = XEXP (u, 0);
16206 if (GET_CODE (u) == UNSPEC
16207 && XINT (u, 1) == UNSPEC_TP)
16209 addr_space_t as = DEFAULT_TLS_SEG_REG;
16211 *x = XEXP (*x, 1 - i);
16213 *loc = replace_equiv_address_nv (*loc, addr, true);
16214 set_mem_addr_space (*loc, as);
16215 return;
16218 x = &XEXP (*x, 0);
16221 iter.skip_subrtxes ();
16226 /* Rewrite instruction pattern involvning TLS address
16227 so that it refers to a default TLS address space. */
16229 ix86_rewrite_tls_address (rtx pattern)
16231 pattern = copy_insn (pattern);
16232 ix86_rewrite_tls_address_1 (&pattern);
16233 return pattern;
16236 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16237 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16238 unique refptr-DECL symbol corresponding to symbol DECL. */
16240 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16242 static inline hashval_t hash (tree_map *m) { return m->hash; }
16243 static inline bool
16244 equal (tree_map *a, tree_map *b)
16246 return a->base.from == b->base.from;
16249 static int
16250 keep_cache_entry (tree_map *&m)
16252 return ggc_marked_p (m->base.from);
16256 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16258 static tree
16259 get_dllimport_decl (tree decl, bool beimport)
16261 struct tree_map *h, in;
16262 const char *name;
16263 const char *prefix;
16264 size_t namelen, prefixlen;
16265 char *imp_name;
16266 tree to;
16267 rtx rtl;
16269 if (!dllimport_map)
16270 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16272 in.hash = htab_hash_pointer (decl);
16273 in.base.from = decl;
16274 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16275 h = *loc;
16276 if (h)
16277 return h->to;
16279 *loc = h = ggc_alloc<tree_map> ();
16280 h->hash = in.hash;
16281 h->base.from = decl;
16282 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16283 VAR_DECL, NULL, ptr_type_node);
16284 DECL_ARTIFICIAL (to) = 1;
16285 DECL_IGNORED_P (to) = 1;
16286 DECL_EXTERNAL (to) = 1;
16287 TREE_READONLY (to) = 1;
16289 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16290 name = targetm.strip_name_encoding (name);
16291 if (beimport)
16292 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16293 ? "*__imp_" : "*__imp__";
16294 else
16295 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16296 namelen = strlen (name);
16297 prefixlen = strlen (prefix);
16298 imp_name = (char *) alloca (namelen + prefixlen + 1);
16299 memcpy (imp_name, prefix, prefixlen);
16300 memcpy (imp_name + prefixlen, name, namelen + 1);
16302 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16303 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16304 SET_SYMBOL_REF_DECL (rtl, to);
16305 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16306 if (!beimport)
16308 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16309 #ifdef SUB_TARGET_RECORD_STUB
16310 SUB_TARGET_RECORD_STUB (name);
16311 #endif
16314 rtl = gen_const_mem (Pmode, rtl);
16315 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16317 SET_DECL_RTL (to, rtl);
16318 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16320 return to;
16323 /* Expand SYMBOL into its corresponding far-address symbol.
16324 WANT_REG is true if we require the result be a register. */
16326 static rtx
16327 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16329 tree imp_decl;
16330 rtx x;
16332 gcc_assert (SYMBOL_REF_DECL (symbol));
16333 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16335 x = DECL_RTL (imp_decl);
16336 if (want_reg)
16337 x = force_reg (Pmode, x);
16338 return x;
16341 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16342 true if we require the result be a register. */
16344 static rtx
16345 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16347 tree imp_decl;
16348 rtx x;
16350 gcc_assert (SYMBOL_REF_DECL (symbol));
16351 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16353 x = DECL_RTL (imp_decl);
16354 if (want_reg)
16355 x = force_reg (Pmode, x);
16356 return x;
16359 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16360 is true if we require the result be a register. */
16362 static rtx
16363 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16365 if (!TARGET_PECOFF)
16366 return NULL_RTX;
16368 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16370 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16371 return legitimize_dllimport_symbol (addr, inreg);
16372 if (GET_CODE (addr) == CONST
16373 && GET_CODE (XEXP (addr, 0)) == PLUS
16374 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16375 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16377 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16378 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16382 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16383 return NULL_RTX;
16384 if (GET_CODE (addr) == SYMBOL_REF
16385 && !is_imported_p (addr)
16386 && SYMBOL_REF_EXTERNAL_P (addr)
16387 && SYMBOL_REF_DECL (addr))
16388 return legitimize_pe_coff_extern_decl (addr, inreg);
16390 if (GET_CODE (addr) == CONST
16391 && GET_CODE (XEXP (addr, 0)) == PLUS
16392 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16393 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16394 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16395 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16397 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16398 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16400 return NULL_RTX;
16403 /* Try machine-dependent ways of modifying an illegitimate address
16404 to be legitimate. If we find one, return the new, valid address.
16405 This macro is used in only one place: `memory_address' in explow.c.
16407 OLDX is the address as it was before break_out_memory_refs was called.
16408 In some cases it is useful to look at this to decide what needs to be done.
16410 It is always safe for this macro to do nothing. It exists to recognize
16411 opportunities to optimize the output.
16413 For the 80386, we handle X+REG by loading X into a register R and
16414 using R+REG. R will go in a general reg and indexing will be used.
16415 However, if REG is a broken-out memory address or multiplication,
16416 nothing needs to be done because REG can certainly go in a general reg.
16418 When -fpic is used, special handling is needed for symbolic references.
16419 See comments by legitimize_pic_address in i386.c for details. */
16421 static rtx
16422 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16424 bool changed = false;
16425 unsigned log;
16427 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16428 if (log)
16429 return legitimize_tls_address (x, (enum tls_model) log, false);
16430 if (GET_CODE (x) == CONST
16431 && GET_CODE (XEXP (x, 0)) == PLUS
16432 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16433 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16435 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16436 (enum tls_model) log, false);
16437 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16440 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16442 rtx tmp = legitimize_pe_coff_symbol (x, true);
16443 if (tmp)
16444 return tmp;
16447 if (flag_pic && SYMBOLIC_CONST (x))
16448 return legitimize_pic_address (x, 0);
16450 #if TARGET_MACHO
16451 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16452 return machopic_indirect_data_reference (x, 0);
16453 #endif
16455 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16456 if (GET_CODE (x) == ASHIFT
16457 && CONST_INT_P (XEXP (x, 1))
16458 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16460 changed = true;
16461 log = INTVAL (XEXP (x, 1));
16462 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16463 GEN_INT (1 << log));
16466 if (GET_CODE (x) == PLUS)
16468 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16470 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16471 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16472 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16474 changed = true;
16475 log = INTVAL (XEXP (XEXP (x, 0), 1));
16476 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16477 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16478 GEN_INT (1 << log));
16481 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16482 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16483 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16485 changed = true;
16486 log = INTVAL (XEXP (XEXP (x, 1), 1));
16487 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16488 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16489 GEN_INT (1 << log));
16492 /* Put multiply first if it isn't already. */
16493 if (GET_CODE (XEXP (x, 1)) == MULT)
16495 std::swap (XEXP (x, 0), XEXP (x, 1));
16496 changed = true;
16499 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16500 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16501 created by virtual register instantiation, register elimination, and
16502 similar optimizations. */
16503 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16505 changed = true;
16506 x = gen_rtx_PLUS (Pmode,
16507 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16508 XEXP (XEXP (x, 1), 0)),
16509 XEXP (XEXP (x, 1), 1));
16512 /* Canonicalize
16513 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16514 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16515 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16516 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16517 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16518 && CONSTANT_P (XEXP (x, 1)))
16520 rtx constant;
16521 rtx other = NULL_RTX;
16523 if (CONST_INT_P (XEXP (x, 1)))
16525 constant = XEXP (x, 1);
16526 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16528 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16530 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16531 other = XEXP (x, 1);
16533 else
16534 constant = 0;
16536 if (constant)
16538 changed = true;
16539 x = gen_rtx_PLUS (Pmode,
16540 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16541 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16542 plus_constant (Pmode, other,
16543 INTVAL (constant)));
16547 if (changed && ix86_legitimate_address_p (mode, x, false))
16548 return x;
16550 if (GET_CODE (XEXP (x, 0)) == MULT)
16552 changed = true;
16553 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16556 if (GET_CODE (XEXP (x, 1)) == MULT)
16558 changed = true;
16559 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16562 if (changed
16563 && REG_P (XEXP (x, 1))
16564 && REG_P (XEXP (x, 0)))
16565 return x;
16567 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16569 changed = true;
16570 x = legitimize_pic_address (x, 0);
16573 if (changed && ix86_legitimate_address_p (mode, x, false))
16574 return x;
16576 if (REG_P (XEXP (x, 0)))
16578 rtx temp = gen_reg_rtx (Pmode);
16579 rtx val = force_operand (XEXP (x, 1), temp);
16580 if (val != temp)
16582 val = convert_to_mode (Pmode, val, 1);
16583 emit_move_insn (temp, val);
16586 XEXP (x, 1) = temp;
16587 return x;
16590 else if (REG_P (XEXP (x, 1)))
16592 rtx temp = gen_reg_rtx (Pmode);
16593 rtx val = force_operand (XEXP (x, 0), temp);
16594 if (val != temp)
16596 val = convert_to_mode (Pmode, val, 1);
16597 emit_move_insn (temp, val);
16600 XEXP (x, 0) = temp;
16601 return x;
16605 return x;
16608 /* Print an integer constant expression in assembler syntax. Addition
16609 and subtraction are the only arithmetic that may appear in these
16610 expressions. FILE is the stdio stream to write to, X is the rtx, and
16611 CODE is the operand print code from the output string. */
16613 static void
16614 output_pic_addr_const (FILE *file, rtx x, int code)
16616 char buf[256];
16618 switch (GET_CODE (x))
16620 case PC:
16621 gcc_assert (flag_pic);
16622 putc ('.', file);
16623 break;
16625 case SYMBOL_REF:
16626 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16627 output_addr_const (file, x);
16628 else
16630 const char *name = XSTR (x, 0);
16632 /* Mark the decl as referenced so that cgraph will
16633 output the function. */
16634 if (SYMBOL_REF_DECL (x))
16635 mark_decl_referenced (SYMBOL_REF_DECL (x));
16637 #if TARGET_MACHO
16638 if (MACHOPIC_INDIRECT
16639 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16640 name = machopic_indirection_name (x, /*stub_p=*/true);
16641 #endif
16642 assemble_name (file, name);
16644 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16645 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16646 fputs ("@PLT", file);
16647 break;
16649 case LABEL_REF:
16650 x = XEXP (x, 0);
16651 /* FALLTHRU */
16652 case CODE_LABEL:
16653 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16654 assemble_name (asm_out_file, buf);
16655 break;
16657 case CONST_INT:
16658 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16659 break;
16661 case CONST:
16662 /* This used to output parentheses around the expression,
16663 but that does not work on the 386 (either ATT or BSD assembler). */
16664 output_pic_addr_const (file, XEXP (x, 0), code);
16665 break;
16667 case CONST_DOUBLE:
16668 /* We can't handle floating point constants;
16669 TARGET_PRINT_OPERAND must handle them. */
16670 output_operand_lossage ("floating constant misused");
16671 break;
16673 case PLUS:
16674 /* Some assemblers need integer constants to appear first. */
16675 if (CONST_INT_P (XEXP (x, 0)))
16677 output_pic_addr_const (file, XEXP (x, 0), code);
16678 putc ('+', file);
16679 output_pic_addr_const (file, XEXP (x, 1), code);
16681 else
16683 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16684 output_pic_addr_const (file, XEXP (x, 1), code);
16685 putc ('+', file);
16686 output_pic_addr_const (file, XEXP (x, 0), code);
16688 break;
16690 case MINUS:
16691 if (!TARGET_MACHO)
16692 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16693 output_pic_addr_const (file, XEXP (x, 0), code);
16694 putc ('-', file);
16695 output_pic_addr_const (file, XEXP (x, 1), code);
16696 if (!TARGET_MACHO)
16697 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16698 break;
16700 case UNSPEC:
16701 gcc_assert (XVECLEN (x, 0) == 1);
16702 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16703 switch (XINT (x, 1))
16705 case UNSPEC_GOT:
16706 fputs ("@GOT", file);
16707 break;
16708 case UNSPEC_GOTOFF:
16709 fputs ("@GOTOFF", file);
16710 break;
16711 case UNSPEC_PLTOFF:
16712 fputs ("@PLTOFF", file);
16713 break;
16714 case UNSPEC_PCREL:
16715 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16716 "(%rip)" : "[rip]", file);
16717 break;
16718 case UNSPEC_GOTPCREL:
16719 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16720 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16721 break;
16722 case UNSPEC_GOTTPOFF:
16723 /* FIXME: This might be @TPOFF in Sun ld too. */
16724 fputs ("@gottpoff", file);
16725 break;
16726 case UNSPEC_TPOFF:
16727 fputs ("@tpoff", file);
16728 break;
16729 case UNSPEC_NTPOFF:
16730 if (TARGET_64BIT)
16731 fputs ("@tpoff", file);
16732 else
16733 fputs ("@ntpoff", file);
16734 break;
16735 case UNSPEC_DTPOFF:
16736 fputs ("@dtpoff", file);
16737 break;
16738 case UNSPEC_GOTNTPOFF:
16739 if (TARGET_64BIT)
16740 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16741 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16742 else
16743 fputs ("@gotntpoff", file);
16744 break;
16745 case UNSPEC_INDNTPOFF:
16746 fputs ("@indntpoff", file);
16747 break;
16748 #if TARGET_MACHO
16749 case UNSPEC_MACHOPIC_OFFSET:
16750 putc ('-', file);
16751 machopic_output_function_base_name (file);
16752 break;
16753 #endif
16754 default:
16755 output_operand_lossage ("invalid UNSPEC as operand");
16756 break;
16758 break;
16760 default:
16761 output_operand_lossage ("invalid expression as operand");
16765 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16766 We need to emit DTP-relative relocations. */
16768 static void ATTRIBUTE_UNUSED
16769 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16771 fputs (ASM_LONG, file);
16772 output_addr_const (file, x);
16773 fputs ("@dtpoff", file);
16774 switch (size)
16776 case 4:
16777 break;
16778 case 8:
16779 fputs (", 0", file);
16780 break;
16781 default:
16782 gcc_unreachable ();
16786 /* Return true if X is a representation of the PIC register. This copes
16787 with calls from ix86_find_base_term, where the register might have
16788 been replaced by a cselib value. */
16790 static bool
16791 ix86_pic_register_p (rtx x)
16793 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16794 return (pic_offset_table_rtx
16795 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16796 else if (!REG_P (x))
16797 return false;
16798 else if (pic_offset_table_rtx)
16800 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16801 return true;
16802 if (HARD_REGISTER_P (x)
16803 && !HARD_REGISTER_P (pic_offset_table_rtx)
16804 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16805 return true;
16806 return false;
16808 else
16809 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16812 /* Helper function for ix86_delegitimize_address.
16813 Attempt to delegitimize TLS local-exec accesses. */
16815 static rtx
16816 ix86_delegitimize_tls_address (rtx orig_x)
16818 rtx x = orig_x, unspec;
16819 struct ix86_address addr;
16821 if (!TARGET_TLS_DIRECT_SEG_REFS)
16822 return orig_x;
16823 if (MEM_P (x))
16824 x = XEXP (x, 0);
16825 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16826 return orig_x;
16827 if (ix86_decompose_address (x, &addr) == 0
16828 || addr.seg != DEFAULT_TLS_SEG_REG
16829 || addr.disp == NULL_RTX
16830 || GET_CODE (addr.disp) != CONST)
16831 return orig_x;
16832 unspec = XEXP (addr.disp, 0);
16833 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16834 unspec = XEXP (unspec, 0);
16835 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16836 return orig_x;
16837 x = XVECEXP (unspec, 0, 0);
16838 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16839 if (unspec != XEXP (addr.disp, 0))
16840 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16841 if (addr.index)
16843 rtx idx = addr.index;
16844 if (addr.scale != 1)
16845 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16846 x = gen_rtx_PLUS (Pmode, idx, x);
16848 if (addr.base)
16849 x = gen_rtx_PLUS (Pmode, addr.base, x);
16850 if (MEM_P (orig_x))
16851 x = replace_equiv_address_nv (orig_x, x);
16852 return x;
16855 /* In the name of slightly smaller debug output, and to cater to
16856 general assembler lossage, recognize PIC+GOTOFF and turn it back
16857 into a direct symbol reference.
16859 On Darwin, this is necessary to avoid a crash, because Darwin
16860 has a different PIC label for each routine but the DWARF debugging
16861 information is not associated with any particular routine, so it's
16862 necessary to remove references to the PIC label from RTL stored by
16863 the DWARF output code.
16865 This helper is used in the normal ix86_delegitimize_address
16866 entrypoint (e.g. used in the target delegitimization hook) and
16867 in ix86_find_base_term. As compile time memory optimization, we
16868 avoid allocating rtxes that will not change anything on the outcome
16869 of the callers (find_base_value and find_base_term). */
16871 static inline rtx
16872 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16874 rtx orig_x = delegitimize_mem_from_attrs (x);
16875 /* addend is NULL or some rtx if x is something+GOTOFF where
16876 something doesn't include the PIC register. */
16877 rtx addend = NULL_RTX;
16878 /* reg_addend is NULL or a multiple of some register. */
16879 rtx reg_addend = NULL_RTX;
16880 /* const_addend is NULL or a const_int. */
16881 rtx const_addend = NULL_RTX;
16882 /* This is the result, or NULL. */
16883 rtx result = NULL_RTX;
16885 x = orig_x;
16887 if (MEM_P (x))
16888 x = XEXP (x, 0);
16890 if (TARGET_64BIT)
16892 if (GET_CODE (x) == CONST
16893 && GET_CODE (XEXP (x, 0)) == PLUS
16894 && GET_MODE (XEXP (x, 0)) == Pmode
16895 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16896 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16897 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16899 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16900 base. A CONST can't be arg_pointer_rtx based. */
16901 if (base_term_p && MEM_P (orig_x))
16902 return orig_x;
16903 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16904 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16905 if (MEM_P (orig_x))
16906 x = replace_equiv_address_nv (orig_x, x);
16907 return x;
16910 if (GET_CODE (x) == CONST
16911 && GET_CODE (XEXP (x, 0)) == UNSPEC
16912 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16913 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16914 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16916 x = XVECEXP (XEXP (x, 0), 0, 0);
16917 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16919 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16920 if (x == NULL_RTX)
16921 return orig_x;
16923 return x;
16926 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16927 return ix86_delegitimize_tls_address (orig_x);
16929 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16930 and -mcmodel=medium -fpic. */
16933 if (GET_CODE (x) != PLUS
16934 || GET_CODE (XEXP (x, 1)) != CONST)
16935 return ix86_delegitimize_tls_address (orig_x);
16937 if (ix86_pic_register_p (XEXP (x, 0)))
16938 /* %ebx + GOT/GOTOFF */
16940 else if (GET_CODE (XEXP (x, 0)) == PLUS)
16942 /* %ebx + %reg * scale + GOT/GOTOFF */
16943 reg_addend = XEXP (x, 0);
16944 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
16945 reg_addend = XEXP (reg_addend, 1);
16946 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
16947 reg_addend = XEXP (reg_addend, 0);
16948 else
16950 reg_addend = NULL_RTX;
16951 addend = XEXP (x, 0);
16954 else
16955 addend = XEXP (x, 0);
16957 x = XEXP (XEXP (x, 1), 0);
16958 if (GET_CODE (x) == PLUS
16959 && CONST_INT_P (XEXP (x, 1)))
16961 const_addend = XEXP (x, 1);
16962 x = XEXP (x, 0);
16965 if (GET_CODE (x) == UNSPEC
16966 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
16967 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
16968 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
16969 && !MEM_P (orig_x) && !addend)))
16970 result = XVECEXP (x, 0, 0);
16972 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
16973 && !MEM_P (orig_x))
16974 result = XVECEXP (x, 0, 0);
16976 if (! result)
16977 return ix86_delegitimize_tls_address (orig_x);
16979 /* For (PLUS something CONST_INT) both find_base_{value,term} just
16980 recurse on the first operand. */
16981 if (const_addend && !base_term_p)
16982 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
16983 if (reg_addend)
16984 result = gen_rtx_PLUS (Pmode, reg_addend, result);
16985 if (addend)
16987 /* If the rest of original X doesn't involve the PIC register, add
16988 addend and subtract pic_offset_table_rtx. This can happen e.g.
16989 for code like:
16990 leal (%ebx, %ecx, 4), %ecx
16992 movl foo@GOTOFF(%ecx), %edx
16993 in which case we return (%ecx - %ebx) + foo
16994 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
16995 and reload has completed. Don't do the latter for debug,
16996 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
16997 if (pic_offset_table_rtx
16998 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
16999 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17000 pic_offset_table_rtx),
17001 result);
17002 else if (base_term_p
17003 && pic_offset_table_rtx
17004 && !TARGET_MACHO
17005 && !TARGET_VXWORKS_RTP)
17007 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17008 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17009 result = gen_rtx_PLUS (Pmode, tmp, result);
17011 else
17012 return orig_x;
17014 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17016 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17017 if (result == NULL_RTX)
17018 return orig_x;
17020 return result;
17023 /* The normal instantiation of the above template. */
17025 static rtx
17026 ix86_delegitimize_address (rtx x)
17028 return ix86_delegitimize_address_1 (x, false);
17031 /* If X is a machine specific address (i.e. a symbol or label being
17032 referenced as a displacement from the GOT implemented using an
17033 UNSPEC), then return the base term. Otherwise return X. */
17036 ix86_find_base_term (rtx x)
17038 rtx term;
17040 if (TARGET_64BIT)
17042 if (GET_CODE (x) != CONST)
17043 return x;
17044 term = XEXP (x, 0);
17045 if (GET_CODE (term) == PLUS
17046 && CONST_INT_P (XEXP (term, 1)))
17047 term = XEXP (term, 0);
17048 if (GET_CODE (term) != UNSPEC
17049 || (XINT (term, 1) != UNSPEC_GOTPCREL
17050 && XINT (term, 1) != UNSPEC_PCREL))
17051 return x;
17053 return XVECEXP (term, 0, 0);
17056 return ix86_delegitimize_address_1 (x, true);
17059 /* Return true if X shouldn't be emitted into the debug info.
17060 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17061 symbol easily into the .debug_info section, so we need not to
17062 delegitimize, but instead assemble as @gotoff.
17063 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17064 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17066 static bool
17067 ix86_const_not_ok_for_debug_p (rtx x)
17069 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17070 return true;
17072 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17073 return true;
17075 return false;
17078 static void
17079 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17080 bool fp, FILE *file)
17082 const char *suffix;
17084 if (mode == CCFPmode)
17086 code = ix86_fp_compare_code_to_integer (code);
17087 mode = CCmode;
17089 if (reverse)
17090 code = reverse_condition (code);
17092 switch (code)
17094 case EQ:
17095 gcc_assert (mode != CCGZmode);
17096 switch (mode)
17098 case E_CCAmode:
17099 suffix = "a";
17100 break;
17101 case E_CCCmode:
17102 suffix = "c";
17103 break;
17104 case E_CCOmode:
17105 suffix = "o";
17106 break;
17107 case E_CCPmode:
17108 suffix = "p";
17109 break;
17110 case E_CCSmode:
17111 suffix = "s";
17112 break;
17113 default:
17114 suffix = "e";
17115 break;
17117 break;
17118 case NE:
17119 gcc_assert (mode != CCGZmode);
17120 switch (mode)
17122 case E_CCAmode:
17123 suffix = "na";
17124 break;
17125 case E_CCCmode:
17126 suffix = "nc";
17127 break;
17128 case E_CCOmode:
17129 suffix = "no";
17130 break;
17131 case E_CCPmode:
17132 suffix = "np";
17133 break;
17134 case E_CCSmode:
17135 suffix = "ns";
17136 break;
17137 default:
17138 suffix = "ne";
17139 break;
17141 break;
17142 case GT:
17143 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17144 suffix = "g";
17145 break;
17146 case GTU:
17147 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17148 Those same assemblers have the same but opposite lossage on cmov. */
17149 if (mode == CCmode)
17150 suffix = fp ? "nbe" : "a";
17151 else
17152 gcc_unreachable ();
17153 break;
17154 case LT:
17155 switch (mode)
17157 case E_CCNOmode:
17158 case E_CCGOCmode:
17159 suffix = "s";
17160 break;
17162 case E_CCmode:
17163 case E_CCGCmode:
17164 case E_CCGZmode:
17165 suffix = "l";
17166 break;
17168 default:
17169 gcc_unreachable ();
17171 break;
17172 case LTU:
17173 if (mode == CCmode || mode == CCGZmode)
17174 suffix = "b";
17175 else if (mode == CCCmode)
17176 suffix = fp ? "b" : "c";
17177 else
17178 gcc_unreachable ();
17179 break;
17180 case GE:
17181 switch (mode)
17183 case E_CCNOmode:
17184 case E_CCGOCmode:
17185 suffix = "ns";
17186 break;
17188 case E_CCmode:
17189 case E_CCGCmode:
17190 case E_CCGZmode:
17191 suffix = "ge";
17192 break;
17194 default:
17195 gcc_unreachable ();
17197 break;
17198 case GEU:
17199 if (mode == CCmode || mode == CCGZmode)
17200 suffix = "nb";
17201 else if (mode == CCCmode)
17202 suffix = fp ? "nb" : "nc";
17203 else
17204 gcc_unreachable ();
17205 break;
17206 case LE:
17207 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17208 suffix = "le";
17209 break;
17210 case LEU:
17211 if (mode == CCmode)
17212 suffix = "be";
17213 else
17214 gcc_unreachable ();
17215 break;
17216 case UNORDERED:
17217 suffix = fp ? "u" : "p";
17218 break;
17219 case ORDERED:
17220 suffix = fp ? "nu" : "np";
17221 break;
17222 default:
17223 gcc_unreachable ();
17225 fputs (suffix, file);
17228 /* Print the name of register X to FILE based on its machine mode and number.
17229 If CODE is 'w', pretend the mode is HImode.
17230 If CODE is 'b', pretend the mode is QImode.
17231 If CODE is 'k', pretend the mode is SImode.
17232 If CODE is 'q', pretend the mode is DImode.
17233 If CODE is 'x', pretend the mode is V4SFmode.
17234 If CODE is 't', pretend the mode is V8SFmode.
17235 If CODE is 'g', pretend the mode is V16SFmode.
17236 If CODE is 'h', pretend the reg is the 'high' byte register.
17237 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17238 If CODE is 'd', duplicate the operand for AVX instruction.
17241 void
17242 print_reg (rtx x, int code, FILE *file)
17244 const char *reg;
17245 int msize;
17246 unsigned int regno;
17247 bool duplicated;
17249 if (ASSEMBLER_DIALECT == ASM_ATT)
17250 putc ('%', file);
17252 if (x == pc_rtx)
17254 gcc_assert (TARGET_64BIT);
17255 fputs ("rip", file);
17256 return;
17259 if (code == 'y' && STACK_TOP_P (x))
17261 fputs ("st(0)", file);
17262 return;
17265 if (code == 'w')
17266 msize = 2;
17267 else if (code == 'b')
17268 msize = 1;
17269 else if (code == 'k')
17270 msize = 4;
17271 else if (code == 'q')
17272 msize = 8;
17273 else if (code == 'h')
17274 msize = 0;
17275 else if (code == 'x')
17276 msize = 16;
17277 else if (code == 't')
17278 msize = 32;
17279 else if (code == 'g')
17280 msize = 64;
17281 else
17282 msize = GET_MODE_SIZE (GET_MODE (x));
17284 regno = REGNO (x);
17286 if (regno == ARG_POINTER_REGNUM
17287 || regno == FRAME_POINTER_REGNUM
17288 || regno == FPSR_REG
17289 || regno == FPCR_REG)
17291 output_operand_lossage
17292 ("invalid use of register '%s'", reg_names[regno]);
17293 return;
17295 else if (regno == FLAGS_REG)
17297 output_operand_lossage ("invalid use of asm flag output");
17298 return;
17301 duplicated = code == 'd' && TARGET_AVX;
17303 switch (msize)
17305 case 16:
17306 case 12:
17307 case 8:
17308 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17309 warning (0, "unsupported size for integer register");
17310 /* FALLTHRU */
17311 case 4:
17312 if (LEGACY_INT_REGNO_P (regno))
17313 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17314 /* FALLTHRU */
17315 case 2:
17316 normal:
17317 reg = hi_reg_name[regno];
17318 break;
17319 case 1:
17320 if (regno >= ARRAY_SIZE (qi_reg_name))
17321 goto normal;
17322 if (!ANY_QI_REGNO_P (regno))
17323 error ("unsupported size for integer register");
17324 reg = qi_reg_name[regno];
17325 break;
17326 case 0:
17327 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17328 goto normal;
17329 reg = qi_high_reg_name[regno];
17330 break;
17331 case 32:
17332 case 64:
17333 if (SSE_REGNO_P (regno))
17335 gcc_assert (!duplicated);
17336 putc (msize == 32 ? 'y' : 'z', file);
17337 reg = hi_reg_name[regno] + 1;
17338 break;
17340 goto normal;
17341 default:
17342 gcc_unreachable ();
17345 fputs (reg, file);
17347 /* Irritatingly, AMD extended registers use
17348 different naming convention: "r%d[bwd]" */
17349 if (REX_INT_REGNO_P (regno))
17351 gcc_assert (TARGET_64BIT);
17352 switch (msize)
17354 case 0:
17355 error ("extended registers have no high halves");
17356 break;
17357 case 1:
17358 putc ('b', file);
17359 break;
17360 case 2:
17361 putc ('w', file);
17362 break;
17363 case 4:
17364 putc ('d', file);
17365 break;
17366 case 8:
17367 /* no suffix */
17368 break;
17369 default:
17370 error ("unsupported operand size for extended register");
17371 break;
17373 return;
17376 if (duplicated)
17378 if (ASSEMBLER_DIALECT == ASM_ATT)
17379 fprintf (file, ", %%%s", reg);
17380 else
17381 fprintf (file, ", %s", reg);
17385 /* Meaning of CODE:
17386 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17387 C -- print opcode suffix for set/cmov insn.
17388 c -- like C, but print reversed condition
17389 F,f -- likewise, but for floating-point.
17390 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17391 otherwise nothing
17392 R -- print embeded rounding and sae.
17393 r -- print only sae.
17394 z -- print the opcode suffix for the size of the current operand.
17395 Z -- likewise, with special suffixes for x87 instructions.
17396 * -- print a star (in certain assembler syntax)
17397 A -- print an absolute memory reference.
17398 E -- print address with DImode register names if TARGET_64BIT.
17399 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17400 s -- print a shift double count, followed by the assemblers argument
17401 delimiter.
17402 b -- print the QImode name of the register for the indicated operand.
17403 %b0 would print %al if operands[0] is reg 0.
17404 w -- likewise, print the HImode name of the register.
17405 k -- likewise, print the SImode name of the register.
17406 q -- likewise, print the DImode name of the register.
17407 x -- likewise, print the V4SFmode name of the register.
17408 t -- likewise, print the V8SFmode name of the register.
17409 g -- likewise, print the V16SFmode name of the register.
17410 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17411 y -- print "st(0)" instead of "st" as a register.
17412 d -- print duplicated register operand for AVX instruction.
17413 D -- print condition for SSE cmp instruction.
17414 P -- if PIC, print an @PLT suffix.
17415 p -- print raw symbol name.
17416 X -- don't print any sort of PIC '@' suffix for a symbol.
17417 & -- print some in-use local-dynamic symbol name.
17418 H -- print a memory address offset by 8; used for sse high-parts
17419 Y -- print condition for XOP pcom* instruction.
17420 + -- print a branch hint as 'cs' or 'ds' prefix
17421 ; -- print a semicolon (after prefixes due to bug in older gas).
17422 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17423 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17424 ! -- print MPX prefix for jxx/call/ret instructions if required.
17427 void
17428 ix86_print_operand (FILE *file, rtx x, int code)
17430 if (code)
17432 switch (code)
17434 case 'A':
17435 switch (ASSEMBLER_DIALECT)
17437 case ASM_ATT:
17438 putc ('*', file);
17439 break;
17441 case ASM_INTEL:
17442 /* Intel syntax. For absolute addresses, registers should not
17443 be surrounded by braces. */
17444 if (!REG_P (x))
17446 putc ('[', file);
17447 ix86_print_operand (file, x, 0);
17448 putc (']', file);
17449 return;
17451 break;
17453 default:
17454 gcc_unreachable ();
17457 ix86_print_operand (file, x, 0);
17458 return;
17460 case 'E':
17461 /* Wrap address in an UNSPEC to declare special handling. */
17462 if (TARGET_64BIT)
17463 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17465 output_address (VOIDmode, x);
17466 return;
17468 case 'L':
17469 if (ASSEMBLER_DIALECT == ASM_ATT)
17470 putc ('l', file);
17471 return;
17473 case 'W':
17474 if (ASSEMBLER_DIALECT == ASM_ATT)
17475 putc ('w', file);
17476 return;
17478 case 'B':
17479 if (ASSEMBLER_DIALECT == ASM_ATT)
17480 putc ('b', file);
17481 return;
17483 case 'Q':
17484 if (ASSEMBLER_DIALECT == ASM_ATT)
17485 putc ('l', file);
17486 return;
17488 case 'S':
17489 if (ASSEMBLER_DIALECT == ASM_ATT)
17490 putc ('s', file);
17491 return;
17493 case 'T':
17494 if (ASSEMBLER_DIALECT == ASM_ATT)
17495 putc ('t', file);
17496 return;
17498 case 'O':
17499 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17500 if (ASSEMBLER_DIALECT != ASM_ATT)
17501 return;
17503 switch (GET_MODE_SIZE (GET_MODE (x)))
17505 case 2:
17506 putc ('w', file);
17507 break;
17509 case 4:
17510 putc ('l', file);
17511 break;
17513 case 8:
17514 putc ('q', file);
17515 break;
17517 default:
17518 output_operand_lossage ("invalid operand size for operand "
17519 "code 'O'");
17520 return;
17523 putc ('.', file);
17524 #endif
17525 return;
17527 case 'z':
17528 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17530 /* Opcodes don't get size suffixes if using Intel opcodes. */
17531 if (ASSEMBLER_DIALECT == ASM_INTEL)
17532 return;
17534 switch (GET_MODE_SIZE (GET_MODE (x)))
17536 case 1:
17537 putc ('b', file);
17538 return;
17540 case 2:
17541 putc ('w', file);
17542 return;
17544 case 4:
17545 putc ('l', file);
17546 return;
17548 case 8:
17549 putc ('q', file);
17550 return;
17552 default:
17553 output_operand_lossage ("invalid operand size for operand "
17554 "code 'z'");
17555 return;
17559 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17560 warning (0, "non-integer operand used with operand code 'z'");
17561 /* FALLTHRU */
17563 case 'Z':
17564 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17565 if (ASSEMBLER_DIALECT == ASM_INTEL)
17566 return;
17568 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17570 switch (GET_MODE_SIZE (GET_MODE (x)))
17572 case 2:
17573 #ifdef HAVE_AS_IX86_FILDS
17574 putc ('s', file);
17575 #endif
17576 return;
17578 case 4:
17579 putc ('l', file);
17580 return;
17582 case 8:
17583 #ifdef HAVE_AS_IX86_FILDQ
17584 putc ('q', file);
17585 #else
17586 fputs ("ll", file);
17587 #endif
17588 return;
17590 default:
17591 break;
17594 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17596 /* 387 opcodes don't get size suffixes
17597 if the operands are registers. */
17598 if (STACK_REG_P (x))
17599 return;
17601 switch (GET_MODE_SIZE (GET_MODE (x)))
17603 case 4:
17604 putc ('s', file);
17605 return;
17607 case 8:
17608 putc ('l', file);
17609 return;
17611 case 12:
17612 case 16:
17613 putc ('t', file);
17614 return;
17616 default:
17617 break;
17620 else
17622 output_operand_lossage ("invalid operand type used with "
17623 "operand code 'Z'");
17624 return;
17627 output_operand_lossage ("invalid operand size for operand code 'Z'");
17628 return;
17630 case 'd':
17631 case 'b':
17632 case 'w':
17633 case 'k':
17634 case 'q':
17635 case 'h':
17636 case 't':
17637 case 'g':
17638 case 'y':
17639 case 'x':
17640 case 'X':
17641 case 'P':
17642 case 'p':
17643 break;
17645 case 's':
17646 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17648 ix86_print_operand (file, x, 0);
17649 fputs (", ", file);
17651 return;
17653 case 'Y':
17654 switch (GET_CODE (x))
17656 case NE:
17657 fputs ("neq", file);
17658 break;
17659 case EQ:
17660 fputs ("eq", file);
17661 break;
17662 case GE:
17663 case GEU:
17664 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17665 break;
17666 case GT:
17667 case GTU:
17668 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17669 break;
17670 case LE:
17671 case LEU:
17672 fputs ("le", file);
17673 break;
17674 case LT:
17675 case LTU:
17676 fputs ("lt", file);
17677 break;
17678 case UNORDERED:
17679 fputs ("unord", file);
17680 break;
17681 case ORDERED:
17682 fputs ("ord", file);
17683 break;
17684 case UNEQ:
17685 fputs ("ueq", file);
17686 break;
17687 case UNGE:
17688 fputs ("nlt", file);
17689 break;
17690 case UNGT:
17691 fputs ("nle", file);
17692 break;
17693 case UNLE:
17694 fputs ("ule", file);
17695 break;
17696 case UNLT:
17697 fputs ("ult", file);
17698 break;
17699 case LTGT:
17700 fputs ("une", file);
17701 break;
17702 default:
17703 output_operand_lossage ("operand is not a condition code, "
17704 "invalid operand code 'Y'");
17705 return;
17707 return;
17709 case 'D':
17710 /* Little bit of braindamage here. The SSE compare instructions
17711 does use completely different names for the comparisons that the
17712 fp conditional moves. */
17713 switch (GET_CODE (x))
17715 case UNEQ:
17716 if (TARGET_AVX)
17718 fputs ("eq_us", file);
17719 break;
17721 /* FALLTHRU */
17722 case EQ:
17723 fputs ("eq", file);
17724 break;
17725 case UNLT:
17726 if (TARGET_AVX)
17728 fputs ("nge", file);
17729 break;
17731 /* FALLTHRU */
17732 case LT:
17733 fputs ("lt", file);
17734 break;
17735 case UNLE:
17736 if (TARGET_AVX)
17738 fputs ("ngt", file);
17739 break;
17741 /* FALLTHRU */
17742 case LE:
17743 fputs ("le", file);
17744 break;
17745 case UNORDERED:
17746 fputs ("unord", file);
17747 break;
17748 case LTGT:
17749 if (TARGET_AVX)
17751 fputs ("neq_oq", file);
17752 break;
17754 /* FALLTHRU */
17755 case NE:
17756 fputs ("neq", file);
17757 break;
17758 case GE:
17759 if (TARGET_AVX)
17761 fputs ("ge", file);
17762 break;
17764 /* FALLTHRU */
17765 case UNGE:
17766 fputs ("nlt", file);
17767 break;
17768 case GT:
17769 if (TARGET_AVX)
17771 fputs ("gt", file);
17772 break;
17774 /* FALLTHRU */
17775 case UNGT:
17776 fputs ("nle", file);
17777 break;
17778 case ORDERED:
17779 fputs ("ord", file);
17780 break;
17781 default:
17782 output_operand_lossage ("operand is not a condition code, "
17783 "invalid operand code 'D'");
17784 return;
17786 return;
17788 case 'F':
17789 case 'f':
17790 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17791 if (ASSEMBLER_DIALECT == ASM_ATT)
17792 putc ('.', file);
17793 gcc_fallthrough ();
17794 #endif
17796 case 'C':
17797 case 'c':
17798 if (!COMPARISON_P (x))
17800 output_operand_lossage ("operand is not a condition code, "
17801 "invalid operand code '%c'", code);
17802 return;
17804 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17805 code == 'c' || code == 'f',
17806 code == 'F' || code == 'f',
17807 file);
17808 return;
17810 case 'H':
17811 if (!offsettable_memref_p (x))
17813 output_operand_lossage ("operand is not an offsettable memory "
17814 "reference, invalid operand code 'H'");
17815 return;
17817 /* It doesn't actually matter what mode we use here, as we're
17818 only going to use this for printing. */
17819 x = adjust_address_nv (x, DImode, 8);
17820 /* Output 'qword ptr' for intel assembler dialect. */
17821 if (ASSEMBLER_DIALECT == ASM_INTEL)
17822 code = 'q';
17823 break;
17825 case 'K':
17826 if (!CONST_INT_P (x))
17828 output_operand_lossage ("operand is not an integer, invalid "
17829 "operand code 'K'");
17830 return;
17833 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17834 #ifdef HAVE_AS_IX86_HLE
17835 fputs ("xacquire ", file);
17836 #else
17837 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17838 #endif
17839 else if (INTVAL (x) & IX86_HLE_RELEASE)
17840 #ifdef HAVE_AS_IX86_HLE
17841 fputs ("xrelease ", file);
17842 #else
17843 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17844 #endif
17845 /* We do not want to print value of the operand. */
17846 return;
17848 case 'N':
17849 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17850 fputs ("{z}", file);
17851 return;
17853 case 'r':
17854 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17856 output_operand_lossage ("operand is not a specific integer, "
17857 "invalid operand code 'r'");
17858 return;
17861 if (ASSEMBLER_DIALECT == ASM_INTEL)
17862 fputs (", ", file);
17864 fputs ("{sae}", file);
17866 if (ASSEMBLER_DIALECT == ASM_ATT)
17867 fputs (", ", file);
17869 return;
17871 case 'R':
17872 if (!CONST_INT_P (x))
17874 output_operand_lossage ("operand is not an integer, invalid "
17875 "operand code 'R'");
17876 return;
17879 if (ASSEMBLER_DIALECT == ASM_INTEL)
17880 fputs (", ", file);
17882 switch (INTVAL (x))
17884 case ROUND_NEAREST_INT | ROUND_SAE:
17885 fputs ("{rn-sae}", file);
17886 break;
17887 case ROUND_NEG_INF | ROUND_SAE:
17888 fputs ("{rd-sae}", file);
17889 break;
17890 case ROUND_POS_INF | ROUND_SAE:
17891 fputs ("{ru-sae}", file);
17892 break;
17893 case ROUND_ZERO | ROUND_SAE:
17894 fputs ("{rz-sae}", file);
17895 break;
17896 default:
17897 output_operand_lossage ("operand is not a specific integer, "
17898 "invalid operand code 'R'");
17901 if (ASSEMBLER_DIALECT == ASM_ATT)
17902 fputs (", ", file);
17904 return;
17906 case '*':
17907 if (ASSEMBLER_DIALECT == ASM_ATT)
17908 putc ('*', file);
17909 return;
17911 case '&':
17913 const char *name = get_some_local_dynamic_name ();
17914 if (name == NULL)
17915 output_operand_lossage ("'%%&' used without any "
17916 "local dynamic TLS references");
17917 else
17918 assemble_name (file, name);
17919 return;
17922 case '+':
17924 rtx x;
17926 if (!optimize
17927 || optimize_function_for_size_p (cfun)
17928 || !TARGET_BRANCH_PREDICTION_HINTS)
17929 return;
17931 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17932 if (x)
17934 int pred_val = profile_probability::from_reg_br_prob_note
17935 (XINT (x, 0)).to_reg_br_prob_base ();
17937 if (pred_val < REG_BR_PROB_BASE * 45 / 100
17938 || pred_val > REG_BR_PROB_BASE * 55 / 100)
17940 bool taken = pred_val > REG_BR_PROB_BASE / 2;
17941 bool cputaken
17942 = final_forward_branch_p (current_output_insn) == 0;
17944 /* Emit hints only in the case default branch prediction
17945 heuristics would fail. */
17946 if (taken != cputaken)
17948 /* We use 3e (DS) prefix for taken branches and
17949 2e (CS) prefix for not taken branches. */
17950 if (taken)
17951 fputs ("ds ; ", file);
17952 else
17953 fputs ("cs ; ", file);
17957 return;
17960 case ';':
17961 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
17962 putc (';', file);
17963 #endif
17964 return;
17966 case '~':
17967 putc (TARGET_AVX2 ? 'i' : 'f', file);
17968 return;
17970 case '^':
17971 if (TARGET_64BIT && Pmode != word_mode)
17972 fputs ("addr32 ", file);
17973 return;
17975 case '!':
17976 if (ix86_bnd_prefixed_insn_p (current_output_insn))
17977 fputs ("bnd ", file);
17978 if (ix86_notrack_prefixed_insn_p (current_output_insn))
17979 fputs ("notrack ", file);
17980 return;
17982 default:
17983 output_operand_lossage ("invalid operand code '%c'", code);
17987 if (REG_P (x))
17988 print_reg (x, code, file);
17990 else if (MEM_P (x))
17992 rtx addr = XEXP (x, 0);
17994 /* No `byte ptr' prefix for call instructions ... */
17995 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
17997 machine_mode mode = GET_MODE (x);
17998 const char *size;
18000 /* Check for explicit size override codes. */
18001 if (code == 'b')
18002 size = "BYTE";
18003 else if (code == 'w')
18004 size = "WORD";
18005 else if (code == 'k')
18006 size = "DWORD";
18007 else if (code == 'q')
18008 size = "QWORD";
18009 else if (code == 'x')
18010 size = "XMMWORD";
18011 else if (code == 't')
18012 size = "YMMWORD";
18013 else if (code == 'g')
18014 size = "ZMMWORD";
18015 else if (mode == BLKmode)
18016 /* ... or BLKmode operands, when not overridden. */
18017 size = NULL;
18018 else
18019 switch (GET_MODE_SIZE (mode))
18021 case 1: size = "BYTE"; break;
18022 case 2: size = "WORD"; break;
18023 case 4: size = "DWORD"; break;
18024 case 8: size = "QWORD"; break;
18025 case 12: size = "TBYTE"; break;
18026 case 16:
18027 if (mode == XFmode)
18028 size = "TBYTE";
18029 else
18030 size = "XMMWORD";
18031 break;
18032 case 32: size = "YMMWORD"; break;
18033 case 64: size = "ZMMWORD"; break;
18034 default:
18035 gcc_unreachable ();
18037 if (size)
18039 fputs (size, file);
18040 fputs (" PTR ", file);
18044 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18045 output_operand_lossage ("invalid constraints for operand");
18046 else
18047 ix86_print_operand_address_as
18048 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18051 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18053 long l;
18055 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18057 if (ASSEMBLER_DIALECT == ASM_ATT)
18058 putc ('$', file);
18059 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18060 if (code == 'q')
18061 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18062 (unsigned long long) (int) l);
18063 else
18064 fprintf (file, "0x%08x", (unsigned int) l);
18067 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18069 long l[2];
18071 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18073 if (ASSEMBLER_DIALECT == ASM_ATT)
18074 putc ('$', file);
18075 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18078 /* These float cases don't actually occur as immediate operands. */
18079 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18081 char dstr[30];
18083 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18084 fputs (dstr, file);
18087 else
18089 /* We have patterns that allow zero sets of memory, for instance.
18090 In 64-bit mode, we should probably support all 8-byte vectors,
18091 since we can in fact encode that into an immediate. */
18092 if (GET_CODE (x) == CONST_VECTOR)
18094 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18095 x = const0_rtx;
18098 if (code != 'P' && code != 'p')
18100 if (CONST_INT_P (x))
18102 if (ASSEMBLER_DIALECT == ASM_ATT)
18103 putc ('$', file);
18105 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18106 || GET_CODE (x) == LABEL_REF)
18108 if (ASSEMBLER_DIALECT == ASM_ATT)
18109 putc ('$', file);
18110 else
18111 fputs ("OFFSET FLAT:", file);
18114 if (CONST_INT_P (x))
18115 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18116 else if (flag_pic || MACHOPIC_INDIRECT)
18117 output_pic_addr_const (file, x, code);
18118 else
18119 output_addr_const (file, x);
18123 static bool
18124 ix86_print_operand_punct_valid_p (unsigned char code)
18126 return (code == '*' || code == '+' || code == '&' || code == ';'
18127 || code == '~' || code == '^' || code == '!');
18130 /* Print a memory operand whose address is ADDR. */
18132 static void
18133 ix86_print_operand_address_as (FILE *file, rtx addr,
18134 addr_space_t as, bool no_rip)
18136 struct ix86_address parts;
18137 rtx base, index, disp;
18138 int scale;
18139 int ok;
18140 bool vsib = false;
18141 int code = 0;
18143 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18145 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18146 gcc_assert (parts.index == NULL_RTX);
18147 parts.index = XVECEXP (addr, 0, 1);
18148 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18149 addr = XVECEXP (addr, 0, 0);
18150 vsib = true;
18152 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18154 gcc_assert (TARGET_64BIT);
18155 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18156 code = 'q';
18158 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18160 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18161 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18162 if (parts.base != NULL_RTX)
18164 parts.index = parts.base;
18165 parts.scale = 1;
18167 parts.base = XVECEXP (addr, 0, 0);
18168 addr = XVECEXP (addr, 0, 0);
18170 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18172 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18173 gcc_assert (parts.index == NULL_RTX);
18174 parts.index = XVECEXP (addr, 0, 1);
18175 addr = XVECEXP (addr, 0, 0);
18177 else
18178 ok = ix86_decompose_address (addr, &parts);
18180 gcc_assert (ok);
18182 base = parts.base;
18183 index = parts.index;
18184 disp = parts.disp;
18185 scale = parts.scale;
18187 if (ADDR_SPACE_GENERIC_P (as))
18188 as = parts.seg;
18189 else
18190 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18192 if (!ADDR_SPACE_GENERIC_P (as))
18194 const char *string;
18196 if (as == ADDR_SPACE_SEG_FS)
18197 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18198 else if (as == ADDR_SPACE_SEG_GS)
18199 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18200 else
18201 gcc_unreachable ();
18202 fputs (string, file);
18205 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18206 if (TARGET_64BIT && !base && !index && !no_rip)
18208 rtx symbol = disp;
18210 if (GET_CODE (disp) == CONST
18211 && GET_CODE (XEXP (disp, 0)) == PLUS
18212 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18213 symbol = XEXP (XEXP (disp, 0), 0);
18215 if (GET_CODE (symbol) == LABEL_REF
18216 || (GET_CODE (symbol) == SYMBOL_REF
18217 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18218 base = pc_rtx;
18221 if (!base && !index)
18223 /* Displacement only requires special attention. */
18224 if (CONST_INT_P (disp))
18226 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18227 fputs ("ds:", file);
18228 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18230 /* Load the external function address via the GOT slot to avoid PLT. */
18231 else if (GET_CODE (disp) == CONST
18232 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18233 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18234 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18235 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18236 output_pic_addr_const (file, disp, 0);
18237 else if (flag_pic)
18238 output_pic_addr_const (file, disp, 0);
18239 else
18240 output_addr_const (file, disp);
18242 else
18244 /* Print SImode register names to force addr32 prefix. */
18245 if (SImode_address_operand (addr, VOIDmode))
18247 if (flag_checking)
18249 gcc_assert (TARGET_64BIT);
18250 switch (GET_CODE (addr))
18252 case SUBREG:
18253 gcc_assert (GET_MODE (addr) == SImode);
18254 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18255 break;
18256 case ZERO_EXTEND:
18257 case AND:
18258 gcc_assert (GET_MODE (addr) == DImode);
18259 break;
18260 default:
18261 gcc_unreachable ();
18264 gcc_assert (!code);
18265 code = 'k';
18267 else if (code == 0
18268 && TARGET_X32
18269 && disp
18270 && CONST_INT_P (disp)
18271 && INTVAL (disp) < -16*1024*1024)
18273 /* X32 runs in 64-bit mode, where displacement, DISP, in
18274 address DISP(%r64), is encoded as 32-bit immediate sign-
18275 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18276 address is %r64 + 0xffffffffbffffd00. When %r64 <
18277 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18278 which is invalid for x32. The correct address is %r64
18279 - 0x40000300 == 0xf7ffdd64. To properly encode
18280 -0x40000300(%r64) for x32, we zero-extend negative
18281 displacement by forcing addr32 prefix which truncates
18282 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18283 zero-extend all negative displacements, including -1(%rsp).
18284 However, for small negative displacements, sign-extension
18285 won't cause overflow. We only zero-extend negative
18286 displacements if they < -16*1024*1024, which is also used
18287 to check legitimate address displacements for PIC. */
18288 code = 'k';
18291 /* Since the upper 32 bits of RSP are always zero for x32,
18292 we can encode %esp as %rsp to avoid 0x67 prefix if
18293 there is no index register. */
18294 if (TARGET_X32 && Pmode == SImode
18295 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18296 code = 'q';
18298 if (ASSEMBLER_DIALECT == ASM_ATT)
18300 if (disp)
18302 if (flag_pic)
18303 output_pic_addr_const (file, disp, 0);
18304 else if (GET_CODE (disp) == LABEL_REF)
18305 output_asm_label (disp);
18306 else
18307 output_addr_const (file, disp);
18310 putc ('(', file);
18311 if (base)
18312 print_reg (base, code, file);
18313 if (index)
18315 putc (',', file);
18316 print_reg (index, vsib ? 0 : code, file);
18317 if (scale != 1 || vsib)
18318 fprintf (file, ",%d", scale);
18320 putc (')', file);
18322 else
18324 rtx offset = NULL_RTX;
18326 if (disp)
18328 /* Pull out the offset of a symbol; print any symbol itself. */
18329 if (GET_CODE (disp) == CONST
18330 && GET_CODE (XEXP (disp, 0)) == PLUS
18331 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18333 offset = XEXP (XEXP (disp, 0), 1);
18334 disp = gen_rtx_CONST (VOIDmode,
18335 XEXP (XEXP (disp, 0), 0));
18338 if (flag_pic)
18339 output_pic_addr_const (file, disp, 0);
18340 else if (GET_CODE (disp) == LABEL_REF)
18341 output_asm_label (disp);
18342 else if (CONST_INT_P (disp))
18343 offset = disp;
18344 else
18345 output_addr_const (file, disp);
18348 putc ('[', file);
18349 if (base)
18351 print_reg (base, code, file);
18352 if (offset)
18354 if (INTVAL (offset) >= 0)
18355 putc ('+', file);
18356 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18359 else if (offset)
18360 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18361 else
18362 putc ('0', file);
18364 if (index)
18366 putc ('+', file);
18367 print_reg (index, vsib ? 0 : code, file);
18368 if (scale != 1 || vsib)
18369 fprintf (file, "*%d", scale);
18371 putc (']', file);
18376 static void
18377 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18379 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18382 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18384 static bool
18385 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18387 rtx op;
18389 if (GET_CODE (x) != UNSPEC)
18390 return false;
18392 op = XVECEXP (x, 0, 0);
18393 switch (XINT (x, 1))
18395 case UNSPEC_GOTOFF:
18396 output_addr_const (file, op);
18397 fputs ("@gotoff", file);
18398 break;
18399 case UNSPEC_GOTTPOFF:
18400 output_addr_const (file, op);
18401 /* FIXME: This might be @TPOFF in Sun ld. */
18402 fputs ("@gottpoff", file);
18403 break;
18404 case UNSPEC_TPOFF:
18405 output_addr_const (file, op);
18406 fputs ("@tpoff", file);
18407 break;
18408 case UNSPEC_NTPOFF:
18409 output_addr_const (file, op);
18410 if (TARGET_64BIT)
18411 fputs ("@tpoff", file);
18412 else
18413 fputs ("@ntpoff", file);
18414 break;
18415 case UNSPEC_DTPOFF:
18416 output_addr_const (file, op);
18417 fputs ("@dtpoff", file);
18418 break;
18419 case UNSPEC_GOTNTPOFF:
18420 output_addr_const (file, op);
18421 if (TARGET_64BIT)
18422 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18423 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18424 else
18425 fputs ("@gotntpoff", file);
18426 break;
18427 case UNSPEC_INDNTPOFF:
18428 output_addr_const (file, op);
18429 fputs ("@indntpoff", file);
18430 break;
18431 #if TARGET_MACHO
18432 case UNSPEC_MACHOPIC_OFFSET:
18433 output_addr_const (file, op);
18434 putc ('-', file);
18435 machopic_output_function_base_name (file);
18436 break;
18437 #endif
18439 default:
18440 return false;
18443 return true;
18446 /* Split one or more double-mode RTL references into pairs of half-mode
18447 references. The RTL can be REG, offsettable MEM, integer constant, or
18448 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18449 split and "num" is its length. lo_half and hi_half are output arrays
18450 that parallel "operands". */
18452 void
18453 split_double_mode (machine_mode mode, rtx operands[],
18454 int num, rtx lo_half[], rtx hi_half[])
18456 machine_mode half_mode;
18457 unsigned int byte;
18459 switch (mode)
18461 case E_TImode:
18462 half_mode = DImode;
18463 break;
18464 case E_DImode:
18465 half_mode = SImode;
18466 break;
18467 default:
18468 gcc_unreachable ();
18471 byte = GET_MODE_SIZE (half_mode);
18473 while (num--)
18475 rtx op = operands[num];
18477 /* simplify_subreg refuse to split volatile memory addresses,
18478 but we still have to handle it. */
18479 if (MEM_P (op))
18481 lo_half[num] = adjust_address (op, half_mode, 0);
18482 hi_half[num] = adjust_address (op, half_mode, byte);
18484 else
18486 lo_half[num] = simplify_gen_subreg (half_mode, op,
18487 GET_MODE (op) == VOIDmode
18488 ? mode : GET_MODE (op), 0);
18489 hi_half[num] = simplify_gen_subreg (half_mode, op,
18490 GET_MODE (op) == VOIDmode
18491 ? mode : GET_MODE (op), byte);
18496 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18497 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18498 is the expression of the binary operation. The output may either be
18499 emitted here, or returned to the caller, like all output_* functions.
18501 There is no guarantee that the operands are the same mode, as they
18502 might be within FLOAT or FLOAT_EXTEND expressions. */
18504 #ifndef SYSV386_COMPAT
18505 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18506 wants to fix the assemblers because that causes incompatibility
18507 with gcc. No-one wants to fix gcc because that causes
18508 incompatibility with assemblers... You can use the option of
18509 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18510 #define SYSV386_COMPAT 1
18511 #endif
18513 const char *
18514 output_387_binary_op (rtx_insn *insn, rtx *operands)
18516 static char buf[40];
18517 const char *p;
18518 bool is_sse
18519 = (SSE_REG_P (operands[0])
18520 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18522 if (is_sse)
18523 p = "%v";
18524 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18525 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18526 p = "fi";
18527 else
18528 p = "f";
18530 strcpy (buf, p);
18532 switch (GET_CODE (operands[3]))
18534 case PLUS:
18535 p = "add"; break;
18536 case MINUS:
18537 p = "sub"; break;
18538 case MULT:
18539 p = "mul"; break;
18540 case DIV:
18541 p = "div"; break;
18542 default:
18543 gcc_unreachable ();
18546 strcat (buf, p);
18548 if (is_sse)
18550 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18551 strcat (buf, p);
18553 if (TARGET_AVX)
18554 p = "\t{%2, %1, %0|%0, %1, %2}";
18555 else
18556 p = "\t{%2, %0|%0, %2}";
18558 strcat (buf, p);
18559 return buf;
18562 /* Even if we do not want to check the inputs, this documents input
18563 constraints. Which helps in understanding the following code. */
18564 if (flag_checking)
18566 if (STACK_REG_P (operands[0])
18567 && ((REG_P (operands[1])
18568 && REGNO (operands[0]) == REGNO (operands[1])
18569 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18570 || (REG_P (operands[2])
18571 && REGNO (operands[0]) == REGNO (operands[2])
18572 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18573 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18574 ; /* ok */
18575 else
18576 gcc_unreachable ();
18579 switch (GET_CODE (operands[3]))
18581 case MULT:
18582 case PLUS:
18583 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18584 std::swap (operands[1], operands[2]);
18586 /* know operands[0] == operands[1]. */
18588 if (MEM_P (operands[2]))
18590 p = "%Z2\t%2";
18591 break;
18594 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18596 if (STACK_TOP_P (operands[0]))
18597 /* How is it that we are storing to a dead operand[2]?
18598 Well, presumably operands[1] is dead too. We can't
18599 store the result to st(0) as st(0) gets popped on this
18600 instruction. Instead store to operands[2] (which I
18601 think has to be st(1)). st(1) will be popped later.
18602 gcc <= 2.8.1 didn't have this check and generated
18603 assembly code that the Unixware assembler rejected. */
18604 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18605 else
18606 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18607 break;
18610 if (STACK_TOP_P (operands[0]))
18611 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18612 else
18613 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18614 break;
18616 case MINUS:
18617 case DIV:
18618 if (MEM_P (operands[1]))
18620 p = "r%Z1\t%1";
18621 break;
18624 if (MEM_P (operands[2]))
18626 p = "%Z2\t%2";
18627 break;
18630 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18632 #if SYSV386_COMPAT
18633 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18634 derived assemblers, confusingly reverse the direction of
18635 the operation for fsub{r} and fdiv{r} when the
18636 destination register is not st(0). The Intel assembler
18637 doesn't have this brain damage. Read !SYSV386_COMPAT to
18638 figure out what the hardware really does. */
18639 if (STACK_TOP_P (operands[0]))
18640 p = "{p\t%0, %2|rp\t%2, %0}";
18641 else
18642 p = "{rp\t%2, %0|p\t%0, %2}";
18643 #else
18644 if (STACK_TOP_P (operands[0]))
18645 /* As above for fmul/fadd, we can't store to st(0). */
18646 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18647 else
18648 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18649 #endif
18650 break;
18653 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18655 #if SYSV386_COMPAT
18656 if (STACK_TOP_P (operands[0]))
18657 p = "{rp\t%0, %1|p\t%1, %0}";
18658 else
18659 p = "{p\t%1, %0|rp\t%0, %1}";
18660 #else
18661 if (STACK_TOP_P (operands[0]))
18662 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18663 else
18664 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18665 #endif
18666 break;
18669 if (STACK_TOP_P (operands[0]))
18671 if (STACK_TOP_P (operands[1]))
18672 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18673 else
18674 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18675 break;
18677 else if (STACK_TOP_P (operands[1]))
18679 #if SYSV386_COMPAT
18680 p = "{\t%1, %0|r\t%0, %1}";
18681 #else
18682 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18683 #endif
18685 else
18687 #if SYSV386_COMPAT
18688 p = "{r\t%2, %0|\t%0, %2}";
18689 #else
18690 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18691 #endif
18693 break;
18695 default:
18696 gcc_unreachable ();
18699 strcat (buf, p);
18700 return buf;
18703 /* Return needed mode for entity in optimize_mode_switching pass. */
18705 static int
18706 ix86_dirflag_mode_needed (rtx_insn *insn)
18708 if (CALL_P (insn))
18710 if (cfun->machine->func_type == TYPE_NORMAL)
18711 return X86_DIRFLAG_ANY;
18712 else
18713 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18714 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18717 if (recog_memoized (insn) < 0)
18718 return X86_DIRFLAG_ANY;
18720 if (get_attr_type (insn) == TYPE_STR)
18722 /* Emit cld instruction if stringops are used in the function. */
18723 if (cfun->machine->func_type == TYPE_NORMAL)
18724 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18725 else
18726 return X86_DIRFLAG_RESET;
18729 return X86_DIRFLAG_ANY;
18732 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
18734 static bool
18735 ix86_check_avx_upper_register (const_rtx exp)
18737 if (SUBREG_P (exp))
18738 exp = SUBREG_REG (exp);
18740 return (REG_P (exp)
18741 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
18742 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
18745 /* Return needed mode for entity in optimize_mode_switching pass. */
18747 static int
18748 ix86_avx_u128_mode_needed (rtx_insn *insn)
18750 if (CALL_P (insn))
18752 rtx link;
18754 /* Needed mode is set to AVX_U128_CLEAN if there are
18755 no 256bit or 512bit modes used in function arguments. */
18756 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18757 link;
18758 link = XEXP (link, 1))
18760 if (GET_CODE (XEXP (link, 0)) == USE)
18762 rtx arg = XEXP (XEXP (link, 0), 0);
18764 if (ix86_check_avx_upper_register (arg))
18765 return AVX_U128_DIRTY;
18769 return AVX_U128_CLEAN;
18772 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
18773 Hardware changes state only when a 256bit register is written to,
18774 but we need to prevent the compiler from moving optimal insertion
18775 point above eventual read from 256bit or 512 bit register. */
18776 subrtx_iterator::array_type array;
18777 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18778 if (ix86_check_avx_upper_register (*iter))
18779 return AVX_U128_DIRTY;
18781 return AVX_U128_ANY;
18784 /* Return mode that i387 must be switched into
18785 prior to the execution of insn. */
18787 static int
18788 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18790 enum attr_i387_cw mode;
18792 /* The mode UNINITIALIZED is used to store control word after a
18793 function call or ASM pattern. The mode ANY specify that function
18794 has no requirements on the control word and make no changes in the
18795 bits we are interested in. */
18797 if (CALL_P (insn)
18798 || (NONJUMP_INSN_P (insn)
18799 && (asm_noperands (PATTERN (insn)) >= 0
18800 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18801 return I387_CW_UNINITIALIZED;
18803 if (recog_memoized (insn) < 0)
18804 return I387_CW_ANY;
18806 mode = get_attr_i387_cw (insn);
18808 switch (entity)
18810 case I387_TRUNC:
18811 if (mode == I387_CW_TRUNC)
18812 return mode;
18813 break;
18815 case I387_FLOOR:
18816 if (mode == I387_CW_FLOOR)
18817 return mode;
18818 break;
18820 case I387_CEIL:
18821 if (mode == I387_CW_CEIL)
18822 return mode;
18823 break;
18825 case I387_MASK_PM:
18826 if (mode == I387_CW_MASK_PM)
18827 return mode;
18828 break;
18830 default:
18831 gcc_unreachable ();
18834 return I387_CW_ANY;
18837 /* Return mode that entity must be switched into
18838 prior to the execution of insn. */
18840 static int
18841 ix86_mode_needed (int entity, rtx_insn *insn)
18843 switch (entity)
18845 case X86_DIRFLAG:
18846 return ix86_dirflag_mode_needed (insn);
18847 case AVX_U128:
18848 return ix86_avx_u128_mode_needed (insn);
18849 case I387_TRUNC:
18850 case I387_FLOOR:
18851 case I387_CEIL:
18852 case I387_MASK_PM:
18853 return ix86_i387_mode_needed (entity, insn);
18854 default:
18855 gcc_unreachable ();
18857 return 0;
18860 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
18862 static void
18863 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
18865 if (ix86_check_avx_upper_register (dest))
18867 bool *used = (bool *) data;
18868 *used = true;
18872 /* Calculate mode of upper 128bit AVX registers after the insn. */
18874 static int
18875 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18877 rtx pat = PATTERN (insn);
18879 if (vzeroupper_operation (pat, VOIDmode)
18880 || vzeroall_operation (pat, VOIDmode))
18881 return AVX_U128_CLEAN;
18883 /* We know that state is clean after CALL insn if there are no
18884 256bit or 512bit registers used in the function return register. */
18885 if (CALL_P (insn))
18887 bool avx_upper_reg_found = false;
18888 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
18890 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18893 /* Otherwise, return current mode. Remember that if insn
18894 references AVX 256bit or 512bit registers, the mode was already
18895 changed to DIRTY from MODE_NEEDED. */
18896 return mode;
18899 /* Return the mode that an insn results in. */
18901 static int
18902 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18904 switch (entity)
18906 case X86_DIRFLAG:
18907 return mode;
18908 case AVX_U128:
18909 return ix86_avx_u128_mode_after (mode, insn);
18910 case I387_TRUNC:
18911 case I387_FLOOR:
18912 case I387_CEIL:
18913 case I387_MASK_PM:
18914 return mode;
18915 default:
18916 gcc_unreachable ();
18920 static int
18921 ix86_dirflag_mode_entry (void)
18923 /* For TARGET_CLD or in the interrupt handler we can't assume
18924 direction flag state at function entry. */
18925 if (TARGET_CLD
18926 || cfun->machine->func_type != TYPE_NORMAL)
18927 return X86_DIRFLAG_ANY;
18929 return X86_DIRFLAG_RESET;
18932 static int
18933 ix86_avx_u128_mode_entry (void)
18935 tree arg;
18937 /* Entry mode is set to AVX_U128_DIRTY if there are
18938 256bit or 512bit modes used in function arguments. */
18939 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
18940 arg = TREE_CHAIN (arg))
18942 rtx incoming = DECL_INCOMING_RTL (arg);
18944 if (incoming && ix86_check_avx_upper_register (incoming))
18945 return AVX_U128_DIRTY;
18948 return AVX_U128_CLEAN;
18951 /* Return a mode that ENTITY is assumed to be
18952 switched to at function entry. */
18954 static int
18955 ix86_mode_entry (int entity)
18957 switch (entity)
18959 case X86_DIRFLAG:
18960 return ix86_dirflag_mode_entry ();
18961 case AVX_U128:
18962 return ix86_avx_u128_mode_entry ();
18963 case I387_TRUNC:
18964 case I387_FLOOR:
18965 case I387_CEIL:
18966 case I387_MASK_PM:
18967 return I387_CW_ANY;
18968 default:
18969 gcc_unreachable ();
18973 static int
18974 ix86_avx_u128_mode_exit (void)
18976 rtx reg = crtl->return_rtx;
18978 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
18979 or 512 bit modes used in the function return register. */
18980 if (reg && ix86_check_avx_upper_register (reg))
18981 return AVX_U128_DIRTY;
18983 return AVX_U128_CLEAN;
18986 /* Return a mode that ENTITY is assumed to be
18987 switched to at function exit. */
18989 static int
18990 ix86_mode_exit (int entity)
18992 switch (entity)
18994 case X86_DIRFLAG:
18995 return X86_DIRFLAG_ANY;
18996 case AVX_U128:
18997 return ix86_avx_u128_mode_exit ();
18998 case I387_TRUNC:
18999 case I387_FLOOR:
19000 case I387_CEIL:
19001 case I387_MASK_PM:
19002 return I387_CW_ANY;
19003 default:
19004 gcc_unreachable ();
19008 static int
19009 ix86_mode_priority (int, int n)
19011 return n;
19014 /* Output code to initialize control word copies used by trunc?f?i and
19015 rounding patterns. CURRENT_MODE is set to current control word,
19016 while NEW_MODE is set to new control word. */
19018 static void
19019 emit_i387_cw_initialization (int mode)
19021 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19022 rtx new_mode;
19024 enum ix86_stack_slot slot;
19026 rtx reg = gen_reg_rtx (HImode);
19028 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19029 emit_move_insn (reg, copy_rtx (stored_mode));
19031 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19032 || optimize_insn_for_size_p ())
19034 switch (mode)
19036 case I387_CW_TRUNC:
19037 /* round toward zero (truncate) */
19038 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19039 slot = SLOT_CW_TRUNC;
19040 break;
19042 case I387_CW_FLOOR:
19043 /* round down toward -oo */
19044 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19045 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19046 slot = SLOT_CW_FLOOR;
19047 break;
19049 case I387_CW_CEIL:
19050 /* round up toward +oo */
19051 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19052 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19053 slot = SLOT_CW_CEIL;
19054 break;
19056 case I387_CW_MASK_PM:
19057 /* mask precision exception for nearbyint() */
19058 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19059 slot = SLOT_CW_MASK_PM;
19060 break;
19062 default:
19063 gcc_unreachable ();
19066 else
19068 switch (mode)
19070 case I387_CW_TRUNC:
19071 /* round toward zero (truncate) */
19072 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19073 slot = SLOT_CW_TRUNC;
19074 break;
19076 case I387_CW_FLOOR:
19077 /* round down toward -oo */
19078 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19079 slot = SLOT_CW_FLOOR;
19080 break;
19082 case I387_CW_CEIL:
19083 /* round up toward +oo */
19084 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19085 slot = SLOT_CW_CEIL;
19086 break;
19088 case I387_CW_MASK_PM:
19089 /* mask precision exception for nearbyint() */
19090 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19091 slot = SLOT_CW_MASK_PM;
19092 break;
19094 default:
19095 gcc_unreachable ();
19099 gcc_assert (slot < MAX_386_STACK_LOCALS);
19101 new_mode = assign_386_stack_local (HImode, slot);
19102 emit_move_insn (new_mode, reg);
19105 /* Emit vzeroupper. */
19107 void
19108 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19110 int i;
19112 /* Cancel automatic vzeroupper insertion if there are
19113 live call-saved SSE registers at the insertion point. */
19115 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19116 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19117 return;
19119 if (TARGET_64BIT)
19120 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19121 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19122 return;
19124 emit_insn (gen_avx_vzeroupper ());
19127 /* Generate one or more insns to set ENTITY to MODE. */
19129 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19130 is the set of hard registers live at the point where the insn(s)
19131 are to be inserted. */
19133 static void
19134 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19135 HARD_REG_SET regs_live)
19137 switch (entity)
19139 case X86_DIRFLAG:
19140 if (mode == X86_DIRFLAG_RESET)
19141 emit_insn (gen_cld ());
19142 break;
19143 case AVX_U128:
19144 if (mode == AVX_U128_CLEAN)
19145 ix86_avx_emit_vzeroupper (regs_live);
19146 break;
19147 case I387_TRUNC:
19148 case I387_FLOOR:
19149 case I387_CEIL:
19150 case I387_MASK_PM:
19151 if (mode != I387_CW_ANY
19152 && mode != I387_CW_UNINITIALIZED)
19153 emit_i387_cw_initialization (mode);
19154 break;
19155 default:
19156 gcc_unreachable ();
19160 /* Output code for INSN to convert a float to a signed int. OPERANDS
19161 are the insn operands. The output may be [HSD]Imode and the input
19162 operand may be [SDX]Fmode. */
19164 const char *
19165 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19167 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19168 bool dimode_p = GET_MODE (operands[0]) == DImode;
19169 int round_mode = get_attr_i387_cw (insn);
19171 static char buf[40];
19172 const char *p;
19174 /* Jump through a hoop or two for DImode, since the hardware has no
19175 non-popping instruction. We used to do this a different way, but
19176 that was somewhat fragile and broke with post-reload splitters. */
19177 if ((dimode_p || fisttp) && !stack_top_dies)
19178 output_asm_insn ("fld\t%y1", operands);
19180 gcc_assert (STACK_TOP_P (operands[1]));
19181 gcc_assert (MEM_P (operands[0]));
19182 gcc_assert (GET_MODE (operands[1]) != TFmode);
19184 if (fisttp)
19185 return "fisttp%Z0\t%0";
19187 strcpy (buf, "fist");
19189 if (round_mode != I387_CW_ANY)
19190 output_asm_insn ("fldcw\t%3", operands);
19192 p = "p%Z0\t%0";
19193 strcat (buf, p + !(stack_top_dies || dimode_p));
19195 output_asm_insn (buf, operands);
19197 if (round_mode != I387_CW_ANY)
19198 output_asm_insn ("fldcw\t%2", operands);
19200 return "";
19203 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19204 have the values zero or one, indicates the ffreep insn's operand
19205 from the OPERANDS array. */
19207 static const char *
19208 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19210 if (TARGET_USE_FFREEP)
19211 #ifdef HAVE_AS_IX86_FFREEP
19212 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19213 #else
19215 static char retval[32];
19216 int regno = REGNO (operands[opno]);
19218 gcc_assert (STACK_REGNO_P (regno));
19220 regno -= FIRST_STACK_REG;
19222 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19223 return retval;
19225 #endif
19227 return opno ? "fstp\t%y1" : "fstp\t%y0";
19231 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19232 should be used. UNORDERED_P is true when fucom should be used. */
19234 const char *
19235 output_fp_compare (rtx_insn *insn, rtx *operands,
19236 bool eflags_p, bool unordered_p)
19238 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19239 bool stack_top_dies;
19241 static char buf[40];
19242 const char *p;
19244 gcc_assert (STACK_TOP_P (xops[0]));
19246 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19248 if (eflags_p)
19250 p = unordered_p ? "fucomi" : "fcomi";
19251 strcpy (buf, p);
19253 p = "p\t{%y1, %0|%0, %y1}";
19254 strcat (buf, p + !stack_top_dies);
19256 return buf;
19259 if (STACK_REG_P (xops[1])
19260 && stack_top_dies
19261 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19263 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19265 /* If both the top of the 387 stack die, and the other operand
19266 is also a stack register that dies, then this must be a
19267 `fcompp' float compare. */
19268 p = unordered_p ? "fucompp" : "fcompp";
19269 strcpy (buf, p);
19271 else if (const0_operand (xops[1], VOIDmode))
19273 gcc_assert (!unordered_p);
19274 strcpy (buf, "ftst");
19276 else
19278 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19280 gcc_assert (!unordered_p);
19281 p = "ficom";
19283 else
19284 p = unordered_p ? "fucom" : "fcom";
19286 strcpy (buf, p);
19288 p = "p%Z2\t%y2";
19289 strcat (buf, p + !stack_top_dies);
19292 output_asm_insn (buf, operands);
19293 return "fnstsw\t%0";
19296 void
19297 ix86_output_addr_vec_elt (FILE *file, int value)
19299 const char *directive = ASM_LONG;
19301 #ifdef ASM_QUAD
19302 if (TARGET_LP64)
19303 directive = ASM_QUAD;
19304 #else
19305 gcc_assert (!TARGET_64BIT);
19306 #endif
19308 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19311 void
19312 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19314 const char *directive = ASM_LONG;
19316 #ifdef ASM_QUAD
19317 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19318 directive = ASM_QUAD;
19319 #else
19320 gcc_assert (!TARGET_64BIT);
19321 #endif
19322 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19323 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19324 fprintf (file, "%s%s%d-%s%d\n",
19325 directive, LPREFIX, value, LPREFIX, rel);
19326 else if (HAVE_AS_GOTOFF_IN_DATA)
19327 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19328 #if TARGET_MACHO
19329 else if (TARGET_MACHO)
19331 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19332 machopic_output_function_base_name (file);
19333 putc ('\n', file);
19335 #endif
19336 else
19337 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19338 GOT_SYMBOL_NAME, LPREFIX, value);
19341 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19342 for the target. */
19344 void
19345 ix86_expand_clear (rtx dest)
19347 rtx tmp;
19349 /* We play register width games, which are only valid after reload. */
19350 gcc_assert (reload_completed);
19352 /* Avoid HImode and its attendant prefix byte. */
19353 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19354 dest = gen_rtx_REG (SImode, REGNO (dest));
19355 tmp = gen_rtx_SET (dest, const0_rtx);
19357 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19359 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19360 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19363 emit_insn (tmp);
19366 void
19367 ix86_expand_move (machine_mode mode, rtx operands[])
19369 rtx op0, op1;
19370 rtx tmp, addend = NULL_RTX;
19371 enum tls_model model;
19373 op0 = operands[0];
19374 op1 = operands[1];
19376 switch (GET_CODE (op1))
19378 case CONST:
19379 tmp = XEXP (op1, 0);
19381 if (GET_CODE (tmp) != PLUS
19382 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19383 break;
19385 op1 = XEXP (tmp, 0);
19386 addend = XEXP (tmp, 1);
19387 /* FALLTHRU */
19389 case SYMBOL_REF:
19390 model = SYMBOL_REF_TLS_MODEL (op1);
19392 if (model)
19393 op1 = legitimize_tls_address (op1, model, true);
19394 else if (ix86_force_load_from_GOT_p (op1))
19396 /* Load the external function address via GOT slot to avoid PLT. */
19397 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19398 (TARGET_64BIT
19399 ? UNSPEC_GOTPCREL
19400 : UNSPEC_GOT));
19401 op1 = gen_rtx_CONST (Pmode, op1);
19402 op1 = gen_const_mem (Pmode, op1);
19403 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19405 else
19407 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19408 if (tmp)
19410 op1 = tmp;
19411 if (!addend)
19412 break;
19414 else
19416 op1 = operands[1];
19417 break;
19421 if (addend)
19423 op1 = force_operand (op1, NULL_RTX);
19424 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19425 op0, 1, OPTAB_DIRECT);
19427 else
19428 op1 = force_operand (op1, op0);
19430 if (op1 == op0)
19431 return;
19433 op1 = convert_to_mode (mode, op1, 1);
19435 default:
19436 break;
19439 if ((flag_pic || MACHOPIC_INDIRECT)
19440 && symbolic_operand (op1, mode))
19442 if (TARGET_MACHO && !TARGET_64BIT)
19444 #if TARGET_MACHO
19445 /* dynamic-no-pic */
19446 if (MACHOPIC_INDIRECT)
19448 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19449 ? op0 : gen_reg_rtx (Pmode);
19450 op1 = machopic_indirect_data_reference (op1, temp);
19451 if (MACHOPIC_PURE)
19452 op1 = machopic_legitimize_pic_address (op1, mode,
19453 temp == op1 ? 0 : temp);
19455 if (op0 != op1 && GET_CODE (op0) != MEM)
19457 rtx insn = gen_rtx_SET (op0, op1);
19458 emit_insn (insn);
19459 return;
19461 if (GET_CODE (op0) == MEM)
19462 op1 = force_reg (Pmode, op1);
19463 else
19465 rtx temp = op0;
19466 if (GET_CODE (temp) != REG)
19467 temp = gen_reg_rtx (Pmode);
19468 temp = legitimize_pic_address (op1, temp);
19469 if (temp == op0)
19470 return;
19471 op1 = temp;
19473 /* dynamic-no-pic */
19474 #endif
19476 else
19478 if (MEM_P (op0))
19479 op1 = force_reg (mode, op1);
19480 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19482 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19483 op1 = legitimize_pic_address (op1, reg);
19484 if (op0 == op1)
19485 return;
19486 op1 = convert_to_mode (mode, op1, 1);
19490 else
19492 if (MEM_P (op0)
19493 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19494 || !push_operand (op0, mode))
19495 && MEM_P (op1))
19496 op1 = force_reg (mode, op1);
19498 if (push_operand (op0, mode)
19499 && ! general_no_elim_operand (op1, mode))
19500 op1 = copy_to_mode_reg (mode, op1);
19502 /* Force large constants in 64bit compilation into register
19503 to get them CSEed. */
19504 if (can_create_pseudo_p ()
19505 && (mode == DImode) && TARGET_64BIT
19506 && immediate_operand (op1, mode)
19507 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19508 && !register_operand (op0, mode)
19509 && optimize)
19510 op1 = copy_to_mode_reg (mode, op1);
19512 if (can_create_pseudo_p ()
19513 && CONST_DOUBLE_P (op1))
19515 /* If we are loading a floating point constant to a register,
19516 force the value to memory now, since we'll get better code
19517 out the back end. */
19519 op1 = validize_mem (force_const_mem (mode, op1));
19520 if (!register_operand (op0, mode))
19522 rtx temp = gen_reg_rtx (mode);
19523 emit_insn (gen_rtx_SET (temp, op1));
19524 emit_move_insn (op0, temp);
19525 return;
19530 emit_insn (gen_rtx_SET (op0, op1));
19533 void
19534 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19536 rtx op0 = operands[0], op1 = operands[1];
19537 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19538 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19539 unsigned int align = (TARGET_IAMCU
19540 ? GET_MODE_BITSIZE (mode)
19541 : GET_MODE_ALIGNMENT (mode));
19543 if (push_operand (op0, VOIDmode))
19544 op0 = emit_move_resolve_push (mode, op0);
19546 /* Force constants other than zero into memory. We do not know how
19547 the instructions used to build constants modify the upper 64 bits
19548 of the register, once we have that information we may be able
19549 to handle some of them more efficiently. */
19550 if (can_create_pseudo_p ()
19551 && (CONSTANT_P (op1)
19552 || (SUBREG_P (op1)
19553 && CONSTANT_P (SUBREG_REG (op1))))
19554 && ((register_operand (op0, mode)
19555 && !standard_sse_constant_p (op1, mode))
19556 /* ix86_expand_vector_move_misalign() does not like constants. */
19557 || (SSE_REG_MODE_P (mode)
19558 && MEM_P (op0)
19559 && MEM_ALIGN (op0) < align)))
19561 if (SUBREG_P (op1))
19563 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19564 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19565 if (r)
19566 r = validize_mem (r);
19567 else
19568 r = force_reg (imode, SUBREG_REG (op1));
19569 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19571 else
19572 op1 = validize_mem (force_const_mem (mode, op1));
19575 /* We need to check memory alignment for SSE mode since attribute
19576 can make operands unaligned. */
19577 if (can_create_pseudo_p ()
19578 && SSE_REG_MODE_P (mode)
19579 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19580 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19582 rtx tmp[2];
19584 /* ix86_expand_vector_move_misalign() does not like both
19585 arguments in memory. */
19586 if (!register_operand (op0, mode)
19587 && !register_operand (op1, mode))
19588 op1 = force_reg (mode, op1);
19590 tmp[0] = op0; tmp[1] = op1;
19591 ix86_expand_vector_move_misalign (mode, tmp);
19592 return;
19595 /* Make operand1 a register if it isn't already. */
19596 if (can_create_pseudo_p ()
19597 && !register_operand (op0, mode)
19598 && !register_operand (op1, mode))
19600 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19601 return;
19604 emit_insn (gen_rtx_SET (op0, op1));
19607 /* Split 32-byte AVX unaligned load and store if needed. */
19609 static void
19610 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19612 rtx m;
19613 rtx (*extract) (rtx, rtx, rtx);
19614 machine_mode mode;
19616 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19617 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19619 emit_insn (gen_rtx_SET (op0, op1));
19620 return;
19623 rtx orig_op0 = NULL_RTX;
19624 mode = GET_MODE (op0);
19625 switch (GET_MODE_CLASS (mode))
19627 case MODE_VECTOR_INT:
19628 case MODE_INT:
19629 if (mode != V32QImode)
19631 if (!MEM_P (op0))
19633 orig_op0 = op0;
19634 op0 = gen_reg_rtx (V32QImode);
19636 else
19637 op0 = gen_lowpart (V32QImode, op0);
19638 op1 = gen_lowpart (V32QImode, op1);
19639 mode = V32QImode;
19641 break;
19642 case MODE_VECTOR_FLOAT:
19643 break;
19644 default:
19645 gcc_unreachable ();
19648 switch (mode)
19650 default:
19651 gcc_unreachable ();
19652 case E_V32QImode:
19653 extract = gen_avx_vextractf128v32qi;
19654 mode = V16QImode;
19655 break;
19656 case E_V8SFmode:
19657 extract = gen_avx_vextractf128v8sf;
19658 mode = V4SFmode;
19659 break;
19660 case E_V4DFmode:
19661 extract = gen_avx_vextractf128v4df;
19662 mode = V2DFmode;
19663 break;
19666 if (MEM_P (op1))
19668 rtx r = gen_reg_rtx (mode);
19669 m = adjust_address (op1, mode, 0);
19670 emit_move_insn (r, m);
19671 m = adjust_address (op1, mode, 16);
19672 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19673 emit_move_insn (op0, r);
19675 else if (MEM_P (op0))
19677 m = adjust_address (op0, mode, 0);
19678 emit_insn (extract (m, op1, const0_rtx));
19679 m = adjust_address (op0, mode, 16);
19680 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19682 else
19683 gcc_unreachable ();
19685 if (orig_op0)
19686 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19689 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19690 straight to ix86_expand_vector_move. */
19691 /* Code generation for scalar reg-reg moves of single and double precision data:
19692 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19693 movaps reg, reg
19694 else
19695 movss reg, reg
19696 if (x86_sse_partial_reg_dependency == true)
19697 movapd reg, reg
19698 else
19699 movsd reg, reg
19701 Code generation for scalar loads of double precision data:
19702 if (x86_sse_split_regs == true)
19703 movlpd mem, reg (gas syntax)
19704 else
19705 movsd mem, reg
19707 Code generation for unaligned packed loads of single precision data
19708 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19709 if (x86_sse_unaligned_move_optimal)
19710 movups mem, reg
19712 if (x86_sse_partial_reg_dependency == true)
19714 xorps reg, reg
19715 movlps mem, reg
19716 movhps mem+8, reg
19718 else
19720 movlps mem, reg
19721 movhps mem+8, reg
19724 Code generation for unaligned packed loads of double precision data
19725 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19726 if (x86_sse_unaligned_move_optimal)
19727 movupd mem, reg
19729 if (x86_sse_split_regs == true)
19731 movlpd mem, reg
19732 movhpd mem+8, reg
19734 else
19736 movsd mem, reg
19737 movhpd mem+8, reg
19741 void
19742 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19744 rtx op0, op1, m;
19746 op0 = operands[0];
19747 op1 = operands[1];
19749 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19750 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19752 emit_insn (gen_rtx_SET (op0, op1));
19753 return;
19756 if (TARGET_AVX)
19758 if (GET_MODE_SIZE (mode) == 32)
19759 ix86_avx256_split_vector_move_misalign (op0, op1);
19760 else
19761 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19762 emit_insn (gen_rtx_SET (op0, op1));
19763 return;
19766 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19767 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19769 emit_insn (gen_rtx_SET (op0, op1));
19770 return;
19773 /* ??? If we have typed data, then it would appear that using
19774 movdqu is the only way to get unaligned data loaded with
19775 integer type. */
19776 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19778 emit_insn (gen_rtx_SET (op0, op1));
19779 return;
19782 if (MEM_P (op1))
19784 if (TARGET_SSE2 && mode == V2DFmode)
19786 rtx zero;
19788 /* When SSE registers are split into halves, we can avoid
19789 writing to the top half twice. */
19790 if (TARGET_SSE_SPLIT_REGS)
19792 emit_clobber (op0);
19793 zero = op0;
19795 else
19797 /* ??? Not sure about the best option for the Intel chips.
19798 The following would seem to satisfy; the register is
19799 entirely cleared, breaking the dependency chain. We
19800 then store to the upper half, with a dependency depth
19801 of one. A rumor has it that Intel recommends two movsd
19802 followed by an unpacklpd, but this is unconfirmed. And
19803 given that the dependency depth of the unpacklpd would
19804 still be one, I'm not sure why this would be better. */
19805 zero = CONST0_RTX (V2DFmode);
19808 m = adjust_address (op1, DFmode, 0);
19809 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19810 m = adjust_address (op1, DFmode, 8);
19811 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19813 else
19815 rtx t;
19817 if (mode != V4SFmode)
19818 t = gen_reg_rtx (V4SFmode);
19819 else
19820 t = op0;
19822 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19823 emit_move_insn (t, CONST0_RTX (V4SFmode));
19824 else
19825 emit_clobber (t);
19827 m = adjust_address (op1, V2SFmode, 0);
19828 emit_insn (gen_sse_loadlps (t, t, m));
19829 m = adjust_address (op1, V2SFmode, 8);
19830 emit_insn (gen_sse_loadhps (t, t, m));
19831 if (mode != V4SFmode)
19832 emit_move_insn (op0, gen_lowpart (mode, t));
19835 else if (MEM_P (op0))
19837 if (TARGET_SSE2 && mode == V2DFmode)
19839 m = adjust_address (op0, DFmode, 0);
19840 emit_insn (gen_sse2_storelpd (m, op1));
19841 m = adjust_address (op0, DFmode, 8);
19842 emit_insn (gen_sse2_storehpd (m, op1));
19844 else
19846 if (mode != V4SFmode)
19847 op1 = gen_lowpart (V4SFmode, op1);
19849 m = adjust_address (op0, V2SFmode, 0);
19850 emit_insn (gen_sse_storelps (m, op1));
19851 m = adjust_address (op0, V2SFmode, 8);
19852 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19855 else
19856 gcc_unreachable ();
19859 /* Helper function of ix86_fixup_binary_operands to canonicalize
19860 operand order. Returns true if the operands should be swapped. */
19862 static bool
19863 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19864 rtx operands[])
19866 rtx dst = operands[0];
19867 rtx src1 = operands[1];
19868 rtx src2 = operands[2];
19870 /* If the operation is not commutative, we can't do anything. */
19871 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
19872 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
19873 return false;
19875 /* Highest priority is that src1 should match dst. */
19876 if (rtx_equal_p (dst, src1))
19877 return false;
19878 if (rtx_equal_p (dst, src2))
19879 return true;
19881 /* Next highest priority is that immediate constants come second. */
19882 if (immediate_operand (src2, mode))
19883 return false;
19884 if (immediate_operand (src1, mode))
19885 return true;
19887 /* Lowest priority is that memory references should come second. */
19888 if (MEM_P (src2))
19889 return false;
19890 if (MEM_P (src1))
19891 return true;
19893 return false;
19897 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19898 destination to use for the operation. If different from the true
19899 destination in operands[0], a copy operation will be required. */
19902 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19903 rtx operands[])
19905 rtx dst = operands[0];
19906 rtx src1 = operands[1];
19907 rtx src2 = operands[2];
19909 /* Canonicalize operand order. */
19910 if (ix86_swap_binary_operands_p (code, mode, operands))
19912 /* It is invalid to swap operands of different modes. */
19913 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19915 std::swap (src1, src2);
19918 /* Both source operands cannot be in memory. */
19919 if (MEM_P (src1) && MEM_P (src2))
19921 /* Optimization: Only read from memory once. */
19922 if (rtx_equal_p (src1, src2))
19924 src2 = force_reg (mode, src2);
19925 src1 = src2;
19927 else if (rtx_equal_p (dst, src1))
19928 src2 = force_reg (mode, src2);
19929 else
19930 src1 = force_reg (mode, src1);
19933 /* If the destination is memory, and we do not have matching source
19934 operands, do things in registers. */
19935 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19936 dst = gen_reg_rtx (mode);
19938 /* Source 1 cannot be a constant. */
19939 if (CONSTANT_P (src1))
19940 src1 = force_reg (mode, src1);
19942 /* Source 1 cannot be a non-matching memory. */
19943 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19944 src1 = force_reg (mode, src1);
19946 /* Improve address combine. */
19947 if (code == PLUS
19948 && GET_MODE_CLASS (mode) == MODE_INT
19949 && MEM_P (src2))
19950 src2 = force_reg (mode, src2);
19952 operands[1] = src1;
19953 operands[2] = src2;
19954 return dst;
19957 /* Similarly, but assume that the destination has already been
19958 set up properly. */
19960 void
19961 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
19962 machine_mode mode, rtx operands[])
19964 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
19965 gcc_assert (dst == operands[0]);
19968 /* Attempt to expand a binary operator. Make the expansion closer to the
19969 actual machine, then just general_operand, which will allow 3 separate
19970 memory references (one output, two input) in a single insn. */
19972 void
19973 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
19974 rtx operands[])
19976 rtx src1, src2, dst, op, clob;
19978 dst = ix86_fixup_binary_operands (code, mode, operands);
19979 src1 = operands[1];
19980 src2 = operands[2];
19982 /* Emit the instruction. */
19984 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
19986 if (reload_completed
19987 && code == PLUS
19988 && !rtx_equal_p (dst, src1))
19990 /* This is going to be an LEA; avoid splitting it later. */
19991 emit_insn (op);
19993 else
19995 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19996 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
19999 /* Fix up the destination if needed. */
20000 if (dst != operands[0])
20001 emit_move_insn (operands[0], dst);
20004 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20005 the given OPERANDS. */
20007 void
20008 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20009 rtx operands[])
20011 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20012 if (SUBREG_P (operands[1]))
20014 op1 = operands[1];
20015 op2 = operands[2];
20017 else if (SUBREG_P (operands[2]))
20019 op1 = operands[2];
20020 op2 = operands[1];
20022 /* Optimize (__m128i) d | (__m128i) e and similar code
20023 when d and e are float vectors into float vector logical
20024 insn. In C/C++ without using intrinsics there is no other way
20025 to express vector logical operation on float vectors than
20026 to cast them temporarily to integer vectors. */
20027 if (op1
20028 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20029 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20030 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20031 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20032 && SUBREG_BYTE (op1) == 0
20033 && (GET_CODE (op2) == CONST_VECTOR
20034 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20035 && SUBREG_BYTE (op2) == 0))
20036 && can_create_pseudo_p ())
20038 rtx dst;
20039 switch (GET_MODE (SUBREG_REG (op1)))
20041 case E_V4SFmode:
20042 case E_V8SFmode:
20043 case E_V16SFmode:
20044 case E_V2DFmode:
20045 case E_V4DFmode:
20046 case E_V8DFmode:
20047 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20048 if (GET_CODE (op2) == CONST_VECTOR)
20050 op2 = gen_lowpart (GET_MODE (dst), op2);
20051 op2 = force_reg (GET_MODE (dst), op2);
20053 else
20055 op1 = operands[1];
20056 op2 = SUBREG_REG (operands[2]);
20057 if (!vector_operand (op2, GET_MODE (dst)))
20058 op2 = force_reg (GET_MODE (dst), op2);
20060 op1 = SUBREG_REG (op1);
20061 if (!vector_operand (op1, GET_MODE (dst)))
20062 op1 = force_reg (GET_MODE (dst), op1);
20063 emit_insn (gen_rtx_SET (dst,
20064 gen_rtx_fmt_ee (code, GET_MODE (dst),
20065 op1, op2)));
20066 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20067 return;
20068 default:
20069 break;
20072 if (!vector_operand (operands[1], mode))
20073 operands[1] = force_reg (mode, operands[1]);
20074 if (!vector_operand (operands[2], mode))
20075 operands[2] = force_reg (mode, operands[2]);
20076 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20077 emit_insn (gen_rtx_SET (operands[0],
20078 gen_rtx_fmt_ee (code, mode, operands[1],
20079 operands[2])));
20082 /* Return TRUE or FALSE depending on whether the binary operator meets the
20083 appropriate constraints. */
20085 bool
20086 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20087 rtx operands[3])
20089 rtx dst = operands[0];
20090 rtx src1 = operands[1];
20091 rtx src2 = operands[2];
20093 /* Both source operands cannot be in memory. */
20094 if (MEM_P (src1) && MEM_P (src2))
20095 return false;
20097 /* Canonicalize operand order for commutative operators. */
20098 if (ix86_swap_binary_operands_p (code, mode, operands))
20099 std::swap (src1, src2);
20101 /* If the destination is memory, we must have a matching source operand. */
20102 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20103 return false;
20105 /* Source 1 cannot be a constant. */
20106 if (CONSTANT_P (src1))
20107 return false;
20109 /* Source 1 cannot be a non-matching memory. */
20110 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20111 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20112 return (code == AND
20113 && (mode == HImode
20114 || mode == SImode
20115 || (TARGET_64BIT && mode == DImode))
20116 && satisfies_constraint_L (src2));
20118 return true;
20121 /* Attempt to expand a unary operator. Make the expansion closer to the
20122 actual machine, then just general_operand, which will allow 2 separate
20123 memory references (one output, one input) in a single insn. */
20125 void
20126 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20127 rtx operands[])
20129 bool matching_memory = false;
20130 rtx src, dst, op, clob;
20132 dst = operands[0];
20133 src = operands[1];
20135 /* If the destination is memory, and we do not have matching source
20136 operands, do things in registers. */
20137 if (MEM_P (dst))
20139 if (rtx_equal_p (dst, src))
20140 matching_memory = true;
20141 else
20142 dst = gen_reg_rtx (mode);
20145 /* When source operand is memory, destination must match. */
20146 if (MEM_P (src) && !matching_memory)
20147 src = force_reg (mode, src);
20149 /* Emit the instruction. */
20151 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20153 if (code == NOT)
20154 emit_insn (op);
20155 else
20157 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20158 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20161 /* Fix up the destination if needed. */
20162 if (dst != operands[0])
20163 emit_move_insn (operands[0], dst);
20166 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20167 divisor are within the range [0-255]. */
20169 void
20170 ix86_split_idivmod (machine_mode mode, rtx operands[],
20171 bool signed_p)
20173 rtx_code_label *end_label, *qimode_label;
20174 rtx div, mod;
20175 rtx_insn *insn;
20176 rtx scratch, tmp0, tmp1, tmp2;
20177 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20178 rtx (*gen_zero_extend) (rtx, rtx);
20179 rtx (*gen_test_ccno_1) (rtx, rtx);
20181 switch (mode)
20183 case E_SImode:
20184 if (GET_MODE (operands[0]) == SImode)
20186 if (GET_MODE (operands[1]) == SImode)
20187 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20188 else
20189 gen_divmod4_1
20190 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20191 gen_zero_extend = gen_zero_extendqisi2;
20193 else
20195 gen_divmod4_1
20196 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20197 gen_zero_extend = gen_zero_extendqidi2;
20199 gen_test_ccno_1 = gen_testsi_ccno_1;
20200 break;
20201 case E_DImode:
20202 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20203 gen_test_ccno_1 = gen_testdi_ccno_1;
20204 gen_zero_extend = gen_zero_extendqidi2;
20205 break;
20206 default:
20207 gcc_unreachable ();
20210 end_label = gen_label_rtx ();
20211 qimode_label = gen_label_rtx ();
20213 scratch = gen_reg_rtx (mode);
20215 /* Use 8bit unsigned divimod if dividend and divisor are within
20216 the range [0-255]. */
20217 emit_move_insn (scratch, operands[2]);
20218 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20219 scratch, 1, OPTAB_DIRECT);
20220 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20221 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20222 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20223 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20224 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20225 pc_rtx);
20226 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20227 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20228 JUMP_LABEL (insn) = qimode_label;
20230 /* Generate original signed/unsigned divimod. */
20231 div = gen_divmod4_1 (operands[0], operands[1],
20232 operands[2], operands[3]);
20233 emit_insn (div);
20235 /* Branch to the end. */
20236 emit_jump_insn (gen_jump (end_label));
20237 emit_barrier ();
20239 /* Generate 8bit unsigned divide. */
20240 emit_label (qimode_label);
20241 /* Don't use operands[0] for result of 8bit divide since not all
20242 registers support QImode ZERO_EXTRACT. */
20243 tmp0 = lowpart_subreg (HImode, scratch, mode);
20244 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20245 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20246 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20248 if (signed_p)
20250 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20251 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20253 else
20255 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20256 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20258 if (mode == SImode)
20260 if (GET_MODE (operands[0]) != SImode)
20261 div = gen_rtx_ZERO_EXTEND (DImode, div);
20262 if (GET_MODE (operands[1]) != SImode)
20263 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20266 /* Extract remainder from AH. */
20267 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20268 tmp0, GEN_INT (8), GEN_INT (8));
20269 if (REG_P (operands[1]))
20270 insn = emit_move_insn (operands[1], tmp1);
20271 else
20273 /* Need a new scratch register since the old one has result
20274 of 8bit divide. */
20275 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20276 emit_move_insn (scratch, tmp1);
20277 insn = emit_move_insn (operands[1], scratch);
20279 set_unique_reg_note (insn, REG_EQUAL, mod);
20281 /* Zero extend quotient from AL. */
20282 tmp1 = gen_lowpart (QImode, tmp0);
20283 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20284 set_unique_reg_note (insn, REG_EQUAL, div);
20286 emit_label (end_label);
20289 #define LEA_MAX_STALL (3)
20290 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20292 /* Increase given DISTANCE in half-cycles according to
20293 dependencies between PREV and NEXT instructions.
20294 Add 1 half-cycle if there is no dependency and
20295 go to next cycle if there is some dependecy. */
20297 static unsigned int
20298 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20300 df_ref def, use;
20302 if (!prev || !next)
20303 return distance + (distance & 1) + 2;
20305 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20306 return distance + 1;
20308 FOR_EACH_INSN_USE (use, next)
20309 FOR_EACH_INSN_DEF (def, prev)
20310 if (!DF_REF_IS_ARTIFICIAL (def)
20311 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20312 return distance + (distance & 1) + 2;
20314 return distance + 1;
20317 /* Function checks if instruction INSN defines register number
20318 REGNO1 or REGNO2. */
20320 static bool
20321 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20322 rtx_insn *insn)
20324 df_ref def;
20326 FOR_EACH_INSN_DEF (def, insn)
20327 if (DF_REF_REG_DEF_P (def)
20328 && !DF_REF_IS_ARTIFICIAL (def)
20329 && (regno1 == DF_REF_REGNO (def)
20330 || regno2 == DF_REF_REGNO (def)))
20331 return true;
20333 return false;
20336 /* Function checks if instruction INSN uses register number
20337 REGNO as a part of address expression. */
20339 static bool
20340 insn_uses_reg_mem (unsigned int regno, rtx insn)
20342 df_ref use;
20344 FOR_EACH_INSN_USE (use, insn)
20345 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20346 return true;
20348 return false;
20351 /* Search backward for non-agu definition of register number REGNO1
20352 or register number REGNO2 in basic block starting from instruction
20353 START up to head of basic block or instruction INSN.
20355 Function puts true value into *FOUND var if definition was found
20356 and false otherwise.
20358 Distance in half-cycles between START and found instruction or head
20359 of BB is added to DISTANCE and returned. */
20361 static int
20362 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20363 rtx_insn *insn, int distance,
20364 rtx_insn *start, bool *found)
20366 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20367 rtx_insn *prev = start;
20368 rtx_insn *next = NULL;
20370 *found = false;
20372 while (prev
20373 && prev != insn
20374 && distance < LEA_SEARCH_THRESHOLD)
20376 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20378 distance = increase_distance (prev, next, distance);
20379 if (insn_defines_reg (regno1, regno2, prev))
20381 if (recog_memoized (prev) < 0
20382 || get_attr_type (prev) != TYPE_LEA)
20384 *found = true;
20385 return distance;
20389 next = prev;
20391 if (prev == BB_HEAD (bb))
20392 break;
20394 prev = PREV_INSN (prev);
20397 return distance;
20400 /* Search backward for non-agu definition of register number REGNO1
20401 or register number REGNO2 in INSN's basic block until
20402 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20403 2. Reach neighbor BBs boundary, or
20404 3. Reach agu definition.
20405 Returns the distance between the non-agu definition point and INSN.
20406 If no definition point, returns -1. */
20408 static int
20409 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20410 rtx_insn *insn)
20412 basic_block bb = BLOCK_FOR_INSN (insn);
20413 int distance = 0;
20414 bool found = false;
20416 if (insn != BB_HEAD (bb))
20417 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20418 distance, PREV_INSN (insn),
20419 &found);
20421 if (!found && distance < LEA_SEARCH_THRESHOLD)
20423 edge e;
20424 edge_iterator ei;
20425 bool simple_loop = false;
20427 FOR_EACH_EDGE (e, ei, bb->preds)
20428 if (e->src == bb)
20430 simple_loop = true;
20431 break;
20434 if (simple_loop)
20435 distance = distance_non_agu_define_in_bb (regno1, regno2,
20436 insn, distance,
20437 BB_END (bb), &found);
20438 else
20440 int shortest_dist = -1;
20441 bool found_in_bb = false;
20443 FOR_EACH_EDGE (e, ei, bb->preds)
20445 int bb_dist
20446 = distance_non_agu_define_in_bb (regno1, regno2,
20447 insn, distance,
20448 BB_END (e->src),
20449 &found_in_bb);
20450 if (found_in_bb)
20452 if (shortest_dist < 0)
20453 shortest_dist = bb_dist;
20454 else if (bb_dist > 0)
20455 shortest_dist = MIN (bb_dist, shortest_dist);
20457 found = true;
20461 distance = shortest_dist;
20465 /* get_attr_type may modify recog data. We want to make sure
20466 that recog data is valid for instruction INSN, on which
20467 distance_non_agu_define is called. INSN is unchanged here. */
20468 extract_insn_cached (insn);
20470 if (!found)
20471 return -1;
20473 return distance >> 1;
20476 /* Return the distance in half-cycles between INSN and the next
20477 insn that uses register number REGNO in memory address added
20478 to DISTANCE. Return -1 if REGNO0 is set.
20480 Put true value into *FOUND if register usage was found and
20481 false otherwise.
20482 Put true value into *REDEFINED if register redefinition was
20483 found and false otherwise. */
20485 static int
20486 distance_agu_use_in_bb (unsigned int regno,
20487 rtx_insn *insn, int distance, rtx_insn *start,
20488 bool *found, bool *redefined)
20490 basic_block bb = NULL;
20491 rtx_insn *next = start;
20492 rtx_insn *prev = NULL;
20494 *found = false;
20495 *redefined = false;
20497 if (start != NULL_RTX)
20499 bb = BLOCK_FOR_INSN (start);
20500 if (start != BB_HEAD (bb))
20501 /* If insn and start belong to the same bb, set prev to insn,
20502 so the call to increase_distance will increase the distance
20503 between insns by 1. */
20504 prev = insn;
20507 while (next
20508 && next != insn
20509 && distance < LEA_SEARCH_THRESHOLD)
20511 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20513 distance = increase_distance(prev, next, distance);
20514 if (insn_uses_reg_mem (regno, next))
20516 /* Return DISTANCE if OP0 is used in memory
20517 address in NEXT. */
20518 *found = true;
20519 return distance;
20522 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20524 /* Return -1 if OP0 is set in NEXT. */
20525 *redefined = true;
20526 return -1;
20529 prev = next;
20532 if (next == BB_END (bb))
20533 break;
20535 next = NEXT_INSN (next);
20538 return distance;
20541 /* Return the distance between INSN and the next insn that uses
20542 register number REGNO0 in memory address. Return -1 if no such
20543 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20545 static int
20546 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20548 basic_block bb = BLOCK_FOR_INSN (insn);
20549 int distance = 0;
20550 bool found = false;
20551 bool redefined = false;
20553 if (insn != BB_END (bb))
20554 distance = distance_agu_use_in_bb (regno0, insn, distance,
20555 NEXT_INSN (insn),
20556 &found, &redefined);
20558 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20560 edge e;
20561 edge_iterator ei;
20562 bool simple_loop = false;
20564 FOR_EACH_EDGE (e, ei, bb->succs)
20565 if (e->dest == bb)
20567 simple_loop = true;
20568 break;
20571 if (simple_loop)
20572 distance = distance_agu_use_in_bb (regno0, insn,
20573 distance, BB_HEAD (bb),
20574 &found, &redefined);
20575 else
20577 int shortest_dist = -1;
20578 bool found_in_bb = false;
20579 bool redefined_in_bb = false;
20581 FOR_EACH_EDGE (e, ei, bb->succs)
20583 int bb_dist
20584 = distance_agu_use_in_bb (regno0, insn,
20585 distance, BB_HEAD (e->dest),
20586 &found_in_bb, &redefined_in_bb);
20587 if (found_in_bb)
20589 if (shortest_dist < 0)
20590 shortest_dist = bb_dist;
20591 else if (bb_dist > 0)
20592 shortest_dist = MIN (bb_dist, shortest_dist);
20594 found = true;
20598 distance = shortest_dist;
20602 if (!found || redefined)
20603 return -1;
20605 return distance >> 1;
20608 /* Define this macro to tune LEA priority vs ADD, it take effect when
20609 there is a dilemma of choicing LEA or ADD
20610 Negative value: ADD is more preferred than LEA
20611 Zero: Netrual
20612 Positive value: LEA is more preferred than ADD*/
20613 #define IX86_LEA_PRIORITY 0
20615 /* Return true if usage of lea INSN has performance advantage
20616 over a sequence of instructions. Instructions sequence has
20617 SPLIT_COST cycles higher latency than lea latency. */
20619 static bool
20620 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20621 unsigned int regno2, int split_cost, bool has_scale)
20623 int dist_define, dist_use;
20625 /* For Silvermont if using a 2-source or 3-source LEA for
20626 non-destructive destination purposes, or due to wanting
20627 ability to use SCALE, the use of LEA is justified. */
20628 if (TARGET_SILVERMONT || TARGET_INTEL)
20630 if (has_scale)
20631 return true;
20632 if (split_cost < 1)
20633 return false;
20634 if (regno0 == regno1 || regno0 == regno2)
20635 return false;
20636 return true;
20639 dist_define = distance_non_agu_define (regno1, regno2, insn);
20640 dist_use = distance_agu_use (regno0, insn);
20642 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20644 /* If there is no non AGU operand definition, no AGU
20645 operand usage and split cost is 0 then both lea
20646 and non lea variants have same priority. Currently
20647 we prefer lea for 64 bit code and non lea on 32 bit
20648 code. */
20649 if (dist_use < 0 && split_cost == 0)
20650 return TARGET_64BIT || IX86_LEA_PRIORITY;
20651 else
20652 return true;
20655 /* With longer definitions distance lea is more preferable.
20656 Here we change it to take into account splitting cost and
20657 lea priority. */
20658 dist_define += split_cost + IX86_LEA_PRIORITY;
20660 /* If there is no use in memory addess then we just check
20661 that split cost exceeds AGU stall. */
20662 if (dist_use < 0)
20663 return dist_define > LEA_MAX_STALL;
20665 /* If this insn has both backward non-agu dependence and forward
20666 agu dependence, the one with short distance takes effect. */
20667 return dist_define >= dist_use;
20670 /* Return true if it is legal to clobber flags by INSN and
20671 false otherwise. */
20673 static bool
20674 ix86_ok_to_clobber_flags (rtx_insn *insn)
20676 basic_block bb = BLOCK_FOR_INSN (insn);
20677 df_ref use;
20678 bitmap live;
20680 while (insn)
20682 if (NONDEBUG_INSN_P (insn))
20684 FOR_EACH_INSN_USE (use, insn)
20685 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20686 return false;
20688 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20689 return true;
20692 if (insn == BB_END (bb))
20693 break;
20695 insn = NEXT_INSN (insn);
20698 live = df_get_live_out(bb);
20699 return !REGNO_REG_SET_P (live, FLAGS_REG);
20702 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20703 move and add to avoid AGU stalls. */
20705 bool
20706 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20708 unsigned int regno0, regno1, regno2;
20710 /* Check if we need to optimize. */
20711 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20712 return false;
20714 /* Check it is correct to split here. */
20715 if (!ix86_ok_to_clobber_flags(insn))
20716 return false;
20718 regno0 = true_regnum (operands[0]);
20719 regno1 = true_regnum (operands[1]);
20720 regno2 = true_regnum (operands[2]);
20722 /* We need to split only adds with non destructive
20723 destination operand. */
20724 if (regno0 == regno1 || regno0 == regno2)
20725 return false;
20726 else
20727 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20730 /* Return true if we should emit lea instruction instead of mov
20731 instruction. */
20733 bool
20734 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20736 unsigned int regno0, regno1;
20738 /* Check if we need to optimize. */
20739 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20740 return false;
20742 /* Use lea for reg to reg moves only. */
20743 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20744 return false;
20746 regno0 = true_regnum (operands[0]);
20747 regno1 = true_regnum (operands[1]);
20749 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20752 /* Return true if we need to split lea into a sequence of
20753 instructions to avoid AGU stalls. */
20755 bool
20756 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20758 unsigned int regno0, regno1, regno2;
20759 int split_cost;
20760 struct ix86_address parts;
20761 int ok;
20763 /* Check we need to optimize. */
20764 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20765 return false;
20767 /* The "at least two components" test below might not catch simple
20768 move or zero extension insns if parts.base is non-NULL and parts.disp
20769 is const0_rtx as the only components in the address, e.g. if the
20770 register is %rbp or %r13. As this test is much cheaper and moves or
20771 zero extensions are the common case, do this check first. */
20772 if (REG_P (operands[1])
20773 || (SImode_address_operand (operands[1], VOIDmode)
20774 && REG_P (XEXP (operands[1], 0))))
20775 return false;
20777 /* Check if it is OK to split here. */
20778 if (!ix86_ok_to_clobber_flags (insn))
20779 return false;
20781 ok = ix86_decompose_address (operands[1], &parts);
20782 gcc_assert (ok);
20784 /* There should be at least two components in the address. */
20785 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20786 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20787 return false;
20789 /* We should not split into add if non legitimate pic
20790 operand is used as displacement. */
20791 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20792 return false;
20794 regno0 = true_regnum (operands[0]) ;
20795 regno1 = INVALID_REGNUM;
20796 regno2 = INVALID_REGNUM;
20798 if (parts.base)
20799 regno1 = true_regnum (parts.base);
20800 if (parts.index)
20801 regno2 = true_regnum (parts.index);
20803 split_cost = 0;
20805 /* Compute how many cycles we will add to execution time
20806 if split lea into a sequence of instructions. */
20807 if (parts.base || parts.index)
20809 /* Have to use mov instruction if non desctructive
20810 destination form is used. */
20811 if (regno1 != regno0 && regno2 != regno0)
20812 split_cost += 1;
20814 /* Have to add index to base if both exist. */
20815 if (parts.base && parts.index)
20816 split_cost += 1;
20818 /* Have to use shift and adds if scale is 2 or greater. */
20819 if (parts.scale > 1)
20821 if (regno0 != regno1)
20822 split_cost += 1;
20823 else if (regno2 == regno0)
20824 split_cost += 4;
20825 else
20826 split_cost += parts.scale;
20829 /* Have to use add instruction with immediate if
20830 disp is non zero. */
20831 if (parts.disp && parts.disp != const0_rtx)
20832 split_cost += 1;
20834 /* Subtract the price of lea. */
20835 split_cost -= 1;
20838 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20839 parts.scale > 1);
20842 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20843 matches destination. RTX includes clobber of FLAGS_REG. */
20845 static void
20846 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20847 rtx dst, rtx src)
20849 rtx op, clob;
20851 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20852 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20854 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20857 /* Return true if regno1 def is nearest to the insn. */
20859 static bool
20860 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20862 rtx_insn *prev = insn;
20863 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20865 if (insn == start)
20866 return false;
20867 while (prev && prev != start)
20869 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20871 prev = PREV_INSN (prev);
20872 continue;
20874 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20875 return true;
20876 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20877 return false;
20878 prev = PREV_INSN (prev);
20881 /* None of the regs is defined in the bb. */
20882 return false;
20885 /* Split lea instructions into a sequence of instructions
20886 which are executed on ALU to avoid AGU stalls.
20887 It is assumed that it is allowed to clobber flags register
20888 at lea position. */
20890 void
20891 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20893 unsigned int regno0, regno1, regno2;
20894 struct ix86_address parts;
20895 rtx target, tmp;
20896 int ok, adds;
20898 ok = ix86_decompose_address (operands[1], &parts);
20899 gcc_assert (ok);
20901 target = gen_lowpart (mode, operands[0]);
20903 regno0 = true_regnum (target);
20904 regno1 = INVALID_REGNUM;
20905 regno2 = INVALID_REGNUM;
20907 if (parts.base)
20909 parts.base = gen_lowpart (mode, parts.base);
20910 regno1 = true_regnum (parts.base);
20913 if (parts.index)
20915 parts.index = gen_lowpart (mode, parts.index);
20916 regno2 = true_regnum (parts.index);
20919 if (parts.disp)
20920 parts.disp = gen_lowpart (mode, parts.disp);
20922 if (parts.scale > 1)
20924 /* Case r1 = r1 + ... */
20925 if (regno1 == regno0)
20927 /* If we have a case r1 = r1 + C * r2 then we
20928 should use multiplication which is very
20929 expensive. Assume cost model is wrong if we
20930 have such case here. */
20931 gcc_assert (regno2 != regno0);
20933 for (adds = parts.scale; adds > 0; adds--)
20934 ix86_emit_binop (PLUS, mode, target, parts.index);
20936 else
20938 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
20939 if (regno0 != regno2)
20940 emit_insn (gen_rtx_SET (target, parts.index));
20942 /* Use shift for scaling. */
20943 ix86_emit_binop (ASHIFT, mode, target,
20944 GEN_INT (exact_log2 (parts.scale)));
20946 if (parts.base)
20947 ix86_emit_binop (PLUS, mode, target, parts.base);
20949 if (parts.disp && parts.disp != const0_rtx)
20950 ix86_emit_binop (PLUS, mode, target, parts.disp);
20953 else if (!parts.base && !parts.index)
20955 gcc_assert(parts.disp);
20956 emit_insn (gen_rtx_SET (target, parts.disp));
20958 else
20960 if (!parts.base)
20962 if (regno0 != regno2)
20963 emit_insn (gen_rtx_SET (target, parts.index));
20965 else if (!parts.index)
20967 if (regno0 != regno1)
20968 emit_insn (gen_rtx_SET (target, parts.base));
20970 else
20972 if (regno0 == regno1)
20973 tmp = parts.index;
20974 else if (regno0 == regno2)
20975 tmp = parts.base;
20976 else
20978 rtx tmp1;
20980 /* Find better operand for SET instruction, depending
20981 on which definition is farther from the insn. */
20982 if (find_nearest_reg_def (insn, regno1, regno2))
20983 tmp = parts.index, tmp1 = parts.base;
20984 else
20985 tmp = parts.base, tmp1 = parts.index;
20987 emit_insn (gen_rtx_SET (target, tmp));
20989 if (parts.disp && parts.disp != const0_rtx)
20990 ix86_emit_binop (PLUS, mode, target, parts.disp);
20992 ix86_emit_binop (PLUS, mode, target, tmp1);
20993 return;
20996 ix86_emit_binop (PLUS, mode, target, tmp);
20999 if (parts.disp && parts.disp != const0_rtx)
21000 ix86_emit_binop (PLUS, mode, target, parts.disp);
21004 /* Return true if it is ok to optimize an ADD operation to LEA
21005 operation to avoid flag register consumation. For most processors,
21006 ADD is faster than LEA. For the processors like BONNELL, if the
21007 destination register of LEA holds an actual address which will be
21008 used soon, LEA is better and otherwise ADD is better. */
21010 bool
21011 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21013 unsigned int regno0 = true_regnum (operands[0]);
21014 unsigned int regno1 = true_regnum (operands[1]);
21015 unsigned int regno2 = true_regnum (operands[2]);
21017 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21018 if (regno0 != regno1 && regno0 != regno2)
21019 return true;
21021 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21022 return false;
21024 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21027 /* Return true if destination reg of SET_BODY is shift count of
21028 USE_BODY. */
21030 static bool
21031 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21033 rtx set_dest;
21034 rtx shift_rtx;
21035 int i;
21037 /* Retrieve destination of SET_BODY. */
21038 switch (GET_CODE (set_body))
21040 case SET:
21041 set_dest = SET_DEST (set_body);
21042 if (!set_dest || !REG_P (set_dest))
21043 return false;
21044 break;
21045 case PARALLEL:
21046 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21047 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21048 use_body))
21049 return true;
21050 /* FALLTHROUGH */
21051 default:
21052 return false;
21055 /* Retrieve shift count of USE_BODY. */
21056 switch (GET_CODE (use_body))
21058 case SET:
21059 shift_rtx = XEXP (use_body, 1);
21060 break;
21061 case PARALLEL:
21062 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21063 if (ix86_dep_by_shift_count_body (set_body,
21064 XVECEXP (use_body, 0, i)))
21065 return true;
21066 /* FALLTHROUGH */
21067 default:
21068 return false;
21071 if (shift_rtx
21072 && (GET_CODE (shift_rtx) == ASHIFT
21073 || GET_CODE (shift_rtx) == LSHIFTRT
21074 || GET_CODE (shift_rtx) == ASHIFTRT
21075 || GET_CODE (shift_rtx) == ROTATE
21076 || GET_CODE (shift_rtx) == ROTATERT))
21078 rtx shift_count = XEXP (shift_rtx, 1);
21080 /* Return true if shift count is dest of SET_BODY. */
21081 if (REG_P (shift_count))
21083 /* Add check since it can be invoked before register
21084 allocation in pre-reload schedule. */
21085 if (reload_completed
21086 && true_regnum (set_dest) == true_regnum (shift_count))
21087 return true;
21088 else if (REGNO(set_dest) == REGNO(shift_count))
21089 return true;
21093 return false;
21096 /* Return true if destination reg of SET_INSN is shift count of
21097 USE_INSN. */
21099 bool
21100 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21102 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21103 PATTERN (use_insn));
21106 /* Return TRUE or FALSE depending on whether the unary operator meets the
21107 appropriate constraints. */
21109 bool
21110 ix86_unary_operator_ok (enum rtx_code,
21111 machine_mode,
21112 rtx operands[2])
21114 /* If one of operands is memory, source and destination must match. */
21115 if ((MEM_P (operands[0])
21116 || MEM_P (operands[1]))
21117 && ! rtx_equal_p (operands[0], operands[1]))
21118 return false;
21119 return true;
21122 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21123 are ok, keeping in mind the possible movddup alternative. */
21125 bool
21126 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21128 if (MEM_P (operands[0]))
21129 return rtx_equal_p (operands[0], operands[1 + high]);
21130 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21131 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21132 return true;
21135 /* Post-reload splitter for converting an SF or DFmode value in an
21136 SSE register into an unsigned SImode. */
21138 void
21139 ix86_split_convert_uns_si_sse (rtx operands[])
21141 machine_mode vecmode;
21142 rtx value, large, zero_or_two31, input, two31, x;
21144 large = operands[1];
21145 zero_or_two31 = operands[2];
21146 input = operands[3];
21147 two31 = operands[4];
21148 vecmode = GET_MODE (large);
21149 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21151 /* Load up the value into the low element. We must ensure that the other
21152 elements are valid floats -- zero is the easiest such value. */
21153 if (MEM_P (input))
21155 if (vecmode == V4SFmode)
21156 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21157 else
21158 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21160 else
21162 input = gen_rtx_REG (vecmode, REGNO (input));
21163 emit_move_insn (value, CONST0_RTX (vecmode));
21164 if (vecmode == V4SFmode)
21165 emit_insn (gen_sse_movss (value, value, input));
21166 else
21167 emit_insn (gen_sse2_movsd (value, value, input));
21170 emit_move_insn (large, two31);
21171 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21173 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21174 emit_insn (gen_rtx_SET (large, x));
21176 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21177 emit_insn (gen_rtx_SET (zero_or_two31, x));
21179 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21180 emit_insn (gen_rtx_SET (value, x));
21182 large = gen_rtx_REG (V4SImode, REGNO (large));
21183 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21185 x = gen_rtx_REG (V4SImode, REGNO (value));
21186 if (vecmode == V4SFmode)
21187 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21188 else
21189 emit_insn (gen_sse2_cvttpd2dq (x, value));
21190 value = x;
21192 emit_insn (gen_xorv4si3 (value, value, large));
21195 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21196 Expects the 64-bit DImode to be supplied in a pair of integral
21197 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21198 -mfpmath=sse, !optimize_size only. */
21200 void
21201 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21203 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21204 rtx int_xmm, fp_xmm;
21205 rtx biases, exponents;
21206 rtx x;
21208 int_xmm = gen_reg_rtx (V4SImode);
21209 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21210 emit_insn (gen_movdi_to_sse (int_xmm, input));
21211 else if (TARGET_SSE_SPLIT_REGS)
21213 emit_clobber (int_xmm);
21214 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21216 else
21218 x = gen_reg_rtx (V2DImode);
21219 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21220 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21223 x = gen_rtx_CONST_VECTOR (V4SImode,
21224 gen_rtvec (4, GEN_INT (0x43300000UL),
21225 GEN_INT (0x45300000UL),
21226 const0_rtx, const0_rtx));
21227 exponents = validize_mem (force_const_mem (V4SImode, x));
21229 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21230 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21232 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21233 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21234 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21235 (0x1.0p84 + double(fp_value_hi_xmm)).
21236 Note these exponents differ by 32. */
21238 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21240 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21241 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21242 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21243 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21244 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21245 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21246 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21247 biases = validize_mem (force_const_mem (V2DFmode, biases));
21248 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21250 /* Add the upper and lower DFmode values together. */
21251 if (TARGET_SSE3)
21252 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21253 else
21255 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21256 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21257 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21260 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21263 /* Not used, but eases macroization of patterns. */
21264 void
21265 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21267 gcc_unreachable ();
21270 /* Convert an unsigned SImode value into a DFmode. Only currently used
21271 for SSE, but applicable anywhere. */
21273 void
21274 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21276 REAL_VALUE_TYPE TWO31r;
21277 rtx x, fp;
21279 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21280 NULL, 1, OPTAB_DIRECT);
21282 fp = gen_reg_rtx (DFmode);
21283 emit_insn (gen_floatsidf2 (fp, x));
21285 real_ldexp (&TWO31r, &dconst1, 31);
21286 x = const_double_from_real_value (TWO31r, DFmode);
21288 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21289 if (x != target)
21290 emit_move_insn (target, x);
21293 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21294 32-bit mode; otherwise we have a direct convert instruction. */
21296 void
21297 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21299 REAL_VALUE_TYPE TWO32r;
21300 rtx fp_lo, fp_hi, x;
21302 fp_lo = gen_reg_rtx (DFmode);
21303 fp_hi = gen_reg_rtx (DFmode);
21305 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21307 real_ldexp (&TWO32r, &dconst1, 32);
21308 x = const_double_from_real_value (TWO32r, DFmode);
21309 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21311 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21313 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21314 0, OPTAB_DIRECT);
21315 if (x != target)
21316 emit_move_insn (target, x);
21319 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21320 For x86_32, -mfpmath=sse, !optimize_size only. */
21321 void
21322 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21324 REAL_VALUE_TYPE ONE16r;
21325 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21327 real_ldexp (&ONE16r, &dconst1, 16);
21328 x = const_double_from_real_value (ONE16r, SFmode);
21329 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21330 NULL, 0, OPTAB_DIRECT);
21331 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21332 NULL, 0, OPTAB_DIRECT);
21333 fp_hi = gen_reg_rtx (SFmode);
21334 fp_lo = gen_reg_rtx (SFmode);
21335 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21336 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21337 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21338 0, OPTAB_DIRECT);
21339 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21340 0, OPTAB_DIRECT);
21341 if (!rtx_equal_p (target, fp_hi))
21342 emit_move_insn (target, fp_hi);
21345 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21346 a vector of unsigned ints VAL to vector of floats TARGET. */
21348 void
21349 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21351 rtx tmp[8];
21352 REAL_VALUE_TYPE TWO16r;
21353 machine_mode intmode = GET_MODE (val);
21354 machine_mode fltmode = GET_MODE (target);
21355 rtx (*cvt) (rtx, rtx);
21357 if (intmode == V4SImode)
21358 cvt = gen_floatv4siv4sf2;
21359 else
21360 cvt = gen_floatv8siv8sf2;
21361 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21362 tmp[0] = force_reg (intmode, tmp[0]);
21363 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21364 OPTAB_DIRECT);
21365 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21366 NULL_RTX, 1, OPTAB_DIRECT);
21367 tmp[3] = gen_reg_rtx (fltmode);
21368 emit_insn (cvt (tmp[3], tmp[1]));
21369 tmp[4] = gen_reg_rtx (fltmode);
21370 emit_insn (cvt (tmp[4], tmp[2]));
21371 real_ldexp (&TWO16r, &dconst1, 16);
21372 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21373 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21374 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21375 OPTAB_DIRECT);
21376 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21377 OPTAB_DIRECT);
21378 if (tmp[7] != target)
21379 emit_move_insn (target, tmp[7]);
21382 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21383 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21384 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21385 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21388 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21390 REAL_VALUE_TYPE TWO31r;
21391 rtx two31r, tmp[4];
21392 machine_mode mode = GET_MODE (val);
21393 machine_mode scalarmode = GET_MODE_INNER (mode);
21394 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21395 rtx (*cmp) (rtx, rtx, rtx, rtx);
21396 int i;
21398 for (i = 0; i < 3; i++)
21399 tmp[i] = gen_reg_rtx (mode);
21400 real_ldexp (&TWO31r, &dconst1, 31);
21401 two31r = const_double_from_real_value (TWO31r, scalarmode);
21402 two31r = ix86_build_const_vector (mode, 1, two31r);
21403 two31r = force_reg (mode, two31r);
21404 switch (mode)
21406 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21407 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21408 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21409 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21410 default: gcc_unreachable ();
21412 tmp[3] = gen_rtx_LE (mode, two31r, val);
21413 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21414 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21415 0, OPTAB_DIRECT);
21416 if (intmode == V4SImode || TARGET_AVX2)
21417 *xorp = expand_simple_binop (intmode, ASHIFT,
21418 gen_lowpart (intmode, tmp[0]),
21419 GEN_INT (31), NULL_RTX, 0,
21420 OPTAB_DIRECT);
21421 else
21423 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21424 two31 = ix86_build_const_vector (intmode, 1, two31);
21425 *xorp = expand_simple_binop (intmode, AND,
21426 gen_lowpart (intmode, tmp[0]),
21427 two31, NULL_RTX, 0,
21428 OPTAB_DIRECT);
21430 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21431 0, OPTAB_DIRECT);
21434 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21435 then replicate the value for all elements of the vector
21436 register. */
21439 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21441 int i, n_elt;
21442 rtvec v;
21443 machine_mode scalar_mode;
21445 switch (mode)
21447 case E_V64QImode:
21448 case E_V32QImode:
21449 case E_V16QImode:
21450 case E_V32HImode:
21451 case E_V16HImode:
21452 case E_V8HImode:
21453 case E_V16SImode:
21454 case E_V8SImode:
21455 case E_V4SImode:
21456 case E_V8DImode:
21457 case E_V4DImode:
21458 case E_V2DImode:
21459 gcc_assert (vect);
21460 /* FALLTHRU */
21461 case E_V16SFmode:
21462 case E_V8SFmode:
21463 case E_V4SFmode:
21464 case E_V8DFmode:
21465 case E_V4DFmode:
21466 case E_V2DFmode:
21467 n_elt = GET_MODE_NUNITS (mode);
21468 v = rtvec_alloc (n_elt);
21469 scalar_mode = GET_MODE_INNER (mode);
21471 RTVEC_ELT (v, 0) = value;
21473 for (i = 1; i < n_elt; ++i)
21474 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21476 return gen_rtx_CONST_VECTOR (mode, v);
21478 default:
21479 gcc_unreachable ();
21483 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21484 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21485 for an SSE register. If VECT is true, then replicate the mask for
21486 all elements of the vector register. If INVERT is true, then create
21487 a mask excluding the sign bit. */
21490 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21492 machine_mode vec_mode, imode;
21493 wide_int w;
21494 rtx mask, v;
21496 switch (mode)
21498 case E_V16SImode:
21499 case E_V16SFmode:
21500 case E_V8SImode:
21501 case E_V4SImode:
21502 case E_V8SFmode:
21503 case E_V4SFmode:
21504 vec_mode = mode;
21505 imode = SImode;
21506 break;
21508 case E_V8DImode:
21509 case E_V4DImode:
21510 case E_V2DImode:
21511 case E_V8DFmode:
21512 case E_V4DFmode:
21513 case E_V2DFmode:
21514 vec_mode = mode;
21515 imode = DImode;
21516 break;
21518 case E_TImode:
21519 case E_TFmode:
21520 vec_mode = VOIDmode;
21521 imode = TImode;
21522 break;
21524 default:
21525 gcc_unreachable ();
21528 machine_mode inner_mode = GET_MODE_INNER (mode);
21529 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21530 GET_MODE_BITSIZE (inner_mode));
21531 if (invert)
21532 w = wi::bit_not (w);
21534 /* Force this value into the low part of a fp vector constant. */
21535 mask = immed_wide_int_const (w, imode);
21536 mask = gen_lowpart (inner_mode, mask);
21538 if (vec_mode == VOIDmode)
21539 return force_reg (inner_mode, mask);
21541 v = ix86_build_const_vector (vec_mode, vect, mask);
21542 return force_reg (vec_mode, v);
21545 /* Generate code for floating point ABS or NEG. */
21547 void
21548 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21549 rtx operands[])
21551 rtx mask, set, dst, src;
21552 bool use_sse = false;
21553 bool vector_mode = VECTOR_MODE_P (mode);
21554 machine_mode vmode = mode;
21556 if (vector_mode)
21557 use_sse = true;
21558 else if (mode == TFmode)
21559 use_sse = true;
21560 else if (TARGET_SSE_MATH)
21562 use_sse = SSE_FLOAT_MODE_P (mode);
21563 if (mode == SFmode)
21564 vmode = V4SFmode;
21565 else if (mode == DFmode)
21566 vmode = V2DFmode;
21569 /* NEG and ABS performed with SSE use bitwise mask operations.
21570 Create the appropriate mask now. */
21571 if (use_sse)
21572 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21573 else
21574 mask = NULL_RTX;
21576 dst = operands[0];
21577 src = operands[1];
21579 set = gen_rtx_fmt_e (code, mode, src);
21580 set = gen_rtx_SET (dst, set);
21582 if (mask)
21584 rtx use, clob;
21585 rtvec par;
21587 use = gen_rtx_USE (VOIDmode, mask);
21588 if (vector_mode)
21589 par = gen_rtvec (2, set, use);
21590 else
21592 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21593 par = gen_rtvec (3, set, use, clob);
21595 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21597 else
21598 emit_insn (set);
21601 /* Expand a copysign operation. Special case operand 0 being a constant. */
21603 void
21604 ix86_expand_copysign (rtx operands[])
21606 machine_mode mode, vmode;
21607 rtx dest, op0, op1, mask, nmask;
21609 dest = operands[0];
21610 op0 = operands[1];
21611 op1 = operands[2];
21613 mode = GET_MODE (dest);
21615 if (mode == SFmode)
21616 vmode = V4SFmode;
21617 else if (mode == DFmode)
21618 vmode = V2DFmode;
21619 else
21620 vmode = mode;
21622 if (CONST_DOUBLE_P (op0))
21624 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21626 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21627 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21629 if (mode == SFmode || mode == DFmode)
21631 if (op0 == CONST0_RTX (mode))
21632 op0 = CONST0_RTX (vmode);
21633 else
21635 rtx v = ix86_build_const_vector (vmode, false, op0);
21637 op0 = force_reg (vmode, v);
21640 else if (op0 != CONST0_RTX (mode))
21641 op0 = force_reg (mode, op0);
21643 mask = ix86_build_signbit_mask (vmode, 0, 0);
21645 if (mode == SFmode)
21646 copysign_insn = gen_copysignsf3_const;
21647 else if (mode == DFmode)
21648 copysign_insn = gen_copysigndf3_const;
21649 else
21650 copysign_insn = gen_copysigntf3_const;
21652 emit_insn (copysign_insn (dest, op0, op1, mask));
21654 else
21656 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21658 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21659 mask = ix86_build_signbit_mask (vmode, 0, 0);
21661 if (mode == SFmode)
21662 copysign_insn = gen_copysignsf3_var;
21663 else if (mode == DFmode)
21664 copysign_insn = gen_copysigndf3_var;
21665 else
21666 copysign_insn = gen_copysigntf3_var;
21668 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21672 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21673 be a constant, and so has already been expanded into a vector constant. */
21675 void
21676 ix86_split_copysign_const (rtx operands[])
21678 machine_mode mode, vmode;
21679 rtx dest, op0, mask, x;
21681 dest = operands[0];
21682 op0 = operands[1];
21683 mask = operands[3];
21685 mode = GET_MODE (dest);
21686 vmode = GET_MODE (mask);
21688 dest = lowpart_subreg (vmode, dest, mode);
21689 x = gen_rtx_AND (vmode, dest, mask);
21690 emit_insn (gen_rtx_SET (dest, x));
21692 if (op0 != CONST0_RTX (vmode))
21694 x = gen_rtx_IOR (vmode, dest, op0);
21695 emit_insn (gen_rtx_SET (dest, x));
21699 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21700 so we have to do two masks. */
21702 void
21703 ix86_split_copysign_var (rtx operands[])
21705 machine_mode mode, vmode;
21706 rtx dest, scratch, op0, op1, mask, nmask, x;
21708 dest = operands[0];
21709 scratch = operands[1];
21710 op0 = operands[2];
21711 op1 = operands[3];
21712 nmask = operands[4];
21713 mask = operands[5];
21715 mode = GET_MODE (dest);
21716 vmode = GET_MODE (mask);
21718 if (rtx_equal_p (op0, op1))
21720 /* Shouldn't happen often (it's useless, obviously), but when it does
21721 we'd generate incorrect code if we continue below. */
21722 emit_move_insn (dest, op0);
21723 return;
21726 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21728 gcc_assert (REGNO (op1) == REGNO (scratch));
21730 x = gen_rtx_AND (vmode, scratch, mask);
21731 emit_insn (gen_rtx_SET (scratch, x));
21733 dest = mask;
21734 op0 = lowpart_subreg (vmode, op0, mode);
21735 x = gen_rtx_NOT (vmode, dest);
21736 x = gen_rtx_AND (vmode, x, op0);
21737 emit_insn (gen_rtx_SET (dest, x));
21739 else
21741 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21743 x = gen_rtx_AND (vmode, scratch, mask);
21745 else /* alternative 2,4 */
21747 gcc_assert (REGNO (mask) == REGNO (scratch));
21748 op1 = lowpart_subreg (vmode, op1, mode);
21749 x = gen_rtx_AND (vmode, scratch, op1);
21751 emit_insn (gen_rtx_SET (scratch, x));
21753 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21755 dest = lowpart_subreg (vmode, op0, mode);
21756 x = gen_rtx_AND (vmode, dest, nmask);
21758 else /* alternative 3,4 */
21760 gcc_assert (REGNO (nmask) == REGNO (dest));
21761 dest = nmask;
21762 op0 = lowpart_subreg (vmode, op0, mode);
21763 x = gen_rtx_AND (vmode, dest, op0);
21765 emit_insn (gen_rtx_SET (dest, x));
21768 x = gen_rtx_IOR (vmode, dest, scratch);
21769 emit_insn (gen_rtx_SET (dest, x));
21772 /* Return TRUE or FALSE depending on whether the first SET in INSN
21773 has source and destination with matching CC modes, and that the
21774 CC mode is at least as constrained as REQ_MODE. */
21776 bool
21777 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21779 rtx set;
21780 machine_mode set_mode;
21782 set = PATTERN (insn);
21783 if (GET_CODE (set) == PARALLEL)
21784 set = XVECEXP (set, 0, 0);
21785 gcc_assert (GET_CODE (set) == SET);
21786 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21788 set_mode = GET_MODE (SET_DEST (set));
21789 switch (set_mode)
21791 case E_CCNOmode:
21792 if (req_mode != CCNOmode
21793 && (req_mode != CCmode
21794 || XEXP (SET_SRC (set), 1) != const0_rtx))
21795 return false;
21796 break;
21797 case E_CCmode:
21798 if (req_mode == CCGCmode)
21799 return false;
21800 /* FALLTHRU */
21801 case E_CCGCmode:
21802 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21803 return false;
21804 /* FALLTHRU */
21805 case E_CCGOCmode:
21806 if (req_mode == CCZmode)
21807 return false;
21808 /* FALLTHRU */
21809 case E_CCZmode:
21810 break;
21812 case E_CCGZmode:
21814 case E_CCAmode:
21815 case E_CCCmode:
21816 case E_CCOmode:
21817 case E_CCPmode:
21818 case E_CCSmode:
21819 if (set_mode != req_mode)
21820 return false;
21821 break;
21823 default:
21824 gcc_unreachable ();
21827 return GET_MODE (SET_SRC (set)) == set_mode;
21830 /* Generate insn patterns to do an integer compare of OPERANDS. */
21832 static rtx
21833 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21835 machine_mode cmpmode;
21836 rtx tmp, flags;
21838 cmpmode = SELECT_CC_MODE (code, op0, op1);
21839 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21841 /* This is very simple, but making the interface the same as in the
21842 FP case makes the rest of the code easier. */
21843 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21844 emit_insn (gen_rtx_SET (flags, tmp));
21846 /* Return the test that should be put into the flags user, i.e.
21847 the bcc, scc, or cmov instruction. */
21848 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21851 /* Figure out whether to use unordered fp comparisons. */
21853 static bool
21854 ix86_unordered_fp_compare (enum rtx_code code)
21856 if (!TARGET_IEEE_FP)
21857 return false;
21859 switch (code)
21861 case GT:
21862 case GE:
21863 case LT:
21864 case LE:
21865 return false;
21867 case EQ:
21868 case NE:
21870 case LTGT:
21871 case UNORDERED:
21872 case ORDERED:
21873 case UNLT:
21874 case UNLE:
21875 case UNGT:
21876 case UNGE:
21877 case UNEQ:
21878 return true;
21880 default:
21881 gcc_unreachable ();
21885 machine_mode
21886 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21888 machine_mode mode = GET_MODE (op0);
21890 if (SCALAR_FLOAT_MODE_P (mode))
21892 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21893 return CCFPmode;
21896 switch (code)
21898 /* Only zero flag is needed. */
21899 case EQ: /* ZF=0 */
21900 case NE: /* ZF!=0 */
21901 return CCZmode;
21902 /* Codes needing carry flag. */
21903 case GEU: /* CF=0 */
21904 case LTU: /* CF=1 */
21905 /* Detect overflow checks. They need just the carry flag. */
21906 if (GET_CODE (op0) == PLUS
21907 && (rtx_equal_p (op1, XEXP (op0, 0))
21908 || rtx_equal_p (op1, XEXP (op0, 1))))
21909 return CCCmode;
21910 else
21911 return CCmode;
21912 case GTU: /* CF=0 & ZF=0 */
21913 case LEU: /* CF=1 | ZF=1 */
21914 return CCmode;
21915 /* Codes possibly doable only with sign flag when
21916 comparing against zero. */
21917 case GE: /* SF=OF or SF=0 */
21918 case LT: /* SF<>OF or SF=1 */
21919 if (op1 == const0_rtx)
21920 return CCGOCmode;
21921 else
21922 /* For other cases Carry flag is not required. */
21923 return CCGCmode;
21924 /* Codes doable only with sign flag when comparing
21925 against zero, but we miss jump instruction for it
21926 so we need to use relational tests against overflow
21927 that thus needs to be zero. */
21928 case GT: /* ZF=0 & SF=OF */
21929 case LE: /* ZF=1 | SF<>OF */
21930 if (op1 == const0_rtx)
21931 return CCNOmode;
21932 else
21933 return CCGCmode;
21934 /* strcmp pattern do (use flags) and combine may ask us for proper
21935 mode. */
21936 case USE:
21937 return CCmode;
21938 default:
21939 gcc_unreachable ();
21943 /* Return the fixed registers used for condition codes. */
21945 static bool
21946 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
21948 *p1 = FLAGS_REG;
21949 *p2 = FPSR_REG;
21950 return true;
21953 /* If two condition code modes are compatible, return a condition code
21954 mode which is compatible with both. Otherwise, return
21955 VOIDmode. */
21957 static machine_mode
21958 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
21960 if (m1 == m2)
21961 return m1;
21963 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
21964 return VOIDmode;
21966 if ((m1 == CCGCmode && m2 == CCGOCmode)
21967 || (m1 == CCGOCmode && m2 == CCGCmode))
21968 return CCGCmode;
21970 if ((m1 == CCNOmode && m2 == CCGOCmode)
21971 || (m1 == CCGOCmode && m2 == CCNOmode))
21972 return CCNOmode;
21974 if (m1 == CCZmode
21975 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
21976 return m2;
21977 else if (m2 == CCZmode
21978 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
21979 return m1;
21981 switch (m1)
21983 default:
21984 gcc_unreachable ();
21986 case E_CCmode:
21987 case E_CCGCmode:
21988 case E_CCGOCmode:
21989 case E_CCNOmode:
21990 case E_CCAmode:
21991 case E_CCCmode:
21992 case E_CCOmode:
21993 case E_CCPmode:
21994 case E_CCSmode:
21995 case E_CCZmode:
21996 switch (m2)
21998 default:
21999 return VOIDmode;
22001 case E_CCmode:
22002 case E_CCGCmode:
22003 case E_CCGOCmode:
22004 case E_CCNOmode:
22005 case E_CCAmode:
22006 case E_CCCmode:
22007 case E_CCOmode:
22008 case E_CCPmode:
22009 case E_CCSmode:
22010 case E_CCZmode:
22011 return CCmode;
22014 case E_CCFPmode:
22015 /* These are only compatible with themselves, which we already
22016 checked above. */
22017 return VOIDmode;
22022 /* Return a comparison we can do and that it is equivalent to
22023 swap_condition (code) apart possibly from orderedness.
22024 But, never change orderedness if TARGET_IEEE_FP, returning
22025 UNKNOWN in that case if necessary. */
22027 static enum rtx_code
22028 ix86_fp_swap_condition (enum rtx_code code)
22030 switch (code)
22032 case GT: /* GTU - CF=0 & ZF=0 */
22033 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22034 case GE: /* GEU - CF=0 */
22035 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22036 case UNLT: /* LTU - CF=1 */
22037 return TARGET_IEEE_FP ? UNKNOWN : GT;
22038 case UNLE: /* LEU - CF=1 | ZF=1 */
22039 return TARGET_IEEE_FP ? UNKNOWN : GE;
22040 default:
22041 return swap_condition (code);
22045 /* Return cost of comparison CODE using the best strategy for performance.
22046 All following functions do use number of instructions as a cost metrics.
22047 In future this should be tweaked to compute bytes for optimize_size and
22048 take into account performance of various instructions on various CPUs. */
22050 static int
22051 ix86_fp_comparison_cost (enum rtx_code code)
22053 int arith_cost;
22055 /* The cost of code using bit-twiddling on %ah. */
22056 switch (code)
22058 case UNLE:
22059 case UNLT:
22060 case LTGT:
22061 case GT:
22062 case GE:
22063 case UNORDERED:
22064 case ORDERED:
22065 case UNEQ:
22066 arith_cost = 4;
22067 break;
22068 case LT:
22069 case NE:
22070 case EQ:
22071 case UNGE:
22072 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22073 break;
22074 case LE:
22075 case UNGT:
22076 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22077 break;
22078 default:
22079 gcc_unreachable ();
22082 switch (ix86_fp_comparison_strategy (code))
22084 case IX86_FPCMP_COMI:
22085 return arith_cost > 4 ? 3 : 2;
22086 case IX86_FPCMP_SAHF:
22087 return arith_cost > 4 ? 4 : 3;
22088 default:
22089 return arith_cost;
22093 /* Return strategy to use for floating-point. We assume that fcomi is always
22094 preferrable where available, since that is also true when looking at size
22095 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22097 enum ix86_fpcmp_strategy
22098 ix86_fp_comparison_strategy (enum rtx_code)
22100 /* Do fcomi/sahf based test when profitable. */
22102 if (TARGET_CMOVE)
22103 return IX86_FPCMP_COMI;
22105 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22106 return IX86_FPCMP_SAHF;
22108 return IX86_FPCMP_ARITH;
22111 /* Swap, force into registers, or otherwise massage the two operands
22112 to a fp comparison. The operands are updated in place; the new
22113 comparison code is returned. */
22115 static enum rtx_code
22116 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22118 bool unordered_compare = ix86_unordered_fp_compare (code);
22119 rtx op0 = *pop0, op1 = *pop1;
22120 machine_mode op_mode = GET_MODE (op0);
22121 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22123 /* All of the unordered compare instructions only work on registers.
22124 The same is true of the fcomi compare instructions. The XFmode
22125 compare instructions require registers except when comparing
22126 against zero or when converting operand 1 from fixed point to
22127 floating point. */
22129 if (!is_sse
22130 && (unordered_compare
22131 || (op_mode == XFmode
22132 && ! (standard_80387_constant_p (op0) == 1
22133 || standard_80387_constant_p (op1) == 1)
22134 && GET_CODE (op1) != FLOAT)
22135 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22137 op0 = force_reg (op_mode, op0);
22138 op1 = force_reg (op_mode, op1);
22140 else
22142 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22143 things around if they appear profitable, otherwise force op0
22144 into a register. */
22146 if (standard_80387_constant_p (op0) == 0
22147 || (MEM_P (op0)
22148 && ! (standard_80387_constant_p (op1) == 0
22149 || MEM_P (op1))))
22151 enum rtx_code new_code = ix86_fp_swap_condition (code);
22152 if (new_code != UNKNOWN)
22154 std::swap (op0, op1);
22155 code = new_code;
22159 if (!REG_P (op0))
22160 op0 = force_reg (op_mode, op0);
22162 if (CONSTANT_P (op1))
22164 int tmp = standard_80387_constant_p (op1);
22165 if (tmp == 0)
22166 op1 = validize_mem (force_const_mem (op_mode, op1));
22167 else if (tmp == 1)
22169 if (TARGET_CMOVE)
22170 op1 = force_reg (op_mode, op1);
22172 else
22173 op1 = force_reg (op_mode, op1);
22177 /* Try to rearrange the comparison to make it cheaper. */
22178 if (ix86_fp_comparison_cost (code)
22179 > ix86_fp_comparison_cost (swap_condition (code))
22180 && (REG_P (op1) || can_create_pseudo_p ()))
22182 std::swap (op0, op1);
22183 code = swap_condition (code);
22184 if (!REG_P (op0))
22185 op0 = force_reg (op_mode, op0);
22188 *pop0 = op0;
22189 *pop1 = op1;
22190 return code;
22193 /* Convert comparison codes we use to represent FP comparison to integer
22194 code that will result in proper branch. Return UNKNOWN if no such code
22195 is available. */
22197 enum rtx_code
22198 ix86_fp_compare_code_to_integer (enum rtx_code code)
22200 switch (code)
22202 case GT:
22203 return GTU;
22204 case GE:
22205 return GEU;
22206 case ORDERED:
22207 case UNORDERED:
22208 return code;
22209 case UNEQ:
22210 return EQ;
22211 case UNLT:
22212 return LTU;
22213 case UNLE:
22214 return LEU;
22215 case LTGT:
22216 return NE;
22217 default:
22218 return UNKNOWN;
22222 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22224 static rtx
22225 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22227 bool unordered_compare = ix86_unordered_fp_compare (code);
22228 machine_mode intcmp_mode;
22229 rtx tmp, tmp2;
22231 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22233 /* Do fcomi/sahf based test when profitable. */
22234 switch (ix86_fp_comparison_strategy (code))
22236 case IX86_FPCMP_COMI:
22237 intcmp_mode = CCFPmode;
22238 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22239 if (unordered_compare)
22240 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22241 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22242 break;
22244 case IX86_FPCMP_SAHF:
22245 intcmp_mode = CCFPmode;
22246 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22247 if (unordered_compare)
22248 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22249 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22250 if (!scratch)
22251 scratch = gen_reg_rtx (HImode);
22252 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22253 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22254 break;
22256 case IX86_FPCMP_ARITH:
22257 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22258 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22259 if (unordered_compare)
22260 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22261 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22262 if (!scratch)
22263 scratch = gen_reg_rtx (HImode);
22264 emit_insn (gen_rtx_SET (scratch, tmp));
22266 /* In the unordered case, we have to check C2 for NaN's, which
22267 doesn't happen to work out to anything nice combination-wise.
22268 So do some bit twiddling on the value we've got in AH to come
22269 up with an appropriate set of condition codes. */
22271 intcmp_mode = CCNOmode;
22272 switch (code)
22274 case GT:
22275 case UNGT:
22276 if (code == GT || !TARGET_IEEE_FP)
22278 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22279 code = EQ;
22281 else
22283 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22284 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22285 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22286 intcmp_mode = CCmode;
22287 code = GEU;
22289 break;
22290 case LT:
22291 case UNLT:
22292 if (code == LT && TARGET_IEEE_FP)
22294 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22295 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22296 intcmp_mode = CCmode;
22297 code = EQ;
22299 else
22301 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22302 code = NE;
22304 break;
22305 case GE:
22306 case UNGE:
22307 if (code == GE || !TARGET_IEEE_FP)
22309 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22310 code = EQ;
22312 else
22314 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22315 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22316 code = NE;
22318 break;
22319 case LE:
22320 case UNLE:
22321 if (code == LE && TARGET_IEEE_FP)
22323 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22324 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22325 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22326 intcmp_mode = CCmode;
22327 code = LTU;
22329 else
22331 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22332 code = NE;
22334 break;
22335 case EQ:
22336 case UNEQ:
22337 if (code == EQ && TARGET_IEEE_FP)
22339 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22340 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22341 intcmp_mode = CCmode;
22342 code = EQ;
22344 else
22346 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22347 code = NE;
22349 break;
22350 case NE:
22351 case LTGT:
22352 if (code == NE && TARGET_IEEE_FP)
22354 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22355 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22356 GEN_INT (0x40)));
22357 code = NE;
22359 else
22361 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22362 code = EQ;
22364 break;
22366 case UNORDERED:
22367 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22368 code = NE;
22369 break;
22370 case ORDERED:
22371 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22372 code = EQ;
22373 break;
22375 default:
22376 gcc_unreachable ();
22378 break;
22380 default:
22381 gcc_unreachable();
22384 /* Return the test that should be put into the flags user, i.e.
22385 the bcc, scc, or cmov instruction. */
22386 return gen_rtx_fmt_ee (code, VOIDmode,
22387 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22388 const0_rtx);
22391 static rtx
22392 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22394 rtx ret;
22396 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22397 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22399 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22401 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22402 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22404 else
22405 ret = ix86_expand_int_compare (code, op0, op1);
22407 return ret;
22410 void
22411 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22413 machine_mode mode = GET_MODE (op0);
22414 rtx tmp;
22416 /* Handle special case - vector comparsion with boolean result, transform
22417 it using ptest instruction. */
22418 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22420 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22421 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22423 gcc_assert (code == EQ || code == NE);
22424 /* Generate XOR since we can't check that one operand is zero vector. */
22425 tmp = gen_reg_rtx (mode);
22426 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22427 tmp = gen_lowpart (p_mode, tmp);
22428 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22429 gen_rtx_UNSPEC (CCmode,
22430 gen_rtvec (2, tmp, tmp),
22431 UNSPEC_PTEST)));
22432 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22433 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22434 gen_rtx_LABEL_REF (VOIDmode, label),
22435 pc_rtx);
22436 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22437 return;
22440 switch (mode)
22442 case E_SFmode:
22443 case E_DFmode:
22444 case E_XFmode:
22445 case E_QImode:
22446 case E_HImode:
22447 case E_SImode:
22448 simple:
22449 tmp = ix86_expand_compare (code, op0, op1);
22450 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22451 gen_rtx_LABEL_REF (VOIDmode, label),
22452 pc_rtx);
22453 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22454 return;
22456 case E_DImode:
22457 if (TARGET_64BIT)
22458 goto simple;
22459 /* For 32-bit target DI comparison may be performed on
22460 SSE registers. To allow this we should avoid split
22461 to SI mode which is achieved by doing xor in DI mode
22462 and then comparing with zero (which is recognized by
22463 STV pass). We don't compare using xor when optimizing
22464 for size. */
22465 if (!optimize_insn_for_size_p ()
22466 && TARGET_STV
22467 && (code == EQ || code == NE))
22469 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22470 op1 = const0_rtx;
22472 /* FALLTHRU */
22473 case E_TImode:
22474 /* Expand DImode branch into multiple compare+branch. */
22476 rtx lo[2], hi[2];
22477 rtx_code_label *label2;
22478 enum rtx_code code1, code2, code3;
22479 machine_mode submode;
22481 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22483 std::swap (op0, op1);
22484 code = swap_condition (code);
22487 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22488 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22490 submode = mode == DImode ? SImode : DImode;
22492 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22493 avoid two branches. This costs one extra insn, so disable when
22494 optimizing for size. */
22496 if ((code == EQ || code == NE)
22497 && (!optimize_insn_for_size_p ()
22498 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22500 rtx xor0, xor1;
22502 xor1 = hi[0];
22503 if (hi[1] != const0_rtx)
22504 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22505 NULL_RTX, 0, OPTAB_WIDEN);
22507 xor0 = lo[0];
22508 if (lo[1] != const0_rtx)
22509 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22510 NULL_RTX, 0, OPTAB_WIDEN);
22512 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22513 NULL_RTX, 0, OPTAB_WIDEN);
22515 ix86_expand_branch (code, tmp, const0_rtx, label);
22516 return;
22519 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22520 op1 is a constant and the low word is zero, then we can just
22521 examine the high word. Similarly for low word -1 and
22522 less-or-equal-than or greater-than. */
22524 if (CONST_INT_P (hi[1]))
22525 switch (code)
22527 case LT: case LTU: case GE: case GEU:
22528 if (lo[1] == const0_rtx)
22530 ix86_expand_branch (code, hi[0], hi[1], label);
22531 return;
22533 break;
22534 case LE: case LEU: case GT: case GTU:
22535 if (lo[1] == constm1_rtx)
22537 ix86_expand_branch (code, hi[0], hi[1], label);
22538 return;
22540 break;
22541 default:
22542 break;
22545 /* Emulate comparisons that do not depend on Zero flag with
22546 double-word subtraction. Note that only Overflow, Sign
22547 and Carry flags are valid, so swap arguments and condition
22548 of comparisons that would otherwise test Zero flag. */
22550 switch (code)
22552 case LE: case LEU: case GT: case GTU:
22553 std::swap (lo[0], lo[1]);
22554 std::swap (hi[0], hi[1]);
22555 code = swap_condition (code);
22556 /* FALLTHRU */
22558 case LT: case LTU: case GE: case GEU:
22560 rtx (*cmp_insn) (rtx, rtx);
22561 rtx (*sbb_insn) (rtx, rtx, rtx);
22562 bool uns = (code == LTU || code == GEU);
22564 if (TARGET_64BIT)
22566 cmp_insn = gen_cmpdi_1;
22567 sbb_insn
22568 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22570 else
22572 cmp_insn = gen_cmpsi_1;
22573 sbb_insn
22574 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22577 if (!nonimmediate_operand (lo[0], submode))
22578 lo[0] = force_reg (submode, lo[0]);
22579 if (!x86_64_general_operand (lo[1], submode))
22580 lo[1] = force_reg (submode, lo[1]);
22582 if (!register_operand (hi[0], submode))
22583 hi[0] = force_reg (submode, hi[0]);
22584 if ((uns && !nonimmediate_operand (hi[1], submode))
22585 || (!uns && !x86_64_general_operand (hi[1], submode)))
22586 hi[1] = force_reg (submode, hi[1]);
22588 emit_insn (cmp_insn (lo[0], lo[1]));
22589 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22591 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22593 ix86_expand_branch (code, tmp, const0_rtx, label);
22594 return;
22597 default:
22598 break;
22601 /* Otherwise, we need two or three jumps. */
22603 label2 = gen_label_rtx ();
22605 code1 = code;
22606 code2 = swap_condition (code);
22607 code3 = unsigned_condition (code);
22609 switch (code)
22611 case LT: case GT: case LTU: case GTU:
22612 break;
22614 case LE: code1 = LT; code2 = GT; break;
22615 case GE: code1 = GT; code2 = LT; break;
22616 case LEU: code1 = LTU; code2 = GTU; break;
22617 case GEU: code1 = GTU; code2 = LTU; break;
22619 case EQ: code1 = UNKNOWN; code2 = NE; break;
22620 case NE: code2 = UNKNOWN; break;
22622 default:
22623 gcc_unreachable ();
22627 * a < b =>
22628 * if (hi(a) < hi(b)) goto true;
22629 * if (hi(a) > hi(b)) goto false;
22630 * if (lo(a) < lo(b)) goto true;
22631 * false:
22634 if (code1 != UNKNOWN)
22635 ix86_expand_branch (code1, hi[0], hi[1], label);
22636 if (code2 != UNKNOWN)
22637 ix86_expand_branch (code2, hi[0], hi[1], label2);
22639 ix86_expand_branch (code3, lo[0], lo[1], label);
22641 if (code2 != UNKNOWN)
22642 emit_label (label2);
22643 return;
22646 default:
22647 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22648 goto simple;
22652 void
22653 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22655 rtx ret;
22657 gcc_assert (GET_MODE (dest) == QImode);
22659 ret = ix86_expand_compare (code, op0, op1);
22660 PUT_MODE (ret, QImode);
22661 emit_insn (gen_rtx_SET (dest, ret));
22664 /* Expand comparison setting or clearing carry flag. Return true when
22665 successful and set pop for the operation. */
22666 static bool
22667 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22669 machine_mode mode =
22670 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22672 /* Do not handle double-mode compares that go through special path. */
22673 if (mode == (TARGET_64BIT ? TImode : DImode))
22674 return false;
22676 if (SCALAR_FLOAT_MODE_P (mode))
22678 rtx compare_op;
22679 rtx_insn *compare_seq;
22681 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22683 /* Shortcut: following common codes never translate
22684 into carry flag compares. */
22685 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22686 || code == ORDERED || code == UNORDERED)
22687 return false;
22689 /* These comparisons require zero flag; swap operands so they won't. */
22690 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22691 && !TARGET_IEEE_FP)
22693 std::swap (op0, op1);
22694 code = swap_condition (code);
22697 /* Try to expand the comparison and verify that we end up with
22698 carry flag based comparison. This fails to be true only when
22699 we decide to expand comparison using arithmetic that is not
22700 too common scenario. */
22701 start_sequence ();
22702 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22703 compare_seq = get_insns ();
22704 end_sequence ();
22706 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
22707 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22708 else
22709 code = GET_CODE (compare_op);
22711 if (code != LTU && code != GEU)
22712 return false;
22714 emit_insn (compare_seq);
22715 *pop = compare_op;
22716 return true;
22719 if (!INTEGRAL_MODE_P (mode))
22720 return false;
22722 switch (code)
22724 case LTU:
22725 case GEU:
22726 break;
22728 /* Convert a==0 into (unsigned)a<1. */
22729 case EQ:
22730 case NE:
22731 if (op1 != const0_rtx)
22732 return false;
22733 op1 = const1_rtx;
22734 code = (code == EQ ? LTU : GEU);
22735 break;
22737 /* Convert a>b into b<a or a>=b-1. */
22738 case GTU:
22739 case LEU:
22740 if (CONST_INT_P (op1))
22742 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22743 /* Bail out on overflow. We still can swap operands but that
22744 would force loading of the constant into register. */
22745 if (op1 == const0_rtx
22746 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22747 return false;
22748 code = (code == GTU ? GEU : LTU);
22750 else
22752 std::swap (op0, op1);
22753 code = (code == GTU ? LTU : GEU);
22755 break;
22757 /* Convert a>=0 into (unsigned)a<0x80000000. */
22758 case LT:
22759 case GE:
22760 if (mode == DImode || op1 != const0_rtx)
22761 return false;
22762 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22763 code = (code == LT ? GEU : LTU);
22764 break;
22765 case LE:
22766 case GT:
22767 if (mode == DImode || op1 != constm1_rtx)
22768 return false;
22769 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22770 code = (code == LE ? GEU : LTU);
22771 break;
22773 default:
22774 return false;
22776 /* Swapping operands may cause constant to appear as first operand. */
22777 if (!nonimmediate_operand (op0, VOIDmode))
22779 if (!can_create_pseudo_p ())
22780 return false;
22781 op0 = force_reg (mode, op0);
22783 *pop = ix86_expand_compare (code, op0, op1);
22784 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22785 return true;
22788 bool
22789 ix86_expand_int_movcc (rtx operands[])
22791 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22792 rtx_insn *compare_seq;
22793 rtx compare_op;
22794 machine_mode mode = GET_MODE (operands[0]);
22795 bool sign_bit_compare_p = false;
22796 rtx op0 = XEXP (operands[1], 0);
22797 rtx op1 = XEXP (operands[1], 1);
22799 if (GET_MODE (op0) == TImode
22800 || (GET_MODE (op0) == DImode
22801 && !TARGET_64BIT))
22802 return false;
22804 start_sequence ();
22805 compare_op = ix86_expand_compare (code, op0, op1);
22806 compare_seq = get_insns ();
22807 end_sequence ();
22809 compare_code = GET_CODE (compare_op);
22811 if ((op1 == const0_rtx && (code == GE || code == LT))
22812 || (op1 == constm1_rtx && (code == GT || code == LE)))
22813 sign_bit_compare_p = true;
22815 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22816 HImode insns, we'd be swallowed in word prefix ops. */
22818 if ((mode != HImode || TARGET_FAST_PREFIX)
22819 && (mode != (TARGET_64BIT ? TImode : DImode))
22820 && CONST_INT_P (operands[2])
22821 && CONST_INT_P (operands[3]))
22823 rtx out = operands[0];
22824 HOST_WIDE_INT ct = INTVAL (operands[2]);
22825 HOST_WIDE_INT cf = INTVAL (operands[3]);
22826 HOST_WIDE_INT diff;
22828 diff = ct - cf;
22829 /* Sign bit compares are better done using shifts than we do by using
22830 sbb. */
22831 if (sign_bit_compare_p
22832 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22834 /* Detect overlap between destination and compare sources. */
22835 rtx tmp = out;
22837 if (!sign_bit_compare_p)
22839 rtx flags;
22840 bool fpcmp = false;
22842 compare_code = GET_CODE (compare_op);
22844 flags = XEXP (compare_op, 0);
22846 if (GET_MODE (flags) == CCFPmode)
22848 fpcmp = true;
22849 compare_code
22850 = ix86_fp_compare_code_to_integer (compare_code);
22853 /* To simplify rest of code, restrict to the GEU case. */
22854 if (compare_code == LTU)
22856 std::swap (ct, cf);
22857 compare_code = reverse_condition (compare_code);
22858 code = reverse_condition (code);
22860 else
22862 if (fpcmp)
22863 PUT_CODE (compare_op,
22864 reverse_condition_maybe_unordered
22865 (GET_CODE (compare_op)));
22866 else
22867 PUT_CODE (compare_op,
22868 reverse_condition (GET_CODE (compare_op)));
22870 diff = ct - cf;
22872 if (reg_overlap_mentioned_p (out, op0)
22873 || reg_overlap_mentioned_p (out, op1))
22874 tmp = gen_reg_rtx (mode);
22876 if (mode == DImode)
22877 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22878 else
22879 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22880 flags, compare_op));
22882 else
22884 if (code == GT || code == GE)
22885 code = reverse_condition (code);
22886 else
22888 std::swap (ct, cf);
22889 diff = ct - cf;
22891 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22894 if (diff == 1)
22897 * cmpl op0,op1
22898 * sbbl dest,dest
22899 * [addl dest, ct]
22901 * Size 5 - 8.
22903 if (ct)
22904 tmp = expand_simple_binop (mode, PLUS,
22905 tmp, GEN_INT (ct),
22906 copy_rtx (tmp), 1, OPTAB_DIRECT);
22908 else if (cf == -1)
22911 * cmpl op0,op1
22912 * sbbl dest,dest
22913 * orl $ct, dest
22915 * Size 8.
22917 tmp = expand_simple_binop (mode, IOR,
22918 tmp, GEN_INT (ct),
22919 copy_rtx (tmp), 1, OPTAB_DIRECT);
22921 else if (diff == -1 && ct)
22924 * cmpl op0,op1
22925 * sbbl dest,dest
22926 * notl dest
22927 * [addl dest, cf]
22929 * Size 8 - 11.
22931 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22932 if (cf)
22933 tmp = expand_simple_binop (mode, PLUS,
22934 copy_rtx (tmp), GEN_INT (cf),
22935 copy_rtx (tmp), 1, OPTAB_DIRECT);
22937 else
22940 * cmpl op0,op1
22941 * sbbl dest,dest
22942 * [notl dest]
22943 * andl cf - ct, dest
22944 * [addl dest, ct]
22946 * Size 8 - 11.
22949 if (cf == 0)
22951 cf = ct;
22952 ct = 0;
22953 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22956 tmp = expand_simple_binop (mode, AND,
22957 copy_rtx (tmp),
22958 gen_int_mode (cf - ct, mode),
22959 copy_rtx (tmp), 1, OPTAB_DIRECT);
22960 if (ct)
22961 tmp = expand_simple_binop (mode, PLUS,
22962 copy_rtx (tmp), GEN_INT (ct),
22963 copy_rtx (tmp), 1, OPTAB_DIRECT);
22966 if (!rtx_equal_p (tmp, out))
22967 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
22969 return true;
22972 if (diff < 0)
22974 machine_mode cmp_mode = GET_MODE (op0);
22975 enum rtx_code new_code;
22977 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22979 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22981 /* We may be reversing unordered compare to normal compare, that
22982 is not valid in general (we may convert non-trapping condition
22983 to trapping one), however on i386 we currently emit all
22984 comparisons unordered. */
22985 new_code = reverse_condition_maybe_unordered (code);
22987 else
22988 new_code = ix86_reverse_condition (code, cmp_mode);
22989 if (new_code != UNKNOWN)
22991 std::swap (ct, cf);
22992 diff = -diff;
22993 code = new_code;
22997 compare_code = UNKNOWN;
22998 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
22999 && CONST_INT_P (op1))
23001 if (op1 == const0_rtx
23002 && (code == LT || code == GE))
23003 compare_code = code;
23004 else if (op1 == constm1_rtx)
23006 if (code == LE)
23007 compare_code = LT;
23008 else if (code == GT)
23009 compare_code = GE;
23013 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23014 if (compare_code != UNKNOWN
23015 && GET_MODE (op0) == GET_MODE (out)
23016 && (cf == -1 || ct == -1))
23018 /* If lea code below could be used, only optimize
23019 if it results in a 2 insn sequence. */
23021 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23022 || diff == 3 || diff == 5 || diff == 9)
23023 || (compare_code == LT && ct == -1)
23024 || (compare_code == GE && cf == -1))
23027 * notl op1 (if necessary)
23028 * sarl $31, op1
23029 * orl cf, op1
23031 if (ct != -1)
23033 cf = ct;
23034 ct = -1;
23035 code = reverse_condition (code);
23038 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23040 out = expand_simple_binop (mode, IOR,
23041 out, GEN_INT (cf),
23042 out, 1, OPTAB_DIRECT);
23043 if (out != operands[0])
23044 emit_move_insn (operands[0], out);
23046 return true;
23051 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23052 || diff == 3 || diff == 5 || diff == 9)
23053 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23054 && (mode != DImode
23055 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23058 * xorl dest,dest
23059 * cmpl op1,op2
23060 * setcc dest
23061 * lea cf(dest*(ct-cf)),dest
23063 * Size 14.
23065 * This also catches the degenerate setcc-only case.
23068 rtx tmp;
23069 int nops;
23071 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23073 nops = 0;
23074 /* On x86_64 the lea instruction operates on Pmode, so we need
23075 to get arithmetics done in proper mode to match. */
23076 if (diff == 1)
23077 tmp = copy_rtx (out);
23078 else
23080 rtx out1;
23081 out1 = copy_rtx (out);
23082 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23083 nops++;
23084 if (diff & 1)
23086 tmp = gen_rtx_PLUS (mode, tmp, out1);
23087 nops++;
23090 if (cf != 0)
23092 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23093 nops++;
23095 if (!rtx_equal_p (tmp, out))
23097 if (nops == 1)
23098 out = force_operand (tmp, copy_rtx (out));
23099 else
23100 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23102 if (!rtx_equal_p (out, operands[0]))
23103 emit_move_insn (operands[0], copy_rtx (out));
23105 return true;
23109 * General case: Jumpful:
23110 * xorl dest,dest cmpl op1, op2
23111 * cmpl op1, op2 movl ct, dest
23112 * setcc dest jcc 1f
23113 * decl dest movl cf, dest
23114 * andl (cf-ct),dest 1:
23115 * addl ct,dest
23117 * Size 20. Size 14.
23119 * This is reasonably steep, but branch mispredict costs are
23120 * high on modern cpus, so consider failing only if optimizing
23121 * for space.
23124 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23125 && BRANCH_COST (optimize_insn_for_speed_p (),
23126 false) >= 2)
23128 if (cf == 0)
23130 machine_mode cmp_mode = GET_MODE (op0);
23131 enum rtx_code new_code;
23133 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23135 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23137 /* We may be reversing unordered compare to normal compare,
23138 that is not valid in general (we may convert non-trapping
23139 condition to trapping one), however on i386 we currently
23140 emit all comparisons unordered. */
23141 new_code = reverse_condition_maybe_unordered (code);
23143 else
23145 new_code = ix86_reverse_condition (code, cmp_mode);
23146 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23147 compare_code = reverse_condition (compare_code);
23150 if (new_code != UNKNOWN)
23152 cf = ct;
23153 ct = 0;
23154 code = new_code;
23158 if (compare_code != UNKNOWN)
23160 /* notl op1 (if needed)
23161 sarl $31, op1
23162 andl (cf-ct), op1
23163 addl ct, op1
23165 For x < 0 (resp. x <= -1) there will be no notl,
23166 so if possible swap the constants to get rid of the
23167 complement.
23168 True/false will be -1/0 while code below (store flag
23169 followed by decrement) is 0/-1, so the constants need
23170 to be exchanged once more. */
23172 if (compare_code == GE || !cf)
23174 code = reverse_condition (code);
23175 compare_code = LT;
23177 else
23178 std::swap (ct, cf);
23180 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23182 else
23184 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23186 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23187 constm1_rtx,
23188 copy_rtx (out), 1, OPTAB_DIRECT);
23191 out = expand_simple_binop (mode, AND, copy_rtx (out),
23192 gen_int_mode (cf - ct, mode),
23193 copy_rtx (out), 1, OPTAB_DIRECT);
23194 if (ct)
23195 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23196 copy_rtx (out), 1, OPTAB_DIRECT);
23197 if (!rtx_equal_p (out, operands[0]))
23198 emit_move_insn (operands[0], copy_rtx (out));
23200 return true;
23204 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23206 /* Try a few things more with specific constants and a variable. */
23208 optab op;
23209 rtx var, orig_out, out, tmp;
23211 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23212 return false;
23214 /* If one of the two operands is an interesting constant, load a
23215 constant with the above and mask it in with a logical operation. */
23217 if (CONST_INT_P (operands[2]))
23219 var = operands[3];
23220 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23221 operands[3] = constm1_rtx, op = and_optab;
23222 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23223 operands[3] = const0_rtx, op = ior_optab;
23224 else
23225 return false;
23227 else if (CONST_INT_P (operands[3]))
23229 var = operands[2];
23230 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23231 operands[2] = constm1_rtx, op = and_optab;
23232 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23233 operands[2] = const0_rtx, op = ior_optab;
23234 else
23235 return false;
23237 else
23238 return false;
23240 orig_out = operands[0];
23241 tmp = gen_reg_rtx (mode);
23242 operands[0] = tmp;
23244 /* Recurse to get the constant loaded. */
23245 if (!ix86_expand_int_movcc (operands))
23246 return false;
23248 /* Mask in the interesting variable. */
23249 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23250 OPTAB_WIDEN);
23251 if (!rtx_equal_p (out, orig_out))
23252 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23254 return true;
23258 * For comparison with above,
23260 * movl cf,dest
23261 * movl ct,tmp
23262 * cmpl op1,op2
23263 * cmovcc tmp,dest
23265 * Size 15.
23268 if (! nonimmediate_operand (operands[2], mode))
23269 operands[2] = force_reg (mode, operands[2]);
23270 if (! nonimmediate_operand (operands[3], mode))
23271 operands[3] = force_reg (mode, operands[3]);
23273 if (! register_operand (operands[2], VOIDmode)
23274 && (mode == QImode
23275 || ! register_operand (operands[3], VOIDmode)))
23276 operands[2] = force_reg (mode, operands[2]);
23278 if (mode == QImode
23279 && ! register_operand (operands[3], VOIDmode))
23280 operands[3] = force_reg (mode, operands[3]);
23282 emit_insn (compare_seq);
23283 emit_insn (gen_rtx_SET (operands[0],
23284 gen_rtx_IF_THEN_ELSE (mode,
23285 compare_op, operands[2],
23286 operands[3])));
23287 return true;
23290 /* Swap, force into registers, or otherwise massage the two operands
23291 to an sse comparison with a mask result. Thus we differ a bit from
23292 ix86_prepare_fp_compare_args which expects to produce a flags result.
23294 The DEST operand exists to help determine whether to commute commutative
23295 operators. The POP0/POP1 operands are updated in place. The new
23296 comparison code is returned, or UNKNOWN if not implementable. */
23298 static enum rtx_code
23299 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23300 rtx *pop0, rtx *pop1)
23302 switch (code)
23304 case LTGT:
23305 case UNEQ:
23306 /* AVX supports all the needed comparisons. */
23307 if (TARGET_AVX)
23308 break;
23309 /* We have no LTGT as an operator. We could implement it with
23310 NE & ORDERED, but this requires an extra temporary. It's
23311 not clear that it's worth it. */
23312 return UNKNOWN;
23314 case LT:
23315 case LE:
23316 case UNGT:
23317 case UNGE:
23318 /* These are supported directly. */
23319 break;
23321 case EQ:
23322 case NE:
23323 case UNORDERED:
23324 case ORDERED:
23325 /* AVX has 3 operand comparisons, no need to swap anything. */
23326 if (TARGET_AVX)
23327 break;
23328 /* For commutative operators, try to canonicalize the destination
23329 operand to be first in the comparison - this helps reload to
23330 avoid extra moves. */
23331 if (!dest || !rtx_equal_p (dest, *pop1))
23332 break;
23333 /* FALLTHRU */
23335 case GE:
23336 case GT:
23337 case UNLE:
23338 case UNLT:
23339 /* These are not supported directly before AVX, and furthermore
23340 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23341 comparison operands to transform into something that is
23342 supported. */
23343 std::swap (*pop0, *pop1);
23344 code = swap_condition (code);
23345 break;
23347 default:
23348 gcc_unreachable ();
23351 return code;
23354 /* Detect conditional moves that exactly match min/max operational
23355 semantics. Note that this is IEEE safe, as long as we don't
23356 interchange the operands.
23358 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23359 and TRUE if the operation is successful and instructions are emitted. */
23361 static bool
23362 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23363 rtx cmp_op1, rtx if_true, rtx if_false)
23365 machine_mode mode;
23366 bool is_min;
23367 rtx tmp;
23369 if (code == LT)
23371 else if (code == UNGE)
23372 std::swap (if_true, if_false);
23373 else
23374 return false;
23376 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23377 is_min = true;
23378 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23379 is_min = false;
23380 else
23381 return false;
23383 mode = GET_MODE (dest);
23385 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23386 but MODE may be a vector mode and thus not appropriate. */
23387 if (!flag_finite_math_only || flag_signed_zeros)
23389 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23390 rtvec v;
23392 if_true = force_reg (mode, if_true);
23393 v = gen_rtvec (2, if_true, if_false);
23394 tmp = gen_rtx_UNSPEC (mode, v, u);
23396 else
23398 code = is_min ? SMIN : SMAX;
23399 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23402 emit_insn (gen_rtx_SET (dest, tmp));
23403 return true;
23406 /* Expand an sse vector comparison. Return the register with the result. */
23408 static rtx
23409 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23410 rtx op_true, rtx op_false)
23412 machine_mode mode = GET_MODE (dest);
23413 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23415 /* In general case result of comparison can differ from operands' type. */
23416 machine_mode cmp_mode;
23418 /* In AVX512F the result of comparison is an integer mask. */
23419 bool maskcmp = false;
23420 rtx x;
23422 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23424 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23425 cmp_mode = int_mode_for_size (nbits, 0).require ();
23426 maskcmp = true;
23428 else
23429 cmp_mode = cmp_ops_mode;
23432 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23433 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23434 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23436 if (optimize
23437 || (maskcmp && cmp_mode != mode)
23438 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23439 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23440 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23442 /* Compare patterns for int modes are unspec in AVX512F only. */
23443 if (maskcmp && (code == GT || code == EQ))
23445 rtx (*gen)(rtx, rtx, rtx);
23447 switch (cmp_ops_mode)
23449 case E_V64QImode:
23450 gcc_assert (TARGET_AVX512BW);
23451 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23452 break;
23453 case E_V32HImode:
23454 gcc_assert (TARGET_AVX512BW);
23455 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23456 break;
23457 case E_V16SImode:
23458 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23459 break;
23460 case E_V8DImode:
23461 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23462 break;
23463 default:
23464 gen = NULL;
23467 if (gen)
23469 emit_insn (gen (dest, cmp_op0, cmp_op1));
23470 return dest;
23473 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23475 if (cmp_mode != mode && !maskcmp)
23477 x = force_reg (cmp_ops_mode, x);
23478 convert_move (dest, x, false);
23480 else
23481 emit_insn (gen_rtx_SET (dest, x));
23483 return dest;
23486 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23487 operations. This is used for both scalar and vector conditional moves. */
23489 void
23490 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23492 machine_mode mode = GET_MODE (dest);
23493 machine_mode cmpmode = GET_MODE (cmp);
23495 /* In AVX512F the result of comparison is an integer mask. */
23496 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23498 rtx t2, t3, x;
23500 /* If we have an integer mask and FP value then we need
23501 to cast mask to FP mode. */
23502 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23504 cmp = force_reg (cmpmode, cmp);
23505 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23508 if (vector_all_ones_operand (op_true, mode)
23509 && rtx_equal_p (op_false, CONST0_RTX (mode))
23510 && !maskcmp)
23512 emit_insn (gen_rtx_SET (dest, cmp));
23514 else if (op_false == CONST0_RTX (mode)
23515 && !maskcmp)
23517 op_true = force_reg (mode, op_true);
23518 x = gen_rtx_AND (mode, cmp, op_true);
23519 emit_insn (gen_rtx_SET (dest, x));
23521 else if (op_true == CONST0_RTX (mode)
23522 && !maskcmp)
23524 op_false = force_reg (mode, op_false);
23525 x = gen_rtx_NOT (mode, cmp);
23526 x = gen_rtx_AND (mode, x, op_false);
23527 emit_insn (gen_rtx_SET (dest, x));
23529 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23530 && !maskcmp)
23532 op_false = force_reg (mode, op_false);
23533 x = gen_rtx_IOR (mode, cmp, op_false);
23534 emit_insn (gen_rtx_SET (dest, x));
23536 else if (TARGET_XOP
23537 && !maskcmp)
23539 op_true = force_reg (mode, op_true);
23541 if (!nonimmediate_operand (op_false, mode))
23542 op_false = force_reg (mode, op_false);
23544 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23545 op_true,
23546 op_false)));
23548 else
23550 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23551 rtx d = dest;
23553 if (!nonimmediate_operand (op_true, mode))
23554 op_true = force_reg (mode, op_true);
23556 op_false = force_reg (mode, op_false);
23558 switch (mode)
23560 case E_V4SFmode:
23561 if (TARGET_SSE4_1)
23562 gen = gen_sse4_1_blendvps;
23563 break;
23564 case E_V2DFmode:
23565 if (TARGET_SSE4_1)
23566 gen = gen_sse4_1_blendvpd;
23567 break;
23568 case E_V16QImode:
23569 case E_V8HImode:
23570 case E_V4SImode:
23571 case E_V2DImode:
23572 if (TARGET_SSE4_1)
23574 gen = gen_sse4_1_pblendvb;
23575 if (mode != V16QImode)
23576 d = gen_reg_rtx (V16QImode);
23577 op_false = gen_lowpart (V16QImode, op_false);
23578 op_true = gen_lowpart (V16QImode, op_true);
23579 cmp = gen_lowpart (V16QImode, cmp);
23581 break;
23582 case E_V8SFmode:
23583 if (TARGET_AVX)
23584 gen = gen_avx_blendvps256;
23585 break;
23586 case E_V4DFmode:
23587 if (TARGET_AVX)
23588 gen = gen_avx_blendvpd256;
23589 break;
23590 case E_V32QImode:
23591 case E_V16HImode:
23592 case E_V8SImode:
23593 case E_V4DImode:
23594 if (TARGET_AVX2)
23596 gen = gen_avx2_pblendvb;
23597 if (mode != V32QImode)
23598 d = gen_reg_rtx (V32QImode);
23599 op_false = gen_lowpart (V32QImode, op_false);
23600 op_true = gen_lowpart (V32QImode, op_true);
23601 cmp = gen_lowpart (V32QImode, cmp);
23603 break;
23605 case E_V64QImode:
23606 gen = gen_avx512bw_blendmv64qi;
23607 break;
23608 case E_V32HImode:
23609 gen = gen_avx512bw_blendmv32hi;
23610 break;
23611 case E_V16SImode:
23612 gen = gen_avx512f_blendmv16si;
23613 break;
23614 case E_V8DImode:
23615 gen = gen_avx512f_blendmv8di;
23616 break;
23617 case E_V8DFmode:
23618 gen = gen_avx512f_blendmv8df;
23619 break;
23620 case E_V16SFmode:
23621 gen = gen_avx512f_blendmv16sf;
23622 break;
23624 default:
23625 break;
23628 if (gen != NULL)
23630 emit_insn (gen (d, op_false, op_true, cmp));
23631 if (d != dest)
23632 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23634 else
23636 op_true = force_reg (mode, op_true);
23638 t2 = gen_reg_rtx (mode);
23639 if (optimize)
23640 t3 = gen_reg_rtx (mode);
23641 else
23642 t3 = dest;
23644 x = gen_rtx_AND (mode, op_true, cmp);
23645 emit_insn (gen_rtx_SET (t2, x));
23647 x = gen_rtx_NOT (mode, cmp);
23648 x = gen_rtx_AND (mode, x, op_false);
23649 emit_insn (gen_rtx_SET (t3, x));
23651 x = gen_rtx_IOR (mode, t3, t2);
23652 emit_insn (gen_rtx_SET (dest, x));
23657 /* Expand a floating-point conditional move. Return true if successful. */
23659 bool
23660 ix86_expand_fp_movcc (rtx operands[])
23662 machine_mode mode = GET_MODE (operands[0]);
23663 enum rtx_code code = GET_CODE (operands[1]);
23664 rtx tmp, compare_op;
23665 rtx op0 = XEXP (operands[1], 0);
23666 rtx op1 = XEXP (operands[1], 1);
23668 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23670 machine_mode cmode;
23672 /* Since we've no cmove for sse registers, don't force bad register
23673 allocation just to gain access to it. Deny movcc when the
23674 comparison mode doesn't match the move mode. */
23675 cmode = GET_MODE (op0);
23676 if (cmode == VOIDmode)
23677 cmode = GET_MODE (op1);
23678 if (cmode != mode)
23679 return false;
23681 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23682 if (code == UNKNOWN)
23683 return false;
23685 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23686 operands[2], operands[3]))
23687 return true;
23689 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23690 operands[2], operands[3]);
23691 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23692 return true;
23695 if (GET_MODE (op0) == TImode
23696 || (GET_MODE (op0) == DImode
23697 && !TARGET_64BIT))
23698 return false;
23700 /* The floating point conditional move instructions don't directly
23701 support conditions resulting from a signed integer comparison. */
23703 compare_op = ix86_expand_compare (code, op0, op1);
23704 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23706 tmp = gen_reg_rtx (QImode);
23707 ix86_expand_setcc (tmp, code, op0, op1);
23709 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23712 emit_insn (gen_rtx_SET (operands[0],
23713 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23714 operands[2], operands[3])));
23716 return true;
23719 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23721 static int
23722 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23724 switch (code)
23726 case EQ:
23727 return 0;
23728 case LT:
23729 case LTU:
23730 return 1;
23731 case LE:
23732 case LEU:
23733 return 2;
23734 case NE:
23735 return 4;
23736 case GE:
23737 case GEU:
23738 return 5;
23739 case GT:
23740 case GTU:
23741 return 6;
23742 default:
23743 gcc_unreachable ();
23747 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23749 static int
23750 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23752 switch (code)
23754 case EQ:
23755 return 0x00;
23756 case NE:
23757 return 0x04;
23758 case GT:
23759 return 0x0e;
23760 case LE:
23761 return 0x02;
23762 case GE:
23763 return 0x0d;
23764 case LT:
23765 return 0x01;
23766 case UNLE:
23767 return 0x0a;
23768 case UNLT:
23769 return 0x09;
23770 case UNGE:
23771 return 0x05;
23772 case UNGT:
23773 return 0x06;
23774 case UNEQ:
23775 return 0x18;
23776 case LTGT:
23777 return 0x0c;
23778 case ORDERED:
23779 return 0x07;
23780 case UNORDERED:
23781 return 0x03;
23782 default:
23783 gcc_unreachable ();
23787 /* Return immediate value to be used in UNSPEC_PCMP
23788 for comparison CODE in MODE. */
23790 static int
23791 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23793 if (FLOAT_MODE_P (mode))
23794 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23795 return ix86_int_cmp_code_to_pcmp_immediate (code);
23798 /* Expand AVX-512 vector comparison. */
23800 bool
23801 ix86_expand_mask_vec_cmp (rtx operands[])
23803 machine_mode mask_mode = GET_MODE (operands[0]);
23804 machine_mode cmp_mode = GET_MODE (operands[2]);
23805 enum rtx_code code = GET_CODE (operands[1]);
23806 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23807 int unspec_code;
23808 rtx unspec;
23810 switch (code)
23812 case LEU:
23813 case GTU:
23814 case GEU:
23815 case LTU:
23816 unspec_code = UNSPEC_UNSIGNED_PCMP;
23817 break;
23819 default:
23820 unspec_code = UNSPEC_PCMP;
23823 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23824 operands[3], imm),
23825 unspec_code);
23826 emit_insn (gen_rtx_SET (operands[0], unspec));
23828 return true;
23831 /* Expand fp vector comparison. */
23833 bool
23834 ix86_expand_fp_vec_cmp (rtx operands[])
23836 enum rtx_code code = GET_CODE (operands[1]);
23837 rtx cmp;
23839 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23840 &operands[2], &operands[3]);
23841 if (code == UNKNOWN)
23843 rtx temp;
23844 switch (GET_CODE (operands[1]))
23846 case LTGT:
23847 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23848 operands[3], NULL, NULL);
23849 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23850 operands[3], NULL, NULL);
23851 code = AND;
23852 break;
23853 case UNEQ:
23854 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23855 operands[3], NULL, NULL);
23856 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23857 operands[3], NULL, NULL);
23858 code = IOR;
23859 break;
23860 default:
23861 gcc_unreachable ();
23863 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23864 OPTAB_DIRECT);
23866 else
23867 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23868 operands[1], operands[2]);
23870 if (operands[0] != cmp)
23871 emit_move_insn (operands[0], cmp);
23873 return true;
23876 static rtx
23877 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23878 rtx op_true, rtx op_false, bool *negate)
23880 machine_mode data_mode = GET_MODE (dest);
23881 machine_mode mode = GET_MODE (cop0);
23882 rtx x;
23884 *negate = false;
23886 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23887 if (TARGET_XOP
23888 && (mode == V16QImode || mode == V8HImode
23889 || mode == V4SImode || mode == V2DImode))
23891 else
23893 /* Canonicalize the comparison to EQ, GT, GTU. */
23894 switch (code)
23896 case EQ:
23897 case GT:
23898 case GTU:
23899 break;
23901 case NE:
23902 case LE:
23903 case LEU:
23904 code = reverse_condition (code);
23905 *negate = true;
23906 break;
23908 case GE:
23909 case GEU:
23910 code = reverse_condition (code);
23911 *negate = true;
23912 /* FALLTHRU */
23914 case LT:
23915 case LTU:
23916 std::swap (cop0, cop1);
23917 code = swap_condition (code);
23918 break;
23920 default:
23921 gcc_unreachable ();
23924 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23925 if (mode == V2DImode)
23927 switch (code)
23929 case EQ:
23930 /* SSE4.1 supports EQ. */
23931 if (!TARGET_SSE4_1)
23932 return NULL;
23933 break;
23935 case GT:
23936 case GTU:
23937 /* SSE4.2 supports GT/GTU. */
23938 if (!TARGET_SSE4_2)
23939 return NULL;
23940 break;
23942 default:
23943 gcc_unreachable ();
23947 /* Unsigned parallel compare is not supported by the hardware.
23948 Play some tricks to turn this into a signed comparison
23949 against 0. */
23950 if (code == GTU)
23952 cop0 = force_reg (mode, cop0);
23954 switch (mode)
23956 case E_V16SImode:
23957 case E_V8DImode:
23958 case E_V8SImode:
23959 case E_V4DImode:
23960 case E_V4SImode:
23961 case E_V2DImode:
23963 rtx t1, t2, mask;
23964 rtx (*gen_sub3) (rtx, rtx, rtx);
23966 switch (mode)
23968 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
23969 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
23970 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
23971 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
23972 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
23973 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
23974 default:
23975 gcc_unreachable ();
23977 /* Subtract (-(INT MAX) - 1) from both operands to make
23978 them signed. */
23979 mask = ix86_build_signbit_mask (mode, true, false);
23980 t1 = gen_reg_rtx (mode);
23981 emit_insn (gen_sub3 (t1, cop0, mask));
23983 t2 = gen_reg_rtx (mode);
23984 emit_insn (gen_sub3 (t2, cop1, mask));
23986 cop0 = t1;
23987 cop1 = t2;
23988 code = GT;
23990 break;
23992 case E_V64QImode:
23993 case E_V32HImode:
23994 case E_V32QImode:
23995 case E_V16HImode:
23996 case E_V16QImode:
23997 case E_V8HImode:
23998 /* Perform a parallel unsigned saturating subtraction. */
23999 x = gen_reg_rtx (mode);
24000 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24001 cop1)));
24003 cop0 = x;
24004 cop1 = CONST0_RTX (mode);
24005 code = EQ;
24006 *negate = !*negate;
24007 break;
24009 default:
24010 gcc_unreachable ();
24015 if (*negate)
24016 std::swap (op_true, op_false);
24018 /* Allow the comparison to be done in one mode, but the movcc to
24019 happen in another mode. */
24020 if (data_mode == mode)
24022 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24023 op_true, op_false);
24025 else
24027 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24028 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24029 op_true, op_false);
24030 if (GET_MODE (x) == mode)
24031 x = gen_lowpart (data_mode, x);
24034 return x;
24037 /* Expand integer vector comparison. */
24039 bool
24040 ix86_expand_int_vec_cmp (rtx operands[])
24042 rtx_code code = GET_CODE (operands[1]);
24043 bool negate = false;
24044 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24045 operands[3], NULL, NULL, &negate);
24047 if (!cmp)
24048 return false;
24050 if (negate)
24051 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24052 CONST0_RTX (GET_MODE (cmp)),
24053 NULL, NULL, &negate);
24055 gcc_assert (!negate);
24057 if (operands[0] != cmp)
24058 emit_move_insn (operands[0], cmp);
24060 return true;
24063 /* Expand a floating-point vector conditional move; a vcond operation
24064 rather than a movcc operation. */
24066 bool
24067 ix86_expand_fp_vcond (rtx operands[])
24069 enum rtx_code code = GET_CODE (operands[3]);
24070 rtx cmp;
24072 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24073 &operands[4], &operands[5]);
24074 if (code == UNKNOWN)
24076 rtx temp;
24077 switch (GET_CODE (operands[3]))
24079 case LTGT:
24080 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24081 operands[5], operands[0], operands[0]);
24082 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24083 operands[5], operands[1], operands[2]);
24084 code = AND;
24085 break;
24086 case UNEQ:
24087 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24088 operands[5], operands[0], operands[0]);
24089 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24090 operands[5], operands[1], operands[2]);
24091 code = IOR;
24092 break;
24093 default:
24094 gcc_unreachable ();
24096 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24097 OPTAB_DIRECT);
24098 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24099 return true;
24102 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24103 operands[5], operands[1], operands[2]))
24104 return true;
24106 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24107 operands[1], operands[2]);
24108 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24109 return true;
24112 /* Expand a signed/unsigned integral vector conditional move. */
24114 bool
24115 ix86_expand_int_vcond (rtx operands[])
24117 machine_mode data_mode = GET_MODE (operands[0]);
24118 machine_mode mode = GET_MODE (operands[4]);
24119 enum rtx_code code = GET_CODE (operands[3]);
24120 bool negate = false;
24121 rtx x, cop0, cop1;
24123 cop0 = operands[4];
24124 cop1 = operands[5];
24126 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24127 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24128 if ((code == LT || code == GE)
24129 && data_mode == mode
24130 && cop1 == CONST0_RTX (mode)
24131 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24132 && GET_MODE_UNIT_SIZE (data_mode) > 1
24133 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24134 && (GET_MODE_SIZE (data_mode) == 16
24135 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24137 rtx negop = operands[2 - (code == LT)];
24138 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24139 if (negop == CONST1_RTX (data_mode))
24141 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24142 operands[0], 1, OPTAB_DIRECT);
24143 if (res != operands[0])
24144 emit_move_insn (operands[0], res);
24145 return true;
24147 else if (GET_MODE_INNER (data_mode) != DImode
24148 && vector_all_ones_operand (negop, data_mode))
24150 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24151 operands[0], 0, OPTAB_DIRECT);
24152 if (res != operands[0])
24153 emit_move_insn (operands[0], res);
24154 return true;
24158 if (!nonimmediate_operand (cop1, mode))
24159 cop1 = force_reg (mode, cop1);
24160 if (!general_operand (operands[1], data_mode))
24161 operands[1] = force_reg (data_mode, operands[1]);
24162 if (!general_operand (operands[2], data_mode))
24163 operands[2] = force_reg (data_mode, operands[2]);
24165 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24166 operands[1], operands[2], &negate);
24168 if (!x)
24169 return false;
24171 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24172 operands[2-negate]);
24173 return true;
24176 /* AVX512F does support 64-byte integer vector operations,
24177 thus the longest vector we are faced with is V64QImode. */
24178 #define MAX_VECT_LEN 64
24180 struct expand_vec_perm_d
24182 rtx target, op0, op1;
24183 unsigned char perm[MAX_VECT_LEN];
24184 machine_mode vmode;
24185 unsigned char nelt;
24186 bool one_operand_p;
24187 bool testing_p;
24190 static bool
24191 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24192 struct expand_vec_perm_d *d)
24194 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24195 expander, so args are either in d, or in op0, op1 etc. */
24196 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24197 machine_mode maskmode = mode;
24198 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24200 switch (mode)
24202 case E_V8HImode:
24203 if (TARGET_AVX512VL && TARGET_AVX512BW)
24204 gen = gen_avx512vl_vpermt2varv8hi3;
24205 break;
24206 case E_V16HImode:
24207 if (TARGET_AVX512VL && TARGET_AVX512BW)
24208 gen = gen_avx512vl_vpermt2varv16hi3;
24209 break;
24210 case E_V64QImode:
24211 if (TARGET_AVX512VBMI)
24212 gen = gen_avx512bw_vpermt2varv64qi3;
24213 break;
24214 case E_V32HImode:
24215 if (TARGET_AVX512BW)
24216 gen = gen_avx512bw_vpermt2varv32hi3;
24217 break;
24218 case E_V4SImode:
24219 if (TARGET_AVX512VL)
24220 gen = gen_avx512vl_vpermt2varv4si3;
24221 break;
24222 case E_V8SImode:
24223 if (TARGET_AVX512VL)
24224 gen = gen_avx512vl_vpermt2varv8si3;
24225 break;
24226 case E_V16SImode:
24227 if (TARGET_AVX512F)
24228 gen = gen_avx512f_vpermt2varv16si3;
24229 break;
24230 case E_V4SFmode:
24231 if (TARGET_AVX512VL)
24233 gen = gen_avx512vl_vpermt2varv4sf3;
24234 maskmode = V4SImode;
24236 break;
24237 case E_V8SFmode:
24238 if (TARGET_AVX512VL)
24240 gen = gen_avx512vl_vpermt2varv8sf3;
24241 maskmode = V8SImode;
24243 break;
24244 case E_V16SFmode:
24245 if (TARGET_AVX512F)
24247 gen = gen_avx512f_vpermt2varv16sf3;
24248 maskmode = V16SImode;
24250 break;
24251 case E_V2DImode:
24252 if (TARGET_AVX512VL)
24253 gen = gen_avx512vl_vpermt2varv2di3;
24254 break;
24255 case E_V4DImode:
24256 if (TARGET_AVX512VL)
24257 gen = gen_avx512vl_vpermt2varv4di3;
24258 break;
24259 case E_V8DImode:
24260 if (TARGET_AVX512F)
24261 gen = gen_avx512f_vpermt2varv8di3;
24262 break;
24263 case E_V2DFmode:
24264 if (TARGET_AVX512VL)
24266 gen = gen_avx512vl_vpermt2varv2df3;
24267 maskmode = V2DImode;
24269 break;
24270 case E_V4DFmode:
24271 if (TARGET_AVX512VL)
24273 gen = gen_avx512vl_vpermt2varv4df3;
24274 maskmode = V4DImode;
24276 break;
24277 case E_V8DFmode:
24278 if (TARGET_AVX512F)
24280 gen = gen_avx512f_vpermt2varv8df3;
24281 maskmode = V8DImode;
24283 break;
24284 default:
24285 break;
24288 if (gen == NULL)
24289 return false;
24291 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24292 expander, so args are either in d, or in op0, op1 etc. */
24293 if (d)
24295 rtx vec[64];
24296 target = d->target;
24297 op0 = d->op0;
24298 op1 = d->op1;
24299 for (int i = 0; i < d->nelt; ++i)
24300 vec[i] = GEN_INT (d->perm[i]);
24301 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24304 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24305 return true;
24308 /* Expand a variable vector permutation. */
24310 void
24311 ix86_expand_vec_perm (rtx operands[])
24313 rtx target = operands[0];
24314 rtx op0 = operands[1];
24315 rtx op1 = operands[2];
24316 rtx mask = operands[3];
24317 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24318 machine_mode mode = GET_MODE (op0);
24319 machine_mode maskmode = GET_MODE (mask);
24320 int w, e, i;
24321 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24323 /* Number of elements in the vector. */
24324 w = GET_MODE_NUNITS (mode);
24325 e = GET_MODE_UNIT_SIZE (mode);
24326 gcc_assert (w <= 64);
24328 if (TARGET_AVX512F && one_operand_shuffle)
24330 rtx (*gen) (rtx, rtx, rtx) = NULL;
24331 switch (mode)
24333 case E_V16SImode:
24334 gen =gen_avx512f_permvarv16si;
24335 break;
24336 case E_V16SFmode:
24337 gen = gen_avx512f_permvarv16sf;
24338 break;
24339 case E_V8DImode:
24340 gen = gen_avx512f_permvarv8di;
24341 break;
24342 case E_V8DFmode:
24343 gen = gen_avx512f_permvarv8df;
24344 break;
24345 default:
24346 break;
24348 if (gen != NULL)
24350 emit_insn (gen (target, op0, mask));
24351 return;
24355 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24356 return;
24358 if (TARGET_AVX2)
24360 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24362 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24363 an constant shuffle operand. With a tiny bit of effort we can
24364 use VPERMD instead. A re-interpretation stall for V4DFmode is
24365 unfortunate but there's no avoiding it.
24366 Similarly for V16HImode we don't have instructions for variable
24367 shuffling, while for V32QImode we can use after preparing suitable
24368 masks vpshufb; vpshufb; vpermq; vpor. */
24370 if (mode == V16HImode)
24372 maskmode = mode = V32QImode;
24373 w = 32;
24374 e = 1;
24376 else
24378 maskmode = mode = V8SImode;
24379 w = 8;
24380 e = 4;
24382 t1 = gen_reg_rtx (maskmode);
24384 /* Replicate the low bits of the V4DImode mask into V8SImode:
24385 mask = { A B C D }
24386 t1 = { A A B B C C D D }. */
24387 for (i = 0; i < w / 2; ++i)
24388 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24389 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24390 vt = force_reg (maskmode, vt);
24391 mask = gen_lowpart (maskmode, mask);
24392 if (maskmode == V8SImode)
24393 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24394 else
24395 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24397 /* Multiply the shuffle indicies by two. */
24398 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24399 OPTAB_DIRECT);
24401 /* Add one to the odd shuffle indicies:
24402 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24403 for (i = 0; i < w / 2; ++i)
24405 vec[i * 2] = const0_rtx;
24406 vec[i * 2 + 1] = const1_rtx;
24408 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24409 vt = validize_mem (force_const_mem (maskmode, vt));
24410 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24411 OPTAB_DIRECT);
24413 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24414 operands[3] = mask = t1;
24415 target = gen_reg_rtx (mode);
24416 op0 = gen_lowpart (mode, op0);
24417 op1 = gen_lowpart (mode, op1);
24420 switch (mode)
24422 case E_V8SImode:
24423 /* The VPERMD and VPERMPS instructions already properly ignore
24424 the high bits of the shuffle elements. No need for us to
24425 perform an AND ourselves. */
24426 if (one_operand_shuffle)
24428 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24429 if (target != operands[0])
24430 emit_move_insn (operands[0],
24431 gen_lowpart (GET_MODE (operands[0]), target));
24433 else
24435 t1 = gen_reg_rtx (V8SImode);
24436 t2 = gen_reg_rtx (V8SImode);
24437 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24438 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24439 goto merge_two;
24441 return;
24443 case E_V8SFmode:
24444 mask = gen_lowpart (V8SImode, mask);
24445 if (one_operand_shuffle)
24446 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24447 else
24449 t1 = gen_reg_rtx (V8SFmode);
24450 t2 = gen_reg_rtx (V8SFmode);
24451 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24452 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24453 goto merge_two;
24455 return;
24457 case E_V4SImode:
24458 /* By combining the two 128-bit input vectors into one 256-bit
24459 input vector, we can use VPERMD and VPERMPS for the full
24460 two-operand shuffle. */
24461 t1 = gen_reg_rtx (V8SImode);
24462 t2 = gen_reg_rtx (V8SImode);
24463 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24464 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24465 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24466 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24467 return;
24469 case E_V4SFmode:
24470 t1 = gen_reg_rtx (V8SFmode);
24471 t2 = gen_reg_rtx (V8SImode);
24472 mask = gen_lowpart (V4SImode, mask);
24473 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24474 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24475 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24476 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24477 return;
24479 case E_V32QImode:
24480 t1 = gen_reg_rtx (V32QImode);
24481 t2 = gen_reg_rtx (V32QImode);
24482 t3 = gen_reg_rtx (V32QImode);
24483 vt2 = GEN_INT (-128);
24484 vt = gen_const_vec_duplicate (V32QImode, vt2);
24485 vt = force_reg (V32QImode, vt);
24486 for (i = 0; i < 32; i++)
24487 vec[i] = i < 16 ? vt2 : const0_rtx;
24488 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24489 vt2 = force_reg (V32QImode, vt2);
24490 /* From mask create two adjusted masks, which contain the same
24491 bits as mask in the low 7 bits of each vector element.
24492 The first mask will have the most significant bit clear
24493 if it requests element from the same 128-bit lane
24494 and MSB set if it requests element from the other 128-bit lane.
24495 The second mask will have the opposite values of the MSB,
24496 and additionally will have its 128-bit lanes swapped.
24497 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24498 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24499 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24500 stands for other 12 bytes. */
24501 /* The bit whether element is from the same lane or the other
24502 lane is bit 4, so shift it up by 3 to the MSB position. */
24503 t5 = gen_reg_rtx (V4DImode);
24504 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24505 GEN_INT (3)));
24506 /* Clear MSB bits from the mask just in case it had them set. */
24507 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24508 /* After this t1 will have MSB set for elements from other lane. */
24509 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24510 /* Clear bits other than MSB. */
24511 emit_insn (gen_andv32qi3 (t1, t1, vt));
24512 /* Or in the lower bits from mask into t3. */
24513 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24514 /* And invert MSB bits in t1, so MSB is set for elements from the same
24515 lane. */
24516 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24517 /* Swap 128-bit lanes in t3. */
24518 t6 = gen_reg_rtx (V4DImode);
24519 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24520 const2_rtx, GEN_INT (3),
24521 const0_rtx, const1_rtx));
24522 /* And or in the lower bits from mask into t1. */
24523 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24524 if (one_operand_shuffle)
24526 /* Each of these shuffles will put 0s in places where
24527 element from the other 128-bit lane is needed, otherwise
24528 will shuffle in the requested value. */
24529 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24530 gen_lowpart (V32QImode, t6)));
24531 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24532 /* For t3 the 128-bit lanes are swapped again. */
24533 t7 = gen_reg_rtx (V4DImode);
24534 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24535 const2_rtx, GEN_INT (3),
24536 const0_rtx, const1_rtx));
24537 /* And oring both together leads to the result. */
24538 emit_insn (gen_iorv32qi3 (target, t1,
24539 gen_lowpart (V32QImode, t7)));
24540 if (target != operands[0])
24541 emit_move_insn (operands[0],
24542 gen_lowpart (GET_MODE (operands[0]), target));
24543 return;
24546 t4 = gen_reg_rtx (V32QImode);
24547 /* Similarly to the above one_operand_shuffle code,
24548 just for repeated twice for each operand. merge_two:
24549 code will merge the two results together. */
24550 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24551 gen_lowpart (V32QImode, t6)));
24552 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24553 gen_lowpart (V32QImode, t6)));
24554 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24555 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24556 t7 = gen_reg_rtx (V4DImode);
24557 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24558 const2_rtx, GEN_INT (3),
24559 const0_rtx, const1_rtx));
24560 t8 = gen_reg_rtx (V4DImode);
24561 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24562 const2_rtx, GEN_INT (3),
24563 const0_rtx, const1_rtx));
24564 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24565 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24566 t1 = t4;
24567 t2 = t3;
24568 goto merge_two;
24570 default:
24571 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24572 break;
24576 if (TARGET_XOP)
24578 /* The XOP VPPERM insn supports three inputs. By ignoring the
24579 one_operand_shuffle special case, we avoid creating another
24580 set of constant vectors in memory. */
24581 one_operand_shuffle = false;
24583 /* mask = mask & {2*w-1, ...} */
24584 vt = GEN_INT (2*w - 1);
24586 else
24588 /* mask = mask & {w-1, ...} */
24589 vt = GEN_INT (w - 1);
24592 vt = gen_const_vec_duplicate (maskmode, vt);
24593 mask = expand_simple_binop (maskmode, AND, mask, vt,
24594 NULL_RTX, 0, OPTAB_DIRECT);
24596 /* For non-QImode operations, convert the word permutation control
24597 into a byte permutation control. */
24598 if (mode != V16QImode)
24600 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24601 GEN_INT (exact_log2 (e)),
24602 NULL_RTX, 0, OPTAB_DIRECT);
24604 /* Convert mask to vector of chars. */
24605 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24607 /* Replicate each of the input bytes into byte positions:
24608 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24609 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24610 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24611 for (i = 0; i < 16; ++i)
24612 vec[i] = GEN_INT (i/e * e);
24613 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24614 vt = validize_mem (force_const_mem (V16QImode, vt));
24615 if (TARGET_XOP)
24616 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24617 else
24618 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24620 /* Convert it into the byte positions by doing
24621 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24622 for (i = 0; i < 16; ++i)
24623 vec[i] = GEN_INT (i % e);
24624 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24625 vt = validize_mem (force_const_mem (V16QImode, vt));
24626 emit_insn (gen_addv16qi3 (mask, mask, vt));
24629 /* The actual shuffle operations all operate on V16QImode. */
24630 op0 = gen_lowpart (V16QImode, op0);
24631 op1 = gen_lowpart (V16QImode, op1);
24633 if (TARGET_XOP)
24635 if (GET_MODE (target) != V16QImode)
24636 target = gen_reg_rtx (V16QImode);
24637 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24638 if (target != operands[0])
24639 emit_move_insn (operands[0],
24640 gen_lowpart (GET_MODE (operands[0]), target));
24642 else if (one_operand_shuffle)
24644 if (GET_MODE (target) != V16QImode)
24645 target = gen_reg_rtx (V16QImode);
24646 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24647 if (target != operands[0])
24648 emit_move_insn (operands[0],
24649 gen_lowpart (GET_MODE (operands[0]), target));
24651 else
24653 rtx xops[6];
24654 bool ok;
24656 /* Shuffle the two input vectors independently. */
24657 t1 = gen_reg_rtx (V16QImode);
24658 t2 = gen_reg_rtx (V16QImode);
24659 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24660 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24662 merge_two:
24663 /* Then merge them together. The key is whether any given control
24664 element contained a bit set that indicates the second word. */
24665 mask = operands[3];
24666 vt = GEN_INT (w);
24667 if (maskmode == V2DImode && !TARGET_SSE4_1)
24669 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24670 more shuffle to convert the V2DI input mask into a V4SI
24671 input mask. At which point the masking that expand_int_vcond
24672 will work as desired. */
24673 rtx t3 = gen_reg_rtx (V4SImode);
24674 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24675 const0_rtx, const0_rtx,
24676 const2_rtx, const2_rtx));
24677 mask = t3;
24678 maskmode = V4SImode;
24679 e = w = 4;
24682 vt = gen_const_vec_duplicate (maskmode, vt);
24683 vt = force_reg (maskmode, vt);
24684 mask = expand_simple_binop (maskmode, AND, mask, vt,
24685 NULL_RTX, 0, OPTAB_DIRECT);
24687 if (GET_MODE (target) != mode)
24688 target = gen_reg_rtx (mode);
24689 xops[0] = target;
24690 xops[1] = gen_lowpart (mode, t2);
24691 xops[2] = gen_lowpart (mode, t1);
24692 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24693 xops[4] = mask;
24694 xops[5] = vt;
24695 ok = ix86_expand_int_vcond (xops);
24696 gcc_assert (ok);
24697 if (target != operands[0])
24698 emit_move_insn (operands[0],
24699 gen_lowpart (GET_MODE (operands[0]), target));
24703 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24704 true if we should do zero extension, else sign extension. HIGH_P is
24705 true if we want the N/2 high elements, else the low elements. */
24707 void
24708 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24710 machine_mode imode = GET_MODE (src);
24711 rtx tmp;
24713 if (TARGET_SSE4_1)
24715 rtx (*unpack)(rtx, rtx);
24716 rtx (*extract)(rtx, rtx) = NULL;
24717 machine_mode halfmode = BLKmode;
24719 switch (imode)
24721 case E_V64QImode:
24722 if (unsigned_p)
24723 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24724 else
24725 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24726 halfmode = V32QImode;
24727 extract
24728 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24729 break;
24730 case E_V32QImode:
24731 if (unsigned_p)
24732 unpack = gen_avx2_zero_extendv16qiv16hi2;
24733 else
24734 unpack = gen_avx2_sign_extendv16qiv16hi2;
24735 halfmode = V16QImode;
24736 extract
24737 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24738 break;
24739 case E_V32HImode:
24740 if (unsigned_p)
24741 unpack = gen_avx512f_zero_extendv16hiv16si2;
24742 else
24743 unpack = gen_avx512f_sign_extendv16hiv16si2;
24744 halfmode = V16HImode;
24745 extract
24746 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24747 break;
24748 case E_V16HImode:
24749 if (unsigned_p)
24750 unpack = gen_avx2_zero_extendv8hiv8si2;
24751 else
24752 unpack = gen_avx2_sign_extendv8hiv8si2;
24753 halfmode = V8HImode;
24754 extract
24755 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24756 break;
24757 case E_V16SImode:
24758 if (unsigned_p)
24759 unpack = gen_avx512f_zero_extendv8siv8di2;
24760 else
24761 unpack = gen_avx512f_sign_extendv8siv8di2;
24762 halfmode = V8SImode;
24763 extract
24764 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24765 break;
24766 case E_V8SImode:
24767 if (unsigned_p)
24768 unpack = gen_avx2_zero_extendv4siv4di2;
24769 else
24770 unpack = gen_avx2_sign_extendv4siv4di2;
24771 halfmode = V4SImode;
24772 extract
24773 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24774 break;
24775 case E_V16QImode:
24776 if (unsigned_p)
24777 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24778 else
24779 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24780 break;
24781 case E_V8HImode:
24782 if (unsigned_p)
24783 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24784 else
24785 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24786 break;
24787 case E_V4SImode:
24788 if (unsigned_p)
24789 unpack = gen_sse4_1_zero_extendv2siv2di2;
24790 else
24791 unpack = gen_sse4_1_sign_extendv2siv2di2;
24792 break;
24793 default:
24794 gcc_unreachable ();
24797 if (GET_MODE_SIZE (imode) >= 32)
24799 tmp = gen_reg_rtx (halfmode);
24800 emit_insn (extract (tmp, src));
24802 else if (high_p)
24804 /* Shift higher 8 bytes to lower 8 bytes. */
24805 tmp = gen_reg_rtx (V1TImode);
24806 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24807 GEN_INT (64)));
24808 tmp = gen_lowpart (imode, tmp);
24810 else
24811 tmp = src;
24813 emit_insn (unpack (dest, tmp));
24815 else
24817 rtx (*unpack)(rtx, rtx, rtx);
24819 switch (imode)
24821 case E_V16QImode:
24822 if (high_p)
24823 unpack = gen_vec_interleave_highv16qi;
24824 else
24825 unpack = gen_vec_interleave_lowv16qi;
24826 break;
24827 case E_V8HImode:
24828 if (high_p)
24829 unpack = gen_vec_interleave_highv8hi;
24830 else
24831 unpack = gen_vec_interleave_lowv8hi;
24832 break;
24833 case E_V4SImode:
24834 if (high_p)
24835 unpack = gen_vec_interleave_highv4si;
24836 else
24837 unpack = gen_vec_interleave_lowv4si;
24838 break;
24839 default:
24840 gcc_unreachable ();
24843 if (unsigned_p)
24844 tmp = force_reg (imode, CONST0_RTX (imode));
24845 else
24846 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24847 src, pc_rtx, pc_rtx);
24849 rtx tmp2 = gen_reg_rtx (imode);
24850 emit_insn (unpack (tmp2, src, tmp));
24851 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24855 /* Expand conditional increment or decrement using adb/sbb instructions.
24856 The default case using setcc followed by the conditional move can be
24857 done by generic code. */
24858 bool
24859 ix86_expand_int_addcc (rtx operands[])
24861 enum rtx_code code = GET_CODE (operands[1]);
24862 rtx flags;
24863 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24864 rtx compare_op;
24865 rtx val = const0_rtx;
24866 bool fpcmp = false;
24867 machine_mode mode;
24868 rtx op0 = XEXP (operands[1], 0);
24869 rtx op1 = XEXP (operands[1], 1);
24871 if (operands[3] != const1_rtx
24872 && operands[3] != constm1_rtx)
24873 return false;
24874 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24875 return false;
24876 code = GET_CODE (compare_op);
24878 flags = XEXP (compare_op, 0);
24880 if (GET_MODE (flags) == CCFPmode)
24882 fpcmp = true;
24883 code = ix86_fp_compare_code_to_integer (code);
24886 if (code != LTU)
24888 val = constm1_rtx;
24889 if (fpcmp)
24890 PUT_CODE (compare_op,
24891 reverse_condition_maybe_unordered
24892 (GET_CODE (compare_op)));
24893 else
24894 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24897 mode = GET_MODE (operands[0]);
24899 /* Construct either adc or sbb insn. */
24900 if ((code == LTU) == (operands[3] == constm1_rtx))
24902 switch (mode)
24904 case E_QImode:
24905 insn = gen_subqi3_carry;
24906 break;
24907 case E_HImode:
24908 insn = gen_subhi3_carry;
24909 break;
24910 case E_SImode:
24911 insn = gen_subsi3_carry;
24912 break;
24913 case E_DImode:
24914 insn = gen_subdi3_carry;
24915 break;
24916 default:
24917 gcc_unreachable ();
24920 else
24922 switch (mode)
24924 case E_QImode:
24925 insn = gen_addqi3_carry;
24926 break;
24927 case E_HImode:
24928 insn = gen_addhi3_carry;
24929 break;
24930 case E_SImode:
24931 insn = gen_addsi3_carry;
24932 break;
24933 case E_DImode:
24934 insn = gen_adddi3_carry;
24935 break;
24936 default:
24937 gcc_unreachable ();
24940 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
24942 return true;
24946 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
24947 but works for floating pointer parameters and nonoffsetable memories.
24948 For pushes, it returns just stack offsets; the values will be saved
24949 in the right order. Maximally three parts are generated. */
24951 static int
24952 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
24954 int size;
24956 if (!TARGET_64BIT)
24957 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
24958 else
24959 size = (GET_MODE_SIZE (mode) + 4) / 8;
24961 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
24962 gcc_assert (size >= 2 && size <= 4);
24964 /* Optimize constant pool reference to immediates. This is used by fp
24965 moves, that force all constants to memory to allow combining. */
24966 if (MEM_P (operand) && MEM_READONLY_P (operand))
24967 operand = avoid_constant_pool_reference (operand);
24969 if (MEM_P (operand) && !offsettable_memref_p (operand))
24971 /* The only non-offsetable memories we handle are pushes. */
24972 int ok = push_operand (operand, VOIDmode);
24974 gcc_assert (ok);
24976 operand = copy_rtx (operand);
24977 PUT_MODE (operand, word_mode);
24978 parts[0] = parts[1] = parts[2] = parts[3] = operand;
24979 return size;
24982 if (GET_CODE (operand) == CONST_VECTOR)
24984 scalar_int_mode imode = int_mode_for_mode (mode).require ();
24985 /* Caution: if we looked through a constant pool memory above,
24986 the operand may actually have a different mode now. That's
24987 ok, since we want to pun this all the way back to an integer. */
24988 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
24989 gcc_assert (operand != NULL);
24990 mode = imode;
24993 if (!TARGET_64BIT)
24995 if (mode == DImode)
24996 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
24997 else
24999 int i;
25001 if (REG_P (operand))
25003 gcc_assert (reload_completed);
25004 for (i = 0; i < size; i++)
25005 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25007 else if (offsettable_memref_p (operand))
25009 operand = adjust_address (operand, SImode, 0);
25010 parts[0] = operand;
25011 for (i = 1; i < size; i++)
25012 parts[i] = adjust_address (operand, SImode, 4 * i);
25014 else if (CONST_DOUBLE_P (operand))
25016 const REAL_VALUE_TYPE *r;
25017 long l[4];
25019 r = CONST_DOUBLE_REAL_VALUE (operand);
25020 switch (mode)
25022 case E_TFmode:
25023 real_to_target (l, r, mode);
25024 parts[3] = gen_int_mode (l[3], SImode);
25025 parts[2] = gen_int_mode (l[2], SImode);
25026 break;
25027 case E_XFmode:
25028 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25029 long double may not be 80-bit. */
25030 real_to_target (l, r, mode);
25031 parts[2] = gen_int_mode (l[2], SImode);
25032 break;
25033 case E_DFmode:
25034 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25035 break;
25036 default:
25037 gcc_unreachable ();
25039 parts[1] = gen_int_mode (l[1], SImode);
25040 parts[0] = gen_int_mode (l[0], SImode);
25042 else
25043 gcc_unreachable ();
25046 else
25048 if (mode == TImode)
25049 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25050 if (mode == XFmode || mode == TFmode)
25052 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25053 if (REG_P (operand))
25055 gcc_assert (reload_completed);
25056 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25057 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25059 else if (offsettable_memref_p (operand))
25061 operand = adjust_address (operand, DImode, 0);
25062 parts[0] = operand;
25063 parts[1] = adjust_address (operand, upper_mode, 8);
25065 else if (CONST_DOUBLE_P (operand))
25067 long l[4];
25069 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25071 /* real_to_target puts 32-bit pieces in each long. */
25072 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25073 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25074 << 32), DImode);
25076 if (upper_mode == SImode)
25077 parts[1] = gen_int_mode (l[2], SImode);
25078 else
25079 parts[1]
25080 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25081 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25082 << 32), DImode);
25084 else
25085 gcc_unreachable ();
25089 return size;
25092 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25093 Return false when normal moves are needed; true when all required
25094 insns have been emitted. Operands 2-4 contain the input values
25095 int the correct order; operands 5-7 contain the output values. */
25097 void
25098 ix86_split_long_move (rtx operands[])
25100 rtx part[2][4];
25101 int nparts, i, j;
25102 int push = 0;
25103 int collisions = 0;
25104 machine_mode mode = GET_MODE (operands[0]);
25105 bool collisionparts[4];
25107 /* The DFmode expanders may ask us to move double.
25108 For 64bit target this is single move. By hiding the fact
25109 here we simplify i386.md splitters. */
25110 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25112 /* Optimize constant pool reference to immediates. This is used by
25113 fp moves, that force all constants to memory to allow combining. */
25115 if (MEM_P (operands[1])
25116 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25117 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25118 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25119 if (push_operand (operands[0], VOIDmode))
25121 operands[0] = copy_rtx (operands[0]);
25122 PUT_MODE (operands[0], word_mode);
25124 else
25125 operands[0] = gen_lowpart (DImode, operands[0]);
25126 operands[1] = gen_lowpart (DImode, operands[1]);
25127 emit_move_insn (operands[0], operands[1]);
25128 return;
25131 /* The only non-offsettable memory we handle is push. */
25132 if (push_operand (operands[0], VOIDmode))
25133 push = 1;
25134 else
25135 gcc_assert (!MEM_P (operands[0])
25136 || offsettable_memref_p (operands[0]));
25138 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25139 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25141 /* When emitting push, take care for source operands on the stack. */
25142 if (push && MEM_P (operands[1])
25143 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25145 rtx src_base = XEXP (part[1][nparts - 1], 0);
25147 /* Compensate for the stack decrement by 4. */
25148 if (!TARGET_64BIT && nparts == 3
25149 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25150 src_base = plus_constant (Pmode, src_base, 4);
25152 /* src_base refers to the stack pointer and is
25153 automatically decreased by emitted push. */
25154 for (i = 0; i < nparts; i++)
25155 part[1][i] = change_address (part[1][i],
25156 GET_MODE (part[1][i]), src_base);
25159 /* We need to do copy in the right order in case an address register
25160 of the source overlaps the destination. */
25161 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25163 rtx tmp;
25165 for (i = 0; i < nparts; i++)
25167 collisionparts[i]
25168 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25169 if (collisionparts[i])
25170 collisions++;
25173 /* Collision in the middle part can be handled by reordering. */
25174 if (collisions == 1 && nparts == 3 && collisionparts [1])
25176 std::swap (part[0][1], part[0][2]);
25177 std::swap (part[1][1], part[1][2]);
25179 else if (collisions == 1
25180 && nparts == 4
25181 && (collisionparts [1] || collisionparts [2]))
25183 if (collisionparts [1])
25185 std::swap (part[0][1], part[0][2]);
25186 std::swap (part[1][1], part[1][2]);
25188 else
25190 std::swap (part[0][2], part[0][3]);
25191 std::swap (part[1][2], part[1][3]);
25195 /* If there are more collisions, we can't handle it by reordering.
25196 Do an lea to the last part and use only one colliding move. */
25197 else if (collisions > 1)
25199 rtx base, addr;
25201 collisions = 1;
25203 base = part[0][nparts - 1];
25205 /* Handle the case when the last part isn't valid for lea.
25206 Happens in 64-bit mode storing the 12-byte XFmode. */
25207 if (GET_MODE (base) != Pmode)
25208 base = gen_rtx_REG (Pmode, REGNO (base));
25210 addr = XEXP (part[1][0], 0);
25211 if (TARGET_TLS_DIRECT_SEG_REFS)
25213 struct ix86_address parts;
25214 int ok = ix86_decompose_address (addr, &parts);
25215 gcc_assert (ok);
25216 /* It is not valid to use %gs: or %fs: in lea. */
25217 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25219 emit_insn (gen_rtx_SET (base, addr));
25220 part[1][0] = replace_equiv_address (part[1][0], base);
25221 for (i = 1; i < nparts; i++)
25223 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25224 part[1][i] = replace_equiv_address (part[1][i], tmp);
25229 if (push)
25231 if (!TARGET_64BIT)
25233 if (nparts == 3)
25235 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25236 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25237 stack_pointer_rtx, GEN_INT (-4)));
25238 emit_move_insn (part[0][2], part[1][2]);
25240 else if (nparts == 4)
25242 emit_move_insn (part[0][3], part[1][3]);
25243 emit_move_insn (part[0][2], part[1][2]);
25246 else
25248 /* In 64bit mode we don't have 32bit push available. In case this is
25249 register, it is OK - we will just use larger counterpart. We also
25250 retype memory - these comes from attempt to avoid REX prefix on
25251 moving of second half of TFmode value. */
25252 if (GET_MODE (part[1][1]) == SImode)
25254 switch (GET_CODE (part[1][1]))
25256 case MEM:
25257 part[1][1] = adjust_address (part[1][1], DImode, 0);
25258 break;
25260 case REG:
25261 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25262 break;
25264 default:
25265 gcc_unreachable ();
25268 if (GET_MODE (part[1][0]) == SImode)
25269 part[1][0] = part[1][1];
25272 emit_move_insn (part[0][1], part[1][1]);
25273 emit_move_insn (part[0][0], part[1][0]);
25274 return;
25277 /* Choose correct order to not overwrite the source before it is copied. */
25278 if ((REG_P (part[0][0])
25279 && REG_P (part[1][1])
25280 && (REGNO (part[0][0]) == REGNO (part[1][1])
25281 || (nparts == 3
25282 && REGNO (part[0][0]) == REGNO (part[1][2]))
25283 || (nparts == 4
25284 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25285 || (collisions > 0
25286 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25288 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25290 operands[2 + i] = part[0][j];
25291 operands[6 + i] = part[1][j];
25294 else
25296 for (i = 0; i < nparts; i++)
25298 operands[2 + i] = part[0][i];
25299 operands[6 + i] = part[1][i];
25303 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25304 if (optimize_insn_for_size_p ())
25306 for (j = 0; j < nparts - 1; j++)
25307 if (CONST_INT_P (operands[6 + j])
25308 && operands[6 + j] != const0_rtx
25309 && REG_P (operands[2 + j]))
25310 for (i = j; i < nparts - 1; i++)
25311 if (CONST_INT_P (operands[7 + i])
25312 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25313 operands[7 + i] = operands[2 + j];
25316 for (i = 0; i < nparts; i++)
25317 emit_move_insn (operands[2 + i], operands[6 + i]);
25319 return;
25322 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25323 left shift by a constant, either using a single shift or
25324 a sequence of add instructions. */
25326 static void
25327 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25329 rtx (*insn)(rtx, rtx, rtx);
25331 if (count == 1
25332 || (count * ix86_cost->add <= ix86_cost->shift_const
25333 && !optimize_insn_for_size_p ()))
25335 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25336 while (count-- > 0)
25337 emit_insn (insn (operand, operand, operand));
25339 else
25341 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25342 emit_insn (insn (operand, operand, GEN_INT (count)));
25346 void
25347 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25349 rtx (*gen_ashl3)(rtx, rtx, rtx);
25350 rtx (*gen_shld)(rtx, rtx, rtx);
25351 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25353 rtx low[2], high[2];
25354 int count;
25356 if (CONST_INT_P (operands[2]))
25358 split_double_mode (mode, operands, 2, low, high);
25359 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25361 if (count >= half_width)
25363 emit_move_insn (high[0], low[1]);
25364 emit_move_insn (low[0], const0_rtx);
25366 if (count > half_width)
25367 ix86_expand_ashl_const (high[0], count - half_width, mode);
25369 else
25371 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25373 if (!rtx_equal_p (operands[0], operands[1]))
25374 emit_move_insn (operands[0], operands[1]);
25376 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25377 ix86_expand_ashl_const (low[0], count, mode);
25379 return;
25382 split_double_mode (mode, operands, 1, low, high);
25384 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25386 if (operands[1] == const1_rtx)
25388 /* Assuming we've chosen a QImode capable registers, then 1 << N
25389 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25390 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25392 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25394 ix86_expand_clear (low[0]);
25395 ix86_expand_clear (high[0]);
25396 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25398 d = gen_lowpart (QImode, low[0]);
25399 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25400 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25401 emit_insn (gen_rtx_SET (d, s));
25403 d = gen_lowpart (QImode, high[0]);
25404 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25405 s = gen_rtx_NE (QImode, flags, const0_rtx);
25406 emit_insn (gen_rtx_SET (d, s));
25409 /* Otherwise, we can get the same results by manually performing
25410 a bit extract operation on bit 5/6, and then performing the two
25411 shifts. The two methods of getting 0/1 into low/high are exactly
25412 the same size. Avoiding the shift in the bit extract case helps
25413 pentium4 a bit; no one else seems to care much either way. */
25414 else
25416 machine_mode half_mode;
25417 rtx (*gen_lshr3)(rtx, rtx, rtx);
25418 rtx (*gen_and3)(rtx, rtx, rtx);
25419 rtx (*gen_xor3)(rtx, rtx, rtx);
25420 HOST_WIDE_INT bits;
25421 rtx x;
25423 if (mode == DImode)
25425 half_mode = SImode;
25426 gen_lshr3 = gen_lshrsi3;
25427 gen_and3 = gen_andsi3;
25428 gen_xor3 = gen_xorsi3;
25429 bits = 5;
25431 else
25433 half_mode = DImode;
25434 gen_lshr3 = gen_lshrdi3;
25435 gen_and3 = gen_anddi3;
25436 gen_xor3 = gen_xordi3;
25437 bits = 6;
25440 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25441 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25442 else
25443 x = gen_lowpart (half_mode, operands[2]);
25444 emit_insn (gen_rtx_SET (high[0], x));
25446 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25447 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25448 emit_move_insn (low[0], high[0]);
25449 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25452 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25453 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25454 return;
25457 if (operands[1] == constm1_rtx)
25459 /* For -1 << N, we can avoid the shld instruction, because we
25460 know that we're shifting 0...31/63 ones into a -1. */
25461 emit_move_insn (low[0], constm1_rtx);
25462 if (optimize_insn_for_size_p ())
25463 emit_move_insn (high[0], low[0]);
25464 else
25465 emit_move_insn (high[0], constm1_rtx);
25467 else
25469 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25471 if (!rtx_equal_p (operands[0], operands[1]))
25472 emit_move_insn (operands[0], operands[1]);
25474 split_double_mode (mode, operands, 1, low, high);
25475 emit_insn (gen_shld (high[0], low[0], operands[2]));
25478 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25480 if (TARGET_CMOVE && scratch)
25482 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25483 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25485 ix86_expand_clear (scratch);
25486 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25488 else
25490 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25491 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25493 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25497 void
25498 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25500 rtx (*gen_ashr3)(rtx, rtx, rtx)
25501 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25502 rtx (*gen_shrd)(rtx, rtx, rtx);
25503 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25505 rtx low[2], high[2];
25506 int count;
25508 if (CONST_INT_P (operands[2]))
25510 split_double_mode (mode, operands, 2, low, high);
25511 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25513 if (count == GET_MODE_BITSIZE (mode) - 1)
25515 emit_move_insn (high[0], high[1]);
25516 emit_insn (gen_ashr3 (high[0], high[0],
25517 GEN_INT (half_width - 1)));
25518 emit_move_insn (low[0], high[0]);
25521 else if (count >= half_width)
25523 emit_move_insn (low[0], high[1]);
25524 emit_move_insn (high[0], low[0]);
25525 emit_insn (gen_ashr3 (high[0], high[0],
25526 GEN_INT (half_width - 1)));
25528 if (count > half_width)
25529 emit_insn (gen_ashr3 (low[0], low[0],
25530 GEN_INT (count - half_width)));
25532 else
25534 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25536 if (!rtx_equal_p (operands[0], operands[1]))
25537 emit_move_insn (operands[0], operands[1]);
25539 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25540 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25543 else
25545 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25547 if (!rtx_equal_p (operands[0], operands[1]))
25548 emit_move_insn (operands[0], operands[1]);
25550 split_double_mode (mode, operands, 1, low, high);
25552 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25553 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25555 if (TARGET_CMOVE && scratch)
25557 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25558 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25560 emit_move_insn (scratch, high[0]);
25561 emit_insn (gen_ashr3 (scratch, scratch,
25562 GEN_INT (half_width - 1)));
25563 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25564 scratch));
25566 else
25568 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25569 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25571 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25576 void
25577 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25579 rtx (*gen_lshr3)(rtx, rtx, rtx)
25580 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25581 rtx (*gen_shrd)(rtx, rtx, rtx);
25582 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25584 rtx low[2], high[2];
25585 int count;
25587 if (CONST_INT_P (operands[2]))
25589 split_double_mode (mode, operands, 2, low, high);
25590 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25592 if (count >= half_width)
25594 emit_move_insn (low[0], high[1]);
25595 ix86_expand_clear (high[0]);
25597 if (count > half_width)
25598 emit_insn (gen_lshr3 (low[0], low[0],
25599 GEN_INT (count - half_width)));
25601 else
25603 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25605 if (!rtx_equal_p (operands[0], operands[1]))
25606 emit_move_insn (operands[0], operands[1]);
25608 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25609 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25612 else
25614 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25616 if (!rtx_equal_p (operands[0], operands[1]))
25617 emit_move_insn (operands[0], operands[1]);
25619 split_double_mode (mode, operands, 1, low, high);
25621 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25622 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25624 if (TARGET_CMOVE && scratch)
25626 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25627 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25629 ix86_expand_clear (scratch);
25630 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25631 scratch));
25633 else
25635 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25636 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25638 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25643 /* Predict just emitted jump instruction to be taken with probability PROB. */
25644 static void
25645 predict_jump (int prob)
25647 rtx_insn *insn = get_last_insn ();
25648 gcc_assert (JUMP_P (insn));
25649 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25652 /* Helper function for the string operations below. Dest VARIABLE whether
25653 it is aligned to VALUE bytes. If true, jump to the label. */
25654 static rtx_code_label *
25655 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25657 rtx_code_label *label = gen_label_rtx ();
25658 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25659 if (GET_MODE (variable) == DImode)
25660 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25661 else
25662 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25663 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25664 1, label);
25665 if (epilogue)
25666 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25667 else
25668 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25669 return label;
25672 /* Adjust COUNTER by the VALUE. */
25673 static void
25674 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25676 rtx (*gen_add)(rtx, rtx, rtx)
25677 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25679 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25682 /* Zero extend possibly SImode EXP to Pmode register. */
25684 ix86_zero_extend_to_Pmode (rtx exp)
25686 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25689 /* Divide COUNTREG by SCALE. */
25690 static rtx
25691 scale_counter (rtx countreg, int scale)
25693 rtx sc;
25695 if (scale == 1)
25696 return countreg;
25697 if (CONST_INT_P (countreg))
25698 return GEN_INT (INTVAL (countreg) / scale);
25699 gcc_assert (REG_P (countreg));
25701 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25702 GEN_INT (exact_log2 (scale)),
25703 NULL, 1, OPTAB_DIRECT);
25704 return sc;
25707 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25708 DImode for constant loop counts. */
25710 static machine_mode
25711 counter_mode (rtx count_exp)
25713 if (GET_MODE (count_exp) != VOIDmode)
25714 return GET_MODE (count_exp);
25715 if (!CONST_INT_P (count_exp))
25716 return Pmode;
25717 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25718 return DImode;
25719 return SImode;
25722 /* Copy the address to a Pmode register. This is used for x32 to
25723 truncate DImode TLS address to a SImode register. */
25725 static rtx
25726 ix86_copy_addr_to_reg (rtx addr)
25728 rtx reg;
25729 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25731 reg = copy_addr_to_reg (addr);
25732 REG_POINTER (reg) = 1;
25733 return reg;
25735 else
25737 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25738 reg = copy_to_mode_reg (DImode, addr);
25739 REG_POINTER (reg) = 1;
25740 return gen_rtx_SUBREG (SImode, reg, 0);
25744 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25745 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25746 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25747 memory by VALUE (supposed to be in MODE).
25749 The size is rounded down to whole number of chunk size moved at once.
25750 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25753 static void
25754 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25755 rtx destptr, rtx srcptr, rtx value,
25756 rtx count, machine_mode mode, int unroll,
25757 int expected_size, bool issetmem)
25759 rtx_code_label *out_label, *top_label;
25760 rtx iter, tmp;
25761 machine_mode iter_mode = counter_mode (count);
25762 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25763 rtx piece_size = GEN_INT (piece_size_n);
25764 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25765 rtx size;
25766 int i;
25768 top_label = gen_label_rtx ();
25769 out_label = gen_label_rtx ();
25770 iter = gen_reg_rtx (iter_mode);
25772 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25773 NULL, 1, OPTAB_DIRECT);
25774 /* Those two should combine. */
25775 if (piece_size == const1_rtx)
25777 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25778 true, out_label);
25779 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25781 emit_move_insn (iter, const0_rtx);
25783 emit_label (top_label);
25785 tmp = convert_modes (Pmode, iter_mode, iter, true);
25787 /* This assert could be relaxed - in this case we'll need to compute
25788 smallest power of two, containing in PIECE_SIZE_N and pass it to
25789 offset_address. */
25790 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25791 destmem = offset_address (destmem, tmp, piece_size_n);
25792 destmem = adjust_address (destmem, mode, 0);
25794 if (!issetmem)
25796 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25797 srcmem = adjust_address (srcmem, mode, 0);
25799 /* When unrolling for chips that reorder memory reads and writes,
25800 we can save registers by using single temporary.
25801 Also using 4 temporaries is overkill in 32bit mode. */
25802 if (!TARGET_64BIT && 0)
25804 for (i = 0; i < unroll; i++)
25806 if (i)
25808 destmem =
25809 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25810 srcmem =
25811 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25813 emit_move_insn (destmem, srcmem);
25816 else
25818 rtx tmpreg[4];
25819 gcc_assert (unroll <= 4);
25820 for (i = 0; i < unroll; i++)
25822 tmpreg[i] = gen_reg_rtx (mode);
25823 if (i)
25825 srcmem =
25826 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25828 emit_move_insn (tmpreg[i], srcmem);
25830 for (i = 0; i < unroll; i++)
25832 if (i)
25834 destmem =
25835 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25837 emit_move_insn (destmem, tmpreg[i]);
25841 else
25842 for (i = 0; i < unroll; i++)
25844 if (i)
25845 destmem =
25846 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25847 emit_move_insn (destmem, value);
25850 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25851 true, OPTAB_LIB_WIDEN);
25852 if (tmp != iter)
25853 emit_move_insn (iter, tmp);
25855 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25856 true, top_label);
25857 if (expected_size != -1)
25859 expected_size /= GET_MODE_SIZE (mode) * unroll;
25860 if (expected_size == 0)
25861 predict_jump (0);
25862 else if (expected_size > REG_BR_PROB_BASE)
25863 predict_jump (REG_BR_PROB_BASE - 1);
25864 else
25865 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25867 else
25868 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25869 iter = ix86_zero_extend_to_Pmode (iter);
25870 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25871 true, OPTAB_LIB_WIDEN);
25872 if (tmp != destptr)
25873 emit_move_insn (destptr, tmp);
25874 if (!issetmem)
25876 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25877 true, OPTAB_LIB_WIDEN);
25878 if (tmp != srcptr)
25879 emit_move_insn (srcptr, tmp);
25881 emit_label (out_label);
25884 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25885 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25886 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25887 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25888 ORIG_VALUE is the original value passed to memset to fill the memory with.
25889 Other arguments have same meaning as for previous function. */
25891 static void
25892 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25893 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25894 rtx count,
25895 machine_mode mode, bool issetmem)
25897 rtx destexp;
25898 rtx srcexp;
25899 rtx countreg;
25900 HOST_WIDE_INT rounded_count;
25902 /* If possible, it is shorter to use rep movs.
25903 TODO: Maybe it is better to move this logic to decide_alg. */
25904 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25905 && (!issetmem || orig_value == const0_rtx))
25906 mode = SImode;
25908 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25909 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25911 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25912 GET_MODE_SIZE (mode)));
25913 if (mode != QImode)
25915 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25916 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25917 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25919 else
25920 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25921 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25923 rounded_count
25924 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25925 destmem = shallow_copy_rtx (destmem);
25926 set_mem_size (destmem, rounded_count);
25928 else if (MEM_SIZE_KNOWN_P (destmem))
25929 clear_mem_size (destmem);
25931 if (issetmem)
25933 value = force_reg (mode, gen_lowpart (mode, value));
25934 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25936 else
25938 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
25939 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
25940 if (mode != QImode)
25942 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
25943 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25944 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
25946 else
25947 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
25948 if (CONST_INT_P (count))
25950 rounded_count
25951 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25952 srcmem = shallow_copy_rtx (srcmem);
25953 set_mem_size (srcmem, rounded_count);
25955 else
25957 if (MEM_SIZE_KNOWN_P (srcmem))
25958 clear_mem_size (srcmem);
25960 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
25961 destexp, srcexp));
25965 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
25966 DESTMEM.
25967 SRC is passed by pointer to be updated on return.
25968 Return value is updated DST. */
25969 static rtx
25970 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
25971 HOST_WIDE_INT size_to_move)
25973 rtx dst = destmem, src = *srcmem, adjust, tempreg;
25974 enum insn_code code;
25975 machine_mode move_mode;
25976 int piece_size, i;
25978 /* Find the widest mode in which we could perform moves.
25979 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
25980 it until move of such size is supported. */
25981 piece_size = 1 << floor_log2 (size_to_move);
25982 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
25983 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25985 gcc_assert (piece_size > 1);
25986 piece_size >>= 1;
25989 /* Find the corresponding vector mode with the same size as MOVE_MODE.
25990 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
25991 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
25993 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
25994 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
25995 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25997 move_mode = word_mode;
25998 piece_size = GET_MODE_SIZE (move_mode);
25999 code = optab_handler (mov_optab, move_mode);
26002 gcc_assert (code != CODE_FOR_nothing);
26004 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26005 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26007 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26008 gcc_assert (size_to_move % piece_size == 0);
26009 adjust = GEN_INT (piece_size);
26010 for (i = 0; i < size_to_move; i += piece_size)
26012 /* We move from memory to memory, so we'll need to do it via
26013 a temporary register. */
26014 tempreg = gen_reg_rtx (move_mode);
26015 emit_insn (GEN_FCN (code) (tempreg, src));
26016 emit_insn (GEN_FCN (code) (dst, tempreg));
26018 emit_move_insn (destptr,
26019 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26020 emit_move_insn (srcptr,
26021 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26023 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26024 piece_size);
26025 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26026 piece_size);
26029 /* Update DST and SRC rtx. */
26030 *srcmem = src;
26031 return dst;
26034 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26035 static void
26036 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26037 rtx destptr, rtx srcptr, rtx count, int max_size)
26039 rtx src, dest;
26040 if (CONST_INT_P (count))
26042 HOST_WIDE_INT countval = INTVAL (count);
26043 HOST_WIDE_INT epilogue_size = countval % max_size;
26044 int i;
26046 /* For now MAX_SIZE should be a power of 2. This assert could be
26047 relaxed, but it'll require a bit more complicated epilogue
26048 expanding. */
26049 gcc_assert ((max_size & (max_size - 1)) == 0);
26050 for (i = max_size; i >= 1; i >>= 1)
26052 if (epilogue_size & i)
26053 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26055 return;
26057 if (max_size > 8)
26059 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26060 count, 1, OPTAB_DIRECT);
26061 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26062 count, QImode, 1, 4, false);
26063 return;
26066 /* When there are stringops, we can cheaply increase dest and src pointers.
26067 Otherwise we save code size by maintaining offset (zero is readily
26068 available from preceding rep operation) and using x86 addressing modes.
26070 if (TARGET_SINGLE_STRINGOP)
26072 if (max_size > 4)
26074 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26075 src = change_address (srcmem, SImode, srcptr);
26076 dest = change_address (destmem, SImode, destptr);
26077 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26078 emit_label (label);
26079 LABEL_NUSES (label) = 1;
26081 if (max_size > 2)
26083 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26084 src = change_address (srcmem, HImode, srcptr);
26085 dest = change_address (destmem, HImode, destptr);
26086 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26087 emit_label (label);
26088 LABEL_NUSES (label) = 1;
26090 if (max_size > 1)
26092 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26093 src = change_address (srcmem, QImode, srcptr);
26094 dest = change_address (destmem, QImode, destptr);
26095 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26096 emit_label (label);
26097 LABEL_NUSES (label) = 1;
26100 else
26102 rtx offset = force_reg (Pmode, const0_rtx);
26103 rtx tmp;
26105 if (max_size > 4)
26107 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26108 src = change_address (srcmem, SImode, srcptr);
26109 dest = change_address (destmem, SImode, destptr);
26110 emit_move_insn (dest, src);
26111 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26112 true, OPTAB_LIB_WIDEN);
26113 if (tmp != offset)
26114 emit_move_insn (offset, tmp);
26115 emit_label (label);
26116 LABEL_NUSES (label) = 1;
26118 if (max_size > 2)
26120 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26121 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26122 src = change_address (srcmem, HImode, tmp);
26123 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26124 dest = change_address (destmem, HImode, tmp);
26125 emit_move_insn (dest, src);
26126 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26127 true, OPTAB_LIB_WIDEN);
26128 if (tmp != offset)
26129 emit_move_insn (offset, tmp);
26130 emit_label (label);
26131 LABEL_NUSES (label) = 1;
26133 if (max_size > 1)
26135 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26136 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26137 src = change_address (srcmem, QImode, tmp);
26138 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26139 dest = change_address (destmem, QImode, tmp);
26140 emit_move_insn (dest, src);
26141 emit_label (label);
26142 LABEL_NUSES (label) = 1;
26147 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26148 with value PROMOTED_VAL.
26149 SRC is passed by pointer to be updated on return.
26150 Return value is updated DST. */
26151 static rtx
26152 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26153 HOST_WIDE_INT size_to_move)
26155 rtx dst = destmem, adjust;
26156 enum insn_code code;
26157 machine_mode move_mode;
26158 int piece_size, i;
26160 /* Find the widest mode in which we could perform moves.
26161 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26162 it until move of such size is supported. */
26163 move_mode = GET_MODE (promoted_val);
26164 if (move_mode == VOIDmode)
26165 move_mode = QImode;
26166 if (size_to_move < GET_MODE_SIZE (move_mode))
26168 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26169 move_mode = int_mode_for_size (move_bits, 0).require ();
26170 promoted_val = gen_lowpart (move_mode, promoted_val);
26172 piece_size = GET_MODE_SIZE (move_mode);
26173 code = optab_handler (mov_optab, move_mode);
26174 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26176 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26178 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26179 gcc_assert (size_to_move % piece_size == 0);
26180 adjust = GEN_INT (piece_size);
26181 for (i = 0; i < size_to_move; i += piece_size)
26183 if (piece_size <= GET_MODE_SIZE (word_mode))
26185 emit_insn (gen_strset (destptr, dst, promoted_val));
26186 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26187 piece_size);
26188 continue;
26191 emit_insn (GEN_FCN (code) (dst, promoted_val));
26193 emit_move_insn (destptr,
26194 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26196 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26197 piece_size);
26200 /* Update DST rtx. */
26201 return dst;
26203 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26204 static void
26205 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26206 rtx count, int max_size)
26208 count =
26209 expand_simple_binop (counter_mode (count), AND, count,
26210 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26211 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26212 gen_lowpart (QImode, value), count, QImode,
26213 1, max_size / 2, true);
26216 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26217 static void
26218 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26219 rtx count, int max_size)
26221 rtx dest;
26223 if (CONST_INT_P (count))
26225 HOST_WIDE_INT countval = INTVAL (count);
26226 HOST_WIDE_INT epilogue_size = countval % max_size;
26227 int i;
26229 /* For now MAX_SIZE should be a power of 2. This assert could be
26230 relaxed, but it'll require a bit more complicated epilogue
26231 expanding. */
26232 gcc_assert ((max_size & (max_size - 1)) == 0);
26233 for (i = max_size; i >= 1; i >>= 1)
26235 if (epilogue_size & i)
26237 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26238 destmem = emit_memset (destmem, destptr, vec_value, i);
26239 else
26240 destmem = emit_memset (destmem, destptr, value, i);
26243 return;
26245 if (max_size > 32)
26247 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26248 return;
26250 if (max_size > 16)
26252 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26253 if (TARGET_64BIT)
26255 dest = change_address (destmem, DImode, destptr);
26256 emit_insn (gen_strset (destptr, dest, value));
26257 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26258 emit_insn (gen_strset (destptr, dest, value));
26260 else
26262 dest = change_address (destmem, SImode, destptr);
26263 emit_insn (gen_strset (destptr, dest, value));
26264 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26265 emit_insn (gen_strset (destptr, dest, value));
26266 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26267 emit_insn (gen_strset (destptr, dest, value));
26268 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26269 emit_insn (gen_strset (destptr, dest, value));
26271 emit_label (label);
26272 LABEL_NUSES (label) = 1;
26274 if (max_size > 8)
26276 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26277 if (TARGET_64BIT)
26279 dest = change_address (destmem, DImode, destptr);
26280 emit_insn (gen_strset (destptr, dest, value));
26282 else
26284 dest = change_address (destmem, SImode, destptr);
26285 emit_insn (gen_strset (destptr, dest, value));
26286 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26287 emit_insn (gen_strset (destptr, dest, value));
26289 emit_label (label);
26290 LABEL_NUSES (label) = 1;
26292 if (max_size > 4)
26294 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26295 dest = change_address (destmem, SImode, destptr);
26296 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26297 emit_label (label);
26298 LABEL_NUSES (label) = 1;
26300 if (max_size > 2)
26302 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26303 dest = change_address (destmem, HImode, destptr);
26304 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26305 emit_label (label);
26306 LABEL_NUSES (label) = 1;
26308 if (max_size > 1)
26310 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26311 dest = change_address (destmem, QImode, destptr);
26312 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26313 emit_label (label);
26314 LABEL_NUSES (label) = 1;
26318 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26319 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26320 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26321 ignored.
26322 Return value is updated DESTMEM. */
26323 static rtx
26324 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26325 rtx destptr, rtx srcptr, rtx value,
26326 rtx vec_value, rtx count, int align,
26327 int desired_alignment, bool issetmem)
26329 int i;
26330 for (i = 1; i < desired_alignment; i <<= 1)
26332 if (align <= i)
26334 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26335 if (issetmem)
26337 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26338 destmem = emit_memset (destmem, destptr, vec_value, i);
26339 else
26340 destmem = emit_memset (destmem, destptr, value, i);
26342 else
26343 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26344 ix86_adjust_counter (count, i);
26345 emit_label (label);
26346 LABEL_NUSES (label) = 1;
26347 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26350 return destmem;
26353 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26354 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26355 and jump to DONE_LABEL. */
26356 static void
26357 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26358 rtx destptr, rtx srcptr,
26359 rtx value, rtx vec_value,
26360 rtx count, int size,
26361 rtx done_label, bool issetmem)
26363 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26364 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26365 rtx modesize;
26366 int n;
26368 /* If we do not have vector value to copy, we must reduce size. */
26369 if (issetmem)
26371 if (!vec_value)
26373 if (GET_MODE (value) == VOIDmode && size > 8)
26374 mode = Pmode;
26375 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26376 mode = GET_MODE (value);
26378 else
26379 mode = GET_MODE (vec_value), value = vec_value;
26381 else
26383 /* Choose appropriate vector mode. */
26384 if (size >= 32)
26385 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26386 else if (size >= 16)
26387 mode = TARGET_SSE ? V16QImode : DImode;
26388 srcmem = change_address (srcmem, mode, srcptr);
26390 destmem = change_address (destmem, mode, destptr);
26391 modesize = GEN_INT (GET_MODE_SIZE (mode));
26392 gcc_assert (GET_MODE_SIZE (mode) <= size);
26393 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26395 if (issetmem)
26396 emit_move_insn (destmem, gen_lowpart (mode, value));
26397 else
26399 emit_move_insn (destmem, srcmem);
26400 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26402 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26405 destmem = offset_address (destmem, count, 1);
26406 destmem = offset_address (destmem, GEN_INT (-2 * size),
26407 GET_MODE_SIZE (mode));
26408 if (!issetmem)
26410 srcmem = offset_address (srcmem, count, 1);
26411 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26412 GET_MODE_SIZE (mode));
26414 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26416 if (issetmem)
26417 emit_move_insn (destmem, gen_lowpart (mode, value));
26418 else
26420 emit_move_insn (destmem, srcmem);
26421 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26423 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26425 emit_jump_insn (gen_jump (done_label));
26426 emit_barrier ();
26428 emit_label (label);
26429 LABEL_NUSES (label) = 1;
26432 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26433 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26434 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26435 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26436 DONE_LABEL is a label after the whole copying sequence. The label is created
26437 on demand if *DONE_LABEL is NULL.
26438 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26439 bounds after the initial copies.
26441 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26442 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26443 we will dispatch to a library call for large blocks.
26445 In pseudocode we do:
26447 if (COUNT < SIZE)
26449 Assume that SIZE is 4. Bigger sizes are handled analogously
26450 if (COUNT & 4)
26452 copy 4 bytes from SRCPTR to DESTPTR
26453 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26454 goto done_label
26456 if (!COUNT)
26457 goto done_label;
26458 copy 1 byte from SRCPTR to DESTPTR
26459 if (COUNT & 2)
26461 copy 2 bytes from SRCPTR to DESTPTR
26462 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26465 else
26467 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26468 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26470 OLD_DESPTR = DESTPTR;
26471 Align DESTPTR up to DESIRED_ALIGN
26472 SRCPTR += DESTPTR - OLD_DESTPTR
26473 COUNT -= DEST_PTR - OLD_DESTPTR
26474 if (DYNAMIC_CHECK)
26475 Round COUNT down to multiple of SIZE
26476 << optional caller supplied zero size guard is here >>
26477 << optional caller supplied dynamic check is here >>
26478 << caller supplied main copy loop is here >>
26480 done_label:
26482 static void
26483 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26484 rtx *destptr, rtx *srcptr,
26485 machine_mode mode,
26486 rtx value, rtx vec_value,
26487 rtx *count,
26488 rtx_code_label **done_label,
26489 int size,
26490 int desired_align,
26491 int align,
26492 unsigned HOST_WIDE_INT *min_size,
26493 bool dynamic_check,
26494 bool issetmem)
26496 rtx_code_label *loop_label = NULL, *label;
26497 int n;
26498 rtx modesize;
26499 int prolog_size = 0;
26500 rtx mode_value;
26502 /* Chose proper value to copy. */
26503 if (issetmem && VECTOR_MODE_P (mode))
26504 mode_value = vec_value;
26505 else
26506 mode_value = value;
26507 gcc_assert (GET_MODE_SIZE (mode) <= size);
26509 /* See if block is big or small, handle small blocks. */
26510 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26512 int size2 = size;
26513 loop_label = gen_label_rtx ();
26515 if (!*done_label)
26516 *done_label = gen_label_rtx ();
26518 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26519 1, loop_label);
26520 size2 >>= 1;
26522 /* Handle sizes > 3. */
26523 for (;size2 > 2; size2 >>= 1)
26524 expand_small_movmem_or_setmem (destmem, srcmem,
26525 *destptr, *srcptr,
26526 value, vec_value,
26527 *count,
26528 size2, *done_label, issetmem);
26529 /* Nothing to copy? Jump to DONE_LABEL if so */
26530 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26531 1, *done_label);
26533 /* Do a byte copy. */
26534 destmem = change_address (destmem, QImode, *destptr);
26535 if (issetmem)
26536 emit_move_insn (destmem, gen_lowpart (QImode, value));
26537 else
26539 srcmem = change_address (srcmem, QImode, *srcptr);
26540 emit_move_insn (destmem, srcmem);
26543 /* Handle sizes 2 and 3. */
26544 label = ix86_expand_aligntest (*count, 2, false);
26545 destmem = change_address (destmem, HImode, *destptr);
26546 destmem = offset_address (destmem, *count, 1);
26547 destmem = offset_address (destmem, GEN_INT (-2), 2);
26548 if (issetmem)
26549 emit_move_insn (destmem, gen_lowpart (HImode, value));
26550 else
26552 srcmem = change_address (srcmem, HImode, *srcptr);
26553 srcmem = offset_address (srcmem, *count, 1);
26554 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26555 emit_move_insn (destmem, srcmem);
26558 emit_label (label);
26559 LABEL_NUSES (label) = 1;
26560 emit_jump_insn (gen_jump (*done_label));
26561 emit_barrier ();
26563 else
26564 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26565 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26567 /* Start memcpy for COUNT >= SIZE. */
26568 if (loop_label)
26570 emit_label (loop_label);
26571 LABEL_NUSES (loop_label) = 1;
26574 /* Copy first desired_align bytes. */
26575 if (!issetmem)
26576 srcmem = change_address (srcmem, mode, *srcptr);
26577 destmem = change_address (destmem, mode, *destptr);
26578 modesize = GEN_INT (GET_MODE_SIZE (mode));
26579 for (n = 0; prolog_size < desired_align - align; n++)
26581 if (issetmem)
26582 emit_move_insn (destmem, mode_value);
26583 else
26585 emit_move_insn (destmem, srcmem);
26586 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26588 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26589 prolog_size += GET_MODE_SIZE (mode);
26593 /* Copy last SIZE bytes. */
26594 destmem = offset_address (destmem, *count, 1);
26595 destmem = offset_address (destmem,
26596 GEN_INT (-size - prolog_size),
26598 if (issetmem)
26599 emit_move_insn (destmem, mode_value);
26600 else
26602 srcmem = offset_address (srcmem, *count, 1);
26603 srcmem = offset_address (srcmem,
26604 GEN_INT (-size - prolog_size),
26606 emit_move_insn (destmem, srcmem);
26608 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26610 destmem = offset_address (destmem, modesize, 1);
26611 if (issetmem)
26612 emit_move_insn (destmem, mode_value);
26613 else
26615 srcmem = offset_address (srcmem, modesize, 1);
26616 emit_move_insn (destmem, srcmem);
26620 /* Align destination. */
26621 if (desired_align > 1 && desired_align > align)
26623 rtx saveddest = *destptr;
26625 gcc_assert (desired_align <= size);
26626 /* Align destptr up, place it to new register. */
26627 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26628 GEN_INT (prolog_size),
26629 NULL_RTX, 1, OPTAB_DIRECT);
26630 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26631 REG_POINTER (*destptr) = 1;
26632 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26633 GEN_INT (-desired_align),
26634 *destptr, 1, OPTAB_DIRECT);
26635 /* See how many bytes we skipped. */
26636 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26637 *destptr,
26638 saveddest, 1, OPTAB_DIRECT);
26639 /* Adjust srcptr and count. */
26640 if (!issetmem)
26641 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26642 saveddest, *srcptr, 1, OPTAB_DIRECT);
26643 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26644 saveddest, *count, 1, OPTAB_DIRECT);
26645 /* We copied at most size + prolog_size. */
26646 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26647 *min_size
26648 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26649 else
26650 *min_size = 0;
26652 /* Our loops always round down the block size, but for dispatch to
26653 library we need precise value. */
26654 if (dynamic_check)
26655 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26656 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26658 else
26660 gcc_assert (prolog_size == 0);
26661 /* Decrease count, so we won't end up copying last word twice. */
26662 if (!CONST_INT_P (*count))
26663 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26664 constm1_rtx, *count, 1, OPTAB_DIRECT);
26665 else
26666 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26667 (unsigned HOST_WIDE_INT)size));
26668 if (*min_size)
26669 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26674 /* This function is like the previous one, except here we know how many bytes
26675 need to be copied. That allows us to update alignment not only of DST, which
26676 is returned, but also of SRC, which is passed as a pointer for that
26677 reason. */
26678 static rtx
26679 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26680 rtx srcreg, rtx value, rtx vec_value,
26681 int desired_align, int align_bytes,
26682 bool issetmem)
26684 rtx src = NULL;
26685 rtx orig_dst = dst;
26686 rtx orig_src = NULL;
26687 int piece_size = 1;
26688 int copied_bytes = 0;
26690 if (!issetmem)
26692 gcc_assert (srcp != NULL);
26693 src = *srcp;
26694 orig_src = src;
26697 for (piece_size = 1;
26698 piece_size <= desired_align && copied_bytes < align_bytes;
26699 piece_size <<= 1)
26701 if (align_bytes & piece_size)
26703 if (issetmem)
26705 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26706 dst = emit_memset (dst, destreg, vec_value, piece_size);
26707 else
26708 dst = emit_memset (dst, destreg, value, piece_size);
26710 else
26711 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26712 copied_bytes += piece_size;
26715 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26716 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26717 if (MEM_SIZE_KNOWN_P (orig_dst))
26718 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26720 if (!issetmem)
26722 int src_align_bytes = get_mem_align_offset (src, desired_align
26723 * BITS_PER_UNIT);
26724 if (src_align_bytes >= 0)
26725 src_align_bytes = desired_align - src_align_bytes;
26726 if (src_align_bytes >= 0)
26728 unsigned int src_align;
26729 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26731 if ((src_align_bytes & (src_align - 1))
26732 == (align_bytes & (src_align - 1)))
26733 break;
26735 if (src_align > (unsigned int) desired_align)
26736 src_align = desired_align;
26737 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26738 set_mem_align (src, src_align * BITS_PER_UNIT);
26740 if (MEM_SIZE_KNOWN_P (orig_src))
26741 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26742 *srcp = src;
26745 return dst;
26748 /* Return true if ALG can be used in current context.
26749 Assume we expand memset if MEMSET is true. */
26750 static bool
26751 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26753 if (alg == no_stringop)
26754 return false;
26755 if (alg == vector_loop)
26756 return TARGET_SSE || TARGET_AVX;
26757 /* Algorithms using the rep prefix want at least edi and ecx;
26758 additionally, memset wants eax and memcpy wants esi. Don't
26759 consider such algorithms if the user has appropriated those
26760 registers for their own purposes, or if we have a non-default
26761 address space, since some string insns cannot override the segment. */
26762 if (alg == rep_prefix_1_byte
26763 || alg == rep_prefix_4_byte
26764 || alg == rep_prefix_8_byte)
26766 if (have_as)
26767 return false;
26768 if (fixed_regs[CX_REG]
26769 || fixed_regs[DI_REG]
26770 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26771 return false;
26773 return true;
26776 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26777 static enum stringop_alg
26778 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26779 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26780 bool memset, bool zero_memset, bool have_as,
26781 int *dynamic_check, bool *noalign, bool recur)
26783 const struct stringop_algs *algs;
26784 bool optimize_for_speed;
26785 int max = 0;
26786 const struct processor_costs *cost;
26787 int i;
26788 bool any_alg_usable_p = false;
26790 *noalign = false;
26791 *dynamic_check = -1;
26793 /* Even if the string operation call is cold, we still might spend a lot
26794 of time processing large blocks. */
26795 if (optimize_function_for_size_p (cfun)
26796 || (optimize_insn_for_size_p ()
26797 && (max_size < 256
26798 || (expected_size != -1 && expected_size < 256))))
26799 optimize_for_speed = false;
26800 else
26801 optimize_for_speed = true;
26803 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26804 if (memset)
26805 algs = &cost->memset[TARGET_64BIT != 0];
26806 else
26807 algs = &cost->memcpy[TARGET_64BIT != 0];
26809 /* See maximal size for user defined algorithm. */
26810 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26812 enum stringop_alg candidate = algs->size[i].alg;
26813 bool usable = alg_usable_p (candidate, memset, have_as);
26814 any_alg_usable_p |= usable;
26816 if (candidate != libcall && candidate && usable)
26817 max = algs->size[i].max;
26820 /* If expected size is not known but max size is small enough
26821 so inline version is a win, set expected size into
26822 the range. */
26823 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26824 && expected_size == -1)
26825 expected_size = min_size / 2 + max_size / 2;
26827 /* If user specified the algorithm, honor it if possible. */
26828 if (ix86_stringop_alg != no_stringop
26829 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26830 return ix86_stringop_alg;
26831 /* rep; movq or rep; movl is the smallest variant. */
26832 else if (!optimize_for_speed)
26834 *noalign = true;
26835 if (!count || (count & 3) || (memset && !zero_memset))
26836 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26837 ? rep_prefix_1_byte : loop_1_byte;
26838 else
26839 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26840 ? rep_prefix_4_byte : loop;
26842 /* Very tiny blocks are best handled via the loop, REP is expensive to
26843 setup. */
26844 else if (expected_size != -1 && expected_size < 4)
26845 return loop_1_byte;
26846 else if (expected_size != -1)
26848 enum stringop_alg alg = libcall;
26849 bool alg_noalign = false;
26850 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26852 /* We get here if the algorithms that were not libcall-based
26853 were rep-prefix based and we are unable to use rep prefixes
26854 based on global register usage. Break out of the loop and
26855 use the heuristic below. */
26856 if (algs->size[i].max == 0)
26857 break;
26858 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26860 enum stringop_alg candidate = algs->size[i].alg;
26862 if (candidate != libcall
26863 && alg_usable_p (candidate, memset, have_as))
26865 alg = candidate;
26866 alg_noalign = algs->size[i].noalign;
26868 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26869 last non-libcall inline algorithm. */
26870 if (TARGET_INLINE_ALL_STRINGOPS)
26872 /* When the current size is best to be copied by a libcall,
26873 but we are still forced to inline, run the heuristic below
26874 that will pick code for medium sized blocks. */
26875 if (alg != libcall)
26877 *noalign = alg_noalign;
26878 return alg;
26880 else if (!any_alg_usable_p)
26881 break;
26883 else if (alg_usable_p (candidate, memset, have_as))
26885 *noalign = algs->size[i].noalign;
26886 return candidate;
26891 /* When asked to inline the call anyway, try to pick meaningful choice.
26892 We look for maximal size of block that is faster to copy by hand and
26893 take blocks of at most of that size guessing that average size will
26894 be roughly half of the block.
26896 If this turns out to be bad, we might simply specify the preferred
26897 choice in ix86_costs. */
26898 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26899 && (algs->unknown_size == libcall
26900 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26902 enum stringop_alg alg;
26903 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26905 /* If there aren't any usable algorithms or if recursing already,
26906 then recursing on smaller sizes or same size isn't going to
26907 find anything. Just return the simple byte-at-a-time copy loop. */
26908 if (!any_alg_usable_p || recur)
26910 /* Pick something reasonable. */
26911 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26912 *dynamic_check = 128;
26913 return loop_1_byte;
26915 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26916 zero_memset, have_as, dynamic_check, noalign, true);
26917 gcc_assert (*dynamic_check == -1);
26918 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26919 *dynamic_check = max;
26920 else
26921 gcc_assert (alg != libcall);
26922 return alg;
26924 return (alg_usable_p (algs->unknown_size, memset, have_as)
26925 ? algs->unknown_size : libcall);
26928 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26929 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26930 static int
26931 decide_alignment (int align,
26932 enum stringop_alg alg,
26933 int expected_size,
26934 machine_mode move_mode)
26936 int desired_align = 0;
26938 gcc_assert (alg != no_stringop);
26940 if (alg == libcall)
26941 return 0;
26942 if (move_mode == VOIDmode)
26943 return 0;
26945 desired_align = GET_MODE_SIZE (move_mode);
26946 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
26947 copying whole cacheline at once. */
26948 if (TARGET_PENTIUMPRO
26949 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
26950 desired_align = 8;
26952 if (optimize_size)
26953 desired_align = 1;
26954 if (desired_align < align)
26955 desired_align = align;
26956 if (expected_size != -1 && expected_size < 4)
26957 desired_align = align;
26959 return desired_align;
26963 /* Helper function for memcpy. For QImode value 0xXY produce
26964 0xXYXYXYXY of wide specified by MODE. This is essentially
26965 a * 0x10101010, but we can do slightly better than
26966 synth_mult by unwinding the sequence by hand on CPUs with
26967 slow multiply. */
26968 static rtx
26969 promote_duplicated_reg (machine_mode mode, rtx val)
26971 machine_mode valmode = GET_MODE (val);
26972 rtx tmp;
26973 int nops = mode == DImode ? 3 : 2;
26975 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
26976 if (val == const0_rtx)
26977 return copy_to_mode_reg (mode, CONST0_RTX (mode));
26978 if (CONST_INT_P (val))
26980 HOST_WIDE_INT v = INTVAL (val) & 255;
26982 v |= v << 8;
26983 v |= v << 16;
26984 if (mode == DImode)
26985 v |= (v << 16) << 16;
26986 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
26989 if (valmode == VOIDmode)
26990 valmode = QImode;
26991 if (valmode != QImode)
26992 val = gen_lowpart (QImode, val);
26993 if (mode == QImode)
26994 return val;
26995 if (!TARGET_PARTIAL_REG_STALL)
26996 nops--;
26997 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
26998 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
26999 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27000 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27002 rtx reg = convert_modes (mode, QImode, val, true);
27003 tmp = promote_duplicated_reg (mode, const1_rtx);
27004 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27005 OPTAB_DIRECT);
27007 else
27009 rtx reg = convert_modes (mode, QImode, val, true);
27011 if (!TARGET_PARTIAL_REG_STALL)
27012 if (mode == SImode)
27013 emit_insn (gen_insvsi_1 (reg, reg));
27014 else
27015 emit_insn (gen_insvdi_1 (reg, reg));
27016 else
27018 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27019 NULL, 1, OPTAB_DIRECT);
27020 reg =
27021 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27023 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27024 NULL, 1, OPTAB_DIRECT);
27025 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27026 if (mode == SImode)
27027 return reg;
27028 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27029 NULL, 1, OPTAB_DIRECT);
27030 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27031 return reg;
27035 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27036 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27037 alignment from ALIGN to DESIRED_ALIGN. */
27038 static rtx
27039 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27040 int align)
27042 rtx promoted_val;
27044 if (TARGET_64BIT
27045 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27046 promoted_val = promote_duplicated_reg (DImode, val);
27047 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27048 promoted_val = promote_duplicated_reg (SImode, val);
27049 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27050 promoted_val = promote_duplicated_reg (HImode, val);
27051 else
27052 promoted_val = val;
27054 return promoted_val;
27057 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27058 operations when profitable. The code depends upon architecture, block size
27059 and alignment, but always has one of the following overall structures:
27061 Aligned move sequence:
27063 1) Prologue guard: Conditional that jumps up to epilogues for small
27064 blocks that can be handled by epilogue alone. This is faster
27065 but also needed for correctness, since prologue assume the block
27066 is larger than the desired alignment.
27068 Optional dynamic check for size and libcall for large
27069 blocks is emitted here too, with -minline-stringops-dynamically.
27071 2) Prologue: copy first few bytes in order to get destination
27072 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27073 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27074 copied. We emit either a jump tree on power of two sized
27075 blocks, or a byte loop.
27077 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27078 with specified algorithm.
27080 4) Epilogue: code copying tail of the block that is too small to be
27081 handled by main body (or up to size guarded by prologue guard).
27083 Misaligned move sequence
27085 1) missaligned move prologue/epilogue containing:
27086 a) Prologue handling small memory blocks and jumping to done_label
27087 (skipped if blocks are known to be large enough)
27088 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27089 needed by single possibly misaligned move
27090 (skipped if alignment is not needed)
27091 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27093 2) Zero size guard dispatching to done_label, if needed
27095 3) dispatch to library call, if needed,
27097 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27098 with specified algorithm. */
27099 bool
27100 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27101 rtx align_exp, rtx expected_align_exp,
27102 rtx expected_size_exp, rtx min_size_exp,
27103 rtx max_size_exp, rtx probable_max_size_exp,
27104 bool issetmem)
27106 rtx destreg;
27107 rtx srcreg = NULL;
27108 rtx_code_label *label = NULL;
27109 rtx tmp;
27110 rtx_code_label *jump_around_label = NULL;
27111 HOST_WIDE_INT align = 1;
27112 unsigned HOST_WIDE_INT count = 0;
27113 HOST_WIDE_INT expected_size = -1;
27114 int size_needed = 0, epilogue_size_needed;
27115 int desired_align = 0, align_bytes = 0;
27116 enum stringop_alg alg;
27117 rtx promoted_val = NULL;
27118 rtx vec_promoted_val = NULL;
27119 bool force_loopy_epilogue = false;
27120 int dynamic_check;
27121 bool need_zero_guard = false;
27122 bool noalign;
27123 machine_mode move_mode = VOIDmode;
27124 machine_mode wider_mode;
27125 int unroll_factor = 1;
27126 /* TODO: Once value ranges are available, fill in proper data. */
27127 unsigned HOST_WIDE_INT min_size = 0;
27128 unsigned HOST_WIDE_INT max_size = -1;
27129 unsigned HOST_WIDE_INT probable_max_size = -1;
27130 bool misaligned_prologue_used = false;
27131 bool have_as;
27133 if (CONST_INT_P (align_exp))
27134 align = INTVAL (align_exp);
27135 /* i386 can do misaligned access on reasonably increased cost. */
27136 if (CONST_INT_P (expected_align_exp)
27137 && INTVAL (expected_align_exp) > align)
27138 align = INTVAL (expected_align_exp);
27139 /* ALIGN is the minimum of destination and source alignment, but we care here
27140 just about destination alignment. */
27141 else if (!issetmem
27142 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27143 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27145 if (CONST_INT_P (count_exp))
27147 min_size = max_size = probable_max_size = count = expected_size
27148 = INTVAL (count_exp);
27149 /* When COUNT is 0, there is nothing to do. */
27150 if (!count)
27151 return true;
27153 else
27155 if (min_size_exp)
27156 min_size = INTVAL (min_size_exp);
27157 if (max_size_exp)
27158 max_size = INTVAL (max_size_exp);
27159 if (probable_max_size_exp)
27160 probable_max_size = INTVAL (probable_max_size_exp);
27161 if (CONST_INT_P (expected_size_exp))
27162 expected_size = INTVAL (expected_size_exp);
27165 /* Make sure we don't need to care about overflow later on. */
27166 if (count > (HOST_WIDE_INT_1U << 30))
27167 return false;
27169 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27170 if (!issetmem)
27171 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27173 /* Step 0: Decide on preferred algorithm, desired alignment and
27174 size of chunks to be copied by main loop. */
27175 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27176 issetmem,
27177 issetmem && val_exp == const0_rtx, have_as,
27178 &dynamic_check, &noalign, false);
27179 if (alg == libcall)
27180 return false;
27181 gcc_assert (alg != no_stringop);
27183 /* For now vector-version of memset is generated only for memory zeroing, as
27184 creating of promoted vector value is very cheap in this case. */
27185 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27186 alg = unrolled_loop;
27188 if (!count)
27189 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27190 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27191 if (!issetmem)
27192 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27194 unroll_factor = 1;
27195 move_mode = word_mode;
27196 switch (alg)
27198 case libcall:
27199 case no_stringop:
27200 case last_alg:
27201 gcc_unreachable ();
27202 case loop_1_byte:
27203 need_zero_guard = true;
27204 move_mode = QImode;
27205 break;
27206 case loop:
27207 need_zero_guard = true;
27208 break;
27209 case unrolled_loop:
27210 need_zero_guard = true;
27211 unroll_factor = (TARGET_64BIT ? 4 : 2);
27212 break;
27213 case vector_loop:
27214 need_zero_guard = true;
27215 unroll_factor = 4;
27216 /* Find the widest supported mode. */
27217 move_mode = word_mode;
27218 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27219 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27220 move_mode = wider_mode;
27222 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27223 move_mode = TImode;
27225 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27226 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27227 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27229 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27230 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27231 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27232 move_mode = word_mode;
27234 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27235 break;
27236 case rep_prefix_8_byte:
27237 move_mode = DImode;
27238 break;
27239 case rep_prefix_4_byte:
27240 move_mode = SImode;
27241 break;
27242 case rep_prefix_1_byte:
27243 move_mode = QImode;
27244 break;
27246 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27247 epilogue_size_needed = size_needed;
27249 /* If we are going to call any library calls conditionally, make sure any
27250 pending stack adjustment happen before the first conditional branch,
27251 otherwise they will be emitted before the library call only and won't
27252 happen from the other branches. */
27253 if (dynamic_check != -1)
27254 do_pending_stack_adjust ();
27256 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27257 if (!TARGET_ALIGN_STRINGOPS || noalign)
27258 align = desired_align;
27260 /* Step 1: Prologue guard. */
27262 /* Alignment code needs count to be in register. */
27263 if (CONST_INT_P (count_exp) && desired_align > align)
27265 if (INTVAL (count_exp) > desired_align
27266 && INTVAL (count_exp) > size_needed)
27268 align_bytes
27269 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27270 if (align_bytes <= 0)
27271 align_bytes = 0;
27272 else
27273 align_bytes = desired_align - align_bytes;
27275 if (align_bytes == 0)
27276 count_exp = force_reg (counter_mode (count_exp), count_exp);
27278 gcc_assert (desired_align >= 1 && align >= 1);
27280 /* Misaligned move sequences handle both prologue and epilogue at once.
27281 Default code generation results in a smaller code for large alignments
27282 and also avoids redundant job when sizes are known precisely. */
27283 misaligned_prologue_used
27284 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27285 && MAX (desired_align, epilogue_size_needed) <= 32
27286 && desired_align <= epilogue_size_needed
27287 && ((desired_align > align && !align_bytes)
27288 || (!count && epilogue_size_needed > 1)));
27290 /* Do the cheap promotion to allow better CSE across the
27291 main loop and epilogue (ie one load of the big constant in the
27292 front of all code.
27293 For now the misaligned move sequences do not have fast path
27294 without broadcasting. */
27295 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27297 if (alg == vector_loop)
27299 gcc_assert (val_exp == const0_rtx);
27300 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27301 promoted_val = promote_duplicated_reg_to_size (val_exp,
27302 GET_MODE_SIZE (word_mode),
27303 desired_align, align);
27305 else
27307 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27308 desired_align, align);
27311 /* Misaligned move sequences handles both prologues and epilogues at once.
27312 Default code generation results in smaller code for large alignments and
27313 also avoids redundant job when sizes are known precisely. */
27314 if (misaligned_prologue_used)
27316 /* Misaligned move prologue handled small blocks by itself. */
27317 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27318 (dst, src, &destreg, &srcreg,
27319 move_mode, promoted_val, vec_promoted_val,
27320 &count_exp,
27321 &jump_around_label,
27322 desired_align < align
27323 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27324 desired_align, align, &min_size, dynamic_check, issetmem);
27325 if (!issetmem)
27326 src = change_address (src, BLKmode, srcreg);
27327 dst = change_address (dst, BLKmode, destreg);
27328 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27329 epilogue_size_needed = 0;
27330 if (need_zero_guard
27331 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27333 /* It is possible that we copied enough so the main loop will not
27334 execute. */
27335 gcc_assert (size_needed > 1);
27336 if (jump_around_label == NULL_RTX)
27337 jump_around_label = gen_label_rtx ();
27338 emit_cmp_and_jump_insns (count_exp,
27339 GEN_INT (size_needed),
27340 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27341 if (expected_size == -1
27342 || expected_size < (desired_align - align) / 2 + size_needed)
27343 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27344 else
27345 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27348 /* Ensure that alignment prologue won't copy past end of block. */
27349 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27351 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27352 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27353 Make sure it is power of 2. */
27354 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27356 /* To improve performance of small blocks, we jump around the VAL
27357 promoting mode. This mean that if the promoted VAL is not constant,
27358 we might not use it in the epilogue and have to use byte
27359 loop variant. */
27360 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27361 force_loopy_epilogue = true;
27362 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27363 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27365 /* If main algorithm works on QImode, no epilogue is needed.
27366 For small sizes just don't align anything. */
27367 if (size_needed == 1)
27368 desired_align = align;
27369 else
27370 goto epilogue;
27372 else if (!count
27373 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27375 label = gen_label_rtx ();
27376 emit_cmp_and_jump_insns (count_exp,
27377 GEN_INT (epilogue_size_needed),
27378 LTU, 0, counter_mode (count_exp), 1, label);
27379 if (expected_size == -1 || expected_size < epilogue_size_needed)
27380 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27381 else
27382 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27386 /* Emit code to decide on runtime whether library call or inline should be
27387 used. */
27388 if (dynamic_check != -1)
27390 if (!issetmem && CONST_INT_P (count_exp))
27392 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27394 emit_block_copy_via_libcall (dst, src, count_exp);
27395 count_exp = const0_rtx;
27396 goto epilogue;
27399 else
27401 rtx_code_label *hot_label = gen_label_rtx ();
27402 if (jump_around_label == NULL_RTX)
27403 jump_around_label = gen_label_rtx ();
27404 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27405 LEU, 0, counter_mode (count_exp),
27406 1, hot_label);
27407 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27408 if (issetmem)
27409 set_storage_via_libcall (dst, count_exp, val_exp);
27410 else
27411 emit_block_copy_via_libcall (dst, src, count_exp);
27412 emit_jump (jump_around_label);
27413 emit_label (hot_label);
27417 /* Step 2: Alignment prologue. */
27418 /* Do the expensive promotion once we branched off the small blocks. */
27419 if (issetmem && !promoted_val)
27420 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27421 desired_align, align);
27423 if (desired_align > align && !misaligned_prologue_used)
27425 if (align_bytes == 0)
27427 /* Except for the first move in prologue, we no longer know
27428 constant offset in aliasing info. It don't seems to worth
27429 the pain to maintain it for the first move, so throw away
27430 the info early. */
27431 dst = change_address (dst, BLKmode, destreg);
27432 if (!issetmem)
27433 src = change_address (src, BLKmode, srcreg);
27434 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27435 promoted_val, vec_promoted_val,
27436 count_exp, align, desired_align,
27437 issetmem);
27438 /* At most desired_align - align bytes are copied. */
27439 if (min_size < (unsigned)(desired_align - align))
27440 min_size = 0;
27441 else
27442 min_size -= desired_align - align;
27444 else
27446 /* If we know how many bytes need to be stored before dst is
27447 sufficiently aligned, maintain aliasing info accurately. */
27448 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27449 srcreg,
27450 promoted_val,
27451 vec_promoted_val,
27452 desired_align,
27453 align_bytes,
27454 issetmem);
27456 count_exp = plus_constant (counter_mode (count_exp),
27457 count_exp, -align_bytes);
27458 count -= align_bytes;
27459 min_size -= align_bytes;
27460 max_size -= align_bytes;
27462 if (need_zero_guard
27463 && min_size < (unsigned HOST_WIDE_INT) size_needed
27464 && (count < (unsigned HOST_WIDE_INT) size_needed
27465 || (align_bytes == 0
27466 && count < ((unsigned HOST_WIDE_INT) size_needed
27467 + desired_align - align))))
27469 /* It is possible that we copied enough so the main loop will not
27470 execute. */
27471 gcc_assert (size_needed > 1);
27472 if (label == NULL_RTX)
27473 label = gen_label_rtx ();
27474 emit_cmp_and_jump_insns (count_exp,
27475 GEN_INT (size_needed),
27476 LTU, 0, counter_mode (count_exp), 1, label);
27477 if (expected_size == -1
27478 || expected_size < (desired_align - align) / 2 + size_needed)
27479 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27480 else
27481 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27484 if (label && size_needed == 1)
27486 emit_label (label);
27487 LABEL_NUSES (label) = 1;
27488 label = NULL;
27489 epilogue_size_needed = 1;
27490 if (issetmem)
27491 promoted_val = val_exp;
27493 else if (label == NULL_RTX && !misaligned_prologue_used)
27494 epilogue_size_needed = size_needed;
27496 /* Step 3: Main loop. */
27498 switch (alg)
27500 case libcall:
27501 case no_stringop:
27502 case last_alg:
27503 gcc_unreachable ();
27504 case loop_1_byte:
27505 case loop:
27506 case unrolled_loop:
27507 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27508 count_exp, move_mode, unroll_factor,
27509 expected_size, issetmem);
27510 break;
27511 case vector_loop:
27512 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27513 vec_promoted_val, count_exp, move_mode,
27514 unroll_factor, expected_size, issetmem);
27515 break;
27516 case rep_prefix_8_byte:
27517 case rep_prefix_4_byte:
27518 case rep_prefix_1_byte:
27519 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27520 val_exp, count_exp, move_mode, issetmem);
27521 break;
27523 /* Adjust properly the offset of src and dest memory for aliasing. */
27524 if (CONST_INT_P (count_exp))
27526 if (!issetmem)
27527 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27528 (count / size_needed) * size_needed);
27529 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27530 (count / size_needed) * size_needed);
27532 else
27534 if (!issetmem)
27535 src = change_address (src, BLKmode, srcreg);
27536 dst = change_address (dst, BLKmode, destreg);
27539 /* Step 4: Epilogue to copy the remaining bytes. */
27540 epilogue:
27541 if (label)
27543 /* When the main loop is done, COUNT_EXP might hold original count,
27544 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27545 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27546 bytes. Compensate if needed. */
27548 if (size_needed < epilogue_size_needed)
27550 tmp =
27551 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27552 GEN_INT (size_needed - 1), count_exp, 1,
27553 OPTAB_DIRECT);
27554 if (tmp != count_exp)
27555 emit_move_insn (count_exp, tmp);
27557 emit_label (label);
27558 LABEL_NUSES (label) = 1;
27561 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27563 if (force_loopy_epilogue)
27564 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27565 epilogue_size_needed);
27566 else
27568 if (issetmem)
27569 expand_setmem_epilogue (dst, destreg, promoted_val,
27570 vec_promoted_val, count_exp,
27571 epilogue_size_needed);
27572 else
27573 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27574 epilogue_size_needed);
27577 if (jump_around_label)
27578 emit_label (jump_around_label);
27579 return true;
27583 /* Expand the appropriate insns for doing strlen if not just doing
27584 repnz; scasb
27586 out = result, initialized with the start address
27587 align_rtx = alignment of the address.
27588 scratch = scratch register, initialized with the startaddress when
27589 not aligned, otherwise undefined
27591 This is just the body. It needs the initializations mentioned above and
27592 some address computing at the end. These things are done in i386.md. */
27594 static void
27595 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27597 int align;
27598 rtx tmp;
27599 rtx_code_label *align_2_label = NULL;
27600 rtx_code_label *align_3_label = NULL;
27601 rtx_code_label *align_4_label = gen_label_rtx ();
27602 rtx_code_label *end_0_label = gen_label_rtx ();
27603 rtx mem;
27604 rtx tmpreg = gen_reg_rtx (SImode);
27605 rtx scratch = gen_reg_rtx (SImode);
27606 rtx cmp;
27608 align = 0;
27609 if (CONST_INT_P (align_rtx))
27610 align = INTVAL (align_rtx);
27612 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27614 /* Is there a known alignment and is it less than 4? */
27615 if (align < 4)
27617 rtx scratch1 = gen_reg_rtx (Pmode);
27618 emit_move_insn (scratch1, out);
27619 /* Is there a known alignment and is it not 2? */
27620 if (align != 2)
27622 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27623 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27625 /* Leave just the 3 lower bits. */
27626 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27627 NULL_RTX, 0, OPTAB_WIDEN);
27629 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27630 Pmode, 1, align_4_label);
27631 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27632 Pmode, 1, align_2_label);
27633 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27634 Pmode, 1, align_3_label);
27636 else
27638 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27639 check if is aligned to 4 - byte. */
27641 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27642 NULL_RTX, 0, OPTAB_WIDEN);
27644 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27645 Pmode, 1, align_4_label);
27648 mem = change_address (src, QImode, out);
27650 /* Now compare the bytes. */
27652 /* Compare the first n unaligned byte on a byte per byte basis. */
27653 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27654 QImode, 1, end_0_label);
27656 /* Increment the address. */
27657 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27659 /* Not needed with an alignment of 2 */
27660 if (align != 2)
27662 emit_label (align_2_label);
27664 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27665 end_0_label);
27667 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27669 emit_label (align_3_label);
27672 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27673 end_0_label);
27675 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27678 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27679 align this loop. It gives only huge programs, but does not help to
27680 speed up. */
27681 emit_label (align_4_label);
27683 mem = change_address (src, SImode, out);
27684 emit_move_insn (scratch, mem);
27685 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27687 /* This formula yields a nonzero result iff one of the bytes is zero.
27688 This saves three branches inside loop and many cycles. */
27690 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27691 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27692 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27693 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27694 gen_int_mode (0x80808080, SImode)));
27695 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27696 align_4_label);
27698 if (TARGET_CMOVE)
27700 rtx reg = gen_reg_rtx (SImode);
27701 rtx reg2 = gen_reg_rtx (Pmode);
27702 emit_move_insn (reg, tmpreg);
27703 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27705 /* If zero is not in the first two bytes, move two bytes forward. */
27706 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27707 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27708 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27709 emit_insn (gen_rtx_SET (tmpreg,
27710 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27711 reg,
27712 tmpreg)));
27713 /* Emit lea manually to avoid clobbering of flags. */
27714 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27716 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27717 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27718 emit_insn (gen_rtx_SET (out,
27719 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27720 reg2,
27721 out)));
27723 else
27725 rtx_code_label *end_2_label = gen_label_rtx ();
27726 /* Is zero in the first two bytes? */
27728 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27729 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27730 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27731 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27732 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27733 pc_rtx);
27734 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27735 JUMP_LABEL (tmp) = end_2_label;
27737 /* Not in the first two. Move two bytes forward. */
27738 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27739 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27741 emit_label (end_2_label);
27745 /* Avoid branch in fixing the byte. */
27746 tmpreg = gen_lowpart (QImode, tmpreg);
27747 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27748 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27749 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27750 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27752 emit_label (end_0_label);
27755 /* Expand strlen. */
27757 bool
27758 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27760 rtx addr, scratch1, scratch2, scratch3, scratch4;
27762 /* The generic case of strlen expander is long. Avoid it's
27763 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27765 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27766 && !TARGET_INLINE_ALL_STRINGOPS
27767 && !optimize_insn_for_size_p ()
27768 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27769 return false;
27771 addr = force_reg (Pmode, XEXP (src, 0));
27772 scratch1 = gen_reg_rtx (Pmode);
27774 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27775 && !optimize_insn_for_size_p ())
27777 /* Well it seems that some optimizer does not combine a call like
27778 foo(strlen(bar), strlen(bar));
27779 when the move and the subtraction is done here. It does calculate
27780 the length just once when these instructions are done inside of
27781 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27782 often used and I use one fewer register for the lifetime of
27783 output_strlen_unroll() this is better. */
27785 emit_move_insn (out, addr);
27787 ix86_expand_strlensi_unroll_1 (out, src, align);
27789 /* strlensi_unroll_1 returns the address of the zero at the end of
27790 the string, like memchr(), so compute the length by subtracting
27791 the start address. */
27792 emit_insn (ix86_gen_sub3 (out, out, addr));
27794 else
27796 rtx unspec;
27798 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27799 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27800 return false;
27801 /* Can't use this for non-default address spaces. */
27802 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27803 return false;
27805 scratch2 = gen_reg_rtx (Pmode);
27806 scratch3 = gen_reg_rtx (Pmode);
27807 scratch4 = force_reg (Pmode, constm1_rtx);
27809 emit_move_insn (scratch3, addr);
27810 eoschar = force_reg (QImode, eoschar);
27812 src = replace_equiv_address_nv (src, scratch3);
27814 /* If .md starts supporting :P, this can be done in .md. */
27815 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27816 scratch4), UNSPEC_SCAS);
27817 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27818 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27819 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27821 return true;
27824 /* For given symbol (function) construct code to compute address of it's PLT
27825 entry in large x86-64 PIC model. */
27826 static rtx
27827 construct_plt_address (rtx symbol)
27829 rtx tmp, unspec;
27831 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27832 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27833 gcc_assert (Pmode == DImode);
27835 tmp = gen_reg_rtx (Pmode);
27836 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27838 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27839 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27840 return tmp;
27844 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27845 rtx callarg2,
27846 rtx pop, bool sibcall)
27848 rtx vec[3];
27849 rtx use = NULL, call;
27850 unsigned int vec_len = 0;
27851 tree fndecl;
27853 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27855 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27856 if (fndecl
27857 && (lookup_attribute ("interrupt",
27858 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27859 error ("interrupt service routine can't be called directly");
27861 else
27862 fndecl = NULL_TREE;
27864 if (pop == const0_rtx)
27865 pop = NULL;
27866 gcc_assert (!TARGET_64BIT || !pop);
27868 if (TARGET_MACHO && !TARGET_64BIT)
27870 #if TARGET_MACHO
27871 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27872 fnaddr = machopic_indirect_call_target (fnaddr);
27873 #endif
27875 else
27877 /* Static functions and indirect calls don't need the pic register. Also,
27878 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27879 it an indirect call. */
27880 rtx addr = XEXP (fnaddr, 0);
27881 if (flag_pic
27882 && GET_CODE (addr) == SYMBOL_REF
27883 && !SYMBOL_REF_LOCAL_P (addr))
27885 if (flag_plt
27886 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27887 || !lookup_attribute ("noplt",
27888 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27890 if (!TARGET_64BIT
27891 || (ix86_cmodel == CM_LARGE_PIC
27892 && DEFAULT_ABI != MS_ABI))
27894 use_reg (&use, gen_rtx_REG (Pmode,
27895 REAL_PIC_OFFSET_TABLE_REGNUM));
27896 if (ix86_use_pseudo_pic_reg ())
27897 emit_move_insn (gen_rtx_REG (Pmode,
27898 REAL_PIC_OFFSET_TABLE_REGNUM),
27899 pic_offset_table_rtx);
27902 else if (!TARGET_PECOFF && !TARGET_MACHO)
27904 if (TARGET_64BIT)
27906 fnaddr = gen_rtx_UNSPEC (Pmode,
27907 gen_rtvec (1, addr),
27908 UNSPEC_GOTPCREL);
27909 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27911 else
27913 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27914 UNSPEC_GOT);
27915 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27916 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27917 fnaddr);
27919 fnaddr = gen_const_mem (Pmode, fnaddr);
27920 /* Pmode may not be the same as word_mode for x32, which
27921 doesn't support indirect branch via 32-bit memory slot.
27922 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27923 indirect branch via x32 GOT slot is OK. */
27924 if (GET_MODE (fnaddr) != word_mode)
27925 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27926 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27931 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27932 parameters passed in vector registers. */
27933 if (TARGET_64BIT
27934 && (INTVAL (callarg2) > 0
27935 || (INTVAL (callarg2) == 0
27936 && (TARGET_SSE || !flag_skip_rax_setup))))
27938 rtx al = gen_rtx_REG (QImode, AX_REG);
27939 emit_move_insn (al, callarg2);
27940 use_reg (&use, al);
27943 if (ix86_cmodel == CM_LARGE_PIC
27944 && !TARGET_PECOFF
27945 && MEM_P (fnaddr)
27946 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
27947 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
27948 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
27949 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
27950 branch via x32 GOT slot is OK. */
27951 else if (!(TARGET_X32
27952 && MEM_P (fnaddr)
27953 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
27954 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
27955 && (sibcall
27956 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
27957 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
27959 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
27960 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
27963 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
27965 if (retval)
27967 /* We should add bounds as destination register in case
27968 pointer with bounds may be returned. */
27969 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
27971 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
27972 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
27973 if (GET_CODE (retval) == PARALLEL)
27975 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
27976 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
27977 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
27978 retval = chkp_join_splitted_slot (retval, par);
27980 else
27982 retval = gen_rtx_PARALLEL (VOIDmode,
27983 gen_rtvec (3, retval, b0, b1));
27984 chkp_put_regs_to_expr_list (retval);
27988 call = gen_rtx_SET (retval, call);
27990 vec[vec_len++] = call;
27992 if (pop)
27994 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
27995 pop = gen_rtx_SET (stack_pointer_rtx, pop);
27996 vec[vec_len++] = pop;
27999 if (cfun->machine->no_caller_saved_registers
28000 && (!fndecl
28001 || (!TREE_THIS_VOLATILE (fndecl)
28002 && !lookup_attribute ("no_caller_saved_registers",
28003 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28005 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28006 bool is_64bit_ms_abi = (TARGET_64BIT
28007 && ix86_function_abi (fndecl) == MS_ABI);
28008 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28010 /* If there are no caller-saved registers, add all registers
28011 that are clobbered by the call which returns. */
28012 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28013 if (!fixed_regs[i]
28014 && (ix86_call_used_regs[i] == 1
28015 || (ix86_call_used_regs[i] & c_mask))
28016 && !STACK_REGNO_P (i)
28017 && !MMX_REGNO_P (i))
28018 clobber_reg (&use,
28019 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28021 else if (TARGET_64BIT_MS_ABI
28022 && (!callarg2 || INTVAL (callarg2) != -2))
28024 unsigned i;
28026 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28028 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28029 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28031 clobber_reg (&use, gen_rtx_REG (mode, regno));
28034 /* Set here, but it may get cleared later. */
28035 if (TARGET_CALL_MS2SYSV_XLOGUES)
28037 if (!TARGET_SSE)
28040 /* Don't break hot-patched functions. */
28041 else if (ix86_function_ms_hook_prologue (current_function_decl))
28044 /* TODO: Cases not yet examined. */
28045 else if (flag_split_stack)
28046 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28048 else
28050 gcc_assert (!reload_completed);
28051 cfun->machine->call_ms2sysv = true;
28056 if (vec_len > 1)
28057 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28058 call = emit_call_insn (call);
28059 if (use)
28060 CALL_INSN_FUNCTION_USAGE (call) = use;
28062 return call;
28065 /* Return true if the function being called was marked with attribute
28066 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28067 to handle the non-PIC case in the backend because there is no easy
28068 interface for the front-end to force non-PLT calls to use the GOT.
28069 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28070 to call the function marked "noplt" indirectly. */
28072 static bool
28073 ix86_nopic_noplt_attribute_p (rtx call_op)
28075 if (flag_pic || ix86_cmodel == CM_LARGE
28076 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28077 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28078 || SYMBOL_REF_LOCAL_P (call_op))
28079 return false;
28081 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28083 if (!flag_plt
28084 || (symbol_decl != NULL_TREE
28085 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28086 return true;
28088 return false;
28091 /* Output the assembly for a call instruction. */
28093 const char *
28094 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28096 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28097 bool seh_nop_p = false;
28098 const char *xasm;
28100 if (SIBLING_CALL_P (insn))
28102 if (direct_p)
28104 if (ix86_nopic_noplt_attribute_p (call_op))
28106 if (TARGET_64BIT)
28107 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28108 else
28109 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28111 else
28112 xasm = "%!jmp\t%P0";
28114 /* SEH epilogue detection requires the indirect branch case
28115 to include REX.W. */
28116 else if (TARGET_SEH)
28117 xasm = "%!rex.W jmp\t%A0";
28118 else
28119 xasm = "%!jmp\t%A0";
28121 output_asm_insn (xasm, &call_op);
28122 return "";
28125 /* SEH unwinding can require an extra nop to be emitted in several
28126 circumstances. Determine if we have one of those. */
28127 if (TARGET_SEH)
28129 rtx_insn *i;
28131 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28133 /* If we get to another real insn, we don't need the nop. */
28134 if (INSN_P (i))
28135 break;
28137 /* If we get to the epilogue note, prevent a catch region from
28138 being adjacent to the standard epilogue sequence. If non-
28139 call-exceptions, we'll have done this during epilogue emission. */
28140 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28141 && !flag_non_call_exceptions
28142 && !can_throw_internal (insn))
28144 seh_nop_p = true;
28145 break;
28149 /* If we didn't find a real insn following the call, prevent the
28150 unwinder from looking into the next function. */
28151 if (i == NULL)
28152 seh_nop_p = true;
28155 if (direct_p)
28157 if (ix86_nopic_noplt_attribute_p (call_op))
28159 if (TARGET_64BIT)
28160 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28161 else
28162 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28164 else
28165 xasm = "%!call\t%P0";
28167 else
28168 xasm = "%!call\t%A0";
28170 output_asm_insn (xasm, &call_op);
28172 if (seh_nop_p)
28173 return "nop";
28175 return "";
28178 /* Clear stack slot assignments remembered from previous functions.
28179 This is called from INIT_EXPANDERS once before RTL is emitted for each
28180 function. */
28182 static struct machine_function *
28183 ix86_init_machine_status (void)
28185 struct machine_function *f;
28187 f = ggc_cleared_alloc<machine_function> ();
28188 f->call_abi = ix86_abi;
28190 return f;
28193 /* Return a MEM corresponding to a stack slot with mode MODE.
28194 Allocate a new slot if necessary.
28196 The RTL for a function can have several slots available: N is
28197 which slot to use. */
28200 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28202 struct stack_local_entry *s;
28204 gcc_assert (n < MAX_386_STACK_LOCALS);
28206 for (s = ix86_stack_locals; s; s = s->next)
28207 if (s->mode == mode && s->n == n)
28208 return validize_mem (copy_rtx (s->rtl));
28210 s = ggc_alloc<stack_local_entry> ();
28211 s->n = n;
28212 s->mode = mode;
28213 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28215 s->next = ix86_stack_locals;
28216 ix86_stack_locals = s;
28217 return validize_mem (copy_rtx (s->rtl));
28220 static void
28221 ix86_instantiate_decls (void)
28223 struct stack_local_entry *s;
28225 for (s = ix86_stack_locals; s; s = s->next)
28226 if (s->rtl != NULL_RTX)
28227 instantiate_decl_rtl (s->rtl);
28230 /* Return the number used for encoding REG, in the range 0..7. */
28232 static int
28233 reg_encoded_number (rtx reg)
28235 unsigned regno = REGNO (reg);
28236 switch (regno)
28238 case AX_REG:
28239 return 0;
28240 case CX_REG:
28241 return 1;
28242 case DX_REG:
28243 return 2;
28244 case BX_REG:
28245 return 3;
28246 case SP_REG:
28247 return 4;
28248 case BP_REG:
28249 return 5;
28250 case SI_REG:
28251 return 6;
28252 case DI_REG:
28253 return 7;
28254 default:
28255 break;
28257 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28258 return regno - FIRST_STACK_REG;
28259 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28260 return regno - FIRST_SSE_REG;
28261 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28262 return regno - FIRST_MMX_REG;
28263 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28264 return regno - FIRST_REX_SSE_REG;
28265 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28266 return regno - FIRST_REX_INT_REG;
28267 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28268 return regno - FIRST_MASK_REG;
28269 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28270 return regno - FIRST_BND_REG;
28271 return -1;
28274 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28275 in its encoding if it could be relevant for ROP mitigation, otherwise
28276 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28277 used for calculating it into them. */
28279 static int
28280 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28281 int *popno0 = 0, int *popno1 = 0)
28283 if (asm_noperands (PATTERN (insn)) >= 0)
28284 return -1;
28285 int has_modrm = get_attr_modrm (insn);
28286 if (!has_modrm)
28287 return -1;
28288 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28289 rtx op0, op1;
28290 switch (cls)
28292 case MODRM_CLASS_OP02:
28293 gcc_assert (noperands >= 3);
28294 if (popno0)
28296 *popno0 = 0;
28297 *popno1 = 2;
28299 op0 = operands[0];
28300 op1 = operands[2];
28301 break;
28302 case MODRM_CLASS_OP01:
28303 gcc_assert (noperands >= 2);
28304 if (popno0)
28306 *popno0 = 0;
28307 *popno1 = 1;
28309 op0 = operands[0];
28310 op1 = operands[1];
28311 break;
28312 default:
28313 return -1;
28315 if (REG_P (op0) && REG_P (op1))
28317 int enc0 = reg_encoded_number (op0);
28318 int enc1 = reg_encoded_number (op1);
28319 return 0xc0 + (enc1 << 3) + enc0;
28321 return -1;
28324 /* Check whether x86 address PARTS is a pc-relative address. */
28326 bool
28327 ix86_rip_relative_addr_p (struct ix86_address *parts)
28329 rtx base, index, disp;
28331 base = parts->base;
28332 index = parts->index;
28333 disp = parts->disp;
28335 if (disp && !base && !index)
28337 if (TARGET_64BIT)
28339 rtx symbol = disp;
28341 if (GET_CODE (disp) == CONST)
28342 symbol = XEXP (disp, 0);
28343 if (GET_CODE (symbol) == PLUS
28344 && CONST_INT_P (XEXP (symbol, 1)))
28345 symbol = XEXP (symbol, 0);
28347 if (GET_CODE (symbol) == LABEL_REF
28348 || (GET_CODE (symbol) == SYMBOL_REF
28349 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28350 || (GET_CODE (symbol) == UNSPEC
28351 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28352 || XINT (symbol, 1) == UNSPEC_PCREL
28353 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28354 return true;
28357 return false;
28360 /* Calculate the length of the memory address in the instruction encoding.
28361 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28362 or other prefixes. We never generate addr32 prefix for LEA insn. */
28365 memory_address_length (rtx addr, bool lea)
28367 struct ix86_address parts;
28368 rtx base, index, disp;
28369 int len;
28370 int ok;
28372 if (GET_CODE (addr) == PRE_DEC
28373 || GET_CODE (addr) == POST_INC
28374 || GET_CODE (addr) == PRE_MODIFY
28375 || GET_CODE (addr) == POST_MODIFY)
28376 return 0;
28378 ok = ix86_decompose_address (addr, &parts);
28379 gcc_assert (ok);
28381 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28383 /* If this is not LEA instruction, add the length of addr32 prefix. */
28384 if (TARGET_64BIT && !lea
28385 && (SImode_address_operand (addr, VOIDmode)
28386 || (parts.base && GET_MODE (parts.base) == SImode)
28387 || (parts.index && GET_MODE (parts.index) == SImode)))
28388 len++;
28390 base = parts.base;
28391 index = parts.index;
28392 disp = parts.disp;
28394 if (base && SUBREG_P (base))
28395 base = SUBREG_REG (base);
28396 if (index && SUBREG_P (index))
28397 index = SUBREG_REG (index);
28399 gcc_assert (base == NULL_RTX || REG_P (base));
28400 gcc_assert (index == NULL_RTX || REG_P (index));
28402 /* Rule of thumb:
28403 - esp as the base always wants an index,
28404 - ebp as the base always wants a displacement,
28405 - r12 as the base always wants an index,
28406 - r13 as the base always wants a displacement. */
28408 /* Register Indirect. */
28409 if (base && !index && !disp)
28411 /* esp (for its index) and ebp (for its displacement) need
28412 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28413 code. */
28414 if (base == arg_pointer_rtx
28415 || base == frame_pointer_rtx
28416 || REGNO (base) == SP_REG
28417 || REGNO (base) == BP_REG
28418 || REGNO (base) == R12_REG
28419 || REGNO (base) == R13_REG)
28420 len++;
28423 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28424 is not disp32, but disp32(%rip), so for disp32
28425 SIB byte is needed, unless print_operand_address
28426 optimizes it into disp32(%rip) or (%rip) is implied
28427 by UNSPEC. */
28428 else if (disp && !base && !index)
28430 len += 4;
28431 if (!ix86_rip_relative_addr_p (&parts))
28432 len++;
28434 else
28436 /* Find the length of the displacement constant. */
28437 if (disp)
28439 if (base && satisfies_constraint_K (disp))
28440 len += 1;
28441 else
28442 len += 4;
28444 /* ebp always wants a displacement. Similarly r13. */
28445 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28446 len++;
28448 /* An index requires the two-byte modrm form.... */
28449 if (index
28450 /* ...like esp (or r12), which always wants an index. */
28451 || base == arg_pointer_rtx
28452 || base == frame_pointer_rtx
28453 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28454 len++;
28457 return len;
28460 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28461 is set, expect that insn have 8bit immediate alternative. */
28463 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28465 int len = 0;
28466 int i;
28467 extract_insn_cached (insn);
28468 for (i = recog_data.n_operands - 1; i >= 0; --i)
28469 if (CONSTANT_P (recog_data.operand[i]))
28471 enum attr_mode mode = get_attr_mode (insn);
28473 gcc_assert (!len);
28474 if (shortform && CONST_INT_P (recog_data.operand[i]))
28476 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28477 switch (mode)
28479 case MODE_QI:
28480 len = 1;
28481 continue;
28482 case MODE_HI:
28483 ival = trunc_int_for_mode (ival, HImode);
28484 break;
28485 case MODE_SI:
28486 ival = trunc_int_for_mode (ival, SImode);
28487 break;
28488 default:
28489 break;
28491 if (IN_RANGE (ival, -128, 127))
28493 len = 1;
28494 continue;
28497 switch (mode)
28499 case MODE_QI:
28500 len = 1;
28501 break;
28502 case MODE_HI:
28503 len = 2;
28504 break;
28505 case MODE_SI:
28506 len = 4;
28507 break;
28508 /* Immediates for DImode instructions are encoded
28509 as 32bit sign extended values. */
28510 case MODE_DI:
28511 len = 4;
28512 break;
28513 default:
28514 fatal_insn ("unknown insn mode", insn);
28517 return len;
28520 /* Compute default value for "length_address" attribute. */
28522 ix86_attr_length_address_default (rtx_insn *insn)
28524 int i;
28526 if (get_attr_type (insn) == TYPE_LEA)
28528 rtx set = PATTERN (insn), addr;
28530 if (GET_CODE (set) == PARALLEL)
28531 set = XVECEXP (set, 0, 0);
28533 gcc_assert (GET_CODE (set) == SET);
28535 addr = SET_SRC (set);
28537 return memory_address_length (addr, true);
28540 extract_insn_cached (insn);
28541 for (i = recog_data.n_operands - 1; i >= 0; --i)
28543 rtx op = recog_data.operand[i];
28544 if (MEM_P (op))
28546 constrain_operands_cached (insn, reload_completed);
28547 if (which_alternative != -1)
28549 const char *constraints = recog_data.constraints[i];
28550 int alt = which_alternative;
28552 while (*constraints == '=' || *constraints == '+')
28553 constraints++;
28554 while (alt-- > 0)
28555 while (*constraints++ != ',')
28557 /* Skip ignored operands. */
28558 if (*constraints == 'X')
28559 continue;
28562 int len = memory_address_length (XEXP (op, 0), false);
28564 /* Account for segment prefix for non-default addr spaces. */
28565 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28566 len++;
28568 return len;
28571 return 0;
28574 /* Compute default value for "length_vex" attribute. It includes
28575 2 or 3 byte VEX prefix and 1 opcode byte. */
28578 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28579 bool has_vex_w)
28581 int i;
28583 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28584 byte VEX prefix. */
28585 if (!has_0f_opcode || has_vex_w)
28586 return 3 + 1;
28588 /* We can always use 2 byte VEX prefix in 32bit. */
28589 if (!TARGET_64BIT)
28590 return 2 + 1;
28592 extract_insn_cached (insn);
28594 for (i = recog_data.n_operands - 1; i >= 0; --i)
28595 if (REG_P (recog_data.operand[i]))
28597 /* REX.W bit uses 3 byte VEX prefix. */
28598 if (GET_MODE (recog_data.operand[i]) == DImode
28599 && GENERAL_REG_P (recog_data.operand[i]))
28600 return 3 + 1;
28602 else
28604 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28605 if (MEM_P (recog_data.operand[i])
28606 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28607 return 3 + 1;
28610 return 2 + 1;
28614 static bool
28615 ix86_class_likely_spilled_p (reg_class_t);
28617 /* Returns true if lhs of insn is HW function argument register and set up
28618 is_spilled to true if it is likely spilled HW register. */
28619 static bool
28620 insn_is_function_arg (rtx insn, bool* is_spilled)
28622 rtx dst;
28624 if (!NONDEBUG_INSN_P (insn))
28625 return false;
28626 /* Call instructions are not movable, ignore it. */
28627 if (CALL_P (insn))
28628 return false;
28629 insn = PATTERN (insn);
28630 if (GET_CODE (insn) == PARALLEL)
28631 insn = XVECEXP (insn, 0, 0);
28632 if (GET_CODE (insn) != SET)
28633 return false;
28634 dst = SET_DEST (insn);
28635 if (REG_P (dst) && HARD_REGISTER_P (dst)
28636 && ix86_function_arg_regno_p (REGNO (dst)))
28638 /* Is it likely spilled HW register? */
28639 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28640 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28641 *is_spilled = true;
28642 return true;
28644 return false;
28647 /* Add output dependencies for chain of function adjacent arguments if only
28648 there is a move to likely spilled HW register. Return first argument
28649 if at least one dependence was added or NULL otherwise. */
28650 static rtx_insn *
28651 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28653 rtx_insn *insn;
28654 rtx_insn *last = call;
28655 rtx_insn *first_arg = NULL;
28656 bool is_spilled = false;
28658 head = PREV_INSN (head);
28660 /* Find nearest to call argument passing instruction. */
28661 while (true)
28663 last = PREV_INSN (last);
28664 if (last == head)
28665 return NULL;
28666 if (!NONDEBUG_INSN_P (last))
28667 continue;
28668 if (insn_is_function_arg (last, &is_spilled))
28669 break;
28670 return NULL;
28673 first_arg = last;
28674 while (true)
28676 insn = PREV_INSN (last);
28677 if (!INSN_P (insn))
28678 break;
28679 if (insn == head)
28680 break;
28681 if (!NONDEBUG_INSN_P (insn))
28683 last = insn;
28684 continue;
28686 if (insn_is_function_arg (insn, &is_spilled))
28688 /* Add output depdendence between two function arguments if chain
28689 of output arguments contains likely spilled HW registers. */
28690 if (is_spilled)
28691 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28692 first_arg = last = insn;
28694 else
28695 break;
28697 if (!is_spilled)
28698 return NULL;
28699 return first_arg;
28702 /* Add output or anti dependency from insn to first_arg to restrict its code
28703 motion. */
28704 static void
28705 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28707 rtx set;
28708 rtx tmp;
28710 /* Add anti dependencies for bounds stores. */
28711 if (INSN_P (insn)
28712 && GET_CODE (PATTERN (insn)) == PARALLEL
28713 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28714 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28716 add_dependence (first_arg, insn, REG_DEP_ANTI);
28717 return;
28720 set = single_set (insn);
28721 if (!set)
28722 return;
28723 tmp = SET_DEST (set);
28724 if (REG_P (tmp))
28726 /* Add output dependency to the first function argument. */
28727 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28728 return;
28730 /* Add anti dependency. */
28731 add_dependence (first_arg, insn, REG_DEP_ANTI);
28734 /* Avoid cross block motion of function argument through adding dependency
28735 from the first non-jump instruction in bb. */
28736 static void
28737 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28739 rtx_insn *insn = BB_END (bb);
28741 while (insn)
28743 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28745 rtx set = single_set (insn);
28746 if (set)
28748 avoid_func_arg_motion (arg, insn);
28749 return;
28752 if (insn == BB_HEAD (bb))
28753 return;
28754 insn = PREV_INSN (insn);
28758 /* Hook for pre-reload schedule - avoid motion of function arguments
28759 passed in likely spilled HW registers. */
28760 static void
28761 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28763 rtx_insn *insn;
28764 rtx_insn *first_arg = NULL;
28765 if (reload_completed)
28766 return;
28767 while (head != tail && DEBUG_INSN_P (head))
28768 head = NEXT_INSN (head);
28769 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28770 if (INSN_P (insn) && CALL_P (insn))
28772 first_arg = add_parameter_dependencies (insn, head);
28773 if (first_arg)
28775 /* Add dependee for first argument to predecessors if only
28776 region contains more than one block. */
28777 basic_block bb = BLOCK_FOR_INSN (insn);
28778 int rgn = CONTAINING_RGN (bb->index);
28779 int nr_blks = RGN_NR_BLOCKS (rgn);
28780 /* Skip trivial regions and region head blocks that can have
28781 predecessors outside of region. */
28782 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28784 edge e;
28785 edge_iterator ei;
28787 /* Regions are SCCs with the exception of selective
28788 scheduling with pipelining of outer blocks enabled.
28789 So also check that immediate predecessors of a non-head
28790 block are in the same region. */
28791 FOR_EACH_EDGE (e, ei, bb->preds)
28793 /* Avoid creating of loop-carried dependencies through
28794 using topological ordering in the region. */
28795 if (rgn == CONTAINING_RGN (e->src->index)
28796 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28797 add_dependee_for_func_arg (first_arg, e->src);
28800 insn = first_arg;
28801 if (insn == head)
28802 break;
28805 else if (first_arg)
28806 avoid_func_arg_motion (first_arg, insn);
28809 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28810 HW registers to maximum, to schedule them at soon as possible. These are
28811 moves from function argument registers at the top of the function entry
28812 and moves from function return value registers after call. */
28813 static int
28814 ix86_adjust_priority (rtx_insn *insn, int priority)
28816 rtx set;
28818 if (reload_completed)
28819 return priority;
28821 if (!NONDEBUG_INSN_P (insn))
28822 return priority;
28824 set = single_set (insn);
28825 if (set)
28827 rtx tmp = SET_SRC (set);
28828 if (REG_P (tmp)
28829 && HARD_REGISTER_P (tmp)
28830 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28831 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28832 return current_sched_info->sched_max_insns_priority;
28835 return priority;
28838 /* Prepare for scheduling pass. */
28839 static void
28840 ix86_sched_init_global (FILE *, int, int)
28842 /* Install scheduling hooks for current CPU. Some of these hooks are used
28843 in time-critical parts of the scheduler, so we only set them up when
28844 they are actually used. */
28845 switch (ix86_tune)
28847 case PROCESSOR_CORE2:
28848 case PROCESSOR_NEHALEM:
28849 case PROCESSOR_SANDYBRIDGE:
28850 case PROCESSOR_HASWELL:
28851 case PROCESSOR_GENERIC:
28852 /* Do not perform multipass scheduling for pre-reload schedule
28853 to save compile time. */
28854 if (reload_completed)
28856 ix86_core2i7_init_hooks ();
28857 break;
28859 /* Fall through. */
28860 default:
28861 targetm.sched.dfa_post_advance_cycle = NULL;
28862 targetm.sched.first_cycle_multipass_init = NULL;
28863 targetm.sched.first_cycle_multipass_begin = NULL;
28864 targetm.sched.first_cycle_multipass_issue = NULL;
28865 targetm.sched.first_cycle_multipass_backtrack = NULL;
28866 targetm.sched.first_cycle_multipass_end = NULL;
28867 targetm.sched.first_cycle_multipass_fini = NULL;
28868 break;
28873 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
28875 static HOST_WIDE_INT
28876 ix86_static_rtx_alignment (machine_mode mode)
28878 if (mode == DFmode)
28879 return 64;
28880 if (ALIGN_MODE_128 (mode))
28881 return MAX (128, GET_MODE_ALIGNMENT (mode));
28882 return GET_MODE_ALIGNMENT (mode);
28885 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28887 static HOST_WIDE_INT
28888 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28890 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28891 || TREE_CODE (exp) == INTEGER_CST)
28893 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
28894 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
28895 return MAX (mode_align, align);
28897 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28898 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28899 return BITS_PER_WORD;
28901 return align;
28904 /* Implement TARGET_EMPTY_RECORD_P. */
28906 static bool
28907 ix86_is_empty_record (const_tree type)
28909 if (!TARGET_64BIT)
28910 return false;
28911 return default_is_empty_record (type);
28914 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
28916 static void
28917 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
28919 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
28921 if (!cum->warn_empty)
28922 return;
28924 if (!TYPE_EMPTY_P (type))
28925 return;
28927 const_tree ctx = get_ultimate_context (cum->decl);
28928 if (ctx != NULL_TREE
28929 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
28930 return;
28932 /* If the actual size of the type is zero, then there is no change
28933 in how objects of this size are passed. */
28934 if (int_size_in_bytes (type) == 0)
28935 return;
28937 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
28938 "changes in -fabi-version=12 (GCC 8)", type);
28940 /* Only warn once. */
28941 cum->warn_empty = false;
28944 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
28945 the data type, and ALIGN is the alignment that the object would
28946 ordinarily have. */
28948 static int
28949 iamcu_alignment (tree type, int align)
28951 machine_mode mode;
28953 if (align < 32 || TYPE_USER_ALIGN (type))
28954 return align;
28956 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
28957 bytes. */
28958 mode = TYPE_MODE (strip_array_types (type));
28959 switch (GET_MODE_CLASS (mode))
28961 case MODE_INT:
28962 case MODE_COMPLEX_INT:
28963 case MODE_COMPLEX_FLOAT:
28964 case MODE_FLOAT:
28965 case MODE_DECIMAL_FLOAT:
28966 return 32;
28967 default:
28968 return align;
28972 /* Compute the alignment for a static variable.
28973 TYPE is the data type, and ALIGN is the alignment that
28974 the object would ordinarily have. The value of this function is used
28975 instead of that alignment to align the object. */
28978 ix86_data_alignment (tree type, int align, bool opt)
28980 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
28981 for symbols from other compilation units or symbols that don't need
28982 to bind locally. In order to preserve some ABI compatibility with
28983 those compilers, ensure we don't decrease alignment from what we
28984 used to assume. */
28986 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
28988 /* A data structure, equal or greater than the size of a cache line
28989 (64 bytes in the Pentium 4 and other recent Intel processors, including
28990 processors based on Intel Core microarchitecture) should be aligned
28991 so that its base address is a multiple of a cache line size. */
28993 int max_align
28994 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
28996 if (max_align < BITS_PER_WORD)
28997 max_align = BITS_PER_WORD;
28999 switch (ix86_align_data_type)
29001 case ix86_align_data_type_abi: opt = false; break;
29002 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29003 case ix86_align_data_type_cacheline: break;
29006 if (TARGET_IAMCU)
29007 align = iamcu_alignment (type, align);
29009 if (opt
29010 && AGGREGATE_TYPE_P (type)
29011 && TYPE_SIZE (type)
29012 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29014 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29015 && align < max_align_compat)
29016 align = max_align_compat;
29017 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29018 && align < max_align)
29019 align = max_align;
29022 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29023 to 16byte boundary. */
29024 if (TARGET_64BIT)
29026 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29027 && TYPE_SIZE (type)
29028 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29029 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29030 && align < 128)
29031 return 128;
29034 if (!opt)
29035 return align;
29037 if (TREE_CODE (type) == ARRAY_TYPE)
29039 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29040 return 64;
29041 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29042 return 128;
29044 else if (TREE_CODE (type) == COMPLEX_TYPE)
29047 if (TYPE_MODE (type) == DCmode && align < 64)
29048 return 64;
29049 if ((TYPE_MODE (type) == XCmode
29050 || TYPE_MODE (type) == TCmode) && align < 128)
29051 return 128;
29053 else if ((TREE_CODE (type) == RECORD_TYPE
29054 || TREE_CODE (type) == UNION_TYPE
29055 || TREE_CODE (type) == QUAL_UNION_TYPE)
29056 && TYPE_FIELDS (type))
29058 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29059 return 64;
29060 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29061 return 128;
29063 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29064 || TREE_CODE (type) == INTEGER_TYPE)
29066 if (TYPE_MODE (type) == DFmode && align < 64)
29067 return 64;
29068 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29069 return 128;
29072 return align;
29075 /* Compute the alignment for a local variable or a stack slot. EXP is
29076 the data type or decl itself, MODE is the widest mode available and
29077 ALIGN is the alignment that the object would ordinarily have. The
29078 value of this macro is used instead of that alignment to align the
29079 object. */
29081 unsigned int
29082 ix86_local_alignment (tree exp, machine_mode mode,
29083 unsigned int align)
29085 tree type, decl;
29087 if (exp && DECL_P (exp))
29089 type = TREE_TYPE (exp);
29090 decl = exp;
29092 else
29094 type = exp;
29095 decl = NULL;
29098 /* Don't do dynamic stack realignment for long long objects with
29099 -mpreferred-stack-boundary=2. */
29100 if (!TARGET_64BIT
29101 && align == 64
29102 && ix86_preferred_stack_boundary < 64
29103 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29104 && (!type || !TYPE_USER_ALIGN (type))
29105 && (!decl || !DECL_USER_ALIGN (decl)))
29106 align = 32;
29108 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29109 register in MODE. We will return the largest alignment of XF
29110 and DF. */
29111 if (!type)
29113 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29114 align = GET_MODE_ALIGNMENT (DFmode);
29115 return align;
29118 /* Don't increase alignment for Intel MCU psABI. */
29119 if (TARGET_IAMCU)
29120 return align;
29122 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29123 to 16byte boundary. Exact wording is:
29125 An array uses the same alignment as its elements, except that a local or
29126 global array variable of length at least 16 bytes or
29127 a C99 variable-length array variable always has alignment of at least 16 bytes.
29129 This was added to allow use of aligned SSE instructions at arrays. This
29130 rule is meant for static storage (where compiler can not do the analysis
29131 by itself). We follow it for automatic variables only when convenient.
29132 We fully control everything in the function compiled and functions from
29133 other unit can not rely on the alignment.
29135 Exclude va_list type. It is the common case of local array where
29136 we can not benefit from the alignment.
29138 TODO: Probably one should optimize for size only when var is not escaping. */
29139 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29140 && TARGET_SSE)
29142 if (AGGREGATE_TYPE_P (type)
29143 && (va_list_type_node == NULL_TREE
29144 || (TYPE_MAIN_VARIANT (type)
29145 != TYPE_MAIN_VARIANT (va_list_type_node)))
29146 && TYPE_SIZE (type)
29147 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29148 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29149 && align < 128)
29150 return 128;
29152 if (TREE_CODE (type) == ARRAY_TYPE)
29154 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29155 return 64;
29156 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29157 return 128;
29159 else if (TREE_CODE (type) == COMPLEX_TYPE)
29161 if (TYPE_MODE (type) == DCmode && align < 64)
29162 return 64;
29163 if ((TYPE_MODE (type) == XCmode
29164 || TYPE_MODE (type) == TCmode) && align < 128)
29165 return 128;
29167 else if ((TREE_CODE (type) == RECORD_TYPE
29168 || TREE_CODE (type) == UNION_TYPE
29169 || TREE_CODE (type) == QUAL_UNION_TYPE)
29170 && TYPE_FIELDS (type))
29172 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29173 return 64;
29174 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29175 return 128;
29177 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29178 || TREE_CODE (type) == INTEGER_TYPE)
29181 if (TYPE_MODE (type) == DFmode && align < 64)
29182 return 64;
29183 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29184 return 128;
29186 return align;
29189 /* Compute the minimum required alignment for dynamic stack realignment
29190 purposes for a local variable, parameter or a stack slot. EXP is
29191 the data type or decl itself, MODE is its mode and ALIGN is the
29192 alignment that the object would ordinarily have. */
29194 unsigned int
29195 ix86_minimum_alignment (tree exp, machine_mode mode,
29196 unsigned int align)
29198 tree type, decl;
29200 if (exp && DECL_P (exp))
29202 type = TREE_TYPE (exp);
29203 decl = exp;
29205 else
29207 type = exp;
29208 decl = NULL;
29211 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29212 return align;
29214 /* Don't do dynamic stack realignment for long long objects with
29215 -mpreferred-stack-boundary=2. */
29216 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29217 && (!type || !TYPE_USER_ALIGN (type))
29218 && (!decl || !DECL_USER_ALIGN (decl)))
29220 gcc_checking_assert (!TARGET_STV);
29221 return 32;
29224 return align;
29227 /* Find a location for the static chain incoming to a nested function.
29228 This is a register, unless all free registers are used by arguments. */
29230 static rtx
29231 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29233 unsigned regno;
29235 /* While this function won't be called by the middle-end when a static
29236 chain isn't needed, it's also used throughout the backend so it's
29237 easiest to keep this check centralized. */
29238 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
29239 return NULL;
29241 if (TARGET_64BIT)
29243 /* We always use R10 in 64-bit mode. */
29244 regno = R10_REG;
29246 else
29248 const_tree fntype, fndecl;
29249 unsigned int ccvt;
29251 /* By default in 32-bit mode we use ECX to pass the static chain. */
29252 regno = CX_REG;
29254 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29256 fntype = TREE_TYPE (fndecl_or_type);
29257 fndecl = fndecl_or_type;
29259 else
29261 fntype = fndecl_or_type;
29262 fndecl = NULL;
29265 ccvt = ix86_get_callcvt (fntype);
29266 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29268 /* Fastcall functions use ecx/edx for arguments, which leaves
29269 us with EAX for the static chain.
29270 Thiscall functions use ecx for arguments, which also
29271 leaves us with EAX for the static chain. */
29272 regno = AX_REG;
29274 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29276 /* Thiscall functions use ecx for arguments, which leaves
29277 us with EAX and EDX for the static chain.
29278 We are using for abi-compatibility EAX. */
29279 regno = AX_REG;
29281 else if (ix86_function_regparm (fntype, fndecl) == 3)
29283 /* For regparm 3, we have no free call-clobbered registers in
29284 which to store the static chain. In order to implement this,
29285 we have the trampoline push the static chain to the stack.
29286 However, we can't push a value below the return address when
29287 we call the nested function directly, so we have to use an
29288 alternate entry point. For this we use ESI, and have the
29289 alternate entry point push ESI, so that things appear the
29290 same once we're executing the nested function. */
29291 if (incoming_p)
29293 if (fndecl == current_function_decl
29294 && !ix86_static_chain_on_stack)
29296 gcc_assert (!reload_completed);
29297 ix86_static_chain_on_stack = true;
29299 return gen_frame_mem (SImode,
29300 plus_constant (Pmode,
29301 arg_pointer_rtx, -8));
29303 regno = SI_REG;
29307 return gen_rtx_REG (Pmode, regno);
29310 /* Emit RTL insns to initialize the variable parts of a trampoline.
29311 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29312 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29313 to be passed to the target function. */
29315 static void
29316 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29318 rtx mem, fnaddr;
29319 int opcode;
29320 int offset = 0;
29322 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29324 if (TARGET_64BIT)
29326 int size;
29328 /* Load the function address to r11. Try to load address using
29329 the shorter movl instead of movabs. We may want to support
29330 movq for kernel mode, but kernel does not use trampolines at
29331 the moment. FNADDR is a 32bit address and may not be in
29332 DImode when ptr_mode == SImode. Always use movl in this
29333 case. */
29334 if (ptr_mode == SImode
29335 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29337 fnaddr = copy_addr_to_reg (fnaddr);
29339 mem = adjust_address (m_tramp, HImode, offset);
29340 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29342 mem = adjust_address (m_tramp, SImode, offset + 2);
29343 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29344 offset += 6;
29346 else
29348 mem = adjust_address (m_tramp, HImode, offset);
29349 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29351 mem = adjust_address (m_tramp, DImode, offset + 2);
29352 emit_move_insn (mem, fnaddr);
29353 offset += 10;
29356 /* Load static chain using movabs to r10. Use the shorter movl
29357 instead of movabs when ptr_mode == SImode. */
29358 if (ptr_mode == SImode)
29360 opcode = 0xba41;
29361 size = 6;
29363 else
29365 opcode = 0xba49;
29366 size = 10;
29369 mem = adjust_address (m_tramp, HImode, offset);
29370 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29372 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29373 emit_move_insn (mem, chain_value);
29374 offset += size;
29376 /* Jump to r11; the last (unused) byte is a nop, only there to
29377 pad the write out to a single 32-bit store. */
29378 mem = adjust_address (m_tramp, SImode, offset);
29379 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29380 offset += 4;
29382 else
29384 rtx disp, chain;
29386 /* Depending on the static chain location, either load a register
29387 with a constant, or push the constant to the stack. All of the
29388 instructions are the same size. */
29389 chain = ix86_static_chain (fndecl, true);
29390 if (REG_P (chain))
29392 switch (REGNO (chain))
29394 case AX_REG:
29395 opcode = 0xb8; break;
29396 case CX_REG:
29397 opcode = 0xb9; break;
29398 default:
29399 gcc_unreachable ();
29402 else
29403 opcode = 0x68;
29405 mem = adjust_address (m_tramp, QImode, offset);
29406 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29408 mem = adjust_address (m_tramp, SImode, offset + 1);
29409 emit_move_insn (mem, chain_value);
29410 offset += 5;
29412 mem = adjust_address (m_tramp, QImode, offset);
29413 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29415 mem = adjust_address (m_tramp, SImode, offset + 1);
29417 /* Compute offset from the end of the jmp to the target function.
29418 In the case in which the trampoline stores the static chain on
29419 the stack, we need to skip the first insn which pushes the
29420 (call-saved) register static chain; this push is 1 byte. */
29421 offset += 5;
29422 disp = expand_binop (SImode, sub_optab, fnaddr,
29423 plus_constant (Pmode, XEXP (m_tramp, 0),
29424 offset - (MEM_P (chain) ? 1 : 0)),
29425 NULL_RTX, 1, OPTAB_DIRECT);
29426 emit_move_insn (mem, disp);
29429 gcc_assert (offset <= TRAMPOLINE_SIZE);
29431 #ifdef HAVE_ENABLE_EXECUTE_STACK
29432 #ifdef CHECK_EXECUTE_STACK_ENABLED
29433 if (CHECK_EXECUTE_STACK_ENABLED)
29434 #endif
29435 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29436 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29437 #endif
29440 static bool
29441 ix86_allocate_stack_slots_for_args (void)
29443 /* Naked functions should not allocate stack slots for arguments. */
29444 return !ix86_function_naked (current_function_decl);
29447 static bool
29448 ix86_warn_func_return (tree decl)
29450 /* Naked functions are implemented entirely in assembly, including the
29451 return sequence, so suppress warnings about this. */
29452 return !ix86_function_naked (decl);
29455 /* The following file contains several enumerations and data structures
29456 built from the definitions in i386-builtin-types.def. */
29458 #include "i386-builtin-types.inc"
29460 /* Table for the ix86 builtin non-function types. */
29461 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29463 /* Retrieve an element from the above table, building some of
29464 the types lazily. */
29466 static tree
29467 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29469 unsigned int index;
29470 tree type, itype;
29472 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29474 type = ix86_builtin_type_tab[(int) tcode];
29475 if (type != NULL)
29476 return type;
29478 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29479 if (tcode <= IX86_BT_LAST_VECT)
29481 machine_mode mode;
29483 index = tcode - IX86_BT_LAST_PRIM - 1;
29484 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29485 mode = ix86_builtin_type_vect_mode[index];
29487 type = build_vector_type_for_mode (itype, mode);
29489 else
29491 int quals;
29493 index = tcode - IX86_BT_LAST_VECT - 1;
29494 if (tcode <= IX86_BT_LAST_PTR)
29495 quals = TYPE_UNQUALIFIED;
29496 else
29497 quals = TYPE_QUAL_CONST;
29499 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29500 if (quals != TYPE_UNQUALIFIED)
29501 itype = build_qualified_type (itype, quals);
29503 type = build_pointer_type (itype);
29506 ix86_builtin_type_tab[(int) tcode] = type;
29507 return type;
29510 /* Table for the ix86 builtin function types. */
29511 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29513 /* Retrieve an element from the above table, building some of
29514 the types lazily. */
29516 static tree
29517 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29519 tree type;
29521 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29523 type = ix86_builtin_func_type_tab[(int) tcode];
29524 if (type != NULL)
29525 return type;
29527 if (tcode <= IX86_BT_LAST_FUNC)
29529 unsigned start = ix86_builtin_func_start[(int) tcode];
29530 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29531 tree rtype, atype, args = void_list_node;
29532 unsigned i;
29534 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29535 for (i = after - 1; i > start; --i)
29537 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29538 args = tree_cons (NULL, atype, args);
29541 type = build_function_type (rtype, args);
29543 else
29545 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29546 enum ix86_builtin_func_type icode;
29548 icode = ix86_builtin_func_alias_base[index];
29549 type = ix86_get_builtin_func_type (icode);
29552 ix86_builtin_func_type_tab[(int) tcode] = type;
29553 return type;
29557 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29558 bdesc_* arrays below should come first, then builtins for each bdesc_*
29559 array in ascending order, so that we can use direct array accesses. */
29560 enum ix86_builtins
29562 IX86_BUILTIN_MASKMOVQ,
29563 IX86_BUILTIN_LDMXCSR,
29564 IX86_BUILTIN_STMXCSR,
29565 IX86_BUILTIN_MASKMOVDQU,
29566 IX86_BUILTIN_PSLLDQ128,
29567 IX86_BUILTIN_CLFLUSH,
29568 IX86_BUILTIN_MONITOR,
29569 IX86_BUILTIN_MWAIT,
29570 IX86_BUILTIN_CLZERO,
29571 IX86_BUILTIN_VEC_INIT_V2SI,
29572 IX86_BUILTIN_VEC_INIT_V4HI,
29573 IX86_BUILTIN_VEC_INIT_V8QI,
29574 IX86_BUILTIN_VEC_EXT_V2DF,
29575 IX86_BUILTIN_VEC_EXT_V2DI,
29576 IX86_BUILTIN_VEC_EXT_V4SF,
29577 IX86_BUILTIN_VEC_EXT_V4SI,
29578 IX86_BUILTIN_VEC_EXT_V8HI,
29579 IX86_BUILTIN_VEC_EXT_V2SI,
29580 IX86_BUILTIN_VEC_EXT_V4HI,
29581 IX86_BUILTIN_VEC_EXT_V16QI,
29582 IX86_BUILTIN_VEC_SET_V2DI,
29583 IX86_BUILTIN_VEC_SET_V4SF,
29584 IX86_BUILTIN_VEC_SET_V4SI,
29585 IX86_BUILTIN_VEC_SET_V8HI,
29586 IX86_BUILTIN_VEC_SET_V4HI,
29587 IX86_BUILTIN_VEC_SET_V16QI,
29588 IX86_BUILTIN_GATHERSIV2DF,
29589 IX86_BUILTIN_GATHERSIV4DF,
29590 IX86_BUILTIN_GATHERDIV2DF,
29591 IX86_BUILTIN_GATHERDIV4DF,
29592 IX86_BUILTIN_GATHERSIV4SF,
29593 IX86_BUILTIN_GATHERSIV8SF,
29594 IX86_BUILTIN_GATHERDIV4SF,
29595 IX86_BUILTIN_GATHERDIV8SF,
29596 IX86_BUILTIN_GATHERSIV2DI,
29597 IX86_BUILTIN_GATHERSIV4DI,
29598 IX86_BUILTIN_GATHERDIV2DI,
29599 IX86_BUILTIN_GATHERDIV4DI,
29600 IX86_BUILTIN_GATHERSIV4SI,
29601 IX86_BUILTIN_GATHERSIV8SI,
29602 IX86_BUILTIN_GATHERDIV4SI,
29603 IX86_BUILTIN_GATHERDIV8SI,
29604 IX86_BUILTIN_VFMSUBSD3_MASK3,
29605 IX86_BUILTIN_VFMSUBSS3_MASK3,
29606 IX86_BUILTIN_GATHER3SIV8SF,
29607 IX86_BUILTIN_GATHER3SIV4SF,
29608 IX86_BUILTIN_GATHER3SIV4DF,
29609 IX86_BUILTIN_GATHER3SIV2DF,
29610 IX86_BUILTIN_GATHER3DIV8SF,
29611 IX86_BUILTIN_GATHER3DIV4SF,
29612 IX86_BUILTIN_GATHER3DIV4DF,
29613 IX86_BUILTIN_GATHER3DIV2DF,
29614 IX86_BUILTIN_GATHER3SIV8SI,
29615 IX86_BUILTIN_GATHER3SIV4SI,
29616 IX86_BUILTIN_GATHER3SIV4DI,
29617 IX86_BUILTIN_GATHER3SIV2DI,
29618 IX86_BUILTIN_GATHER3DIV8SI,
29619 IX86_BUILTIN_GATHER3DIV4SI,
29620 IX86_BUILTIN_GATHER3DIV4DI,
29621 IX86_BUILTIN_GATHER3DIV2DI,
29622 IX86_BUILTIN_SCATTERSIV8SF,
29623 IX86_BUILTIN_SCATTERSIV4SF,
29624 IX86_BUILTIN_SCATTERSIV4DF,
29625 IX86_BUILTIN_SCATTERSIV2DF,
29626 IX86_BUILTIN_SCATTERDIV8SF,
29627 IX86_BUILTIN_SCATTERDIV4SF,
29628 IX86_BUILTIN_SCATTERDIV4DF,
29629 IX86_BUILTIN_SCATTERDIV2DF,
29630 IX86_BUILTIN_SCATTERSIV8SI,
29631 IX86_BUILTIN_SCATTERSIV4SI,
29632 IX86_BUILTIN_SCATTERSIV4DI,
29633 IX86_BUILTIN_SCATTERSIV2DI,
29634 IX86_BUILTIN_SCATTERDIV8SI,
29635 IX86_BUILTIN_SCATTERDIV4SI,
29636 IX86_BUILTIN_SCATTERDIV4DI,
29637 IX86_BUILTIN_SCATTERDIV2DI,
29638 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29639 where all operands are 32-byte or 64-byte wide respectively. */
29640 IX86_BUILTIN_GATHERALTSIV4DF,
29641 IX86_BUILTIN_GATHERALTDIV8SF,
29642 IX86_BUILTIN_GATHERALTSIV4DI,
29643 IX86_BUILTIN_GATHERALTDIV8SI,
29644 IX86_BUILTIN_GATHER3ALTDIV16SF,
29645 IX86_BUILTIN_GATHER3ALTDIV16SI,
29646 IX86_BUILTIN_GATHER3ALTSIV4DF,
29647 IX86_BUILTIN_GATHER3ALTDIV8SF,
29648 IX86_BUILTIN_GATHER3ALTSIV4DI,
29649 IX86_BUILTIN_GATHER3ALTDIV8SI,
29650 IX86_BUILTIN_GATHER3ALTSIV8DF,
29651 IX86_BUILTIN_GATHER3ALTSIV8DI,
29652 IX86_BUILTIN_GATHER3DIV16SF,
29653 IX86_BUILTIN_GATHER3DIV16SI,
29654 IX86_BUILTIN_GATHER3DIV8DF,
29655 IX86_BUILTIN_GATHER3DIV8DI,
29656 IX86_BUILTIN_GATHER3SIV16SF,
29657 IX86_BUILTIN_GATHER3SIV16SI,
29658 IX86_BUILTIN_GATHER3SIV8DF,
29659 IX86_BUILTIN_GATHER3SIV8DI,
29660 IX86_BUILTIN_SCATTERALTSIV8DF,
29661 IX86_BUILTIN_SCATTERALTDIV16SF,
29662 IX86_BUILTIN_SCATTERALTSIV8DI,
29663 IX86_BUILTIN_SCATTERALTDIV16SI,
29664 IX86_BUILTIN_SCATTERDIV16SF,
29665 IX86_BUILTIN_SCATTERDIV16SI,
29666 IX86_BUILTIN_SCATTERDIV8DF,
29667 IX86_BUILTIN_SCATTERDIV8DI,
29668 IX86_BUILTIN_SCATTERSIV16SF,
29669 IX86_BUILTIN_SCATTERSIV16SI,
29670 IX86_BUILTIN_SCATTERSIV8DF,
29671 IX86_BUILTIN_SCATTERSIV8DI,
29672 IX86_BUILTIN_GATHERPFQPD,
29673 IX86_BUILTIN_GATHERPFDPS,
29674 IX86_BUILTIN_GATHERPFDPD,
29675 IX86_BUILTIN_GATHERPFQPS,
29676 IX86_BUILTIN_SCATTERPFDPD,
29677 IX86_BUILTIN_SCATTERPFDPS,
29678 IX86_BUILTIN_SCATTERPFQPD,
29679 IX86_BUILTIN_SCATTERPFQPS,
29680 IX86_BUILTIN_CLWB,
29681 IX86_BUILTIN_CLFLUSHOPT,
29682 IX86_BUILTIN_INFQ,
29683 IX86_BUILTIN_HUGE_VALQ,
29684 IX86_BUILTIN_NANQ,
29685 IX86_BUILTIN_NANSQ,
29686 IX86_BUILTIN_XABORT,
29687 IX86_BUILTIN_ADDCARRYX32,
29688 IX86_BUILTIN_ADDCARRYX64,
29689 IX86_BUILTIN_SBB32,
29690 IX86_BUILTIN_SBB64,
29691 IX86_BUILTIN_RDRAND16_STEP,
29692 IX86_BUILTIN_RDRAND32_STEP,
29693 IX86_BUILTIN_RDRAND64_STEP,
29694 IX86_BUILTIN_RDSEED16_STEP,
29695 IX86_BUILTIN_RDSEED32_STEP,
29696 IX86_BUILTIN_RDSEED64_STEP,
29697 IX86_BUILTIN_MONITORX,
29698 IX86_BUILTIN_MWAITX,
29699 IX86_BUILTIN_CFSTRING,
29700 IX86_BUILTIN_CPU_INIT,
29701 IX86_BUILTIN_CPU_IS,
29702 IX86_BUILTIN_CPU_SUPPORTS,
29703 IX86_BUILTIN_READ_FLAGS,
29704 IX86_BUILTIN_WRITE_FLAGS,
29706 /* All the remaining builtins are tracked in bdesc_* arrays in
29707 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29708 this point. */
29709 #define BDESC(mask, icode, name, code, comparison, flag) \
29710 code,
29711 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29712 code, \
29713 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29714 #define BDESC_END(kind, next_kind)
29716 #include "i386-builtin.def"
29718 #undef BDESC
29719 #undef BDESC_FIRST
29720 #undef BDESC_END
29722 IX86_BUILTIN_MAX,
29724 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29726 /* Now just the aliases for bdesc_* start/end. */
29727 #define BDESC(mask, icode, name, code, comparison, flag)
29728 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29729 #define BDESC_END(kind, next_kind) \
29730 IX86_BUILTIN__BDESC_##kind##_LAST \
29731 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29733 #include "i386-builtin.def"
29735 #undef BDESC
29736 #undef BDESC_FIRST
29737 #undef BDESC_END
29739 /* Just to make sure there is no comma after the last enumerator. */
29740 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29743 /* Table for the ix86 builtin decls. */
29744 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29746 /* Table of all of the builtin functions that are possible with different ISA's
29747 but are waiting to be built until a function is declared to use that
29748 ISA. */
29749 struct builtin_isa {
29750 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29751 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29752 const char *name; /* function name */
29753 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29754 unsigned char const_p:1; /* true if the declaration is constant */
29755 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29756 bool leaf_p; /* true if the declaration has leaf attribute */
29757 bool nothrow_p; /* true if the declaration has nothrow attribute */
29758 bool set_and_not_built_p;
29761 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29763 /* Bits that can still enable any inclusion of a builtin. */
29764 static HOST_WIDE_INT deferred_isa_values = 0;
29765 static HOST_WIDE_INT deferred_isa_values2 = 0;
29767 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29768 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29769 function decl in the ix86_builtins array. Returns the function decl or
29770 NULL_TREE, if the builtin was not added.
29772 If the front end has a special hook for builtin functions, delay adding
29773 builtin functions that aren't in the current ISA until the ISA is changed
29774 with function specific optimization. Doing so, can save about 300K for the
29775 default compiler. When the builtin is expanded, check at that time whether
29776 it is valid.
29778 If the front end doesn't have a special hook, record all builtins, even if
29779 it isn't an instruction set in the current ISA in case the user uses
29780 function specific options for a different ISA, so that we don't get scope
29781 errors if a builtin is added in the middle of a function scope. */
29783 static inline tree
29784 def_builtin (HOST_WIDE_INT mask, const char *name,
29785 enum ix86_builtin_func_type tcode,
29786 enum ix86_builtins code)
29788 tree decl = NULL_TREE;
29790 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29792 ix86_builtins_isa[(int) code].isa = mask;
29794 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
29795 where any bit set means that built-in is enable, this bit must be *and-ed*
29796 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
29797 means that *both* cpuid bits must be set for the built-in to be available.
29798 Handle this here. */
29799 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29800 mask &= ~OPTION_MASK_ISA_AVX512VL;
29802 mask &= ~OPTION_MASK_ISA_64BIT;
29803 if (mask == 0
29804 || (mask & ix86_isa_flags) != 0
29805 || (lang_hooks.builtin_function
29806 == lang_hooks.builtin_function_ext_scope))
29809 tree type = ix86_get_builtin_func_type (tcode);
29810 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29811 NULL, NULL_TREE);
29812 ix86_builtins[(int) code] = decl;
29813 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29815 else
29817 /* Just a MASK where set_and_not_built_p == true can potentially
29818 include a builtin. */
29819 deferred_isa_values |= mask;
29820 ix86_builtins[(int) code] = NULL_TREE;
29821 ix86_builtins_isa[(int) code].tcode = tcode;
29822 ix86_builtins_isa[(int) code].name = name;
29823 ix86_builtins_isa[(int) code].leaf_p = false;
29824 ix86_builtins_isa[(int) code].nothrow_p = false;
29825 ix86_builtins_isa[(int) code].const_p = false;
29826 ix86_builtins_isa[(int) code].pure_p = false;
29827 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29831 return decl;
29834 /* Like def_builtin, but also marks the function decl "const". */
29836 static inline tree
29837 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29838 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29840 tree decl = def_builtin (mask, name, tcode, code);
29841 if (decl)
29842 TREE_READONLY (decl) = 1;
29843 else
29844 ix86_builtins_isa[(int) code].const_p = true;
29846 return decl;
29849 /* Like def_builtin, but also marks the function decl "pure". */
29851 static inline tree
29852 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29853 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29855 tree decl = def_builtin (mask, name, tcode, code);
29856 if (decl)
29857 DECL_PURE_P (decl) = 1;
29858 else
29859 ix86_builtins_isa[(int) code].pure_p = true;
29861 return decl;
29864 /* Like def_builtin, but for additional isa2 flags. */
29866 static inline tree
29867 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29868 enum ix86_builtin_func_type tcode,
29869 enum ix86_builtins code)
29871 tree decl = NULL_TREE;
29873 ix86_builtins_isa[(int) code].isa2 = mask;
29875 if (mask == 0
29876 || (mask & ix86_isa_flags2) != 0
29877 || (lang_hooks.builtin_function
29878 == lang_hooks.builtin_function_ext_scope))
29881 tree type = ix86_get_builtin_func_type (tcode);
29882 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29883 NULL, NULL_TREE);
29884 ix86_builtins[(int) code] = decl;
29885 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29887 else
29889 /* Just a MASK where set_and_not_built_p == true can potentially
29890 include a builtin. */
29891 deferred_isa_values2 |= mask;
29892 ix86_builtins[(int) code] = NULL_TREE;
29893 ix86_builtins_isa[(int) code].tcode = tcode;
29894 ix86_builtins_isa[(int) code].name = name;
29895 ix86_builtins_isa[(int) code].leaf_p = false;
29896 ix86_builtins_isa[(int) code].nothrow_p = false;
29897 ix86_builtins_isa[(int) code].const_p = false;
29898 ix86_builtins_isa[(int) code].pure_p = false;
29899 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29902 return decl;
29905 /* Like def_builtin, but also marks the function decl "const". */
29907 static inline tree
29908 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29909 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29911 tree decl = def_builtin2 (mask, name, tcode, code);
29912 if (decl)
29913 TREE_READONLY (decl) = 1;
29914 else
29915 ix86_builtins_isa[(int) code].const_p = true;
29917 return decl;
29920 /* Like def_builtin, but also marks the function decl "pure". */
29922 static inline tree
29923 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29924 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29926 tree decl = def_builtin2 (mask, name, tcode, code);
29927 if (decl)
29928 DECL_PURE_P (decl) = 1;
29929 else
29930 ix86_builtins_isa[(int) code].pure_p = true;
29932 return decl;
29935 /* Add any new builtin functions for a given ISA that may not have been
29936 declared. This saves a bit of space compared to adding all of the
29937 declarations to the tree, even if we didn't use them. */
29939 static void
29940 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29942 if ((isa & deferred_isa_values) == 0
29943 && (isa2 & deferred_isa_values2) == 0)
29944 return;
29946 /* Bits in ISA value can be removed from potential isa values. */
29947 deferred_isa_values &= ~isa;
29948 deferred_isa_values2 &= ~isa2;
29950 int i;
29951 tree saved_current_target_pragma = current_target_pragma;
29952 current_target_pragma = NULL_TREE;
29954 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
29956 if (((ix86_builtins_isa[i].isa & isa) != 0
29957 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
29958 && ix86_builtins_isa[i].set_and_not_built_p)
29960 tree decl, type;
29962 /* Don't define the builtin again. */
29963 ix86_builtins_isa[i].set_and_not_built_p = false;
29965 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
29966 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
29967 type, i, BUILT_IN_MD, NULL,
29968 NULL_TREE);
29970 ix86_builtins[i] = decl;
29971 if (ix86_builtins_isa[i].const_p)
29972 TREE_READONLY (decl) = 1;
29973 if (ix86_builtins_isa[i].pure_p)
29974 DECL_PURE_P (decl) = 1;
29975 if (ix86_builtins_isa[i].leaf_p)
29976 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
29977 NULL_TREE);
29978 if (ix86_builtins_isa[i].nothrow_p)
29979 TREE_NOTHROW (decl) = 1;
29983 current_target_pragma = saved_current_target_pragma;
29986 /* Bits for builtin_description.flag. */
29988 /* Set when we don't support the comparison natively, and should
29989 swap_comparison in order to support it. */
29990 #define BUILTIN_DESC_SWAP_OPERANDS 1
29992 struct builtin_description
29994 const HOST_WIDE_INT mask;
29995 const enum insn_code icode;
29996 const char *const name;
29997 const enum ix86_builtins code;
29998 const enum rtx_code comparison;
29999 const int flag;
30002 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30003 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30004 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30005 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30006 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30007 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30008 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30009 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30010 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30011 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30012 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30013 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30014 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30015 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30016 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30017 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30018 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30019 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30020 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30021 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30022 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30023 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30024 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30025 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30026 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30027 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30028 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30029 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30030 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30031 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30032 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30033 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30034 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30035 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30036 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30037 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30038 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30039 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30040 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30041 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30042 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30043 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30044 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30045 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30046 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30047 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30048 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30049 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30050 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30051 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30052 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30053 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30055 #define BDESC(mask, icode, name, code, comparison, flag) \
30056 { mask, icode, name, code, comparison, flag },
30057 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30058 static const struct builtin_description bdesc_##kind[] = \
30060 BDESC (mask, icode, name, code, comparison, flag)
30061 #define BDESC_END(kind, next_kind) \
30064 #include "i386-builtin.def"
30066 #undef BDESC
30067 #undef BDESC_FIRST
30068 #undef BDESC_END
30070 /* TM vector builtins. */
30072 /* Reuse the existing x86-specific `struct builtin_description' cause
30073 we're lazy. Add casts to make them fit. */
30074 static const struct builtin_description bdesc_tm[] =
30076 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30077 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30078 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30079 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30080 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30081 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30082 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30084 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30085 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30086 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30087 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30088 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30089 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30090 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30092 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30093 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30094 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30095 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30096 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30097 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30098 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30100 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30101 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30102 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30105 /* Initialize the transactional memory vector load/store builtins. */
30107 static void
30108 ix86_init_tm_builtins (void)
30110 enum ix86_builtin_func_type ftype;
30111 const struct builtin_description *d;
30112 size_t i;
30113 tree decl;
30114 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30115 tree attrs_log, attrs_type_log;
30117 if (!flag_tm)
30118 return;
30120 /* If there are no builtins defined, we must be compiling in a
30121 language without trans-mem support. */
30122 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30123 return;
30125 /* Use whatever attributes a normal TM load has. */
30126 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30127 attrs_load = DECL_ATTRIBUTES (decl);
30128 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30129 /* Use whatever attributes a normal TM store has. */
30130 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30131 attrs_store = DECL_ATTRIBUTES (decl);
30132 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30133 /* Use whatever attributes a normal TM log has. */
30134 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30135 attrs_log = DECL_ATTRIBUTES (decl);
30136 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30138 for (i = 0, d = bdesc_tm;
30139 i < ARRAY_SIZE (bdesc_tm);
30140 i++, d++)
30142 if ((d->mask & ix86_isa_flags) != 0
30143 || (lang_hooks.builtin_function
30144 == lang_hooks.builtin_function_ext_scope))
30146 tree type, attrs, attrs_type;
30147 enum built_in_function code = (enum built_in_function) d->code;
30149 ftype = (enum ix86_builtin_func_type) d->flag;
30150 type = ix86_get_builtin_func_type (ftype);
30152 if (BUILTIN_TM_LOAD_P (code))
30154 attrs = attrs_load;
30155 attrs_type = attrs_type_load;
30157 else if (BUILTIN_TM_STORE_P (code))
30159 attrs = attrs_store;
30160 attrs_type = attrs_type_store;
30162 else
30164 attrs = attrs_log;
30165 attrs_type = attrs_type_log;
30167 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
30168 /* The builtin without the prefix for
30169 calling it directly. */
30170 d->name + strlen ("__builtin_"),
30171 attrs);
30172 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
30173 set the TYPE_ATTRIBUTES. */
30174 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
30176 set_builtin_decl (code, decl, false);
30181 /* Macros for verification of enum ix86_builtins order. */
30182 #define BDESC_VERIFY(x, y, z) \
30183 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
30184 #define BDESC_VERIFYS(x, y, z) \
30185 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
30187 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30188 IX86_BUILTIN__BDESC_COMI_LAST, 1);
30189 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30190 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
30191 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30192 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
30193 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
30194 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
30195 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30196 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
30197 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
30198 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
30199 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30200 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
30201 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
30202 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
30203 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30204 IX86_BUILTIN__BDESC_MPX_LAST, 1);
30205 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30206 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
30207 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
30208 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
30209 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30210 IX86_BUILTIN__BDESC_CET_LAST, 1);
30211 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30212 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30214 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30215 in the current target ISA to allow the user to compile particular modules
30216 with different target specific options that differ from the command line
30217 options. */
30218 static void
30219 ix86_init_mmx_sse_builtins (void)
30221 const struct builtin_description * d;
30222 enum ix86_builtin_func_type ftype;
30223 size_t i;
30225 /* Add all special builtins with variable number of operands. */
30226 for (i = 0, d = bdesc_special_args;
30227 i < ARRAY_SIZE (bdesc_special_args);
30228 i++, d++)
30230 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30231 if (d->name == 0)
30232 continue;
30234 ftype = (enum ix86_builtin_func_type) d->flag;
30235 def_builtin (d->mask, d->name, ftype, d->code);
30237 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30238 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30239 ARRAY_SIZE (bdesc_special_args) - 1);
30241 /* Add all builtins with variable number of operands. */
30242 for (i = 0, d = bdesc_args;
30243 i < ARRAY_SIZE (bdesc_args);
30244 i++, d++)
30246 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30247 if (d->name == 0)
30248 continue;
30250 ftype = (enum ix86_builtin_func_type) d->flag;
30251 def_builtin_const (d->mask, d->name, ftype, d->code);
30253 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30254 IX86_BUILTIN__BDESC_ARGS_FIRST,
30255 ARRAY_SIZE (bdesc_args) - 1);
30257 /* Add all builtins with variable number of operands. */
30258 for (i = 0, d = bdesc_args2;
30259 i < ARRAY_SIZE (bdesc_args2);
30260 i++, d++)
30262 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
30263 if (d->name == 0)
30264 continue;
30266 ftype = (enum ix86_builtin_func_type) d->flag;
30267 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30269 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
30270 IX86_BUILTIN__BDESC_ARGS2_FIRST,
30271 ARRAY_SIZE (bdesc_args2) - 1);
30273 for (i = 0, d = bdesc_special_args2;
30274 i < ARRAY_SIZE (bdesc_special_args2);
30275 i++, d++)
30277 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
30278 if (d->name == 0)
30279 continue;
30281 ftype = (enum ix86_builtin_func_type) d->flag;
30282 def_builtin2 (d->mask, d->name, ftype, d->code);
30284 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
30285 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30286 ARRAY_SIZE (bdesc_special_args2) - 1);
30288 /* Add all builtins with rounding. */
30289 for (i = 0, d = bdesc_round_args;
30290 i < ARRAY_SIZE (bdesc_round_args);
30291 i++, d++)
30293 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30294 if (d->name == 0)
30295 continue;
30297 ftype = (enum ix86_builtin_func_type) d->flag;
30298 def_builtin_const (d->mask, d->name, ftype, d->code);
30300 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30301 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30302 ARRAY_SIZE (bdesc_round_args) - 1);
30304 /* pcmpestr[im] insns. */
30305 for (i = 0, d = bdesc_pcmpestr;
30306 i < ARRAY_SIZE (bdesc_pcmpestr);
30307 i++, d++)
30309 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30310 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30311 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30312 else
30313 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30314 def_builtin_const (d->mask, d->name, ftype, d->code);
30316 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30317 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30318 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30320 /* pcmpistr[im] insns. */
30321 for (i = 0, d = bdesc_pcmpistr;
30322 i < ARRAY_SIZE (bdesc_pcmpistr);
30323 i++, d++)
30325 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30326 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30327 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30328 else
30329 ftype = INT_FTYPE_V16QI_V16QI_INT;
30330 def_builtin_const (d->mask, d->name, ftype, d->code);
30332 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30333 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30334 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30336 /* comi/ucomi insns. */
30337 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30339 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30340 if (d->mask == OPTION_MASK_ISA_SSE2)
30341 ftype = INT_FTYPE_V2DF_V2DF;
30342 else
30343 ftype = INT_FTYPE_V4SF_V4SF;
30344 def_builtin_const (d->mask, d->name, ftype, d->code);
30346 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30347 IX86_BUILTIN__BDESC_COMI_FIRST,
30348 ARRAY_SIZE (bdesc_comi) - 1);
30350 /* SSE */
30351 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30352 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30353 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30354 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30356 /* SSE or 3DNow!A */
30357 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30358 /* As it uses V4HImode, we have to require -mmmx too. */
30359 | OPTION_MASK_ISA_MMX,
30360 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30361 IX86_BUILTIN_MASKMOVQ);
30363 /* SSE2 */
30364 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30365 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30367 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30368 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30369 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30370 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30372 /* SSE3. */
30373 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30374 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30375 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30376 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30378 /* AES */
30379 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30380 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30381 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30382 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30383 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30384 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30385 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30386 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30387 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30388 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30389 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30390 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30392 /* PCLMUL */
30393 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30394 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30396 /* RDRND */
30397 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30398 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30399 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30400 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30401 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30402 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30403 IX86_BUILTIN_RDRAND64_STEP);
30405 /* AVX2 */
30406 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30407 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30408 IX86_BUILTIN_GATHERSIV2DF);
30410 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30411 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30412 IX86_BUILTIN_GATHERSIV4DF);
30414 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30415 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30416 IX86_BUILTIN_GATHERDIV2DF);
30418 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30419 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30420 IX86_BUILTIN_GATHERDIV4DF);
30422 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30423 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30424 IX86_BUILTIN_GATHERSIV4SF);
30426 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30427 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30428 IX86_BUILTIN_GATHERSIV8SF);
30430 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30431 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30432 IX86_BUILTIN_GATHERDIV4SF);
30434 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30435 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30436 IX86_BUILTIN_GATHERDIV8SF);
30438 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30439 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30440 IX86_BUILTIN_GATHERSIV2DI);
30442 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30443 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30444 IX86_BUILTIN_GATHERSIV4DI);
30446 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30447 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30448 IX86_BUILTIN_GATHERDIV2DI);
30450 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30451 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30452 IX86_BUILTIN_GATHERDIV4DI);
30454 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30455 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30456 IX86_BUILTIN_GATHERSIV4SI);
30458 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30459 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30460 IX86_BUILTIN_GATHERSIV8SI);
30462 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30463 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30464 IX86_BUILTIN_GATHERDIV4SI);
30466 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30467 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30468 IX86_BUILTIN_GATHERDIV8SI);
30470 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30471 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30472 IX86_BUILTIN_GATHERALTSIV4DF);
30474 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30475 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30476 IX86_BUILTIN_GATHERALTDIV8SF);
30478 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30479 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30480 IX86_BUILTIN_GATHERALTSIV4DI);
30482 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30483 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30484 IX86_BUILTIN_GATHERALTDIV8SI);
30486 /* AVX512F */
30487 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30488 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30489 IX86_BUILTIN_GATHER3SIV16SF);
30491 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30492 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30493 IX86_BUILTIN_GATHER3SIV8DF);
30495 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30496 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30497 IX86_BUILTIN_GATHER3DIV16SF);
30499 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30500 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30501 IX86_BUILTIN_GATHER3DIV8DF);
30503 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30504 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30505 IX86_BUILTIN_GATHER3SIV16SI);
30507 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30508 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30509 IX86_BUILTIN_GATHER3SIV8DI);
30511 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30512 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30513 IX86_BUILTIN_GATHER3DIV16SI);
30515 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30516 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30517 IX86_BUILTIN_GATHER3DIV8DI);
30519 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30520 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30521 IX86_BUILTIN_GATHER3ALTSIV8DF);
30523 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30524 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30525 IX86_BUILTIN_GATHER3ALTDIV16SF);
30527 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30528 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30529 IX86_BUILTIN_GATHER3ALTSIV8DI);
30531 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30532 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30533 IX86_BUILTIN_GATHER3ALTDIV16SI);
30535 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30536 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30537 IX86_BUILTIN_SCATTERSIV16SF);
30539 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30540 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30541 IX86_BUILTIN_SCATTERSIV8DF);
30543 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30544 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30545 IX86_BUILTIN_SCATTERDIV16SF);
30547 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30548 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30549 IX86_BUILTIN_SCATTERDIV8DF);
30551 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30552 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30553 IX86_BUILTIN_SCATTERSIV16SI);
30555 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30556 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30557 IX86_BUILTIN_SCATTERSIV8DI);
30559 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30560 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30561 IX86_BUILTIN_SCATTERDIV16SI);
30563 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30564 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30565 IX86_BUILTIN_SCATTERDIV8DI);
30567 /* AVX512VL */
30568 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30569 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30570 IX86_BUILTIN_GATHER3SIV2DF);
30572 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30573 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30574 IX86_BUILTIN_GATHER3SIV4DF);
30576 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30577 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30578 IX86_BUILTIN_GATHER3DIV2DF);
30580 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30581 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30582 IX86_BUILTIN_GATHER3DIV4DF);
30584 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30585 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30586 IX86_BUILTIN_GATHER3SIV4SF);
30588 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30589 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30590 IX86_BUILTIN_GATHER3SIV8SF);
30592 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30593 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30594 IX86_BUILTIN_GATHER3DIV4SF);
30596 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30597 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30598 IX86_BUILTIN_GATHER3DIV8SF);
30600 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30601 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30602 IX86_BUILTIN_GATHER3SIV2DI);
30604 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30605 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30606 IX86_BUILTIN_GATHER3SIV4DI);
30608 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30609 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30610 IX86_BUILTIN_GATHER3DIV2DI);
30612 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30613 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30614 IX86_BUILTIN_GATHER3DIV4DI);
30616 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30617 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30618 IX86_BUILTIN_GATHER3SIV4SI);
30620 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30621 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30622 IX86_BUILTIN_GATHER3SIV8SI);
30624 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30625 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30626 IX86_BUILTIN_GATHER3DIV4SI);
30628 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30629 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30630 IX86_BUILTIN_GATHER3DIV8SI);
30632 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30633 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30634 IX86_BUILTIN_GATHER3ALTSIV4DF);
30636 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30637 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30638 IX86_BUILTIN_GATHER3ALTDIV8SF);
30640 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30641 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30642 IX86_BUILTIN_GATHER3ALTSIV4DI);
30644 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30645 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30646 IX86_BUILTIN_GATHER3ALTDIV8SI);
30648 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30649 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30650 IX86_BUILTIN_SCATTERSIV8SF);
30652 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30653 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30654 IX86_BUILTIN_SCATTERSIV4SF);
30656 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30657 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30658 IX86_BUILTIN_SCATTERSIV4DF);
30660 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30661 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30662 IX86_BUILTIN_SCATTERSIV2DF);
30664 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30665 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30666 IX86_BUILTIN_SCATTERDIV8SF);
30668 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30669 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30670 IX86_BUILTIN_SCATTERDIV4SF);
30672 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30673 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30674 IX86_BUILTIN_SCATTERDIV4DF);
30676 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30677 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30678 IX86_BUILTIN_SCATTERDIV2DF);
30680 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30681 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30682 IX86_BUILTIN_SCATTERSIV8SI);
30684 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30685 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30686 IX86_BUILTIN_SCATTERSIV4SI);
30688 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30689 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30690 IX86_BUILTIN_SCATTERSIV4DI);
30692 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30693 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30694 IX86_BUILTIN_SCATTERSIV2DI);
30696 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30697 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30698 IX86_BUILTIN_SCATTERDIV8SI);
30700 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30701 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30702 IX86_BUILTIN_SCATTERDIV4SI);
30704 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30705 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30706 IX86_BUILTIN_SCATTERDIV4DI);
30708 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30709 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30710 IX86_BUILTIN_SCATTERDIV2DI);
30711 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30712 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30713 IX86_BUILTIN_SCATTERALTSIV8DF);
30715 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30716 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30717 IX86_BUILTIN_SCATTERALTDIV16SF);
30719 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30720 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30721 IX86_BUILTIN_SCATTERALTSIV8DI);
30723 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30724 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30725 IX86_BUILTIN_SCATTERALTDIV16SI);
30727 /* AVX512PF */
30728 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30729 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30730 IX86_BUILTIN_GATHERPFDPD);
30731 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30732 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30733 IX86_BUILTIN_GATHERPFDPS);
30734 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30735 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30736 IX86_BUILTIN_GATHERPFQPD);
30737 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30738 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30739 IX86_BUILTIN_GATHERPFQPS);
30740 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30741 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30742 IX86_BUILTIN_SCATTERPFDPD);
30743 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30744 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30745 IX86_BUILTIN_SCATTERPFDPS);
30746 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30747 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30748 IX86_BUILTIN_SCATTERPFQPD);
30749 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30750 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30751 IX86_BUILTIN_SCATTERPFQPS);
30753 /* SHA */
30754 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30755 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30756 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30757 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30758 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30759 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30760 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30761 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30762 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30763 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30764 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30765 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30766 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30767 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30769 /* RTM. */
30770 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30771 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30773 /* MMX access to the vec_init patterns. */
30774 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30775 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30777 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30778 V4HI_FTYPE_HI_HI_HI_HI,
30779 IX86_BUILTIN_VEC_INIT_V4HI);
30781 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30782 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30783 IX86_BUILTIN_VEC_INIT_V8QI);
30785 /* Access to the vec_extract patterns. */
30786 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30787 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30788 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30789 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30790 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30791 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30792 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30793 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30794 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30795 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30797 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30798 /* As it uses V4HImode, we have to require -mmmx too. */
30799 | OPTION_MASK_ISA_MMX,
30800 "__builtin_ia32_vec_ext_v4hi",
30801 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30803 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30804 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30806 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30807 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30809 /* Access to the vec_set patterns. */
30810 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30811 "__builtin_ia32_vec_set_v2di",
30812 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30814 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30815 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30817 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30818 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30820 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30821 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30823 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30824 /* As it uses V4HImode, we have to require -mmmx too. */
30825 | OPTION_MASK_ISA_MMX,
30826 "__builtin_ia32_vec_set_v4hi",
30827 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30829 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30830 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30832 /* RDSEED */
30833 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30834 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30835 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30836 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30837 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30838 "__builtin_ia32_rdseed_di_step",
30839 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30841 /* ADCX */
30842 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30843 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30844 def_builtin (OPTION_MASK_ISA_64BIT,
30845 "__builtin_ia32_addcarryx_u64",
30846 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30847 IX86_BUILTIN_ADDCARRYX64);
30849 /* SBB */
30850 def_builtin (0, "__builtin_ia32_sbb_u32",
30851 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30852 def_builtin (OPTION_MASK_ISA_64BIT,
30853 "__builtin_ia32_sbb_u64",
30854 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30855 IX86_BUILTIN_SBB64);
30857 /* Read/write FLAGS. */
30858 def_builtin (0, "__builtin_ia32_readeflags_u32",
30859 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30860 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30861 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30862 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30863 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30864 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30865 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30867 /* CLFLUSHOPT. */
30868 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30869 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30871 /* CLWB. */
30872 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30873 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30875 /* MONITORX and MWAITX. */
30876 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30877 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30878 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30879 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30881 /* CLZERO. */
30882 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30883 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30885 /* Add FMA4 multi-arg argument instructions */
30886 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30888 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30889 if (d->name == 0)
30890 continue;
30892 ftype = (enum ix86_builtin_func_type) d->flag;
30893 def_builtin_const (d->mask, d->name, ftype, d->code);
30895 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30896 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30897 ARRAY_SIZE (bdesc_multi_arg) - 1);
30899 /* Add CET inrinsics. */
30900 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30902 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30903 if (d->name == 0)
30904 continue;
30906 ftype = (enum ix86_builtin_func_type) d->flag;
30907 def_builtin2 (d->mask, d->name, ftype, d->code);
30909 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30910 IX86_BUILTIN__BDESC_CET_FIRST,
30911 ARRAY_SIZE (bdesc_cet) - 1);
30913 for (i = 0, d = bdesc_cet_rdssp;
30914 i < ARRAY_SIZE (bdesc_cet_rdssp);
30915 i++, d++)
30917 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30918 if (d->name == 0)
30919 continue;
30921 ftype = (enum ix86_builtin_func_type) d->flag;
30922 def_builtin2 (d->mask, d->name, ftype, d->code);
30924 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30925 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30926 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30929 static void
30930 ix86_init_mpx_builtins ()
30932 const struct builtin_description * d;
30933 enum ix86_builtin_func_type ftype;
30934 tree decl;
30935 size_t i;
30937 for (i = 0, d = bdesc_mpx;
30938 i < ARRAY_SIZE (bdesc_mpx);
30939 i++, d++)
30941 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30942 if (d->name == 0)
30943 continue;
30945 ftype = (enum ix86_builtin_func_type) d->flag;
30946 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
30948 /* With no leaf and nothrow flags for MPX builtins
30949 abnormal edges may follow its call when setjmp
30950 presents in the function. Since we may have a lot
30951 of MPX builtins calls it causes lots of useless
30952 edges and enormous PHI nodes. To avoid this we mark
30953 MPX builtins as leaf and nothrow. */
30954 if (decl)
30956 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30957 NULL_TREE);
30958 TREE_NOTHROW (decl) = 1;
30960 else
30962 ix86_builtins_isa[(int)d->code].leaf_p = true;
30963 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30966 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
30967 IX86_BUILTIN__BDESC_MPX_FIRST,
30968 ARRAY_SIZE (bdesc_mpx) - 1);
30970 for (i = 0, d = bdesc_mpx_const;
30971 i < ARRAY_SIZE (bdesc_mpx_const);
30972 i++, d++)
30974 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
30975 if (d->name == 0)
30976 continue;
30978 ftype = (enum ix86_builtin_func_type) d->flag;
30979 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
30981 if (decl)
30983 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30984 NULL_TREE);
30985 TREE_NOTHROW (decl) = 1;
30987 else
30989 ix86_builtins_isa[(int)d->code].leaf_p = true;
30990 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30993 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
30994 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30995 ARRAY_SIZE (bdesc_mpx_const) - 1);
30997 #undef BDESC_VERIFY
30998 #undef BDESC_VERIFYS
31000 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31001 to return a pointer to VERSION_DECL if the outcome of the expression
31002 formed by PREDICATE_CHAIN is true. This function will be called during
31003 version dispatch to decide which function version to execute. It returns
31004 the basic block at the end, to which more conditions can be added. */
31006 static basic_block
31007 add_condition_to_bb (tree function_decl, tree version_decl,
31008 tree predicate_chain, basic_block new_bb)
31010 gimple *return_stmt;
31011 tree convert_expr, result_var;
31012 gimple *convert_stmt;
31013 gimple *call_cond_stmt;
31014 gimple *if_else_stmt;
31016 basic_block bb1, bb2, bb3;
31017 edge e12, e23;
31019 tree cond_var, and_expr_var = NULL_TREE;
31020 gimple_seq gseq;
31022 tree predicate_decl, predicate_arg;
31024 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31026 gcc_assert (new_bb != NULL);
31027 gseq = bb_seq (new_bb);
31030 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31031 build_fold_addr_expr (version_decl));
31032 result_var = create_tmp_var (ptr_type_node);
31033 convert_stmt = gimple_build_assign (result_var, convert_expr);
31034 return_stmt = gimple_build_return (result_var);
31036 if (predicate_chain == NULL_TREE)
31038 gimple_seq_add_stmt (&gseq, convert_stmt);
31039 gimple_seq_add_stmt (&gseq, return_stmt);
31040 set_bb_seq (new_bb, gseq);
31041 gimple_set_bb (convert_stmt, new_bb);
31042 gimple_set_bb (return_stmt, new_bb);
31043 pop_cfun ();
31044 return new_bb;
31047 while (predicate_chain != NULL)
31049 cond_var = create_tmp_var (integer_type_node);
31050 predicate_decl = TREE_PURPOSE (predicate_chain);
31051 predicate_arg = TREE_VALUE (predicate_chain);
31052 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31053 gimple_call_set_lhs (call_cond_stmt, cond_var);
31055 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31056 gimple_set_bb (call_cond_stmt, new_bb);
31057 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31059 predicate_chain = TREE_CHAIN (predicate_chain);
31061 if (and_expr_var == NULL)
31062 and_expr_var = cond_var;
31063 else
31065 gimple *assign_stmt;
31066 /* Use MIN_EXPR to check if any integer is zero?.
31067 and_expr_var = min_expr <cond_var, and_expr_var> */
31068 assign_stmt = gimple_build_assign (and_expr_var,
31069 build2 (MIN_EXPR, integer_type_node,
31070 cond_var, and_expr_var));
31072 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31073 gimple_set_bb (assign_stmt, new_bb);
31074 gimple_seq_add_stmt (&gseq, assign_stmt);
31078 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31079 integer_zero_node,
31080 NULL_TREE, NULL_TREE);
31081 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31082 gimple_set_bb (if_else_stmt, new_bb);
31083 gimple_seq_add_stmt (&gseq, if_else_stmt);
31085 gimple_seq_add_stmt (&gseq, convert_stmt);
31086 gimple_seq_add_stmt (&gseq, return_stmt);
31087 set_bb_seq (new_bb, gseq);
31089 bb1 = new_bb;
31090 e12 = split_block (bb1, if_else_stmt);
31091 bb2 = e12->dest;
31092 e12->flags &= ~EDGE_FALLTHRU;
31093 e12->flags |= EDGE_TRUE_VALUE;
31095 e23 = split_block (bb2, return_stmt);
31097 gimple_set_bb (convert_stmt, bb2);
31098 gimple_set_bb (return_stmt, bb2);
31100 bb3 = e23->dest;
31101 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31103 remove_edge (e23);
31104 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31106 pop_cfun ();
31108 return bb3;
31111 /* This parses the attribute arguments to target in DECL and determines
31112 the right builtin to use to match the platform specification.
31113 It returns the priority value for this version decl. If PREDICATE_LIST
31114 is not NULL, it stores the list of cpu features that need to be checked
31115 before dispatching this function. */
31117 static unsigned int
31118 get_builtin_code_for_version (tree decl, tree *predicate_list)
31120 tree attrs;
31121 struct cl_target_option cur_target;
31122 tree target_node;
31123 struct cl_target_option *new_target;
31124 const char *arg_str = NULL;
31125 const char *attrs_str = NULL;
31126 char *tok_str = NULL;
31127 char *token;
31129 /* Priority of i386 features, greater value is higher priority. This is
31130 used to decide the order in which function dispatch must happen. For
31131 instance, a version specialized for SSE4.2 should be checked for dispatch
31132 before a version for SSE3, as SSE4.2 implies SSE3. */
31133 enum feature_priority
31135 P_ZERO = 0,
31136 P_MMX,
31137 P_SSE,
31138 P_SSE2,
31139 P_SSE3,
31140 P_SSSE3,
31141 P_PROC_SSSE3,
31142 P_SSE4_A,
31143 P_PROC_SSE4_A,
31144 P_SSE4_1,
31145 P_SSE4_2,
31146 P_PROC_SSE4_2,
31147 P_POPCNT,
31148 P_AES,
31149 P_PCLMUL,
31150 P_AVX,
31151 P_PROC_AVX,
31152 P_BMI,
31153 P_PROC_BMI,
31154 P_FMA4,
31155 P_XOP,
31156 P_PROC_XOP,
31157 P_FMA,
31158 P_PROC_FMA,
31159 P_BMI2,
31160 P_AVX2,
31161 P_PROC_AVX2,
31162 P_AVX512F,
31163 P_PROC_AVX512F
31166 enum feature_priority priority = P_ZERO;
31168 /* These are the target attribute strings for which a dispatcher is
31169 available, from fold_builtin_cpu. */
31171 static struct _feature_list
31173 const char *const name;
31174 const enum feature_priority priority;
31176 const feature_list[] =
31178 {"mmx", P_MMX},
31179 {"sse", P_SSE},
31180 {"sse2", P_SSE2},
31181 {"sse3", P_SSE3},
31182 {"sse4a", P_SSE4_A},
31183 {"ssse3", P_SSSE3},
31184 {"sse4.1", P_SSE4_1},
31185 {"sse4.2", P_SSE4_2},
31186 {"popcnt", P_POPCNT},
31187 {"aes", P_AES},
31188 {"pclmul", P_PCLMUL},
31189 {"avx", P_AVX},
31190 {"bmi", P_BMI},
31191 {"fma4", P_FMA4},
31192 {"xop", P_XOP},
31193 {"fma", P_FMA},
31194 {"bmi2", P_BMI2},
31195 {"avx2", P_AVX2},
31196 {"avx512f", P_AVX512F}
31200 static unsigned int NUM_FEATURES
31201 = sizeof (feature_list) / sizeof (struct _feature_list);
31203 unsigned int i;
31205 tree predicate_chain = NULL_TREE;
31206 tree predicate_decl, predicate_arg;
31208 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31209 gcc_assert (attrs != NULL);
31211 attrs = TREE_VALUE (TREE_VALUE (attrs));
31213 gcc_assert (TREE_CODE (attrs) == STRING_CST);
31214 attrs_str = TREE_STRING_POINTER (attrs);
31216 /* Return priority zero for default function. */
31217 if (strcmp (attrs_str, "default") == 0)
31218 return 0;
31220 /* Handle arch= if specified. For priority, set it to be 1 more than
31221 the best instruction set the processor can handle. For instance, if
31222 there is a version for atom and a version for ssse3 (the highest ISA
31223 priority for atom), the atom version must be checked for dispatch
31224 before the ssse3 version. */
31225 if (strstr (attrs_str, "arch=") != NULL)
31227 cl_target_option_save (&cur_target, &global_options);
31228 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31229 &global_options_set);
31231 gcc_assert (target_node);
31232 new_target = TREE_TARGET_OPTION (target_node);
31233 gcc_assert (new_target);
31235 if (new_target->arch_specified && new_target->arch > 0)
31237 switch (new_target->arch)
31239 case PROCESSOR_CORE2:
31240 arg_str = "core2";
31241 priority = P_PROC_SSSE3;
31242 break;
31243 case PROCESSOR_NEHALEM:
31244 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31246 arg_str = "westmere";
31247 priority = P_AES;
31249 else
31251 /* We translate "arch=corei7" and "arch=nehalem" to
31252 "corei7" so that it will be mapped to M_INTEL_COREI7
31253 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31254 arg_str = "corei7";
31255 priority = P_PROC_SSE4_2;
31257 break;
31258 case PROCESSOR_SANDYBRIDGE:
31259 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31260 arg_str = "ivybridge";
31261 else
31262 arg_str = "sandybridge";
31263 priority = P_PROC_AVX;
31264 break;
31265 case PROCESSOR_HASWELL:
31266 case PROCESSOR_SKYLAKE_AVX512:
31267 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
31268 arg_str = "cannonlake";
31269 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31270 arg_str = "skylake-avx512";
31271 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31272 arg_str = "skylake";
31273 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31274 arg_str = "broadwell";
31275 else
31276 arg_str = "haswell";
31277 priority = P_PROC_AVX2;
31278 break;
31279 case PROCESSOR_BONNELL:
31280 arg_str = "bonnell";
31281 priority = P_PROC_SSSE3;
31282 break;
31283 case PROCESSOR_KNL:
31284 arg_str = "knl";
31285 priority = P_PROC_AVX512F;
31286 break;
31287 case PROCESSOR_KNM:
31288 arg_str = "knm";
31289 priority = P_PROC_AVX512F;
31290 break;
31291 case PROCESSOR_SILVERMONT:
31292 arg_str = "silvermont";
31293 priority = P_PROC_SSE4_2;
31294 break;
31295 case PROCESSOR_AMDFAM10:
31296 arg_str = "amdfam10h";
31297 priority = P_PROC_SSE4_A;
31298 break;
31299 case PROCESSOR_BTVER1:
31300 arg_str = "btver1";
31301 priority = P_PROC_SSE4_A;
31302 break;
31303 case PROCESSOR_BTVER2:
31304 arg_str = "btver2";
31305 priority = P_PROC_BMI;
31306 break;
31307 case PROCESSOR_BDVER1:
31308 arg_str = "bdver1";
31309 priority = P_PROC_XOP;
31310 break;
31311 case PROCESSOR_BDVER2:
31312 arg_str = "bdver2";
31313 priority = P_PROC_FMA;
31314 break;
31315 case PROCESSOR_BDVER3:
31316 arg_str = "bdver3";
31317 priority = P_PROC_FMA;
31318 break;
31319 case PROCESSOR_BDVER4:
31320 arg_str = "bdver4";
31321 priority = P_PROC_AVX2;
31322 break;
31323 case PROCESSOR_ZNVER1:
31324 arg_str = "znver1";
31325 priority = P_PROC_AVX2;
31326 break;
31330 cl_target_option_restore (&global_options, &cur_target);
31332 if (predicate_list && arg_str == NULL)
31334 error_at (DECL_SOURCE_LOCATION (decl),
31335 "No dispatcher found for the versioning attributes");
31336 return 0;
31339 if (predicate_list)
31341 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31342 /* For a C string literal the length includes the trailing NULL. */
31343 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31344 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31345 predicate_chain);
31349 /* Process feature name. */
31350 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31351 strcpy (tok_str, attrs_str);
31352 token = strtok (tok_str, ",");
31353 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31355 while (token != NULL)
31357 /* Do not process "arch=" */
31358 if (strncmp (token, "arch=", 5) == 0)
31360 token = strtok (NULL, ",");
31361 continue;
31363 for (i = 0; i < NUM_FEATURES; ++i)
31365 if (strcmp (token, feature_list[i].name) == 0)
31367 if (predicate_list)
31369 predicate_arg = build_string_literal (
31370 strlen (feature_list[i].name) + 1,
31371 feature_list[i].name);
31372 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31373 predicate_chain);
31375 /* Find the maximum priority feature. */
31376 if (feature_list[i].priority > priority)
31377 priority = feature_list[i].priority;
31379 break;
31382 if (predicate_list && i == NUM_FEATURES)
31384 error_at (DECL_SOURCE_LOCATION (decl),
31385 "No dispatcher found for %s", token);
31386 return 0;
31388 token = strtok (NULL, ",");
31390 free (tok_str);
31392 if (predicate_list && predicate_chain == NULL_TREE)
31394 error_at (DECL_SOURCE_LOCATION (decl),
31395 "No dispatcher found for the versioning attributes : %s",
31396 attrs_str);
31397 return 0;
31399 else if (predicate_list)
31401 predicate_chain = nreverse (predicate_chain);
31402 *predicate_list = predicate_chain;
31405 return priority;
31408 /* This compares the priority of target features in function DECL1
31409 and DECL2. It returns positive value if DECL1 is higher priority,
31410 negative value if DECL2 is higher priority and 0 if they are the
31411 same. */
31413 static int
31414 ix86_compare_version_priority (tree decl1, tree decl2)
31416 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31417 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31419 return (int)priority1 - (int)priority2;
31422 /* V1 and V2 point to function versions with different priorities
31423 based on the target ISA. This function compares their priorities. */
31425 static int
31426 feature_compare (const void *v1, const void *v2)
31428 typedef struct _function_version_info
31430 tree version_decl;
31431 tree predicate_chain;
31432 unsigned int dispatch_priority;
31433 } function_version_info;
31435 const function_version_info c1 = *(const function_version_info *)v1;
31436 const function_version_info c2 = *(const function_version_info *)v2;
31437 return (c2.dispatch_priority - c1.dispatch_priority);
31440 /* This function generates the dispatch function for
31441 multi-versioned functions. DISPATCH_DECL is the function which will
31442 contain the dispatch logic. FNDECLS are the function choices for
31443 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31444 in DISPATCH_DECL in which the dispatch code is generated. */
31446 static int
31447 dispatch_function_versions (tree dispatch_decl,
31448 void *fndecls_p,
31449 basic_block *empty_bb)
31451 tree default_decl;
31452 gimple *ifunc_cpu_init_stmt;
31453 gimple_seq gseq;
31454 int ix;
31455 tree ele;
31456 vec<tree> *fndecls;
31457 unsigned int num_versions = 0;
31458 unsigned int actual_versions = 0;
31459 unsigned int i;
31461 struct _function_version_info
31463 tree version_decl;
31464 tree predicate_chain;
31465 unsigned int dispatch_priority;
31466 }*function_version_info;
31468 gcc_assert (dispatch_decl != NULL
31469 && fndecls_p != NULL
31470 && empty_bb != NULL);
31472 /*fndecls_p is actually a vector. */
31473 fndecls = static_cast<vec<tree> *> (fndecls_p);
31475 /* At least one more version other than the default. */
31476 num_versions = fndecls->length ();
31477 gcc_assert (num_versions >= 2);
31479 function_version_info = (struct _function_version_info *)
31480 XNEWVEC (struct _function_version_info, (num_versions - 1));
31482 /* The first version in the vector is the default decl. */
31483 default_decl = (*fndecls)[0];
31485 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31487 gseq = bb_seq (*empty_bb);
31488 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31489 constructors, so explicity call __builtin_cpu_init here. */
31490 ifunc_cpu_init_stmt = gimple_build_call_vec (
31491 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31492 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31493 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31494 set_bb_seq (*empty_bb, gseq);
31496 pop_cfun ();
31499 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31501 tree version_decl = ele;
31502 tree predicate_chain = NULL_TREE;
31503 unsigned int priority;
31504 /* Get attribute string, parse it and find the right predicate decl.
31505 The predicate function could be a lengthy combination of many
31506 features, like arch-type and various isa-variants. */
31507 priority = get_builtin_code_for_version (version_decl,
31508 &predicate_chain);
31510 if (predicate_chain == NULL_TREE)
31511 continue;
31513 function_version_info [actual_versions].version_decl = version_decl;
31514 function_version_info [actual_versions].predicate_chain
31515 = predicate_chain;
31516 function_version_info [actual_versions].dispatch_priority = priority;
31517 actual_versions++;
31520 /* Sort the versions according to descending order of dispatch priority. The
31521 priority is based on the ISA. This is not a perfect solution. There
31522 could still be ambiguity. If more than one function version is suitable
31523 to execute, which one should be dispatched? In future, allow the user
31524 to specify a dispatch priority next to the version. */
31525 qsort (function_version_info, actual_versions,
31526 sizeof (struct _function_version_info), feature_compare);
31528 for (i = 0; i < actual_versions; ++i)
31529 *empty_bb = add_condition_to_bb (dispatch_decl,
31530 function_version_info[i].version_decl,
31531 function_version_info[i].predicate_chain,
31532 *empty_bb);
31534 /* dispatch default version at the end. */
31535 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31536 NULL, *empty_bb);
31538 free (function_version_info);
31539 return 0;
31542 /* This function changes the assembler name for functions that are
31543 versions. If DECL is a function version and has a "target"
31544 attribute, it appends the attribute string to its assembler name. */
31546 static tree
31547 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31549 tree version_attr;
31550 const char *orig_name, *version_string;
31551 char *attr_str, *assembler_name;
31553 if (DECL_DECLARED_INLINE_P (decl)
31554 && lookup_attribute ("gnu_inline",
31555 DECL_ATTRIBUTES (decl)))
31556 error_at (DECL_SOURCE_LOCATION (decl),
31557 "Function versions cannot be marked as gnu_inline,"
31558 " bodies have to be generated");
31560 if (DECL_VIRTUAL_P (decl)
31561 || DECL_VINDEX (decl))
31562 sorry ("Virtual function multiversioning not supported");
31564 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31566 /* target attribute string cannot be NULL. */
31567 gcc_assert (version_attr != NULL_TREE);
31569 orig_name = IDENTIFIER_POINTER (id);
31570 version_string
31571 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31573 if (strcmp (version_string, "default") == 0)
31574 return id;
31576 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31577 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31579 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31581 /* Allow assembler name to be modified if already set. */
31582 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31583 SET_DECL_RTL (decl, NULL);
31585 tree ret = get_identifier (assembler_name);
31586 XDELETEVEC (attr_str);
31587 XDELETEVEC (assembler_name);
31588 return ret;
31592 static tree
31593 ix86_mangle_decl_assembler_name (tree decl, tree id)
31595 /* For function version, add the target suffix to the assembler name. */
31596 if (TREE_CODE (decl) == FUNCTION_DECL
31597 && DECL_FUNCTION_VERSIONED (decl))
31598 id = ix86_mangle_function_version_assembler_name (decl, id);
31599 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31600 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31601 #endif
31603 return id;
31606 /* Make a dispatcher declaration for the multi-versioned function DECL.
31607 Calls to DECL function will be replaced with calls to the dispatcher
31608 by the front-end. Returns the decl of the dispatcher function. */
31610 static tree
31611 ix86_get_function_versions_dispatcher (void *decl)
31613 tree fn = (tree) decl;
31614 struct cgraph_node *node = NULL;
31615 struct cgraph_node *default_node = NULL;
31616 struct cgraph_function_version_info *node_v = NULL;
31617 struct cgraph_function_version_info *first_v = NULL;
31619 tree dispatch_decl = NULL;
31621 struct cgraph_function_version_info *default_version_info = NULL;
31623 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31625 node = cgraph_node::get (fn);
31626 gcc_assert (node != NULL);
31628 node_v = node->function_version ();
31629 gcc_assert (node_v != NULL);
31631 if (node_v->dispatcher_resolver != NULL)
31632 return node_v->dispatcher_resolver;
31634 /* Find the default version and make it the first node. */
31635 first_v = node_v;
31636 /* Go to the beginning of the chain. */
31637 while (first_v->prev != NULL)
31638 first_v = first_v->prev;
31639 default_version_info = first_v;
31640 while (default_version_info != NULL)
31642 if (is_function_default_version
31643 (default_version_info->this_node->decl))
31644 break;
31645 default_version_info = default_version_info->next;
31648 /* If there is no default node, just return NULL. */
31649 if (default_version_info == NULL)
31650 return NULL;
31652 /* Make default info the first node. */
31653 if (first_v != default_version_info)
31655 default_version_info->prev->next = default_version_info->next;
31656 if (default_version_info->next)
31657 default_version_info->next->prev = default_version_info->prev;
31658 first_v->prev = default_version_info;
31659 default_version_info->next = first_v;
31660 default_version_info->prev = NULL;
31663 default_node = default_version_info->this_node;
31665 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31666 if (targetm.has_ifunc_p ())
31668 struct cgraph_function_version_info *it_v = NULL;
31669 struct cgraph_node *dispatcher_node = NULL;
31670 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31672 /* Right now, the dispatching is done via ifunc. */
31673 dispatch_decl = make_dispatcher_decl (default_node->decl);
31675 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31676 gcc_assert (dispatcher_node != NULL);
31677 dispatcher_node->dispatcher_function = 1;
31678 dispatcher_version_info
31679 = dispatcher_node->insert_new_function_version ();
31680 dispatcher_version_info->next = default_version_info;
31681 dispatcher_node->definition = 1;
31683 /* Set the dispatcher for all the versions. */
31684 it_v = default_version_info;
31685 while (it_v != NULL)
31687 it_v->dispatcher_resolver = dispatch_decl;
31688 it_v = it_v->next;
31691 else
31692 #endif
31694 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31695 "multiversioning needs ifunc which is not supported "
31696 "on this target");
31699 return dispatch_decl;
31702 /* Make the resolver function decl to dispatch the versions of
31703 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31704 ifunc alias that will point to the created resolver. Create an
31705 empty basic block in the resolver and store the pointer in
31706 EMPTY_BB. Return the decl of the resolver function. */
31708 static tree
31709 make_resolver_func (const tree default_decl,
31710 const tree ifunc_alias_decl,
31711 basic_block *empty_bb)
31713 char *resolver_name;
31714 tree decl, type, decl_name, t;
31716 /* IFUNC's have to be globally visible. So, if the default_decl is
31717 not, then the name of the IFUNC should be made unique. */
31718 if (TREE_PUBLIC (default_decl) == 0)
31720 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31721 symtab->change_decl_assembler_name (ifunc_alias_decl,
31722 get_identifier (ifunc_name));
31723 XDELETEVEC (ifunc_name);
31726 resolver_name = make_unique_name (default_decl, "resolver", false);
31728 /* The resolver function should return a (void *). */
31729 type = build_function_type_list (ptr_type_node, NULL_TREE);
31731 decl = build_fn_decl (resolver_name, type);
31732 decl_name = get_identifier (resolver_name);
31733 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31735 DECL_NAME (decl) = decl_name;
31736 TREE_USED (decl) = 1;
31737 DECL_ARTIFICIAL (decl) = 1;
31738 DECL_IGNORED_P (decl) = 1;
31739 TREE_PUBLIC (decl) = 0;
31740 DECL_UNINLINABLE (decl) = 1;
31742 /* Resolver is not external, body is generated. */
31743 DECL_EXTERNAL (decl) = 0;
31744 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31746 DECL_CONTEXT (decl) = NULL_TREE;
31747 DECL_INITIAL (decl) = make_node (BLOCK);
31748 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31750 if (DECL_COMDAT_GROUP (default_decl)
31751 || TREE_PUBLIC (default_decl))
31753 /* In this case, each translation unit with a call to this
31754 versioned function will put out a resolver. Ensure it
31755 is comdat to keep just one copy. */
31756 DECL_COMDAT (decl) = 1;
31757 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31759 /* Build result decl and add to function_decl. */
31760 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31761 DECL_ARTIFICIAL (t) = 1;
31762 DECL_IGNORED_P (t) = 1;
31763 DECL_RESULT (decl) = t;
31765 gimplify_function_tree (decl);
31766 push_cfun (DECL_STRUCT_FUNCTION (decl));
31767 *empty_bb = init_lowered_empty_function (decl, false,
31768 profile_count::uninitialized ());
31770 cgraph_node::add_new_function (decl, true);
31771 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31773 pop_cfun ();
31775 gcc_assert (ifunc_alias_decl != NULL);
31776 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31777 DECL_ATTRIBUTES (ifunc_alias_decl)
31778 = make_attribute ("ifunc", resolver_name,
31779 DECL_ATTRIBUTES (ifunc_alias_decl));
31781 /* Create the alias for dispatch to resolver here. */
31782 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31783 XDELETEVEC (resolver_name);
31784 return decl;
31787 /* Generate the dispatching code body to dispatch multi-versioned function
31788 DECL. The target hook is called to process the "target" attributes and
31789 provide the code to dispatch the right function at run-time. NODE points
31790 to the dispatcher decl whose body will be created. */
31792 static tree
31793 ix86_generate_version_dispatcher_body (void *node_p)
31795 tree resolver_decl;
31796 basic_block empty_bb;
31797 tree default_ver_decl;
31798 struct cgraph_node *versn;
31799 struct cgraph_node *node;
31801 struct cgraph_function_version_info *node_version_info = NULL;
31802 struct cgraph_function_version_info *versn_info = NULL;
31804 node = (cgraph_node *)node_p;
31806 node_version_info = node->function_version ();
31807 gcc_assert (node->dispatcher_function
31808 && node_version_info != NULL);
31810 if (node_version_info->dispatcher_resolver)
31811 return node_version_info->dispatcher_resolver;
31813 /* The first version in the chain corresponds to the default version. */
31814 default_ver_decl = node_version_info->next->this_node->decl;
31816 /* node is going to be an alias, so remove the finalized bit. */
31817 node->definition = false;
31819 resolver_decl = make_resolver_func (default_ver_decl,
31820 node->decl, &empty_bb);
31822 node_version_info->dispatcher_resolver = resolver_decl;
31824 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31826 auto_vec<tree, 2> fn_ver_vec;
31828 for (versn_info = node_version_info->next; versn_info;
31829 versn_info = versn_info->next)
31831 versn = versn_info->this_node;
31832 /* Check for virtual functions here again, as by this time it should
31833 have been determined if this function needs a vtable index or
31834 not. This happens for methods in derived classes that override
31835 virtual methods in base classes but are not explicitly marked as
31836 virtual. */
31837 if (DECL_VINDEX (versn->decl))
31838 sorry ("Virtual function multiversioning not supported");
31840 fn_ver_vec.safe_push (versn->decl);
31843 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31844 cgraph_edge::rebuild_edges ();
31845 pop_cfun ();
31846 return resolver_decl;
31848 /* This builds the processor_model struct type defined in
31849 libgcc/config/i386/cpuinfo.c */
31851 static tree
31852 build_processor_model_struct (void)
31854 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31855 "__cpu_features"};
31856 tree field = NULL_TREE, field_chain = NULL_TREE;
31857 int i;
31858 tree type = make_node (RECORD_TYPE);
31860 /* The first 3 fields are unsigned int. */
31861 for (i = 0; i < 3; ++i)
31863 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31864 get_identifier (field_name[i]), unsigned_type_node);
31865 if (field_chain != NULL_TREE)
31866 DECL_CHAIN (field) = field_chain;
31867 field_chain = field;
31870 /* The last field is an array of unsigned integers of size one. */
31871 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31872 get_identifier (field_name[3]),
31873 build_array_type (unsigned_type_node,
31874 build_index_type (size_one_node)));
31875 if (field_chain != NULL_TREE)
31876 DECL_CHAIN (field) = field_chain;
31877 field_chain = field;
31879 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31880 return type;
31883 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31885 static tree
31886 make_var_decl (tree type, const char *name)
31888 tree new_decl;
31890 new_decl = build_decl (UNKNOWN_LOCATION,
31891 VAR_DECL,
31892 get_identifier(name),
31893 type);
31895 DECL_EXTERNAL (new_decl) = 1;
31896 TREE_STATIC (new_decl) = 1;
31897 TREE_PUBLIC (new_decl) = 1;
31898 DECL_INITIAL (new_decl) = 0;
31899 DECL_ARTIFICIAL (new_decl) = 0;
31900 DECL_PRESERVE_P (new_decl) = 1;
31902 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31903 assemble_variable (new_decl, 0, 0, 0);
31905 return new_decl;
31908 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31909 into an integer defined in libgcc/config/i386/cpuinfo.c */
31911 static tree
31912 fold_builtin_cpu (tree fndecl, tree *args)
31914 unsigned int i;
31915 enum ix86_builtins fn_code = (enum ix86_builtins)
31916 DECL_FUNCTION_CODE (fndecl);
31917 tree param_string_cst = NULL;
31919 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31920 enum processor_features
31922 F_CMOV = 0,
31923 F_MMX,
31924 F_POPCNT,
31925 F_SSE,
31926 F_SSE2,
31927 F_SSE3,
31928 F_SSSE3,
31929 F_SSE4_1,
31930 F_SSE4_2,
31931 F_AVX,
31932 F_AVX2,
31933 F_SSE4_A,
31934 F_FMA4,
31935 F_XOP,
31936 F_FMA,
31937 F_AVX512F,
31938 F_BMI,
31939 F_BMI2,
31940 F_AES,
31941 F_PCLMUL,
31942 F_AVX512VL,
31943 F_AVX512BW,
31944 F_AVX512DQ,
31945 F_AVX512CD,
31946 F_AVX512ER,
31947 F_AVX512PF,
31948 F_AVX512VBMI,
31949 F_AVX512IFMA,
31950 F_AVX5124VNNIW,
31951 F_AVX5124FMAPS,
31952 F_AVX512VPOPCNTDQ,
31953 F_MAX
31956 /* These are the values for vendor types and cpu types and subtypes
31957 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31958 the corresponding start value. */
31959 enum processor_model
31961 M_INTEL = 1,
31962 M_AMD,
31963 M_CPU_TYPE_START,
31964 M_INTEL_BONNELL,
31965 M_INTEL_CORE2,
31966 M_INTEL_COREI7,
31967 M_AMDFAM10H,
31968 M_AMDFAM15H,
31969 M_INTEL_SILVERMONT,
31970 M_INTEL_KNL,
31971 M_AMD_BTVER1,
31972 M_AMD_BTVER2,
31973 M_AMDFAM17H,
31974 M_INTEL_KNM,
31975 M_CPU_SUBTYPE_START,
31976 M_INTEL_COREI7_NEHALEM,
31977 M_INTEL_COREI7_WESTMERE,
31978 M_INTEL_COREI7_SANDYBRIDGE,
31979 M_AMDFAM10H_BARCELONA,
31980 M_AMDFAM10H_SHANGHAI,
31981 M_AMDFAM10H_ISTANBUL,
31982 M_AMDFAM15H_BDVER1,
31983 M_AMDFAM15H_BDVER2,
31984 M_AMDFAM15H_BDVER3,
31985 M_AMDFAM15H_BDVER4,
31986 M_AMDFAM17H_ZNVER1,
31987 M_INTEL_COREI7_IVYBRIDGE,
31988 M_INTEL_COREI7_HASWELL,
31989 M_INTEL_COREI7_BROADWELL,
31990 M_INTEL_COREI7_SKYLAKE,
31991 M_INTEL_COREI7_SKYLAKE_AVX512,
31992 M_INTEL_COREI7_CANNONLAKE
31995 static struct _arch_names_table
31997 const char *const name;
31998 const enum processor_model model;
32000 const arch_names_table[] =
32002 {"amd", M_AMD},
32003 {"intel", M_INTEL},
32004 {"atom", M_INTEL_BONNELL},
32005 {"slm", M_INTEL_SILVERMONT},
32006 {"core2", M_INTEL_CORE2},
32007 {"corei7", M_INTEL_COREI7},
32008 {"nehalem", M_INTEL_COREI7_NEHALEM},
32009 {"westmere", M_INTEL_COREI7_WESTMERE},
32010 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32011 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32012 {"haswell", M_INTEL_COREI7_HASWELL},
32013 {"broadwell", M_INTEL_COREI7_BROADWELL},
32014 {"skylake", M_INTEL_COREI7_SKYLAKE},
32015 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32016 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32017 {"bonnell", M_INTEL_BONNELL},
32018 {"silvermont", M_INTEL_SILVERMONT},
32019 {"knl", M_INTEL_KNL},
32020 {"knm", M_INTEL_KNM},
32021 {"amdfam10h", M_AMDFAM10H},
32022 {"barcelona", M_AMDFAM10H_BARCELONA},
32023 {"shanghai", M_AMDFAM10H_SHANGHAI},
32024 {"istanbul", M_AMDFAM10H_ISTANBUL},
32025 {"btver1", M_AMD_BTVER1},
32026 {"amdfam15h", M_AMDFAM15H},
32027 {"bdver1", M_AMDFAM15H_BDVER1},
32028 {"bdver2", M_AMDFAM15H_BDVER2},
32029 {"bdver3", M_AMDFAM15H_BDVER3},
32030 {"bdver4", M_AMDFAM15H_BDVER4},
32031 {"btver2", M_AMD_BTVER2},
32032 {"amdfam17h", M_AMDFAM17H},
32033 {"znver1", M_AMDFAM17H_ZNVER1},
32036 static struct _isa_names_table
32038 const char *const name;
32039 const enum processor_features feature;
32041 const isa_names_table[] =
32043 {"cmov", F_CMOV},
32044 {"mmx", F_MMX},
32045 {"popcnt", F_POPCNT},
32046 {"sse", F_SSE},
32047 {"sse2", F_SSE2},
32048 {"sse3", F_SSE3},
32049 {"ssse3", F_SSSE3},
32050 {"sse4a", F_SSE4_A},
32051 {"sse4.1", F_SSE4_1},
32052 {"sse4.2", F_SSE4_2},
32053 {"avx", F_AVX},
32054 {"fma4", F_FMA4},
32055 {"xop", F_XOP},
32056 {"fma", F_FMA},
32057 {"avx2", F_AVX2},
32058 {"avx512f", F_AVX512F},
32059 {"bmi", F_BMI},
32060 {"bmi2", F_BMI2},
32061 {"aes", F_AES},
32062 {"pclmul", F_PCLMUL},
32063 {"avx512vl",F_AVX512VL},
32064 {"avx512bw",F_AVX512BW},
32065 {"avx512dq",F_AVX512DQ},
32066 {"avx512cd",F_AVX512CD},
32067 {"avx512er",F_AVX512ER},
32068 {"avx512pf",F_AVX512PF},
32069 {"avx512vbmi",F_AVX512VBMI},
32070 {"avx512ifma",F_AVX512IFMA},
32071 {"avx5124vnniw",F_AVX5124VNNIW},
32072 {"avx5124fmaps",F_AVX5124FMAPS},
32073 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32076 tree __processor_model_type = build_processor_model_struct ();
32077 tree __cpu_model_var = make_var_decl (__processor_model_type,
32078 "__cpu_model");
32081 varpool_node::add (__cpu_model_var);
32083 gcc_assert ((args != NULL) && (*args != NULL));
32085 param_string_cst = *args;
32086 while (param_string_cst
32087 && TREE_CODE (param_string_cst) != STRING_CST)
32089 /* *args must be a expr that can contain other EXPRS leading to a
32090 STRING_CST. */
32091 if (!EXPR_P (param_string_cst))
32093 error ("Parameter to builtin must be a string constant or literal");
32094 return integer_zero_node;
32096 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32099 gcc_assert (param_string_cst);
32101 if (fn_code == IX86_BUILTIN_CPU_IS)
32103 tree ref;
32104 tree field;
32105 tree final;
32107 unsigned int field_val = 0;
32108 unsigned int NUM_ARCH_NAMES
32109 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32111 for (i = 0; i < NUM_ARCH_NAMES; i++)
32112 if (strcmp (arch_names_table[i].name,
32113 TREE_STRING_POINTER (param_string_cst)) == 0)
32114 break;
32116 if (i == NUM_ARCH_NAMES)
32118 error ("Parameter to builtin not valid: %s",
32119 TREE_STRING_POINTER (param_string_cst));
32120 return integer_zero_node;
32123 field = TYPE_FIELDS (__processor_model_type);
32124 field_val = arch_names_table[i].model;
32126 /* CPU types are stored in the next field. */
32127 if (field_val > M_CPU_TYPE_START
32128 && field_val < M_CPU_SUBTYPE_START)
32130 field = DECL_CHAIN (field);
32131 field_val -= M_CPU_TYPE_START;
32134 /* CPU subtypes are stored in the next field. */
32135 if (field_val > M_CPU_SUBTYPE_START)
32137 field = DECL_CHAIN ( DECL_CHAIN (field));
32138 field_val -= M_CPU_SUBTYPE_START;
32141 /* Get the appropriate field in __cpu_model. */
32142 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32143 field, NULL_TREE);
32145 /* Check the value. */
32146 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32147 build_int_cstu (unsigned_type_node, field_val));
32148 return build1 (CONVERT_EXPR, integer_type_node, final);
32150 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32152 tree ref;
32153 tree array_elt;
32154 tree field;
32155 tree final;
32157 unsigned int field_val = 0;
32158 unsigned int NUM_ISA_NAMES
32159 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32161 for (i = 0; i < NUM_ISA_NAMES; i++)
32162 if (strcmp (isa_names_table[i].name,
32163 TREE_STRING_POINTER (param_string_cst)) == 0)
32164 break;
32166 if (i == NUM_ISA_NAMES)
32168 error ("Parameter to builtin not valid: %s",
32169 TREE_STRING_POINTER (param_string_cst));
32170 return integer_zero_node;
32173 field = TYPE_FIELDS (__processor_model_type);
32174 /* Get the last field, which is __cpu_features. */
32175 while (DECL_CHAIN (field))
32176 field = DECL_CHAIN (field);
32178 /* Get the appropriate field: __cpu_model.__cpu_features */
32179 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32180 field, NULL_TREE);
32182 /* Access the 0th element of __cpu_features array. */
32183 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
32184 integer_zero_node, NULL_TREE, NULL_TREE);
32186 field_val = (1 << isa_names_table[i].feature);
32187 /* Return __cpu_model.__cpu_features[0] & field_val */
32188 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
32189 build_int_cstu (unsigned_type_node, field_val));
32190 return build1 (CONVERT_EXPR, integer_type_node, final);
32192 gcc_unreachable ();
32195 static tree
32196 ix86_fold_builtin (tree fndecl, int n_args,
32197 tree *args, bool ignore ATTRIBUTE_UNUSED)
32199 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32201 enum ix86_builtins fn_code = (enum ix86_builtins)
32202 DECL_FUNCTION_CODE (fndecl);
32203 switch (fn_code)
32205 case IX86_BUILTIN_CPU_IS:
32206 case IX86_BUILTIN_CPU_SUPPORTS:
32207 gcc_assert (n_args == 1);
32208 return fold_builtin_cpu (fndecl, args);
32210 case IX86_BUILTIN_NANQ:
32211 case IX86_BUILTIN_NANSQ:
32213 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32214 const char *str = c_getstr (*args);
32215 int quiet = fn_code == IX86_BUILTIN_NANQ;
32216 REAL_VALUE_TYPE real;
32218 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
32219 return build_real (type, real);
32220 return NULL_TREE;
32223 case IX86_BUILTIN_INFQ:
32224 case IX86_BUILTIN_HUGE_VALQ:
32226 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32227 REAL_VALUE_TYPE inf;
32228 real_inf (&inf);
32229 return build_real (type, inf);
32232 case IX86_BUILTIN_TZCNT16:
32233 case IX86_BUILTIN_CTZS:
32234 case IX86_BUILTIN_TZCNT32:
32235 case IX86_BUILTIN_TZCNT64:
32236 gcc_assert (n_args == 1);
32237 if (TREE_CODE (args[0]) == INTEGER_CST)
32239 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32240 tree arg = args[0];
32241 if (fn_code == IX86_BUILTIN_TZCNT16
32242 || fn_code == IX86_BUILTIN_CTZS)
32243 arg = fold_convert (short_unsigned_type_node, arg);
32244 if (integer_zerop (arg))
32245 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32246 else
32247 return fold_const_call (CFN_CTZ, type, arg);
32249 break;
32251 case IX86_BUILTIN_LZCNT16:
32252 case IX86_BUILTIN_CLZS:
32253 case IX86_BUILTIN_LZCNT32:
32254 case IX86_BUILTIN_LZCNT64:
32255 gcc_assert (n_args == 1);
32256 if (TREE_CODE (args[0]) == INTEGER_CST)
32258 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32259 tree arg = args[0];
32260 if (fn_code == IX86_BUILTIN_LZCNT16
32261 || fn_code == IX86_BUILTIN_CLZS)
32262 arg = fold_convert (short_unsigned_type_node, arg);
32263 if (integer_zerop (arg))
32264 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32265 else
32266 return fold_const_call (CFN_CLZ, type, arg);
32268 break;
32270 case IX86_BUILTIN_BEXTR32:
32271 case IX86_BUILTIN_BEXTR64:
32272 case IX86_BUILTIN_BEXTRI32:
32273 case IX86_BUILTIN_BEXTRI64:
32274 gcc_assert (n_args == 2);
32275 if (tree_fits_uhwi_p (args[1]))
32277 unsigned HOST_WIDE_INT res = 0;
32278 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32279 unsigned int start = tree_to_uhwi (args[1]);
32280 unsigned int len = (start & 0xff00) >> 8;
32281 start &= 0xff;
32282 if (start >= prec || len == 0)
32283 res = 0;
32284 else if (!tree_fits_uhwi_p (args[0]))
32285 break;
32286 else
32287 res = tree_to_uhwi (args[0]) >> start;
32288 if (len > prec)
32289 len = prec;
32290 if (len < HOST_BITS_PER_WIDE_INT)
32291 res &= (HOST_WIDE_INT_1U << len) - 1;
32292 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32294 break;
32296 case IX86_BUILTIN_BZHI32:
32297 case IX86_BUILTIN_BZHI64:
32298 gcc_assert (n_args == 2);
32299 if (tree_fits_uhwi_p (args[1]))
32301 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32302 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32303 return args[0];
32304 if (!tree_fits_uhwi_p (args[0]))
32305 break;
32306 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32307 res &= ~(HOST_WIDE_INT_M1U << idx);
32308 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32310 break;
32312 case IX86_BUILTIN_PDEP32:
32313 case IX86_BUILTIN_PDEP64:
32314 gcc_assert (n_args == 2);
32315 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32317 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32318 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32319 unsigned HOST_WIDE_INT res = 0;
32320 unsigned HOST_WIDE_INT m, k = 1;
32321 for (m = 1; m; m <<= 1)
32322 if ((mask & m) != 0)
32324 if ((src & k) != 0)
32325 res |= m;
32326 k <<= 1;
32328 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32330 break;
32332 case IX86_BUILTIN_PEXT32:
32333 case IX86_BUILTIN_PEXT64:
32334 gcc_assert (n_args == 2);
32335 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32337 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32338 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32339 unsigned HOST_WIDE_INT res = 0;
32340 unsigned HOST_WIDE_INT m, k = 1;
32341 for (m = 1; m; m <<= 1)
32342 if ((mask & m) != 0)
32344 if ((src & m) != 0)
32345 res |= k;
32346 k <<= 1;
32348 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32350 break;
32352 default:
32353 break;
32357 #ifdef SUBTARGET_FOLD_BUILTIN
32358 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32359 #endif
32361 return NULL_TREE;
32364 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32365 constant) in GIMPLE. */
32367 bool
32368 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32370 gimple *stmt = gsi_stmt (*gsi);
32371 tree fndecl = gimple_call_fndecl (stmt);
32372 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32373 int n_args = gimple_call_num_args (stmt);
32374 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32375 tree decl = NULL_TREE;
32376 tree arg0, arg1;
32378 switch (fn_code)
32380 case IX86_BUILTIN_TZCNT32:
32381 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32382 goto fold_tzcnt_lzcnt;
32384 case IX86_BUILTIN_TZCNT64:
32385 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32386 goto fold_tzcnt_lzcnt;
32388 case IX86_BUILTIN_LZCNT32:
32389 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32390 goto fold_tzcnt_lzcnt;
32392 case IX86_BUILTIN_LZCNT64:
32393 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32394 goto fold_tzcnt_lzcnt;
32396 fold_tzcnt_lzcnt:
32397 gcc_assert (n_args == 1);
32398 arg0 = gimple_call_arg (stmt, 0);
32399 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32401 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32402 /* If arg0 is provably non-zero, optimize into generic
32403 __builtin_c[tl]z{,ll} function the middle-end handles
32404 better. */
32405 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32406 return false;
32408 location_t loc = gimple_location (stmt);
32409 gimple *g = gimple_build_call (decl, 1, arg0);
32410 gimple_set_location (g, loc);
32411 tree lhs = make_ssa_name (integer_type_node);
32412 gimple_call_set_lhs (g, lhs);
32413 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32414 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32415 gimple_set_location (g, loc);
32416 gsi_replace (gsi, g, false);
32417 return true;
32419 break;
32421 case IX86_BUILTIN_BZHI32:
32422 case IX86_BUILTIN_BZHI64:
32423 gcc_assert (n_args == 2);
32424 arg1 = gimple_call_arg (stmt, 1);
32425 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32427 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32428 arg0 = gimple_call_arg (stmt, 0);
32429 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32430 break;
32431 location_t loc = gimple_location (stmt);
32432 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32433 gimple_set_location (g, loc);
32434 gsi_replace (gsi, g, false);
32435 return true;
32437 break;
32439 case IX86_BUILTIN_PDEP32:
32440 case IX86_BUILTIN_PDEP64:
32441 case IX86_BUILTIN_PEXT32:
32442 case IX86_BUILTIN_PEXT64:
32443 gcc_assert (n_args == 2);
32444 arg1 = gimple_call_arg (stmt, 1);
32445 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32447 location_t loc = gimple_location (stmt);
32448 arg0 = gimple_call_arg (stmt, 0);
32449 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32450 gimple_set_location (g, loc);
32451 gsi_replace (gsi, g, false);
32452 return true;
32454 break;
32456 default:
32457 break;
32460 return false;
32463 /* Make builtins to detect cpu type and features supported. NAME is
32464 the builtin name, CODE is the builtin code, and FTYPE is the function
32465 type of the builtin. */
32467 static void
32468 make_cpu_type_builtin (const char* name, int code,
32469 enum ix86_builtin_func_type ftype, bool is_const)
32471 tree decl;
32472 tree type;
32474 type = ix86_get_builtin_func_type (ftype);
32475 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32476 NULL, NULL_TREE);
32477 gcc_assert (decl != NULL_TREE);
32478 ix86_builtins[(int) code] = decl;
32479 TREE_READONLY (decl) = is_const;
32482 /* Make builtins to get CPU type and features supported. The created
32483 builtins are :
32485 __builtin_cpu_init (), to detect cpu type and features,
32486 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32487 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32490 static void
32491 ix86_init_platform_type_builtins (void)
32493 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32494 INT_FTYPE_VOID, false);
32495 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32496 INT_FTYPE_PCCHAR, true);
32497 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32498 INT_FTYPE_PCCHAR, true);
32501 /* Internal method for ix86_init_builtins. */
32503 static void
32504 ix86_init_builtins_va_builtins_abi (void)
32506 tree ms_va_ref, sysv_va_ref;
32507 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32508 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32509 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32510 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32512 if (!TARGET_64BIT)
32513 return;
32514 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32515 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32516 ms_va_ref = build_reference_type (ms_va_list_type_node);
32517 sysv_va_ref =
32518 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32520 fnvoid_va_end_ms =
32521 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32522 fnvoid_va_start_ms =
32523 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32524 fnvoid_va_end_sysv =
32525 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32526 fnvoid_va_start_sysv =
32527 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32528 NULL_TREE);
32529 fnvoid_va_copy_ms =
32530 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32531 NULL_TREE);
32532 fnvoid_va_copy_sysv =
32533 build_function_type_list (void_type_node, sysv_va_ref,
32534 sysv_va_ref, NULL_TREE);
32536 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32537 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32538 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32539 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32540 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32541 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32542 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32543 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32544 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32545 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32546 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32547 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32550 static void
32551 ix86_init_builtin_types (void)
32553 tree float80_type_node, const_string_type_node;
32555 /* The __float80 type. */
32556 float80_type_node = long_double_type_node;
32557 if (TYPE_MODE (float80_type_node) != XFmode)
32559 if (float64x_type_node != NULL_TREE
32560 && TYPE_MODE (float64x_type_node) == XFmode)
32561 float80_type_node = float64x_type_node;
32562 else
32564 /* The __float80 type. */
32565 float80_type_node = make_node (REAL_TYPE);
32567 TYPE_PRECISION (float80_type_node) = 80;
32568 layout_type (float80_type_node);
32571 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32573 /* The __float128 type. The node has already been created as
32574 _Float128, so we only need to register the __float128 name for
32575 it. */
32576 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32578 const_string_type_node
32579 = build_pointer_type (build_qualified_type
32580 (char_type_node, TYPE_QUAL_CONST));
32582 /* This macro is built by i386-builtin-types.awk. */
32583 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32586 static void
32587 ix86_init_builtins (void)
32589 tree ftype, decl;
32591 ix86_init_builtin_types ();
32593 /* Builtins to get CPU type and features. */
32594 ix86_init_platform_type_builtins ();
32596 /* TFmode support builtins. */
32597 def_builtin_const (0, "__builtin_infq",
32598 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32599 def_builtin_const (0, "__builtin_huge_valq",
32600 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32602 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32603 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32604 BUILT_IN_MD, "nanq", NULL_TREE);
32605 TREE_READONLY (decl) = 1;
32606 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32608 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32609 BUILT_IN_MD, "nansq", NULL_TREE);
32610 TREE_READONLY (decl) = 1;
32611 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32613 /* We will expand them to normal call if SSE isn't available since
32614 they are used by libgcc. */
32615 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32616 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32617 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32618 TREE_READONLY (decl) = 1;
32619 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32621 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32622 decl = add_builtin_function ("__builtin_copysignq", ftype,
32623 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32624 "__copysigntf3", NULL_TREE);
32625 TREE_READONLY (decl) = 1;
32626 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32628 ix86_init_tm_builtins ();
32629 ix86_init_mmx_sse_builtins ();
32630 ix86_init_mpx_builtins ();
32632 if (TARGET_LP64)
32633 ix86_init_builtins_va_builtins_abi ();
32635 #ifdef SUBTARGET_INIT_BUILTINS
32636 SUBTARGET_INIT_BUILTINS;
32637 #endif
32640 /* Return the ix86 builtin for CODE. */
32642 static tree
32643 ix86_builtin_decl (unsigned code, bool)
32645 if (code >= IX86_BUILTIN_MAX)
32646 return error_mark_node;
32648 return ix86_builtins[code];
32651 /* Errors in the source file can cause expand_expr to return const0_rtx
32652 where we expect a vector. To avoid crashing, use one of the vector
32653 clear instructions. */
32654 static rtx
32655 safe_vector_operand (rtx x, machine_mode mode)
32657 if (x == const0_rtx)
32658 x = CONST0_RTX (mode);
32659 return x;
32662 /* Fixup modeless constants to fit required mode. */
32663 static rtx
32664 fixup_modeless_constant (rtx x, machine_mode mode)
32666 if (GET_MODE (x) == VOIDmode)
32667 x = convert_to_mode (mode, x, 1);
32668 return x;
32671 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32673 static rtx
32674 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32676 rtx pat;
32677 tree arg0 = CALL_EXPR_ARG (exp, 0);
32678 tree arg1 = CALL_EXPR_ARG (exp, 1);
32679 rtx op0 = expand_normal (arg0);
32680 rtx op1 = expand_normal (arg1);
32681 machine_mode tmode = insn_data[icode].operand[0].mode;
32682 machine_mode mode0 = insn_data[icode].operand[1].mode;
32683 machine_mode mode1 = insn_data[icode].operand[2].mode;
32685 if (VECTOR_MODE_P (mode0))
32686 op0 = safe_vector_operand (op0, mode0);
32687 if (VECTOR_MODE_P (mode1))
32688 op1 = safe_vector_operand (op1, mode1);
32690 if (optimize || !target
32691 || GET_MODE (target) != tmode
32692 || !insn_data[icode].operand[0].predicate (target, tmode))
32693 target = gen_reg_rtx (tmode);
32695 if (GET_MODE (op1) == SImode && mode1 == TImode)
32697 rtx x = gen_reg_rtx (V4SImode);
32698 emit_insn (gen_sse2_loadd (x, op1));
32699 op1 = gen_lowpart (TImode, x);
32702 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32703 op0 = copy_to_mode_reg (mode0, op0);
32704 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32705 op1 = copy_to_mode_reg (mode1, op1);
32707 pat = GEN_FCN (icode) (target, op0, op1);
32708 if (! pat)
32709 return 0;
32711 emit_insn (pat);
32713 return target;
32716 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32718 static rtx
32719 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32720 enum ix86_builtin_func_type m_type,
32721 enum rtx_code sub_code)
32723 rtx pat;
32724 int i;
32725 int nargs;
32726 bool comparison_p = false;
32727 bool tf_p = false;
32728 bool last_arg_constant = false;
32729 int num_memory = 0;
32730 struct {
32731 rtx op;
32732 machine_mode mode;
32733 } args[4];
32735 machine_mode tmode = insn_data[icode].operand[0].mode;
32737 switch (m_type)
32739 case MULTI_ARG_4_DF2_DI_I:
32740 case MULTI_ARG_4_DF2_DI_I1:
32741 case MULTI_ARG_4_SF2_SI_I:
32742 case MULTI_ARG_4_SF2_SI_I1:
32743 nargs = 4;
32744 last_arg_constant = true;
32745 break;
32747 case MULTI_ARG_3_SF:
32748 case MULTI_ARG_3_DF:
32749 case MULTI_ARG_3_SF2:
32750 case MULTI_ARG_3_DF2:
32751 case MULTI_ARG_3_DI:
32752 case MULTI_ARG_3_SI:
32753 case MULTI_ARG_3_SI_DI:
32754 case MULTI_ARG_3_HI:
32755 case MULTI_ARG_3_HI_SI:
32756 case MULTI_ARG_3_QI:
32757 case MULTI_ARG_3_DI2:
32758 case MULTI_ARG_3_SI2:
32759 case MULTI_ARG_3_HI2:
32760 case MULTI_ARG_3_QI2:
32761 nargs = 3;
32762 break;
32764 case MULTI_ARG_2_SF:
32765 case MULTI_ARG_2_DF:
32766 case MULTI_ARG_2_DI:
32767 case MULTI_ARG_2_SI:
32768 case MULTI_ARG_2_HI:
32769 case MULTI_ARG_2_QI:
32770 nargs = 2;
32771 break;
32773 case MULTI_ARG_2_DI_IMM:
32774 case MULTI_ARG_2_SI_IMM:
32775 case MULTI_ARG_2_HI_IMM:
32776 case MULTI_ARG_2_QI_IMM:
32777 nargs = 2;
32778 last_arg_constant = true;
32779 break;
32781 case MULTI_ARG_1_SF:
32782 case MULTI_ARG_1_DF:
32783 case MULTI_ARG_1_SF2:
32784 case MULTI_ARG_1_DF2:
32785 case MULTI_ARG_1_DI:
32786 case MULTI_ARG_1_SI:
32787 case MULTI_ARG_1_HI:
32788 case MULTI_ARG_1_QI:
32789 case MULTI_ARG_1_SI_DI:
32790 case MULTI_ARG_1_HI_DI:
32791 case MULTI_ARG_1_HI_SI:
32792 case MULTI_ARG_1_QI_DI:
32793 case MULTI_ARG_1_QI_SI:
32794 case MULTI_ARG_1_QI_HI:
32795 nargs = 1;
32796 break;
32798 case MULTI_ARG_2_DI_CMP:
32799 case MULTI_ARG_2_SI_CMP:
32800 case MULTI_ARG_2_HI_CMP:
32801 case MULTI_ARG_2_QI_CMP:
32802 nargs = 2;
32803 comparison_p = true;
32804 break;
32806 case MULTI_ARG_2_SF_TF:
32807 case MULTI_ARG_2_DF_TF:
32808 case MULTI_ARG_2_DI_TF:
32809 case MULTI_ARG_2_SI_TF:
32810 case MULTI_ARG_2_HI_TF:
32811 case MULTI_ARG_2_QI_TF:
32812 nargs = 2;
32813 tf_p = true;
32814 break;
32816 default:
32817 gcc_unreachable ();
32820 if (optimize || !target
32821 || GET_MODE (target) != tmode
32822 || !insn_data[icode].operand[0].predicate (target, tmode))
32823 target = gen_reg_rtx (tmode);
32824 else if (memory_operand (target, tmode))
32825 num_memory++;
32827 gcc_assert (nargs <= 4);
32829 for (i = 0; i < nargs; i++)
32831 tree arg = CALL_EXPR_ARG (exp, i);
32832 rtx op = expand_normal (arg);
32833 int adjust = (comparison_p) ? 1 : 0;
32834 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32836 if (last_arg_constant && i == nargs - 1)
32838 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32840 enum insn_code new_icode = icode;
32841 switch (icode)
32843 case CODE_FOR_xop_vpermil2v2df3:
32844 case CODE_FOR_xop_vpermil2v4sf3:
32845 case CODE_FOR_xop_vpermil2v4df3:
32846 case CODE_FOR_xop_vpermil2v8sf3:
32847 error ("the last argument must be a 2-bit immediate");
32848 return gen_reg_rtx (tmode);
32849 case CODE_FOR_xop_rotlv2di3:
32850 new_icode = CODE_FOR_rotlv2di3;
32851 goto xop_rotl;
32852 case CODE_FOR_xop_rotlv4si3:
32853 new_icode = CODE_FOR_rotlv4si3;
32854 goto xop_rotl;
32855 case CODE_FOR_xop_rotlv8hi3:
32856 new_icode = CODE_FOR_rotlv8hi3;
32857 goto xop_rotl;
32858 case CODE_FOR_xop_rotlv16qi3:
32859 new_icode = CODE_FOR_rotlv16qi3;
32860 xop_rotl:
32861 if (CONST_INT_P (op))
32863 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32864 op = GEN_INT (INTVAL (op) & mask);
32865 gcc_checking_assert
32866 (insn_data[icode].operand[i + 1].predicate (op, mode));
32868 else
32870 gcc_checking_assert
32871 (nargs == 2
32872 && insn_data[new_icode].operand[0].mode == tmode
32873 && insn_data[new_icode].operand[1].mode == tmode
32874 && insn_data[new_icode].operand[2].mode == mode
32875 && insn_data[new_icode].operand[0].predicate
32876 == insn_data[icode].operand[0].predicate
32877 && insn_data[new_icode].operand[1].predicate
32878 == insn_data[icode].operand[1].predicate);
32879 icode = new_icode;
32880 goto non_constant;
32882 break;
32883 default:
32884 gcc_unreachable ();
32888 else
32890 non_constant:
32891 if (VECTOR_MODE_P (mode))
32892 op = safe_vector_operand (op, mode);
32894 /* If we aren't optimizing, only allow one memory operand to be
32895 generated. */
32896 if (memory_operand (op, mode))
32897 num_memory++;
32899 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32901 if (optimize
32902 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32903 || num_memory > 1)
32904 op = force_reg (mode, op);
32907 args[i].op = op;
32908 args[i].mode = mode;
32911 switch (nargs)
32913 case 1:
32914 pat = GEN_FCN (icode) (target, args[0].op);
32915 break;
32917 case 2:
32918 if (tf_p)
32919 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32920 GEN_INT ((int)sub_code));
32921 else if (! comparison_p)
32922 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32923 else
32925 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32926 args[0].op,
32927 args[1].op);
32929 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32931 break;
32933 case 3:
32934 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32935 break;
32937 case 4:
32938 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32939 break;
32941 default:
32942 gcc_unreachable ();
32945 if (! pat)
32946 return 0;
32948 emit_insn (pat);
32949 return target;
32952 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32953 insns with vec_merge. */
32955 static rtx
32956 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32957 rtx target)
32959 rtx pat;
32960 tree arg0 = CALL_EXPR_ARG (exp, 0);
32961 rtx op1, op0 = expand_normal (arg0);
32962 machine_mode tmode = insn_data[icode].operand[0].mode;
32963 machine_mode mode0 = insn_data[icode].operand[1].mode;
32965 if (optimize || !target
32966 || GET_MODE (target) != tmode
32967 || !insn_data[icode].operand[0].predicate (target, tmode))
32968 target = gen_reg_rtx (tmode);
32970 if (VECTOR_MODE_P (mode0))
32971 op0 = safe_vector_operand (op0, mode0);
32973 if ((optimize && !register_operand (op0, mode0))
32974 || !insn_data[icode].operand[1].predicate (op0, mode0))
32975 op0 = copy_to_mode_reg (mode0, op0);
32977 op1 = op0;
32978 if (!insn_data[icode].operand[2].predicate (op1, mode0))
32979 op1 = copy_to_mode_reg (mode0, op1);
32981 pat = GEN_FCN (icode) (target, op0, op1);
32982 if (! pat)
32983 return 0;
32984 emit_insn (pat);
32985 return target;
32988 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
32990 static rtx
32991 ix86_expand_sse_compare (const struct builtin_description *d,
32992 tree exp, rtx target, bool swap)
32994 rtx pat;
32995 tree arg0 = CALL_EXPR_ARG (exp, 0);
32996 tree arg1 = CALL_EXPR_ARG (exp, 1);
32997 rtx op0 = expand_normal (arg0);
32998 rtx op1 = expand_normal (arg1);
32999 rtx op2;
33000 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33001 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33002 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33003 enum rtx_code comparison = d->comparison;
33005 if (VECTOR_MODE_P (mode0))
33006 op0 = safe_vector_operand (op0, mode0);
33007 if (VECTOR_MODE_P (mode1))
33008 op1 = safe_vector_operand (op1, mode1);
33010 /* Swap operands if we have a comparison that isn't available in
33011 hardware. */
33012 if (swap)
33013 std::swap (op0, op1);
33015 if (optimize || !target
33016 || GET_MODE (target) != tmode
33017 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33018 target = gen_reg_rtx (tmode);
33020 if ((optimize && !register_operand (op0, mode0))
33021 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33022 op0 = copy_to_mode_reg (mode0, op0);
33023 if ((optimize && !register_operand (op1, mode1))
33024 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33025 op1 = copy_to_mode_reg (mode1, op1);
33027 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33028 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33029 if (! pat)
33030 return 0;
33031 emit_insn (pat);
33032 return target;
33035 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33037 static rtx
33038 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33039 rtx target)
33041 rtx pat;
33042 tree arg0 = CALL_EXPR_ARG (exp, 0);
33043 tree arg1 = CALL_EXPR_ARG (exp, 1);
33044 rtx op0 = expand_normal (arg0);
33045 rtx op1 = expand_normal (arg1);
33046 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33047 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33048 enum rtx_code comparison = d->comparison;
33050 if (VECTOR_MODE_P (mode0))
33051 op0 = safe_vector_operand (op0, mode0);
33052 if (VECTOR_MODE_P (mode1))
33053 op1 = safe_vector_operand (op1, mode1);
33055 /* Swap operands if we have a comparison that isn't available in
33056 hardware. */
33057 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33058 std::swap (op0, op1);
33060 target = gen_reg_rtx (SImode);
33061 emit_move_insn (target, const0_rtx);
33062 target = gen_rtx_SUBREG (QImode, target, 0);
33064 if ((optimize && !register_operand (op0, mode0))
33065 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33066 op0 = copy_to_mode_reg (mode0, op0);
33067 if ((optimize && !register_operand (op1, mode1))
33068 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33069 op1 = copy_to_mode_reg (mode1, op1);
33071 pat = GEN_FCN (d->icode) (op0, op1);
33072 if (! pat)
33073 return 0;
33074 emit_insn (pat);
33075 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33076 gen_rtx_fmt_ee (comparison, QImode,
33077 SET_DEST (pat),
33078 const0_rtx)));
33080 return SUBREG_REG (target);
33083 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33085 static rtx
33086 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33087 rtx target)
33089 rtx pat;
33090 tree arg0 = CALL_EXPR_ARG (exp, 0);
33091 rtx op1, op0 = expand_normal (arg0);
33092 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33093 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33095 if (optimize || target == 0
33096 || GET_MODE (target) != tmode
33097 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33098 target = gen_reg_rtx (tmode);
33100 if (VECTOR_MODE_P (mode0))
33101 op0 = safe_vector_operand (op0, mode0);
33103 if ((optimize && !register_operand (op0, mode0))
33104 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33105 op0 = copy_to_mode_reg (mode0, op0);
33107 op1 = GEN_INT (d->comparison);
33109 pat = GEN_FCN (d->icode) (target, op0, op1);
33110 if (! pat)
33111 return 0;
33112 emit_insn (pat);
33113 return target;
33116 static rtx
33117 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33118 tree exp, rtx target)
33120 rtx pat;
33121 tree arg0 = CALL_EXPR_ARG (exp, 0);
33122 tree arg1 = CALL_EXPR_ARG (exp, 1);
33123 rtx op0 = expand_normal (arg0);
33124 rtx op1 = expand_normal (arg1);
33125 rtx op2;
33126 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33127 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33128 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33130 if (optimize || target == 0
33131 || GET_MODE (target) != tmode
33132 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33133 target = gen_reg_rtx (tmode);
33135 op0 = safe_vector_operand (op0, mode0);
33136 op1 = safe_vector_operand (op1, mode1);
33138 if ((optimize && !register_operand (op0, mode0))
33139 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33140 op0 = copy_to_mode_reg (mode0, op0);
33141 if ((optimize && !register_operand (op1, mode1))
33142 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33143 op1 = copy_to_mode_reg (mode1, op1);
33145 op2 = GEN_INT (d->comparison);
33147 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33148 if (! pat)
33149 return 0;
33150 emit_insn (pat);
33151 return target;
33154 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33156 static rtx
33157 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33158 rtx target)
33160 rtx pat;
33161 tree arg0 = CALL_EXPR_ARG (exp, 0);
33162 tree arg1 = CALL_EXPR_ARG (exp, 1);
33163 rtx op0 = expand_normal (arg0);
33164 rtx op1 = expand_normal (arg1);
33165 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33166 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33167 enum rtx_code comparison = d->comparison;
33169 if (VECTOR_MODE_P (mode0))
33170 op0 = safe_vector_operand (op0, mode0);
33171 if (VECTOR_MODE_P (mode1))
33172 op1 = safe_vector_operand (op1, mode1);
33174 target = gen_reg_rtx (SImode);
33175 emit_move_insn (target, const0_rtx);
33176 target = gen_rtx_SUBREG (QImode, target, 0);
33178 if ((optimize && !register_operand (op0, mode0))
33179 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33180 op0 = copy_to_mode_reg (mode0, op0);
33181 if ((optimize && !register_operand (op1, mode1))
33182 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33183 op1 = copy_to_mode_reg (mode1, op1);
33185 pat = GEN_FCN (d->icode) (op0, op1);
33186 if (! pat)
33187 return 0;
33188 emit_insn (pat);
33189 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33190 gen_rtx_fmt_ee (comparison, QImode,
33191 SET_DEST (pat),
33192 const0_rtx)));
33194 return SUBREG_REG (target);
33197 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
33199 static rtx
33200 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
33201 tree exp, rtx target)
33203 rtx pat;
33204 tree arg0 = CALL_EXPR_ARG (exp, 0);
33205 tree arg1 = CALL_EXPR_ARG (exp, 1);
33206 tree arg2 = CALL_EXPR_ARG (exp, 2);
33207 tree arg3 = CALL_EXPR_ARG (exp, 3);
33208 tree arg4 = CALL_EXPR_ARG (exp, 4);
33209 rtx scratch0, scratch1;
33210 rtx op0 = expand_normal (arg0);
33211 rtx op1 = expand_normal (arg1);
33212 rtx op2 = expand_normal (arg2);
33213 rtx op3 = expand_normal (arg3);
33214 rtx op4 = expand_normal (arg4);
33215 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
33217 tmode0 = insn_data[d->icode].operand[0].mode;
33218 tmode1 = insn_data[d->icode].operand[1].mode;
33219 modev2 = insn_data[d->icode].operand[2].mode;
33220 modei3 = insn_data[d->icode].operand[3].mode;
33221 modev4 = insn_data[d->icode].operand[4].mode;
33222 modei5 = insn_data[d->icode].operand[5].mode;
33223 modeimm = insn_data[d->icode].operand[6].mode;
33225 if (VECTOR_MODE_P (modev2))
33226 op0 = safe_vector_operand (op0, modev2);
33227 if (VECTOR_MODE_P (modev4))
33228 op2 = safe_vector_operand (op2, modev4);
33230 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33231 op0 = copy_to_mode_reg (modev2, op0);
33232 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
33233 op1 = copy_to_mode_reg (modei3, op1);
33234 if ((optimize && !register_operand (op2, modev4))
33235 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33236 op2 = copy_to_mode_reg (modev4, op2);
33237 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33238 op3 = copy_to_mode_reg (modei5, op3);
33240 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33242 error ("the fifth argument must be an 8-bit immediate");
33243 return const0_rtx;
33246 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33248 if (optimize || !target
33249 || GET_MODE (target) != tmode0
33250 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33251 target = gen_reg_rtx (tmode0);
33253 scratch1 = gen_reg_rtx (tmode1);
33255 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33257 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33259 if (optimize || !target
33260 || GET_MODE (target) != tmode1
33261 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33262 target = gen_reg_rtx (tmode1);
33264 scratch0 = gen_reg_rtx (tmode0);
33266 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33268 else
33270 gcc_assert (d->flag);
33272 scratch0 = gen_reg_rtx (tmode0);
33273 scratch1 = gen_reg_rtx (tmode1);
33275 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33278 if (! pat)
33279 return 0;
33281 emit_insn (pat);
33283 if (d->flag)
33285 target = gen_reg_rtx (SImode);
33286 emit_move_insn (target, const0_rtx);
33287 target = gen_rtx_SUBREG (QImode, target, 0);
33289 emit_insn
33290 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33291 gen_rtx_fmt_ee (EQ, QImode,
33292 gen_rtx_REG ((machine_mode) d->flag,
33293 FLAGS_REG),
33294 const0_rtx)));
33295 return SUBREG_REG (target);
33297 else
33298 return target;
33302 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33304 static rtx
33305 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33306 tree exp, rtx target)
33308 rtx pat;
33309 tree arg0 = CALL_EXPR_ARG (exp, 0);
33310 tree arg1 = CALL_EXPR_ARG (exp, 1);
33311 tree arg2 = CALL_EXPR_ARG (exp, 2);
33312 rtx scratch0, scratch1;
33313 rtx op0 = expand_normal (arg0);
33314 rtx op1 = expand_normal (arg1);
33315 rtx op2 = expand_normal (arg2);
33316 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33318 tmode0 = insn_data[d->icode].operand[0].mode;
33319 tmode1 = insn_data[d->icode].operand[1].mode;
33320 modev2 = insn_data[d->icode].operand[2].mode;
33321 modev3 = insn_data[d->icode].operand[3].mode;
33322 modeimm = insn_data[d->icode].operand[4].mode;
33324 if (VECTOR_MODE_P (modev2))
33325 op0 = safe_vector_operand (op0, modev2);
33326 if (VECTOR_MODE_P (modev3))
33327 op1 = safe_vector_operand (op1, modev3);
33329 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33330 op0 = copy_to_mode_reg (modev2, op0);
33331 if ((optimize && !register_operand (op1, modev3))
33332 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33333 op1 = copy_to_mode_reg (modev3, op1);
33335 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33337 error ("the third argument must be an 8-bit immediate");
33338 return const0_rtx;
33341 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33343 if (optimize || !target
33344 || GET_MODE (target) != tmode0
33345 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33346 target = gen_reg_rtx (tmode0);
33348 scratch1 = gen_reg_rtx (tmode1);
33350 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33352 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33354 if (optimize || !target
33355 || GET_MODE (target) != tmode1
33356 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33357 target = gen_reg_rtx (tmode1);
33359 scratch0 = gen_reg_rtx (tmode0);
33361 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33363 else
33365 gcc_assert (d->flag);
33367 scratch0 = gen_reg_rtx (tmode0);
33368 scratch1 = gen_reg_rtx (tmode1);
33370 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33373 if (! pat)
33374 return 0;
33376 emit_insn (pat);
33378 if (d->flag)
33380 target = gen_reg_rtx (SImode);
33381 emit_move_insn (target, const0_rtx);
33382 target = gen_rtx_SUBREG (QImode, target, 0);
33384 emit_insn
33385 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33386 gen_rtx_fmt_ee (EQ, QImode,
33387 gen_rtx_REG ((machine_mode) d->flag,
33388 FLAGS_REG),
33389 const0_rtx)));
33390 return SUBREG_REG (target);
33392 else
33393 return target;
33396 /* Subroutine of ix86_expand_builtin to take care of insns with
33397 variable number of operands. */
33399 static rtx
33400 ix86_expand_args_builtin (const struct builtin_description *d,
33401 tree exp, rtx target)
33403 rtx pat, real_target;
33404 unsigned int i, nargs;
33405 unsigned int nargs_constant = 0;
33406 unsigned int mask_pos = 0;
33407 int num_memory = 0;
33408 struct
33410 rtx op;
33411 machine_mode mode;
33412 } args[6];
33413 bool second_arg_count = false;
33414 enum insn_code icode = d->icode;
33415 const struct insn_data_d *insn_p = &insn_data[icode];
33416 machine_mode tmode = insn_p->operand[0].mode;
33417 machine_mode rmode = VOIDmode;
33418 bool swap = false;
33419 enum rtx_code comparison = d->comparison;
33421 switch ((enum ix86_builtin_func_type) d->flag)
33423 case V2DF_FTYPE_V2DF_ROUND:
33424 case V4DF_FTYPE_V4DF_ROUND:
33425 case V8DF_FTYPE_V8DF_ROUND:
33426 case V4SF_FTYPE_V4SF_ROUND:
33427 case V8SF_FTYPE_V8SF_ROUND:
33428 case V16SF_FTYPE_V16SF_ROUND:
33429 case V4SI_FTYPE_V4SF_ROUND:
33430 case V8SI_FTYPE_V8SF_ROUND:
33431 case V16SI_FTYPE_V16SF_ROUND:
33432 return ix86_expand_sse_round (d, exp, target);
33433 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33434 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33435 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33436 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33437 case INT_FTYPE_V8SF_V8SF_PTEST:
33438 case INT_FTYPE_V4DI_V4DI_PTEST:
33439 case INT_FTYPE_V4DF_V4DF_PTEST:
33440 case INT_FTYPE_V4SF_V4SF_PTEST:
33441 case INT_FTYPE_V2DI_V2DI_PTEST:
33442 case INT_FTYPE_V2DF_V2DF_PTEST:
33443 return ix86_expand_sse_ptest (d, exp, target);
33444 case FLOAT128_FTYPE_FLOAT128:
33445 case FLOAT_FTYPE_FLOAT:
33446 case INT_FTYPE_INT:
33447 case UINT_FTYPE_UINT:
33448 case UINT16_FTYPE_UINT16:
33449 case UINT64_FTYPE_INT:
33450 case UINT64_FTYPE_UINT64:
33451 case INT64_FTYPE_INT64:
33452 case INT64_FTYPE_V4SF:
33453 case INT64_FTYPE_V2DF:
33454 case INT_FTYPE_V16QI:
33455 case INT_FTYPE_V8QI:
33456 case INT_FTYPE_V8SF:
33457 case INT_FTYPE_V4DF:
33458 case INT_FTYPE_V4SF:
33459 case INT_FTYPE_V2DF:
33460 case INT_FTYPE_V32QI:
33461 case V16QI_FTYPE_V16QI:
33462 case V8SI_FTYPE_V8SF:
33463 case V8SI_FTYPE_V4SI:
33464 case V8HI_FTYPE_V8HI:
33465 case V8HI_FTYPE_V16QI:
33466 case V8QI_FTYPE_V8QI:
33467 case V8SF_FTYPE_V8SF:
33468 case V8SF_FTYPE_V8SI:
33469 case V8SF_FTYPE_V4SF:
33470 case V8SF_FTYPE_V8HI:
33471 case V4SI_FTYPE_V4SI:
33472 case V4SI_FTYPE_V16QI:
33473 case V4SI_FTYPE_V4SF:
33474 case V4SI_FTYPE_V8SI:
33475 case V4SI_FTYPE_V8HI:
33476 case V4SI_FTYPE_V4DF:
33477 case V4SI_FTYPE_V2DF:
33478 case V4HI_FTYPE_V4HI:
33479 case V4DF_FTYPE_V4DF:
33480 case V4DF_FTYPE_V4SI:
33481 case V4DF_FTYPE_V4SF:
33482 case V4DF_FTYPE_V2DF:
33483 case V4SF_FTYPE_V4SF:
33484 case V4SF_FTYPE_V4SI:
33485 case V4SF_FTYPE_V8SF:
33486 case V4SF_FTYPE_V4DF:
33487 case V4SF_FTYPE_V8HI:
33488 case V4SF_FTYPE_V2DF:
33489 case V2DI_FTYPE_V2DI:
33490 case V2DI_FTYPE_V16QI:
33491 case V2DI_FTYPE_V8HI:
33492 case V2DI_FTYPE_V4SI:
33493 case V2DF_FTYPE_V2DF:
33494 case V2DF_FTYPE_V4SI:
33495 case V2DF_FTYPE_V4DF:
33496 case V2DF_FTYPE_V4SF:
33497 case V2DF_FTYPE_V2SI:
33498 case V2SI_FTYPE_V2SI:
33499 case V2SI_FTYPE_V4SF:
33500 case V2SI_FTYPE_V2SF:
33501 case V2SI_FTYPE_V2DF:
33502 case V2SF_FTYPE_V2SF:
33503 case V2SF_FTYPE_V2SI:
33504 case V32QI_FTYPE_V32QI:
33505 case V32QI_FTYPE_V16QI:
33506 case V16HI_FTYPE_V16HI:
33507 case V16HI_FTYPE_V8HI:
33508 case V8SI_FTYPE_V8SI:
33509 case V16HI_FTYPE_V16QI:
33510 case V8SI_FTYPE_V16QI:
33511 case V4DI_FTYPE_V16QI:
33512 case V8SI_FTYPE_V8HI:
33513 case V4DI_FTYPE_V8HI:
33514 case V4DI_FTYPE_V4SI:
33515 case V4DI_FTYPE_V2DI:
33516 case UQI_FTYPE_UQI:
33517 case UHI_FTYPE_UHI:
33518 case USI_FTYPE_USI:
33519 case USI_FTYPE_UQI:
33520 case USI_FTYPE_UHI:
33521 case UDI_FTYPE_UDI:
33522 case UHI_FTYPE_V16QI:
33523 case USI_FTYPE_V32QI:
33524 case UDI_FTYPE_V64QI:
33525 case V16QI_FTYPE_UHI:
33526 case V32QI_FTYPE_USI:
33527 case V64QI_FTYPE_UDI:
33528 case V8HI_FTYPE_UQI:
33529 case V16HI_FTYPE_UHI:
33530 case V32HI_FTYPE_USI:
33531 case V4SI_FTYPE_UQI:
33532 case V8SI_FTYPE_UQI:
33533 case V4SI_FTYPE_UHI:
33534 case V8SI_FTYPE_UHI:
33535 case UQI_FTYPE_V8HI:
33536 case UHI_FTYPE_V16HI:
33537 case USI_FTYPE_V32HI:
33538 case UQI_FTYPE_V4SI:
33539 case UQI_FTYPE_V8SI:
33540 case UHI_FTYPE_V16SI:
33541 case UQI_FTYPE_V2DI:
33542 case UQI_FTYPE_V4DI:
33543 case UQI_FTYPE_V8DI:
33544 case V16SI_FTYPE_UHI:
33545 case V2DI_FTYPE_UQI:
33546 case V4DI_FTYPE_UQI:
33547 case V16SI_FTYPE_INT:
33548 case V16SF_FTYPE_V8SF:
33549 case V16SI_FTYPE_V8SI:
33550 case V16SF_FTYPE_V4SF:
33551 case V16SI_FTYPE_V4SI:
33552 case V16SI_FTYPE_V16SF:
33553 case V16SI_FTYPE_V16SI:
33554 case V16SF_FTYPE_V16SF:
33555 case V8DI_FTYPE_UQI:
33556 case V8DI_FTYPE_V8DI:
33557 case V8DF_FTYPE_V4DF:
33558 case V8DF_FTYPE_V2DF:
33559 case V8DF_FTYPE_V8DF:
33560 nargs = 1;
33561 break;
33562 case V4SF_FTYPE_V4SF_VEC_MERGE:
33563 case V2DF_FTYPE_V2DF_VEC_MERGE:
33564 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33565 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33566 case V16QI_FTYPE_V16QI_V16QI:
33567 case V16QI_FTYPE_V8HI_V8HI:
33568 case V16SF_FTYPE_V16SF_V16SF:
33569 case V8QI_FTYPE_V8QI_V8QI:
33570 case V8QI_FTYPE_V4HI_V4HI:
33571 case V8HI_FTYPE_V8HI_V8HI:
33572 case V8HI_FTYPE_V16QI_V16QI:
33573 case V8HI_FTYPE_V4SI_V4SI:
33574 case V8SF_FTYPE_V8SF_V8SF:
33575 case V8SF_FTYPE_V8SF_V8SI:
33576 case V8DF_FTYPE_V8DF_V8DF:
33577 case V4SI_FTYPE_V4SI_V4SI:
33578 case V4SI_FTYPE_V8HI_V8HI:
33579 case V4SI_FTYPE_V2DF_V2DF:
33580 case V4HI_FTYPE_V4HI_V4HI:
33581 case V4HI_FTYPE_V8QI_V8QI:
33582 case V4HI_FTYPE_V2SI_V2SI:
33583 case V4DF_FTYPE_V4DF_V4DF:
33584 case V4DF_FTYPE_V4DF_V4DI:
33585 case V4SF_FTYPE_V4SF_V4SF:
33586 case V4SF_FTYPE_V4SF_V4SI:
33587 case V4SF_FTYPE_V4SF_V2SI:
33588 case V4SF_FTYPE_V4SF_V2DF:
33589 case V4SF_FTYPE_V4SF_UINT:
33590 case V4SF_FTYPE_V4SF_DI:
33591 case V4SF_FTYPE_V4SF_SI:
33592 case V2DI_FTYPE_V2DI_V2DI:
33593 case V2DI_FTYPE_V16QI_V16QI:
33594 case V2DI_FTYPE_V4SI_V4SI:
33595 case V2DI_FTYPE_V2DI_V16QI:
33596 case V2SI_FTYPE_V2SI_V2SI:
33597 case V2SI_FTYPE_V4HI_V4HI:
33598 case V2SI_FTYPE_V2SF_V2SF:
33599 case V2DF_FTYPE_V2DF_V2DF:
33600 case V2DF_FTYPE_V2DF_V4SF:
33601 case V2DF_FTYPE_V2DF_V2DI:
33602 case V2DF_FTYPE_V2DF_DI:
33603 case V2DF_FTYPE_V2DF_SI:
33604 case V2DF_FTYPE_V2DF_UINT:
33605 case V2SF_FTYPE_V2SF_V2SF:
33606 case V1DI_FTYPE_V1DI_V1DI:
33607 case V1DI_FTYPE_V8QI_V8QI:
33608 case V1DI_FTYPE_V2SI_V2SI:
33609 case V32QI_FTYPE_V16HI_V16HI:
33610 case V16HI_FTYPE_V8SI_V8SI:
33611 case V64QI_FTYPE_V64QI_V64QI:
33612 case V32QI_FTYPE_V32QI_V32QI:
33613 case V16HI_FTYPE_V32QI_V32QI:
33614 case V16HI_FTYPE_V16HI_V16HI:
33615 case V8SI_FTYPE_V4DF_V4DF:
33616 case V8SI_FTYPE_V8SI_V8SI:
33617 case V8SI_FTYPE_V16HI_V16HI:
33618 case V4DI_FTYPE_V4DI_V4DI:
33619 case V4DI_FTYPE_V8SI_V8SI:
33620 case V8DI_FTYPE_V64QI_V64QI:
33621 if (comparison == UNKNOWN)
33622 return ix86_expand_binop_builtin (icode, exp, target);
33623 nargs = 2;
33624 break;
33625 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33626 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33627 gcc_assert (comparison != UNKNOWN);
33628 nargs = 2;
33629 swap = true;
33630 break;
33631 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33632 case V16HI_FTYPE_V16HI_SI_COUNT:
33633 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33634 case V8SI_FTYPE_V8SI_SI_COUNT:
33635 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33636 case V4DI_FTYPE_V4DI_INT_COUNT:
33637 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33638 case V8HI_FTYPE_V8HI_SI_COUNT:
33639 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33640 case V4SI_FTYPE_V4SI_SI_COUNT:
33641 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33642 case V4HI_FTYPE_V4HI_SI_COUNT:
33643 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33644 case V2DI_FTYPE_V2DI_SI_COUNT:
33645 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33646 case V2SI_FTYPE_V2SI_SI_COUNT:
33647 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33648 case V1DI_FTYPE_V1DI_SI_COUNT:
33649 nargs = 2;
33650 second_arg_count = true;
33651 break;
33652 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33653 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33654 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33655 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33656 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33657 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33658 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33659 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33660 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33661 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33662 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33663 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33664 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33665 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33666 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33667 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33668 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33669 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33670 nargs = 4;
33671 second_arg_count = true;
33672 break;
33673 case UINT64_FTYPE_UINT64_UINT64:
33674 case UINT_FTYPE_UINT_UINT:
33675 case UINT_FTYPE_UINT_USHORT:
33676 case UINT_FTYPE_UINT_UCHAR:
33677 case UINT16_FTYPE_UINT16_INT:
33678 case UINT8_FTYPE_UINT8_INT:
33679 case UQI_FTYPE_UQI_UQI:
33680 case UHI_FTYPE_UHI_UHI:
33681 case USI_FTYPE_USI_USI:
33682 case UDI_FTYPE_UDI_UDI:
33683 case V16SI_FTYPE_V8DF_V8DF:
33684 nargs = 2;
33685 break;
33686 case V2DI_FTYPE_V2DI_INT_CONVERT:
33687 nargs = 2;
33688 rmode = V1TImode;
33689 nargs_constant = 1;
33690 break;
33691 case V4DI_FTYPE_V4DI_INT_CONVERT:
33692 nargs = 2;
33693 rmode = V2TImode;
33694 nargs_constant = 1;
33695 break;
33696 case V8DI_FTYPE_V8DI_INT_CONVERT:
33697 nargs = 2;
33698 rmode = V4TImode;
33699 nargs_constant = 1;
33700 break;
33701 case V8HI_FTYPE_V8HI_INT:
33702 case V8HI_FTYPE_V8SF_INT:
33703 case V16HI_FTYPE_V16SF_INT:
33704 case V8HI_FTYPE_V4SF_INT:
33705 case V8SF_FTYPE_V8SF_INT:
33706 case V4SF_FTYPE_V16SF_INT:
33707 case V16SF_FTYPE_V16SF_INT:
33708 case V4SI_FTYPE_V4SI_INT:
33709 case V4SI_FTYPE_V8SI_INT:
33710 case V4HI_FTYPE_V4HI_INT:
33711 case V4DF_FTYPE_V4DF_INT:
33712 case V4DF_FTYPE_V8DF_INT:
33713 case V4SF_FTYPE_V4SF_INT:
33714 case V4SF_FTYPE_V8SF_INT:
33715 case V2DI_FTYPE_V2DI_INT:
33716 case V2DF_FTYPE_V2DF_INT:
33717 case V2DF_FTYPE_V4DF_INT:
33718 case V16HI_FTYPE_V16HI_INT:
33719 case V8SI_FTYPE_V8SI_INT:
33720 case V16SI_FTYPE_V16SI_INT:
33721 case V4SI_FTYPE_V16SI_INT:
33722 case V4DI_FTYPE_V4DI_INT:
33723 case V2DI_FTYPE_V4DI_INT:
33724 case V4DI_FTYPE_V8DI_INT:
33725 case QI_FTYPE_V4SF_INT:
33726 case QI_FTYPE_V2DF_INT:
33727 case UQI_FTYPE_UQI_UQI_CONST:
33728 case UHI_FTYPE_UHI_UQI:
33729 case USI_FTYPE_USI_UQI:
33730 case UDI_FTYPE_UDI_UQI:
33731 nargs = 2;
33732 nargs_constant = 1;
33733 break;
33734 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33735 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33736 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33737 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33738 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33739 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33740 case UHI_FTYPE_V16SI_V16SI_UHI:
33741 case UQI_FTYPE_V8DI_V8DI_UQI:
33742 case V16HI_FTYPE_V16SI_V16HI_UHI:
33743 case V16QI_FTYPE_V16SI_V16QI_UHI:
33744 case V16QI_FTYPE_V8DI_V16QI_UQI:
33745 case V16SF_FTYPE_V16SF_V16SF_UHI:
33746 case V16SF_FTYPE_V4SF_V16SF_UHI:
33747 case V16SI_FTYPE_SI_V16SI_UHI:
33748 case V16SI_FTYPE_V16HI_V16SI_UHI:
33749 case V16SI_FTYPE_V16QI_V16SI_UHI:
33750 case V8SF_FTYPE_V4SF_V8SF_UQI:
33751 case V4DF_FTYPE_V2DF_V4DF_UQI:
33752 case V8SI_FTYPE_V4SI_V8SI_UQI:
33753 case V8SI_FTYPE_SI_V8SI_UQI:
33754 case V4SI_FTYPE_V4SI_V4SI_UQI:
33755 case V4SI_FTYPE_SI_V4SI_UQI:
33756 case V4DI_FTYPE_V2DI_V4DI_UQI:
33757 case V4DI_FTYPE_DI_V4DI_UQI:
33758 case V2DI_FTYPE_V2DI_V2DI_UQI:
33759 case V2DI_FTYPE_DI_V2DI_UQI:
33760 case V64QI_FTYPE_V64QI_V64QI_UDI:
33761 case V64QI_FTYPE_V16QI_V64QI_UDI:
33762 case V64QI_FTYPE_QI_V64QI_UDI:
33763 case V32QI_FTYPE_V32QI_V32QI_USI:
33764 case V32QI_FTYPE_V16QI_V32QI_USI:
33765 case V32QI_FTYPE_QI_V32QI_USI:
33766 case V16QI_FTYPE_V16QI_V16QI_UHI:
33767 case V16QI_FTYPE_QI_V16QI_UHI:
33768 case V32HI_FTYPE_V8HI_V32HI_USI:
33769 case V32HI_FTYPE_HI_V32HI_USI:
33770 case V16HI_FTYPE_V8HI_V16HI_UHI:
33771 case V16HI_FTYPE_HI_V16HI_UHI:
33772 case V8HI_FTYPE_V8HI_V8HI_UQI:
33773 case V8HI_FTYPE_HI_V8HI_UQI:
33774 case V8SF_FTYPE_V8HI_V8SF_UQI:
33775 case V4SF_FTYPE_V8HI_V4SF_UQI:
33776 case V8SI_FTYPE_V8SF_V8SI_UQI:
33777 case V4SI_FTYPE_V4SF_V4SI_UQI:
33778 case V4DI_FTYPE_V4SF_V4DI_UQI:
33779 case V2DI_FTYPE_V4SF_V2DI_UQI:
33780 case V4SF_FTYPE_V4DI_V4SF_UQI:
33781 case V4SF_FTYPE_V2DI_V4SF_UQI:
33782 case V4DF_FTYPE_V4DI_V4DF_UQI:
33783 case V2DF_FTYPE_V2DI_V2DF_UQI:
33784 case V16QI_FTYPE_V8HI_V16QI_UQI:
33785 case V16QI_FTYPE_V16HI_V16QI_UHI:
33786 case V16QI_FTYPE_V4SI_V16QI_UQI:
33787 case V16QI_FTYPE_V8SI_V16QI_UQI:
33788 case V8HI_FTYPE_V4SI_V8HI_UQI:
33789 case V8HI_FTYPE_V8SI_V8HI_UQI:
33790 case V16QI_FTYPE_V2DI_V16QI_UQI:
33791 case V16QI_FTYPE_V4DI_V16QI_UQI:
33792 case V8HI_FTYPE_V2DI_V8HI_UQI:
33793 case V8HI_FTYPE_V4DI_V8HI_UQI:
33794 case V4SI_FTYPE_V2DI_V4SI_UQI:
33795 case V4SI_FTYPE_V4DI_V4SI_UQI:
33796 case V32QI_FTYPE_V32HI_V32QI_USI:
33797 case UHI_FTYPE_V16QI_V16QI_UHI:
33798 case USI_FTYPE_V32QI_V32QI_USI:
33799 case UDI_FTYPE_V64QI_V64QI_UDI:
33800 case UQI_FTYPE_V8HI_V8HI_UQI:
33801 case UHI_FTYPE_V16HI_V16HI_UHI:
33802 case USI_FTYPE_V32HI_V32HI_USI:
33803 case UQI_FTYPE_V4SI_V4SI_UQI:
33804 case UQI_FTYPE_V8SI_V8SI_UQI:
33805 case UQI_FTYPE_V2DI_V2DI_UQI:
33806 case UQI_FTYPE_V4DI_V4DI_UQI:
33807 case V4SF_FTYPE_V2DF_V4SF_UQI:
33808 case V4SF_FTYPE_V4DF_V4SF_UQI:
33809 case V16SI_FTYPE_V16SI_V16SI_UHI:
33810 case V16SI_FTYPE_V4SI_V16SI_UHI:
33811 case V2DI_FTYPE_V4SI_V2DI_UQI:
33812 case V2DI_FTYPE_V8HI_V2DI_UQI:
33813 case V2DI_FTYPE_V16QI_V2DI_UQI:
33814 case V4DI_FTYPE_V4DI_V4DI_UQI:
33815 case V4DI_FTYPE_V4SI_V4DI_UQI:
33816 case V4DI_FTYPE_V8HI_V4DI_UQI:
33817 case V4DI_FTYPE_V16QI_V4DI_UQI:
33818 case V4DI_FTYPE_V4DF_V4DI_UQI:
33819 case V2DI_FTYPE_V2DF_V2DI_UQI:
33820 case V4SI_FTYPE_V4DF_V4SI_UQI:
33821 case V4SI_FTYPE_V2DF_V4SI_UQI:
33822 case V4SI_FTYPE_V8HI_V4SI_UQI:
33823 case V4SI_FTYPE_V16QI_V4SI_UQI:
33824 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33825 case V8DF_FTYPE_V2DF_V8DF_UQI:
33826 case V8DF_FTYPE_V4DF_V8DF_UQI:
33827 case V8DF_FTYPE_V8DF_V8DF_UQI:
33828 case V8SF_FTYPE_V8SF_V8SF_UQI:
33829 case V8SF_FTYPE_V8SI_V8SF_UQI:
33830 case V4DF_FTYPE_V4DF_V4DF_UQI:
33831 case V4SF_FTYPE_V4SF_V4SF_UQI:
33832 case V2DF_FTYPE_V2DF_V2DF_UQI:
33833 case V2DF_FTYPE_V4SF_V2DF_UQI:
33834 case V2DF_FTYPE_V4SI_V2DF_UQI:
33835 case V4SF_FTYPE_V4SI_V4SF_UQI:
33836 case V4DF_FTYPE_V4SF_V4DF_UQI:
33837 case V4DF_FTYPE_V4SI_V4DF_UQI:
33838 case V8SI_FTYPE_V8SI_V8SI_UQI:
33839 case V8SI_FTYPE_V8HI_V8SI_UQI:
33840 case V8SI_FTYPE_V16QI_V8SI_UQI:
33841 case V8DF_FTYPE_V8SI_V8DF_UQI:
33842 case V8DI_FTYPE_DI_V8DI_UQI:
33843 case V16SF_FTYPE_V8SF_V16SF_UHI:
33844 case V16SI_FTYPE_V8SI_V16SI_UHI:
33845 case V16HI_FTYPE_V16HI_V16HI_UHI:
33846 case V8HI_FTYPE_V16QI_V8HI_UQI:
33847 case V16HI_FTYPE_V16QI_V16HI_UHI:
33848 case V32HI_FTYPE_V32HI_V32HI_USI:
33849 case V32HI_FTYPE_V32QI_V32HI_USI:
33850 case V8DI_FTYPE_V16QI_V8DI_UQI:
33851 case V8DI_FTYPE_V2DI_V8DI_UQI:
33852 case V8DI_FTYPE_V4DI_V8DI_UQI:
33853 case V8DI_FTYPE_V8DI_V8DI_UQI:
33854 case V8DI_FTYPE_V8HI_V8DI_UQI:
33855 case V8DI_FTYPE_V8SI_V8DI_UQI:
33856 case V8HI_FTYPE_V8DI_V8HI_UQI:
33857 case V8SI_FTYPE_V8DI_V8SI_UQI:
33858 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33859 case V16SI_FTYPE_V16SI_V16SI_V16SI:
33860 case V8DI_FTYPE_V8DI_V8DI_V8DI:
33861 case V32HI_FTYPE_V32HI_V32HI_V32HI:
33862 case V2DI_FTYPE_V2DI_V2DI_V2DI:
33863 case V16HI_FTYPE_V16HI_V16HI_V16HI:
33864 case V8SI_FTYPE_V8SI_V8SI_V8SI:
33865 case V8HI_FTYPE_V8HI_V8HI_V8HI:
33866 nargs = 3;
33867 break;
33868 case V32QI_FTYPE_V32QI_V32QI_INT:
33869 case V16HI_FTYPE_V16HI_V16HI_INT:
33870 case V16QI_FTYPE_V16QI_V16QI_INT:
33871 case V4DI_FTYPE_V4DI_V4DI_INT:
33872 case V8HI_FTYPE_V8HI_V8HI_INT:
33873 case V8SI_FTYPE_V8SI_V8SI_INT:
33874 case V8SI_FTYPE_V8SI_V4SI_INT:
33875 case V8SF_FTYPE_V8SF_V8SF_INT:
33876 case V8SF_FTYPE_V8SF_V4SF_INT:
33877 case V4SI_FTYPE_V4SI_V4SI_INT:
33878 case V4DF_FTYPE_V4DF_V4DF_INT:
33879 case V16SF_FTYPE_V16SF_V16SF_INT:
33880 case V16SF_FTYPE_V16SF_V4SF_INT:
33881 case V16SI_FTYPE_V16SI_V4SI_INT:
33882 case V4DF_FTYPE_V4DF_V2DF_INT:
33883 case V4SF_FTYPE_V4SF_V4SF_INT:
33884 case V2DI_FTYPE_V2DI_V2DI_INT:
33885 case V4DI_FTYPE_V4DI_V2DI_INT:
33886 case V2DF_FTYPE_V2DF_V2DF_INT:
33887 case UQI_FTYPE_V8DI_V8UDI_INT:
33888 case UQI_FTYPE_V8DF_V8DF_INT:
33889 case UQI_FTYPE_V2DF_V2DF_INT:
33890 case UQI_FTYPE_V4SF_V4SF_INT:
33891 case UHI_FTYPE_V16SI_V16SI_INT:
33892 case UHI_FTYPE_V16SF_V16SF_INT:
33893 case V64QI_FTYPE_V64QI_V64QI_INT:
33894 case V32HI_FTYPE_V32HI_V32HI_INT:
33895 case V16SI_FTYPE_V16SI_V16SI_INT:
33896 case V8DI_FTYPE_V8DI_V8DI_INT:
33897 nargs = 3;
33898 nargs_constant = 1;
33899 break;
33900 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33901 nargs = 3;
33902 rmode = V4DImode;
33903 nargs_constant = 1;
33904 break;
33905 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33906 nargs = 3;
33907 rmode = V2DImode;
33908 nargs_constant = 1;
33909 break;
33910 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33911 nargs = 3;
33912 rmode = DImode;
33913 nargs_constant = 1;
33914 break;
33915 case V2DI_FTYPE_V2DI_UINT_UINT:
33916 nargs = 3;
33917 nargs_constant = 2;
33918 break;
33919 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33920 nargs = 3;
33921 rmode = V8DImode;
33922 nargs_constant = 1;
33923 break;
33924 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33925 nargs = 5;
33926 rmode = V8DImode;
33927 mask_pos = 2;
33928 nargs_constant = 1;
33929 break;
33930 case QI_FTYPE_V8DF_INT_UQI:
33931 case QI_FTYPE_V4DF_INT_UQI:
33932 case QI_FTYPE_V2DF_INT_UQI:
33933 case HI_FTYPE_V16SF_INT_UHI:
33934 case QI_FTYPE_V8SF_INT_UQI:
33935 case QI_FTYPE_V4SF_INT_UQI:
33936 nargs = 3;
33937 mask_pos = 1;
33938 nargs_constant = 1;
33939 break;
33940 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33941 nargs = 5;
33942 rmode = V4DImode;
33943 mask_pos = 2;
33944 nargs_constant = 1;
33945 break;
33946 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33947 nargs = 5;
33948 rmode = V2DImode;
33949 mask_pos = 2;
33950 nargs_constant = 1;
33951 break;
33952 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
33953 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
33954 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
33955 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
33956 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
33957 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
33958 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
33959 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
33960 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
33961 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
33962 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
33963 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
33964 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
33965 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
33966 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
33967 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
33968 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
33969 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
33970 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
33971 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
33972 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
33973 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
33974 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
33975 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
33976 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
33977 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
33978 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
33979 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
33980 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
33981 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
33982 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
33983 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
33984 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
33985 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
33986 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
33987 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
33988 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
33989 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
33990 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
33991 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
33992 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
33993 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
33994 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
33995 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
33996 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
33997 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
33998 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
33999 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34000 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34001 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34002 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34003 nargs = 4;
34004 break;
34005 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34006 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34007 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34008 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34009 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34010 nargs = 4;
34011 nargs_constant = 1;
34012 break;
34013 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34014 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34015 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34016 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34017 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34018 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34019 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34020 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34021 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34022 case USI_FTYPE_V32QI_V32QI_INT_USI:
34023 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34024 case USI_FTYPE_V32HI_V32HI_INT_USI:
34025 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34026 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34027 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34028 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34029 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34030 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34031 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34032 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34033 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34034 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34035 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34036 nargs = 4;
34037 mask_pos = 1;
34038 nargs_constant = 1;
34039 break;
34040 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34041 nargs = 4;
34042 nargs_constant = 2;
34043 break;
34044 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34045 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34046 nargs = 4;
34047 break;
34048 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34049 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34050 mask_pos = 1;
34051 nargs = 4;
34052 nargs_constant = 1;
34053 break;
34054 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34055 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34056 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34057 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34058 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34059 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34060 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34061 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34062 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34063 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34064 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34065 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34066 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34067 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34068 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34069 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34070 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34071 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34072 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34073 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34074 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34075 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34076 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34077 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34078 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34079 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34080 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34081 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34082 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34083 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34084 nargs = 4;
34085 mask_pos = 2;
34086 nargs_constant = 1;
34087 break;
34088 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34089 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34090 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34091 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34092 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34093 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34094 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34095 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34096 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34097 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34098 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34099 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34100 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34101 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34102 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34103 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34104 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34105 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34106 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34107 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34108 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34109 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34110 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34111 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34112 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34113 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34114 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34115 nargs = 5;
34116 mask_pos = 2;
34117 nargs_constant = 1;
34118 break;
34119 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34120 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34121 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34122 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34123 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34124 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34125 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34126 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34127 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34128 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34129 nargs = 5;
34130 mask_pos = 1;
34131 nargs_constant = 1;
34132 break;
34133 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34134 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34135 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34136 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34137 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34138 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34139 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34140 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34141 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34142 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34143 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34144 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34145 nargs = 5;
34146 mask_pos = 1;
34147 nargs_constant = 2;
34148 break;
34150 default:
34151 gcc_unreachable ();
34154 gcc_assert (nargs <= ARRAY_SIZE (args));
34156 if (comparison != UNKNOWN)
34158 gcc_assert (nargs == 2);
34159 return ix86_expand_sse_compare (d, exp, target, swap);
34162 if (rmode == VOIDmode || rmode == tmode)
34164 if (optimize
34165 || target == 0
34166 || GET_MODE (target) != tmode
34167 || !insn_p->operand[0].predicate (target, tmode))
34168 target = gen_reg_rtx (tmode);
34169 else if (memory_operand (target, tmode))
34170 num_memory++;
34171 real_target = target;
34173 else
34175 real_target = gen_reg_rtx (tmode);
34176 target = lowpart_subreg (rmode, real_target, tmode);
34179 for (i = 0; i < nargs; i++)
34181 tree arg = CALL_EXPR_ARG (exp, i);
34182 rtx op = expand_normal (arg);
34183 machine_mode mode = insn_p->operand[i + 1].mode;
34184 bool match = insn_p->operand[i + 1].predicate (op, mode);
34186 if (second_arg_count && i == 1)
34188 /* SIMD shift insns take either an 8-bit immediate or
34189 register as count. But builtin functions take int as
34190 count. If count doesn't match, we put it in register.
34191 The instructions are using 64-bit count, if op is just
34192 32-bit, zero-extend it, as negative shift counts
34193 are undefined behavior and zero-extension is more
34194 efficient. */
34195 if (!match)
34197 if (SCALAR_INT_MODE_P (GET_MODE (op)))
34198 op = convert_modes (mode, GET_MODE (op), op, 1);
34199 else
34200 op = lowpart_subreg (mode, op, GET_MODE (op));
34201 if (!insn_p->operand[i + 1].predicate (op, mode))
34202 op = copy_to_reg (op);
34205 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34206 (!mask_pos && (nargs - i) <= nargs_constant))
34208 if (!match)
34209 switch (icode)
34211 case CODE_FOR_avx_vinsertf128v4di:
34212 case CODE_FOR_avx_vextractf128v4di:
34213 error ("the last argument must be an 1-bit immediate");
34214 return const0_rtx;
34216 case CODE_FOR_avx512f_cmpv8di3_mask:
34217 case CODE_FOR_avx512f_cmpv16si3_mask:
34218 case CODE_FOR_avx512f_ucmpv8di3_mask:
34219 case CODE_FOR_avx512f_ucmpv16si3_mask:
34220 case CODE_FOR_avx512vl_cmpv4di3_mask:
34221 case CODE_FOR_avx512vl_cmpv8si3_mask:
34222 case CODE_FOR_avx512vl_ucmpv4di3_mask:
34223 case CODE_FOR_avx512vl_ucmpv8si3_mask:
34224 case CODE_FOR_avx512vl_cmpv2di3_mask:
34225 case CODE_FOR_avx512vl_cmpv4si3_mask:
34226 case CODE_FOR_avx512vl_ucmpv2di3_mask:
34227 case CODE_FOR_avx512vl_ucmpv4si3_mask:
34228 error ("the last argument must be a 3-bit immediate");
34229 return const0_rtx;
34231 case CODE_FOR_sse4_1_roundsd:
34232 case CODE_FOR_sse4_1_roundss:
34234 case CODE_FOR_sse4_1_roundpd:
34235 case CODE_FOR_sse4_1_roundps:
34236 case CODE_FOR_avx_roundpd256:
34237 case CODE_FOR_avx_roundps256:
34239 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
34240 case CODE_FOR_sse4_1_roundps_sfix:
34241 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
34242 case CODE_FOR_avx_roundps_sfix256:
34244 case CODE_FOR_sse4_1_blendps:
34245 case CODE_FOR_avx_blendpd256:
34246 case CODE_FOR_avx_vpermilv4df:
34247 case CODE_FOR_avx_vpermilv4df_mask:
34248 case CODE_FOR_avx512f_getmantv8df_mask:
34249 case CODE_FOR_avx512f_getmantv16sf_mask:
34250 case CODE_FOR_avx512vl_getmantv8sf_mask:
34251 case CODE_FOR_avx512vl_getmantv4df_mask:
34252 case CODE_FOR_avx512vl_getmantv4sf_mask:
34253 case CODE_FOR_avx512vl_getmantv2df_mask:
34254 case CODE_FOR_avx512dq_rangepv8df_mask_round:
34255 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
34256 case CODE_FOR_avx512dq_rangepv4df_mask:
34257 case CODE_FOR_avx512dq_rangepv8sf_mask:
34258 case CODE_FOR_avx512dq_rangepv2df_mask:
34259 case CODE_FOR_avx512dq_rangepv4sf_mask:
34260 case CODE_FOR_avx_shufpd256_mask:
34261 error ("the last argument must be a 4-bit immediate");
34262 return const0_rtx;
34264 case CODE_FOR_sha1rnds4:
34265 case CODE_FOR_sse4_1_blendpd:
34266 case CODE_FOR_avx_vpermilv2df:
34267 case CODE_FOR_avx_vpermilv2df_mask:
34268 case CODE_FOR_xop_vpermil2v2df3:
34269 case CODE_FOR_xop_vpermil2v4sf3:
34270 case CODE_FOR_xop_vpermil2v4df3:
34271 case CODE_FOR_xop_vpermil2v8sf3:
34272 case CODE_FOR_avx512f_vinsertf32x4_mask:
34273 case CODE_FOR_avx512f_vinserti32x4_mask:
34274 case CODE_FOR_avx512f_vextractf32x4_mask:
34275 case CODE_FOR_avx512f_vextracti32x4_mask:
34276 case CODE_FOR_sse2_shufpd:
34277 case CODE_FOR_sse2_shufpd_mask:
34278 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34279 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34280 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34281 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34282 error ("the last argument must be a 2-bit immediate");
34283 return const0_rtx;
34285 case CODE_FOR_avx_vextractf128v4df:
34286 case CODE_FOR_avx_vextractf128v8sf:
34287 case CODE_FOR_avx_vextractf128v8si:
34288 case CODE_FOR_avx_vinsertf128v4df:
34289 case CODE_FOR_avx_vinsertf128v8sf:
34290 case CODE_FOR_avx_vinsertf128v8si:
34291 case CODE_FOR_avx512f_vinsertf64x4_mask:
34292 case CODE_FOR_avx512f_vinserti64x4_mask:
34293 case CODE_FOR_avx512f_vextractf64x4_mask:
34294 case CODE_FOR_avx512f_vextracti64x4_mask:
34295 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34296 case CODE_FOR_avx512dq_vinserti32x8_mask:
34297 case CODE_FOR_avx512vl_vinsertv4df:
34298 case CODE_FOR_avx512vl_vinsertv4di:
34299 case CODE_FOR_avx512vl_vinsertv8sf:
34300 case CODE_FOR_avx512vl_vinsertv8si:
34301 error ("the last argument must be a 1-bit immediate");
34302 return const0_rtx;
34304 case CODE_FOR_avx_vmcmpv2df3:
34305 case CODE_FOR_avx_vmcmpv4sf3:
34306 case CODE_FOR_avx_cmpv2df3:
34307 case CODE_FOR_avx_cmpv4sf3:
34308 case CODE_FOR_avx_cmpv4df3:
34309 case CODE_FOR_avx_cmpv8sf3:
34310 case CODE_FOR_avx512f_cmpv8df3_mask:
34311 case CODE_FOR_avx512f_cmpv16sf3_mask:
34312 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34313 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34314 error ("the last argument must be a 5-bit immediate");
34315 return const0_rtx;
34317 default:
34318 switch (nargs_constant)
34320 case 2:
34321 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34322 (!mask_pos && (nargs - i) == nargs_constant))
34324 error ("the next to last argument must be an 8-bit immediate");
34325 break;
34327 /* FALLTHRU */
34328 case 1:
34329 error ("the last argument must be an 8-bit immediate");
34330 break;
34331 default:
34332 gcc_unreachable ();
34334 return const0_rtx;
34337 else
34339 if (VECTOR_MODE_P (mode))
34340 op = safe_vector_operand (op, mode);
34342 /* If we aren't optimizing, only allow one memory operand to
34343 be generated. */
34344 if (memory_operand (op, mode))
34345 num_memory++;
34347 op = fixup_modeless_constant (op, mode);
34349 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34351 if (optimize || !match || num_memory > 1)
34352 op = copy_to_mode_reg (mode, op);
34354 else
34356 op = copy_to_reg (op);
34357 op = lowpart_subreg (mode, op, GET_MODE (op));
34361 args[i].op = op;
34362 args[i].mode = mode;
34365 switch (nargs)
34367 case 1:
34368 pat = GEN_FCN (icode) (real_target, args[0].op);
34369 break;
34370 case 2:
34371 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34372 break;
34373 case 3:
34374 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34375 args[2].op);
34376 break;
34377 case 4:
34378 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34379 args[2].op, args[3].op);
34380 break;
34381 case 5:
34382 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34383 args[2].op, args[3].op, args[4].op);
34384 break;
34385 case 6:
34386 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34387 args[2].op, args[3].op, args[4].op,
34388 args[5].op);
34389 break;
34390 default:
34391 gcc_unreachable ();
34394 if (! pat)
34395 return 0;
34397 emit_insn (pat);
34398 return target;
34401 /* Transform pattern of following layout:
34402 (set A
34403 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34405 into:
34406 (set (A B)) */
34408 static rtx
34409 ix86_erase_embedded_rounding (rtx pat)
34411 if (GET_CODE (pat) == INSN)
34412 pat = PATTERN (pat);
34414 gcc_assert (GET_CODE (pat) == SET);
34415 rtx src = SET_SRC (pat);
34416 gcc_assert (XVECLEN (src, 0) == 2);
34417 rtx p0 = XVECEXP (src, 0, 0);
34418 gcc_assert (GET_CODE (src) == UNSPEC
34419 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34420 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34421 return res;
34424 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34425 with rounding. */
34426 static rtx
34427 ix86_expand_sse_comi_round (const struct builtin_description *d,
34428 tree exp, rtx target)
34430 rtx pat, set_dst;
34431 tree arg0 = CALL_EXPR_ARG (exp, 0);
34432 tree arg1 = CALL_EXPR_ARG (exp, 1);
34433 tree arg2 = CALL_EXPR_ARG (exp, 2);
34434 tree arg3 = CALL_EXPR_ARG (exp, 3);
34435 rtx op0 = expand_normal (arg0);
34436 rtx op1 = expand_normal (arg1);
34437 rtx op2 = expand_normal (arg2);
34438 rtx op3 = expand_normal (arg3);
34439 enum insn_code icode = d->icode;
34440 const struct insn_data_d *insn_p = &insn_data[icode];
34441 machine_mode mode0 = insn_p->operand[0].mode;
34442 machine_mode mode1 = insn_p->operand[1].mode;
34443 enum rtx_code comparison = UNEQ;
34444 bool need_ucomi = false;
34446 /* See avxintrin.h for values. */
34447 enum rtx_code comi_comparisons[32] =
34449 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34450 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34451 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34453 bool need_ucomi_values[32] =
34455 true, false, false, true, true, false, false, true,
34456 true, false, false, true, true, false, false, true,
34457 false, true, true, false, false, true, true, false,
34458 false, true, true, false, false, true, true, false
34461 if (!CONST_INT_P (op2))
34463 error ("the third argument must be comparison constant");
34464 return const0_rtx;
34466 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34468 error ("incorrect comparison mode");
34469 return const0_rtx;
34472 if (!insn_p->operand[2].predicate (op3, SImode))
34474 error ("incorrect rounding operand");
34475 return const0_rtx;
34478 comparison = comi_comparisons[INTVAL (op2)];
34479 need_ucomi = need_ucomi_values[INTVAL (op2)];
34481 if (VECTOR_MODE_P (mode0))
34482 op0 = safe_vector_operand (op0, mode0);
34483 if (VECTOR_MODE_P (mode1))
34484 op1 = safe_vector_operand (op1, mode1);
34486 target = gen_reg_rtx (SImode);
34487 emit_move_insn (target, const0_rtx);
34488 target = gen_rtx_SUBREG (QImode, target, 0);
34490 if ((optimize && !register_operand (op0, mode0))
34491 || !insn_p->operand[0].predicate (op0, mode0))
34492 op0 = copy_to_mode_reg (mode0, op0);
34493 if ((optimize && !register_operand (op1, mode1))
34494 || !insn_p->operand[1].predicate (op1, mode1))
34495 op1 = copy_to_mode_reg (mode1, op1);
34497 if (need_ucomi)
34498 icode = icode == CODE_FOR_sse_comi_round
34499 ? CODE_FOR_sse_ucomi_round
34500 : CODE_FOR_sse2_ucomi_round;
34502 pat = GEN_FCN (icode) (op0, op1, op3);
34503 if (! pat)
34504 return 0;
34506 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34507 if (INTVAL (op3) == NO_ROUND)
34509 pat = ix86_erase_embedded_rounding (pat);
34510 if (! pat)
34511 return 0;
34513 set_dst = SET_DEST (pat);
34515 else
34517 gcc_assert (GET_CODE (pat) == SET);
34518 set_dst = SET_DEST (pat);
34521 emit_insn (pat);
34522 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34523 gen_rtx_fmt_ee (comparison, QImode,
34524 set_dst,
34525 const0_rtx)));
34527 return SUBREG_REG (target);
34530 static rtx
34531 ix86_expand_round_builtin (const struct builtin_description *d,
34532 tree exp, rtx target)
34534 rtx pat;
34535 unsigned int i, nargs;
34536 struct
34538 rtx op;
34539 machine_mode mode;
34540 } args[6];
34541 enum insn_code icode = d->icode;
34542 const struct insn_data_d *insn_p = &insn_data[icode];
34543 machine_mode tmode = insn_p->operand[0].mode;
34544 unsigned int nargs_constant = 0;
34545 unsigned int redundant_embed_rnd = 0;
34547 switch ((enum ix86_builtin_func_type) d->flag)
34549 case UINT64_FTYPE_V2DF_INT:
34550 case UINT64_FTYPE_V4SF_INT:
34551 case UINT_FTYPE_V2DF_INT:
34552 case UINT_FTYPE_V4SF_INT:
34553 case INT64_FTYPE_V2DF_INT:
34554 case INT64_FTYPE_V4SF_INT:
34555 case INT_FTYPE_V2DF_INT:
34556 case INT_FTYPE_V4SF_INT:
34557 nargs = 2;
34558 break;
34559 case V4SF_FTYPE_V4SF_UINT_INT:
34560 case V4SF_FTYPE_V4SF_UINT64_INT:
34561 case V2DF_FTYPE_V2DF_UINT64_INT:
34562 case V4SF_FTYPE_V4SF_INT_INT:
34563 case V4SF_FTYPE_V4SF_INT64_INT:
34564 case V2DF_FTYPE_V2DF_INT64_INT:
34565 case V4SF_FTYPE_V4SF_V4SF_INT:
34566 case V2DF_FTYPE_V2DF_V2DF_INT:
34567 case V4SF_FTYPE_V4SF_V2DF_INT:
34568 case V2DF_FTYPE_V2DF_V4SF_INT:
34569 nargs = 3;
34570 break;
34571 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34572 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34573 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34574 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34575 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34576 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34577 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34578 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34579 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34580 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34581 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34582 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34583 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34584 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34585 nargs = 4;
34586 break;
34587 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34588 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34589 nargs_constant = 2;
34590 nargs = 4;
34591 break;
34592 case INT_FTYPE_V4SF_V4SF_INT_INT:
34593 case INT_FTYPE_V2DF_V2DF_INT_INT:
34594 return ix86_expand_sse_comi_round (d, exp, target);
34595 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34596 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34597 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34598 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34599 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34600 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34601 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34602 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34603 nargs = 5;
34604 break;
34605 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34606 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34607 nargs_constant = 4;
34608 nargs = 5;
34609 break;
34610 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34611 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34612 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34613 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34614 nargs_constant = 3;
34615 nargs = 5;
34616 break;
34617 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34618 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34619 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34620 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34621 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34622 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34623 nargs = 6;
34624 nargs_constant = 4;
34625 break;
34626 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34627 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34628 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34629 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34630 nargs = 6;
34631 nargs_constant = 3;
34632 break;
34633 default:
34634 gcc_unreachable ();
34636 gcc_assert (nargs <= ARRAY_SIZE (args));
34638 if (optimize
34639 || target == 0
34640 || GET_MODE (target) != tmode
34641 || !insn_p->operand[0].predicate (target, tmode))
34642 target = gen_reg_rtx (tmode);
34644 for (i = 0; i < nargs; i++)
34646 tree arg = CALL_EXPR_ARG (exp, i);
34647 rtx op = expand_normal (arg);
34648 machine_mode mode = insn_p->operand[i + 1].mode;
34649 bool match = insn_p->operand[i + 1].predicate (op, mode);
34651 if (i == nargs - nargs_constant)
34653 if (!match)
34655 switch (icode)
34657 case CODE_FOR_avx512f_getmantv8df_mask_round:
34658 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34659 case CODE_FOR_avx512f_vgetmantv2df_round:
34660 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34661 case CODE_FOR_avx512f_vgetmantv4sf_round:
34662 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34663 error ("the immediate argument must be a 4-bit immediate");
34664 return const0_rtx;
34665 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34666 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34667 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34668 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34669 error ("the immediate argument must be a 5-bit immediate");
34670 return const0_rtx;
34671 default:
34672 error ("the immediate argument must be an 8-bit immediate");
34673 return const0_rtx;
34677 else if (i == nargs-1)
34679 if (!insn_p->operand[nargs].predicate (op, SImode))
34681 error ("incorrect rounding operand");
34682 return const0_rtx;
34685 /* If there is no rounding use normal version of the pattern. */
34686 if (INTVAL (op) == NO_ROUND)
34687 redundant_embed_rnd = 1;
34689 else
34691 if (VECTOR_MODE_P (mode))
34692 op = safe_vector_operand (op, mode);
34694 op = fixup_modeless_constant (op, mode);
34696 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34698 if (optimize || !match)
34699 op = copy_to_mode_reg (mode, op);
34701 else
34703 op = copy_to_reg (op);
34704 op = lowpart_subreg (mode, op, GET_MODE (op));
34708 args[i].op = op;
34709 args[i].mode = mode;
34712 switch (nargs)
34714 case 1:
34715 pat = GEN_FCN (icode) (target, args[0].op);
34716 break;
34717 case 2:
34718 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34719 break;
34720 case 3:
34721 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34722 args[2].op);
34723 break;
34724 case 4:
34725 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34726 args[2].op, args[3].op);
34727 break;
34728 case 5:
34729 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34730 args[2].op, args[3].op, args[4].op);
34731 break;
34732 case 6:
34733 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34734 args[2].op, args[3].op, args[4].op,
34735 args[5].op);
34736 break;
34737 default:
34738 gcc_unreachable ();
34741 if (!pat)
34742 return 0;
34744 if (redundant_embed_rnd)
34745 pat = ix86_erase_embedded_rounding (pat);
34747 emit_insn (pat);
34748 return target;
34751 /* Subroutine of ix86_expand_builtin to take care of special insns
34752 with variable number of operands. */
34754 static rtx
34755 ix86_expand_special_args_builtin (const struct builtin_description *d,
34756 tree exp, rtx target)
34758 tree arg;
34759 rtx pat, op;
34760 unsigned int i, nargs, arg_adjust, memory;
34761 bool aligned_mem = false;
34762 struct
34764 rtx op;
34765 machine_mode mode;
34766 } args[3];
34767 enum insn_code icode = d->icode;
34768 bool last_arg_constant = false;
34769 const struct insn_data_d *insn_p = &insn_data[icode];
34770 machine_mode tmode = insn_p->operand[0].mode;
34771 enum { load, store } klass;
34773 switch ((enum ix86_builtin_func_type) d->flag)
34775 case VOID_FTYPE_VOID:
34776 emit_insn (GEN_FCN (icode) (target));
34777 return 0;
34778 case VOID_FTYPE_UINT64:
34779 case VOID_FTYPE_UNSIGNED:
34780 nargs = 0;
34781 klass = store;
34782 memory = 0;
34783 break;
34785 case INT_FTYPE_VOID:
34786 case USHORT_FTYPE_VOID:
34787 case UINT64_FTYPE_VOID:
34788 case UNSIGNED_FTYPE_VOID:
34789 nargs = 0;
34790 klass = load;
34791 memory = 0;
34792 break;
34793 case UINT64_FTYPE_PUNSIGNED:
34794 case V2DI_FTYPE_PV2DI:
34795 case V4DI_FTYPE_PV4DI:
34796 case V32QI_FTYPE_PCCHAR:
34797 case V16QI_FTYPE_PCCHAR:
34798 case V8SF_FTYPE_PCV4SF:
34799 case V8SF_FTYPE_PCFLOAT:
34800 case V4SF_FTYPE_PCFLOAT:
34801 case V4DF_FTYPE_PCV2DF:
34802 case V4DF_FTYPE_PCDOUBLE:
34803 case V2DF_FTYPE_PCDOUBLE:
34804 case VOID_FTYPE_PVOID:
34805 case V8DI_FTYPE_PV8DI:
34806 nargs = 1;
34807 klass = load;
34808 memory = 0;
34809 switch (icode)
34811 case CODE_FOR_sse4_1_movntdqa:
34812 case CODE_FOR_avx2_movntdqa:
34813 case CODE_FOR_avx512f_movntdqa:
34814 aligned_mem = true;
34815 break;
34816 default:
34817 break;
34819 break;
34820 case VOID_FTYPE_PV2SF_V4SF:
34821 case VOID_FTYPE_PV8DI_V8DI:
34822 case VOID_FTYPE_PV4DI_V4DI:
34823 case VOID_FTYPE_PV2DI_V2DI:
34824 case VOID_FTYPE_PCHAR_V32QI:
34825 case VOID_FTYPE_PCHAR_V16QI:
34826 case VOID_FTYPE_PFLOAT_V16SF:
34827 case VOID_FTYPE_PFLOAT_V8SF:
34828 case VOID_FTYPE_PFLOAT_V4SF:
34829 case VOID_FTYPE_PDOUBLE_V8DF:
34830 case VOID_FTYPE_PDOUBLE_V4DF:
34831 case VOID_FTYPE_PDOUBLE_V2DF:
34832 case VOID_FTYPE_PLONGLONG_LONGLONG:
34833 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34834 case VOID_FTYPE_PINT_INT:
34835 nargs = 1;
34836 klass = store;
34837 /* Reserve memory operand for target. */
34838 memory = ARRAY_SIZE (args);
34839 switch (icode)
34841 /* These builtins and instructions require the memory
34842 to be properly aligned. */
34843 case CODE_FOR_avx_movntv4di:
34844 case CODE_FOR_sse2_movntv2di:
34845 case CODE_FOR_avx_movntv8sf:
34846 case CODE_FOR_sse_movntv4sf:
34847 case CODE_FOR_sse4a_vmmovntv4sf:
34848 case CODE_FOR_avx_movntv4df:
34849 case CODE_FOR_sse2_movntv2df:
34850 case CODE_FOR_sse4a_vmmovntv2df:
34851 case CODE_FOR_sse2_movntidi:
34852 case CODE_FOR_sse_movntq:
34853 case CODE_FOR_sse2_movntisi:
34854 case CODE_FOR_avx512f_movntv16sf:
34855 case CODE_FOR_avx512f_movntv8df:
34856 case CODE_FOR_avx512f_movntv8di:
34857 aligned_mem = true;
34858 break;
34859 default:
34860 break;
34862 break;
34863 case V4SF_FTYPE_V4SF_PCV2SF:
34864 case V2DF_FTYPE_V2DF_PCDOUBLE:
34865 nargs = 2;
34866 klass = load;
34867 memory = 1;
34868 break;
34869 case V8SF_FTYPE_PCV8SF_V8SI:
34870 case V4DF_FTYPE_PCV4DF_V4DI:
34871 case V4SF_FTYPE_PCV4SF_V4SI:
34872 case V2DF_FTYPE_PCV2DF_V2DI:
34873 case V8SI_FTYPE_PCV8SI_V8SI:
34874 case V4DI_FTYPE_PCV4DI_V4DI:
34875 case V4SI_FTYPE_PCV4SI_V4SI:
34876 case V2DI_FTYPE_PCV2DI_V2DI:
34877 case VOID_FTYPE_INT_INT64:
34878 nargs = 2;
34879 klass = load;
34880 memory = 0;
34881 break;
34882 case VOID_FTYPE_PV8DF_V8DF_UQI:
34883 case VOID_FTYPE_PV4DF_V4DF_UQI:
34884 case VOID_FTYPE_PV2DF_V2DF_UQI:
34885 case VOID_FTYPE_PV16SF_V16SF_UHI:
34886 case VOID_FTYPE_PV8SF_V8SF_UQI:
34887 case VOID_FTYPE_PV4SF_V4SF_UQI:
34888 case VOID_FTYPE_PV8DI_V8DI_UQI:
34889 case VOID_FTYPE_PV4DI_V4DI_UQI:
34890 case VOID_FTYPE_PV2DI_V2DI_UQI:
34891 case VOID_FTYPE_PV16SI_V16SI_UHI:
34892 case VOID_FTYPE_PV8SI_V8SI_UQI:
34893 case VOID_FTYPE_PV4SI_V4SI_UQI:
34894 case VOID_FTYPE_PV64QI_V64QI_UDI:
34895 case VOID_FTYPE_PV32HI_V32HI_USI:
34896 case VOID_FTYPE_PV32QI_V32QI_USI:
34897 case VOID_FTYPE_PV16QI_V16QI_UHI:
34898 case VOID_FTYPE_PV16HI_V16HI_UHI:
34899 case VOID_FTYPE_PV8HI_V8HI_UQI:
34900 switch (icode)
34902 /* These builtins and instructions require the memory
34903 to be properly aligned. */
34904 case CODE_FOR_avx512f_storev16sf_mask:
34905 case CODE_FOR_avx512f_storev16si_mask:
34906 case CODE_FOR_avx512f_storev8df_mask:
34907 case CODE_FOR_avx512f_storev8di_mask:
34908 case CODE_FOR_avx512vl_storev8sf_mask:
34909 case CODE_FOR_avx512vl_storev8si_mask:
34910 case CODE_FOR_avx512vl_storev4df_mask:
34911 case CODE_FOR_avx512vl_storev4di_mask:
34912 case CODE_FOR_avx512vl_storev4sf_mask:
34913 case CODE_FOR_avx512vl_storev4si_mask:
34914 case CODE_FOR_avx512vl_storev2df_mask:
34915 case CODE_FOR_avx512vl_storev2di_mask:
34916 aligned_mem = true;
34917 break;
34918 default:
34919 break;
34921 /* FALLTHRU */
34922 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34923 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34924 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34925 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34926 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34927 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34928 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34929 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34930 case VOID_FTYPE_PV8SI_V8DI_UQI:
34931 case VOID_FTYPE_PV8HI_V8DI_UQI:
34932 case VOID_FTYPE_PV16HI_V16SI_UHI:
34933 case VOID_FTYPE_PV16QI_V8DI_UQI:
34934 case VOID_FTYPE_PV16QI_V16SI_UHI:
34935 case VOID_FTYPE_PV4SI_V4DI_UQI:
34936 case VOID_FTYPE_PV4SI_V2DI_UQI:
34937 case VOID_FTYPE_PV8HI_V4DI_UQI:
34938 case VOID_FTYPE_PV8HI_V2DI_UQI:
34939 case VOID_FTYPE_PV8HI_V8SI_UQI:
34940 case VOID_FTYPE_PV8HI_V4SI_UQI:
34941 case VOID_FTYPE_PV16QI_V4DI_UQI:
34942 case VOID_FTYPE_PV16QI_V2DI_UQI:
34943 case VOID_FTYPE_PV16QI_V8SI_UQI:
34944 case VOID_FTYPE_PV16QI_V4SI_UQI:
34945 case VOID_FTYPE_PCHAR_V64QI_UDI:
34946 case VOID_FTYPE_PCHAR_V32QI_USI:
34947 case VOID_FTYPE_PCHAR_V16QI_UHI:
34948 case VOID_FTYPE_PSHORT_V32HI_USI:
34949 case VOID_FTYPE_PSHORT_V16HI_UHI:
34950 case VOID_FTYPE_PSHORT_V8HI_UQI:
34951 case VOID_FTYPE_PINT_V16SI_UHI:
34952 case VOID_FTYPE_PINT_V8SI_UQI:
34953 case VOID_FTYPE_PINT_V4SI_UQI:
34954 case VOID_FTYPE_PINT64_V8DI_UQI:
34955 case VOID_FTYPE_PINT64_V4DI_UQI:
34956 case VOID_FTYPE_PINT64_V2DI_UQI:
34957 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
34958 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
34959 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
34960 case VOID_FTYPE_PFLOAT_V16SF_UHI:
34961 case VOID_FTYPE_PFLOAT_V8SF_UQI:
34962 case VOID_FTYPE_PFLOAT_V4SF_UQI:
34963 case VOID_FTYPE_PV32QI_V32HI_USI:
34964 case VOID_FTYPE_PV16QI_V16HI_UHI:
34965 case VOID_FTYPE_PV8QI_V8HI_UQI:
34966 nargs = 2;
34967 klass = store;
34968 /* Reserve memory operand for target. */
34969 memory = ARRAY_SIZE (args);
34970 break;
34971 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
34972 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
34973 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
34974 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
34975 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
34976 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
34977 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
34978 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
34979 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
34980 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
34981 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
34982 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
34983 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
34984 case V32HI_FTYPE_PCV32HI_V32HI_USI:
34985 case V32QI_FTYPE_PCV32QI_V32QI_USI:
34986 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
34987 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
34988 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
34989 switch (icode)
34991 /* These builtins and instructions require the memory
34992 to be properly aligned. */
34993 case CODE_FOR_avx512f_loadv16sf_mask:
34994 case CODE_FOR_avx512f_loadv16si_mask:
34995 case CODE_FOR_avx512f_loadv8df_mask:
34996 case CODE_FOR_avx512f_loadv8di_mask:
34997 case CODE_FOR_avx512vl_loadv8sf_mask:
34998 case CODE_FOR_avx512vl_loadv8si_mask:
34999 case CODE_FOR_avx512vl_loadv4df_mask:
35000 case CODE_FOR_avx512vl_loadv4di_mask:
35001 case CODE_FOR_avx512vl_loadv4sf_mask:
35002 case CODE_FOR_avx512vl_loadv4si_mask:
35003 case CODE_FOR_avx512vl_loadv2df_mask:
35004 case CODE_FOR_avx512vl_loadv2di_mask:
35005 case CODE_FOR_avx512bw_loadv64qi_mask:
35006 case CODE_FOR_avx512vl_loadv32qi_mask:
35007 case CODE_FOR_avx512vl_loadv16qi_mask:
35008 case CODE_FOR_avx512bw_loadv32hi_mask:
35009 case CODE_FOR_avx512vl_loadv16hi_mask:
35010 case CODE_FOR_avx512vl_loadv8hi_mask:
35011 aligned_mem = true;
35012 break;
35013 default:
35014 break;
35016 /* FALLTHRU */
35017 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35018 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35019 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35020 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35021 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35022 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35023 case V16SI_FTYPE_PCINT_V16SI_UHI:
35024 case V8SI_FTYPE_PCINT_V8SI_UQI:
35025 case V4SI_FTYPE_PCINT_V4SI_UQI:
35026 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35027 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35028 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35029 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35030 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35031 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35032 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35033 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35034 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35035 nargs = 3;
35036 klass = load;
35037 memory = 0;
35038 break;
35039 case VOID_FTYPE_UINT_UINT_UINT:
35040 case VOID_FTYPE_UINT64_UINT_UINT:
35041 case UCHAR_FTYPE_UINT_UINT_UINT:
35042 case UCHAR_FTYPE_UINT64_UINT_UINT:
35043 nargs = 3;
35044 klass = load;
35045 memory = ARRAY_SIZE (args);
35046 last_arg_constant = true;
35047 break;
35048 default:
35049 gcc_unreachable ();
35052 gcc_assert (nargs <= ARRAY_SIZE (args));
35054 if (klass == store)
35056 arg = CALL_EXPR_ARG (exp, 0);
35057 op = expand_normal (arg);
35058 gcc_assert (target == 0);
35059 if (memory)
35061 op = ix86_zero_extend_to_Pmode (op);
35062 target = gen_rtx_MEM (tmode, op);
35063 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35064 on it. Try to improve it using get_pointer_alignment,
35065 and if the special builtin is one that requires strict
35066 mode alignment, also from it's GET_MODE_ALIGNMENT.
35067 Failure to do so could lead to ix86_legitimate_combined_insn
35068 rejecting all changes to such insns. */
35069 unsigned int align = get_pointer_alignment (arg);
35070 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35071 align = GET_MODE_ALIGNMENT (tmode);
35072 if (MEM_ALIGN (target) < align)
35073 set_mem_align (target, align);
35075 else
35076 target = force_reg (tmode, op);
35077 arg_adjust = 1;
35079 else
35081 arg_adjust = 0;
35082 if (optimize
35083 || target == 0
35084 || !register_operand (target, tmode)
35085 || GET_MODE (target) != tmode)
35086 target = gen_reg_rtx (tmode);
35089 for (i = 0; i < nargs; i++)
35091 machine_mode mode = insn_p->operand[i + 1].mode;
35092 bool match;
35094 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35095 op = expand_normal (arg);
35096 match = insn_p->operand[i + 1].predicate (op, mode);
35098 if (last_arg_constant && (i + 1) == nargs)
35100 if (!match)
35102 if (icode == CODE_FOR_lwp_lwpvalsi3
35103 || icode == CODE_FOR_lwp_lwpinssi3
35104 || icode == CODE_FOR_lwp_lwpvaldi3
35105 || icode == CODE_FOR_lwp_lwpinsdi3)
35106 error ("the last argument must be a 32-bit immediate");
35107 else
35108 error ("the last argument must be an 8-bit immediate");
35109 return const0_rtx;
35112 else
35114 if (i == memory)
35116 /* This must be the memory operand. */
35117 op = ix86_zero_extend_to_Pmode (op);
35118 op = gen_rtx_MEM (mode, op);
35119 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35120 on it. Try to improve it using get_pointer_alignment,
35121 and if the special builtin is one that requires strict
35122 mode alignment, also from it's GET_MODE_ALIGNMENT.
35123 Failure to do so could lead to ix86_legitimate_combined_insn
35124 rejecting all changes to such insns. */
35125 unsigned int align = get_pointer_alignment (arg);
35126 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35127 align = GET_MODE_ALIGNMENT (mode);
35128 if (MEM_ALIGN (op) < align)
35129 set_mem_align (op, align);
35131 else
35133 /* This must be register. */
35134 if (VECTOR_MODE_P (mode))
35135 op = safe_vector_operand (op, mode);
35137 op = fixup_modeless_constant (op, mode);
35139 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35140 op = copy_to_mode_reg (mode, op);
35141 else
35143 op = copy_to_reg (op);
35144 op = lowpart_subreg (mode, op, GET_MODE (op));
35149 args[i].op = op;
35150 args[i].mode = mode;
35153 switch (nargs)
35155 case 0:
35156 pat = GEN_FCN (icode) (target);
35157 break;
35158 case 1:
35159 pat = GEN_FCN (icode) (target, args[0].op);
35160 break;
35161 case 2:
35162 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35163 break;
35164 case 3:
35165 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35166 break;
35167 default:
35168 gcc_unreachable ();
35171 if (! pat)
35172 return 0;
35173 emit_insn (pat);
35174 return klass == store ? 0 : target;
35177 /* Return the integer constant in ARG. Constrain it to be in the range
35178 of the subparts of VEC_TYPE; issue an error if not. */
35180 static int
35181 get_element_number (tree vec_type, tree arg)
35183 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35185 if (!tree_fits_uhwi_p (arg)
35186 || (elt = tree_to_uhwi (arg), elt > max))
35188 error ("selector must be an integer constant in the range 0..%wi", max);
35189 return 0;
35192 return elt;
35195 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35196 ix86_expand_vector_init. We DO have language-level syntax for this, in
35197 the form of (type){ init-list }. Except that since we can't place emms
35198 instructions from inside the compiler, we can't allow the use of MMX
35199 registers unless the user explicitly asks for it. So we do *not* define
35200 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
35201 we have builtins invoked by mmintrin.h that gives us license to emit
35202 these sorts of instructions. */
35204 static rtx
35205 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
35207 machine_mode tmode = TYPE_MODE (type);
35208 machine_mode inner_mode = GET_MODE_INNER (tmode);
35209 int i, n_elt = GET_MODE_NUNITS (tmode);
35210 rtvec v = rtvec_alloc (n_elt);
35212 gcc_assert (VECTOR_MODE_P (tmode));
35213 gcc_assert (call_expr_nargs (exp) == n_elt);
35215 for (i = 0; i < n_elt; ++i)
35217 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
35218 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
35221 if (!target || !register_operand (target, tmode))
35222 target = gen_reg_rtx (tmode);
35224 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
35225 return target;
35228 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35229 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
35230 had a language-level syntax for referencing vector elements. */
35232 static rtx
35233 ix86_expand_vec_ext_builtin (tree exp, rtx target)
35235 machine_mode tmode, mode0;
35236 tree arg0, arg1;
35237 int elt;
35238 rtx op0;
35240 arg0 = CALL_EXPR_ARG (exp, 0);
35241 arg1 = CALL_EXPR_ARG (exp, 1);
35243 op0 = expand_normal (arg0);
35244 elt = get_element_number (TREE_TYPE (arg0), arg1);
35246 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35247 mode0 = TYPE_MODE (TREE_TYPE (arg0));
35248 gcc_assert (VECTOR_MODE_P (mode0));
35250 op0 = force_reg (mode0, op0);
35252 if (optimize || !target || !register_operand (target, tmode))
35253 target = gen_reg_rtx (tmode);
35255 ix86_expand_vector_extract (true, target, op0, elt);
35257 return target;
35260 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35261 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
35262 a language-level syntax for referencing vector elements. */
35264 static rtx
35265 ix86_expand_vec_set_builtin (tree exp)
35267 machine_mode tmode, mode1;
35268 tree arg0, arg1, arg2;
35269 int elt;
35270 rtx op0, op1, target;
35272 arg0 = CALL_EXPR_ARG (exp, 0);
35273 arg1 = CALL_EXPR_ARG (exp, 1);
35274 arg2 = CALL_EXPR_ARG (exp, 2);
35276 tmode = TYPE_MODE (TREE_TYPE (arg0));
35277 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35278 gcc_assert (VECTOR_MODE_P (tmode));
35280 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
35281 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
35282 elt = get_element_number (TREE_TYPE (arg0), arg2);
35284 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35285 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35287 op0 = force_reg (tmode, op0);
35288 op1 = force_reg (mode1, op1);
35290 /* OP0 is the source of these builtin functions and shouldn't be
35291 modified. Create a copy, use it and return it as target. */
35292 target = gen_reg_rtx (tmode);
35293 emit_move_insn (target, op0);
35294 ix86_expand_vector_set (true, target, op1, elt);
35296 return target;
35299 /* Emit conditional move of SRC to DST with condition
35300 OP1 CODE OP2. */
35301 static void
35302 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35304 rtx t;
35306 if (TARGET_CMOVE)
35308 t = ix86_expand_compare (code, op1, op2);
35309 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35310 src, dst)));
35312 else
35314 rtx_code_label *nomove = gen_label_rtx ();
35315 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35316 const0_rtx, GET_MODE (op1), 1, nomove);
35317 emit_move_insn (dst, src);
35318 emit_label (nomove);
35322 /* Choose max of DST and SRC and put it to DST. */
35323 static void
35324 ix86_emit_move_max (rtx dst, rtx src)
35326 ix86_emit_cmove (dst, src, LTU, dst, src);
35329 /* Expand an expression EXP that calls a built-in function,
35330 with result going to TARGET if that's convenient
35331 (and in mode MODE if that's convenient).
35332 SUBTARGET may be used as the target for computing one of EXP's operands.
35333 IGNORE is nonzero if the value is to be ignored. */
35335 static rtx
35336 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35337 machine_mode mode, int ignore)
35339 size_t i;
35340 enum insn_code icode, icode2;
35341 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35342 tree arg0, arg1, arg2, arg3, arg4;
35343 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
35344 machine_mode mode0, mode1, mode2, mode3, mode4;
35345 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35347 /* For CPU builtins that can be folded, fold first and expand the fold. */
35348 switch (fcode)
35350 case IX86_BUILTIN_CPU_INIT:
35352 /* Make it call __cpu_indicator_init in libgcc. */
35353 tree call_expr, fndecl, type;
35354 type = build_function_type_list (integer_type_node, NULL_TREE);
35355 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35356 call_expr = build_call_expr (fndecl, 0);
35357 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35359 case IX86_BUILTIN_CPU_IS:
35360 case IX86_BUILTIN_CPU_SUPPORTS:
35362 tree arg0 = CALL_EXPR_ARG (exp, 0);
35363 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35364 gcc_assert (fold_expr != NULL_TREE);
35365 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35369 /* Determine whether the builtin function is available under the current ISA.
35370 Originally the builtin was not created if it wasn't applicable to the
35371 current ISA based on the command line switches. With function specific
35372 options, we need to check in the context of the function making the call
35373 whether it is supported. Treat AVX512VL and MMX specially. For other flags,
35374 if isa includes more than one ISA bit, treat those are requiring any
35375 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
35376 ISAs. Likewise for MMX, require both MMX and the non-MMX ISAs.
35377 Similarly for 64BIT, but we shouldn't be building such builtins
35378 at all, -m64 is a whole TU option. */
35379 if (((ix86_builtins_isa[fcode].isa
35380 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35381 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI))
35382 && !(ix86_builtins_isa[fcode].isa
35383 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35384 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI)
35385 & ix86_isa_flags))
35386 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
35387 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
35388 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_GFNI)
35389 && !(ix86_isa_flags & OPTION_MASK_ISA_GFNI))
35390 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_MMX)
35391 && !(ix86_isa_flags & OPTION_MASK_ISA_MMX))
35392 || (ix86_builtins_isa[fcode].isa2
35393 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
35395 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
35396 ix86_builtins_isa[fcode].isa2, 0, 0,
35397 NULL, NULL, (enum fpmath_unit) 0,
35398 false);
35399 if (!opts)
35400 error ("%qE needs unknown isa option", fndecl);
35401 else
35403 gcc_assert (opts != NULL);
35404 error ("%qE needs isa option %s", fndecl, opts);
35405 free (opts);
35407 return expand_call (exp, target, ignore);
35410 switch (fcode)
35412 case IX86_BUILTIN_BNDMK:
35413 if (!target
35414 || GET_MODE (target) != BNDmode
35415 || !register_operand (target, BNDmode))
35416 target = gen_reg_rtx (BNDmode);
35418 arg0 = CALL_EXPR_ARG (exp, 0);
35419 arg1 = CALL_EXPR_ARG (exp, 1);
35421 op0 = expand_normal (arg0);
35422 op1 = expand_normal (arg1);
35424 if (!register_operand (op0, Pmode))
35425 op0 = ix86_zero_extend_to_Pmode (op0);
35426 if (!register_operand (op1, Pmode))
35427 op1 = ix86_zero_extend_to_Pmode (op1);
35429 /* Builtin arg1 is size of block but instruction op1 should
35430 be (size - 1). */
35431 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35432 NULL_RTX, 1, OPTAB_DIRECT);
35434 emit_insn (BNDmode == BND64mode
35435 ? gen_bnd64_mk (target, op0, op1)
35436 : gen_bnd32_mk (target, op0, op1));
35437 return target;
35439 case IX86_BUILTIN_BNDSTX:
35440 arg0 = CALL_EXPR_ARG (exp, 0);
35441 arg1 = CALL_EXPR_ARG (exp, 1);
35442 arg2 = CALL_EXPR_ARG (exp, 2);
35444 op0 = expand_normal (arg0);
35445 op1 = expand_normal (arg1);
35446 op2 = expand_normal (arg2);
35448 if (!register_operand (op0, Pmode))
35449 op0 = ix86_zero_extend_to_Pmode (op0);
35450 if (!register_operand (op1, BNDmode))
35451 op1 = copy_to_mode_reg (BNDmode, op1);
35452 if (!register_operand (op2, Pmode))
35453 op2 = ix86_zero_extend_to_Pmode (op2);
35455 emit_insn (BNDmode == BND64mode
35456 ? gen_bnd64_stx (op2, op0, op1)
35457 : gen_bnd32_stx (op2, op0, op1));
35458 return 0;
35460 case IX86_BUILTIN_BNDLDX:
35461 if (!target
35462 || GET_MODE (target) != BNDmode
35463 || !register_operand (target, BNDmode))
35464 target = gen_reg_rtx (BNDmode);
35466 arg0 = CALL_EXPR_ARG (exp, 0);
35467 arg1 = CALL_EXPR_ARG (exp, 1);
35469 op0 = expand_normal (arg0);
35470 op1 = expand_normal (arg1);
35472 if (!register_operand (op0, Pmode))
35473 op0 = ix86_zero_extend_to_Pmode (op0);
35474 if (!register_operand (op1, Pmode))
35475 op1 = ix86_zero_extend_to_Pmode (op1);
35477 emit_insn (BNDmode == BND64mode
35478 ? gen_bnd64_ldx (target, op0, op1)
35479 : gen_bnd32_ldx (target, op0, op1));
35480 return target;
35482 case IX86_BUILTIN_BNDCL:
35483 arg0 = CALL_EXPR_ARG (exp, 0);
35484 arg1 = CALL_EXPR_ARG (exp, 1);
35486 op0 = expand_normal (arg0);
35487 op1 = expand_normal (arg1);
35489 if (!register_operand (op0, Pmode))
35490 op0 = ix86_zero_extend_to_Pmode (op0);
35491 if (!register_operand (op1, BNDmode))
35492 op1 = copy_to_mode_reg (BNDmode, op1);
35494 emit_insn (BNDmode == BND64mode
35495 ? gen_bnd64_cl (op1, op0)
35496 : gen_bnd32_cl (op1, op0));
35497 return 0;
35499 case IX86_BUILTIN_BNDCU:
35500 arg0 = CALL_EXPR_ARG (exp, 0);
35501 arg1 = CALL_EXPR_ARG (exp, 1);
35503 op0 = expand_normal (arg0);
35504 op1 = expand_normal (arg1);
35506 if (!register_operand (op0, Pmode))
35507 op0 = ix86_zero_extend_to_Pmode (op0);
35508 if (!register_operand (op1, BNDmode))
35509 op1 = copy_to_mode_reg (BNDmode, op1);
35511 emit_insn (BNDmode == BND64mode
35512 ? gen_bnd64_cu (op1, op0)
35513 : gen_bnd32_cu (op1, op0));
35514 return 0;
35516 case IX86_BUILTIN_BNDRET:
35517 arg0 = CALL_EXPR_ARG (exp, 0);
35518 target = chkp_get_rtl_bounds (arg0);
35520 /* If no bounds were specified for returned value,
35521 then use INIT bounds. It usually happens when
35522 some built-in function is expanded. */
35523 if (!target)
35525 rtx t1 = gen_reg_rtx (Pmode);
35526 rtx t2 = gen_reg_rtx (Pmode);
35527 target = gen_reg_rtx (BNDmode);
35528 emit_move_insn (t1, const0_rtx);
35529 emit_move_insn (t2, constm1_rtx);
35530 emit_insn (BNDmode == BND64mode
35531 ? gen_bnd64_mk (target, t1, t2)
35532 : gen_bnd32_mk (target, t1, t2));
35535 gcc_assert (target && REG_P (target));
35536 return target;
35538 case IX86_BUILTIN_BNDNARROW:
35540 rtx m1, m1h1, m1h2, lb, ub, t1;
35542 /* Return value and lb. */
35543 arg0 = CALL_EXPR_ARG (exp, 0);
35544 /* Bounds. */
35545 arg1 = CALL_EXPR_ARG (exp, 1);
35546 /* Size. */
35547 arg2 = CALL_EXPR_ARG (exp, 2);
35549 lb = expand_normal (arg0);
35550 op1 = expand_normal (arg1);
35551 op2 = expand_normal (arg2);
35553 /* Size was passed but we need to use (size - 1) as for bndmk. */
35554 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35555 NULL_RTX, 1, OPTAB_DIRECT);
35557 /* Add LB to size and inverse to get UB. */
35558 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35559 op2, 1, OPTAB_DIRECT);
35560 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35562 if (!register_operand (lb, Pmode))
35563 lb = ix86_zero_extend_to_Pmode (lb);
35564 if (!register_operand (ub, Pmode))
35565 ub = ix86_zero_extend_to_Pmode (ub);
35567 /* We need to move bounds to memory before any computations. */
35568 if (MEM_P (op1))
35569 m1 = op1;
35570 else
35572 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35573 emit_move_insn (m1, op1);
35576 /* Generate mem expression to be used for access to LB and UB. */
35577 m1h1 = adjust_address (m1, Pmode, 0);
35578 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35580 t1 = gen_reg_rtx (Pmode);
35582 /* Compute LB. */
35583 emit_move_insn (t1, m1h1);
35584 ix86_emit_move_max (t1, lb);
35585 emit_move_insn (m1h1, t1);
35587 /* Compute UB. UB is stored in 1's complement form. Therefore
35588 we also use max here. */
35589 emit_move_insn (t1, m1h2);
35590 ix86_emit_move_max (t1, ub);
35591 emit_move_insn (m1h2, t1);
35593 op2 = gen_reg_rtx (BNDmode);
35594 emit_move_insn (op2, m1);
35596 return chkp_join_splitted_slot (lb, op2);
35599 case IX86_BUILTIN_BNDINT:
35601 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35603 if (!target
35604 || GET_MODE (target) != BNDmode
35605 || !register_operand (target, BNDmode))
35606 target = gen_reg_rtx (BNDmode);
35608 arg0 = CALL_EXPR_ARG (exp, 0);
35609 arg1 = CALL_EXPR_ARG (exp, 1);
35611 op0 = expand_normal (arg0);
35612 op1 = expand_normal (arg1);
35614 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35615 rh1 = adjust_address (res, Pmode, 0);
35616 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35618 /* Put first bounds to temporaries. */
35619 lb1 = gen_reg_rtx (Pmode);
35620 ub1 = gen_reg_rtx (Pmode);
35621 if (MEM_P (op0))
35623 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35624 emit_move_insn (ub1, adjust_address (op0, Pmode,
35625 GET_MODE_SIZE (Pmode)));
35627 else
35629 emit_move_insn (res, op0);
35630 emit_move_insn (lb1, rh1);
35631 emit_move_insn (ub1, rh2);
35634 /* Put second bounds to temporaries. */
35635 lb2 = gen_reg_rtx (Pmode);
35636 ub2 = gen_reg_rtx (Pmode);
35637 if (MEM_P (op1))
35639 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35640 emit_move_insn (ub2, adjust_address (op1, Pmode,
35641 GET_MODE_SIZE (Pmode)));
35643 else
35645 emit_move_insn (res, op1);
35646 emit_move_insn (lb2, rh1);
35647 emit_move_insn (ub2, rh2);
35650 /* Compute LB. */
35651 ix86_emit_move_max (lb1, lb2);
35652 emit_move_insn (rh1, lb1);
35654 /* Compute UB. UB is stored in 1's complement form. Therefore
35655 we also use max here. */
35656 ix86_emit_move_max (ub1, ub2);
35657 emit_move_insn (rh2, ub1);
35659 emit_move_insn (target, res);
35661 return target;
35664 case IX86_BUILTIN_SIZEOF:
35666 tree name;
35667 rtx symbol;
35669 if (!target
35670 || GET_MODE (target) != Pmode
35671 || !register_operand (target, Pmode))
35672 target = gen_reg_rtx (Pmode);
35674 arg0 = CALL_EXPR_ARG (exp, 0);
35675 gcc_assert (VAR_P (arg0));
35677 name = DECL_ASSEMBLER_NAME (arg0);
35678 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35680 emit_insn (Pmode == SImode
35681 ? gen_move_size_reloc_si (target, symbol)
35682 : gen_move_size_reloc_di (target, symbol));
35684 return target;
35687 case IX86_BUILTIN_BNDLOWER:
35689 rtx mem, hmem;
35691 if (!target
35692 || GET_MODE (target) != Pmode
35693 || !register_operand (target, Pmode))
35694 target = gen_reg_rtx (Pmode);
35696 arg0 = CALL_EXPR_ARG (exp, 0);
35697 op0 = expand_normal (arg0);
35699 /* We need to move bounds to memory first. */
35700 if (MEM_P (op0))
35701 mem = op0;
35702 else
35704 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35705 emit_move_insn (mem, op0);
35708 /* Generate mem expression to access LB and load it. */
35709 hmem = adjust_address (mem, Pmode, 0);
35710 emit_move_insn (target, hmem);
35712 return target;
35715 case IX86_BUILTIN_BNDUPPER:
35717 rtx mem, hmem, res;
35719 if (!target
35720 || GET_MODE (target) != Pmode
35721 || !register_operand (target, Pmode))
35722 target = gen_reg_rtx (Pmode);
35724 arg0 = CALL_EXPR_ARG (exp, 0);
35725 op0 = expand_normal (arg0);
35727 /* We need to move bounds to memory first. */
35728 if (MEM_P (op0))
35729 mem = op0;
35730 else
35732 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35733 emit_move_insn (mem, op0);
35736 /* Generate mem expression to access UB. */
35737 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35739 /* We need to inverse all bits of UB. */
35740 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35742 if (res != target)
35743 emit_move_insn (target, res);
35745 return target;
35748 case IX86_BUILTIN_MASKMOVQ:
35749 case IX86_BUILTIN_MASKMOVDQU:
35750 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35751 ? CODE_FOR_mmx_maskmovq
35752 : CODE_FOR_sse2_maskmovdqu);
35753 /* Note the arg order is different from the operand order. */
35754 arg1 = CALL_EXPR_ARG (exp, 0);
35755 arg2 = CALL_EXPR_ARG (exp, 1);
35756 arg0 = CALL_EXPR_ARG (exp, 2);
35757 op0 = expand_normal (arg0);
35758 op1 = expand_normal (arg1);
35759 op2 = expand_normal (arg2);
35760 mode0 = insn_data[icode].operand[0].mode;
35761 mode1 = insn_data[icode].operand[1].mode;
35762 mode2 = insn_data[icode].operand[2].mode;
35764 op0 = ix86_zero_extend_to_Pmode (op0);
35765 op0 = gen_rtx_MEM (mode1, op0);
35767 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35768 op0 = copy_to_mode_reg (mode0, op0);
35769 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35770 op1 = copy_to_mode_reg (mode1, op1);
35771 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35772 op2 = copy_to_mode_reg (mode2, op2);
35773 pat = GEN_FCN (icode) (op0, op1, op2);
35774 if (! pat)
35775 return 0;
35776 emit_insn (pat);
35777 return 0;
35779 case IX86_BUILTIN_LDMXCSR:
35780 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35781 target = assign_386_stack_local (SImode, SLOT_TEMP);
35782 emit_move_insn (target, op0);
35783 emit_insn (gen_sse_ldmxcsr (target));
35784 return 0;
35786 case IX86_BUILTIN_STMXCSR:
35787 target = assign_386_stack_local (SImode, SLOT_TEMP);
35788 emit_insn (gen_sse_stmxcsr (target));
35789 return copy_to_mode_reg (SImode, target);
35791 case IX86_BUILTIN_CLFLUSH:
35792 arg0 = CALL_EXPR_ARG (exp, 0);
35793 op0 = expand_normal (arg0);
35794 icode = CODE_FOR_sse2_clflush;
35795 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35796 op0 = ix86_zero_extend_to_Pmode (op0);
35798 emit_insn (gen_sse2_clflush (op0));
35799 return 0;
35801 case IX86_BUILTIN_CLWB:
35802 arg0 = CALL_EXPR_ARG (exp, 0);
35803 op0 = expand_normal (arg0);
35804 icode = CODE_FOR_clwb;
35805 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35806 op0 = ix86_zero_extend_to_Pmode (op0);
35808 emit_insn (gen_clwb (op0));
35809 return 0;
35811 case IX86_BUILTIN_CLFLUSHOPT:
35812 arg0 = CALL_EXPR_ARG (exp, 0);
35813 op0 = expand_normal (arg0);
35814 icode = CODE_FOR_clflushopt;
35815 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35816 op0 = ix86_zero_extend_to_Pmode (op0);
35818 emit_insn (gen_clflushopt (op0));
35819 return 0;
35821 case IX86_BUILTIN_MONITOR:
35822 case IX86_BUILTIN_MONITORX:
35823 arg0 = CALL_EXPR_ARG (exp, 0);
35824 arg1 = CALL_EXPR_ARG (exp, 1);
35825 arg2 = CALL_EXPR_ARG (exp, 2);
35826 op0 = expand_normal (arg0);
35827 op1 = expand_normal (arg1);
35828 op2 = expand_normal (arg2);
35829 if (!REG_P (op0))
35830 op0 = ix86_zero_extend_to_Pmode (op0);
35831 if (!REG_P (op1))
35832 op1 = copy_to_mode_reg (SImode, op1);
35833 if (!REG_P (op2))
35834 op2 = copy_to_mode_reg (SImode, op2);
35836 emit_insn (fcode == IX86_BUILTIN_MONITOR
35837 ? ix86_gen_monitor (op0, op1, op2)
35838 : ix86_gen_monitorx (op0, op1, op2));
35839 return 0;
35841 case IX86_BUILTIN_MWAIT:
35842 arg0 = CALL_EXPR_ARG (exp, 0);
35843 arg1 = CALL_EXPR_ARG (exp, 1);
35844 op0 = expand_normal (arg0);
35845 op1 = expand_normal (arg1);
35846 if (!REG_P (op0))
35847 op0 = copy_to_mode_reg (SImode, op0);
35848 if (!REG_P (op1))
35849 op1 = copy_to_mode_reg (SImode, op1);
35850 emit_insn (gen_sse3_mwait (op0, op1));
35851 return 0;
35853 case IX86_BUILTIN_MWAITX:
35854 arg0 = CALL_EXPR_ARG (exp, 0);
35855 arg1 = CALL_EXPR_ARG (exp, 1);
35856 arg2 = CALL_EXPR_ARG (exp, 2);
35857 op0 = expand_normal (arg0);
35858 op1 = expand_normal (arg1);
35859 op2 = expand_normal (arg2);
35860 if (!REG_P (op0))
35861 op0 = copy_to_mode_reg (SImode, op0);
35862 if (!REG_P (op1))
35863 op1 = copy_to_mode_reg (SImode, op1);
35864 if (!REG_P (op2))
35865 op2 = copy_to_mode_reg (SImode, op2);
35866 emit_insn (gen_mwaitx (op0, op1, op2));
35867 return 0;
35869 case IX86_BUILTIN_CLZERO:
35870 arg0 = CALL_EXPR_ARG (exp, 0);
35871 op0 = expand_normal (arg0);
35872 if (!REG_P (op0))
35873 op0 = ix86_zero_extend_to_Pmode (op0);
35874 emit_insn (ix86_gen_clzero (op0));
35875 return 0;
35877 case IX86_BUILTIN_VEC_INIT_V2SI:
35878 case IX86_BUILTIN_VEC_INIT_V4HI:
35879 case IX86_BUILTIN_VEC_INIT_V8QI:
35880 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35882 case IX86_BUILTIN_VEC_EXT_V2DF:
35883 case IX86_BUILTIN_VEC_EXT_V2DI:
35884 case IX86_BUILTIN_VEC_EXT_V4SF:
35885 case IX86_BUILTIN_VEC_EXT_V4SI:
35886 case IX86_BUILTIN_VEC_EXT_V8HI:
35887 case IX86_BUILTIN_VEC_EXT_V2SI:
35888 case IX86_BUILTIN_VEC_EXT_V4HI:
35889 case IX86_BUILTIN_VEC_EXT_V16QI:
35890 return ix86_expand_vec_ext_builtin (exp, target);
35892 case IX86_BUILTIN_VEC_SET_V2DI:
35893 case IX86_BUILTIN_VEC_SET_V4SF:
35894 case IX86_BUILTIN_VEC_SET_V4SI:
35895 case IX86_BUILTIN_VEC_SET_V8HI:
35896 case IX86_BUILTIN_VEC_SET_V4HI:
35897 case IX86_BUILTIN_VEC_SET_V16QI:
35898 return ix86_expand_vec_set_builtin (exp);
35900 case IX86_BUILTIN_NANQ:
35901 case IX86_BUILTIN_NANSQ:
35902 return expand_call (exp, target, ignore);
35904 case IX86_BUILTIN_RDPMC:
35905 case IX86_BUILTIN_RDTSC:
35906 case IX86_BUILTIN_RDTSCP:
35907 case IX86_BUILTIN_XGETBV:
35909 op0 = gen_reg_rtx (DImode);
35910 op1 = gen_reg_rtx (DImode);
35912 if (fcode == IX86_BUILTIN_RDPMC)
35914 arg0 = CALL_EXPR_ARG (exp, 0);
35915 op2 = expand_normal (arg0);
35916 if (!register_operand (op2, SImode))
35917 op2 = copy_to_mode_reg (SImode, op2);
35919 insn = (TARGET_64BIT
35920 ? gen_rdpmc_rex64 (op0, op1, op2)
35921 : gen_rdpmc (op0, op2));
35922 emit_insn (insn);
35924 else if (fcode == IX86_BUILTIN_XGETBV)
35926 arg0 = CALL_EXPR_ARG (exp, 0);
35927 op2 = expand_normal (arg0);
35928 if (!register_operand (op2, SImode))
35929 op2 = copy_to_mode_reg (SImode, op2);
35931 insn = (TARGET_64BIT
35932 ? gen_xgetbv_rex64 (op0, op1, op2)
35933 : gen_xgetbv (op0, op2));
35934 emit_insn (insn);
35936 else if (fcode == IX86_BUILTIN_RDTSC)
35938 insn = (TARGET_64BIT
35939 ? gen_rdtsc_rex64 (op0, op1)
35940 : gen_rdtsc (op0));
35941 emit_insn (insn);
35943 else
35945 op2 = gen_reg_rtx (SImode);
35947 insn = (TARGET_64BIT
35948 ? gen_rdtscp_rex64 (op0, op1, op2)
35949 : gen_rdtscp (op0, op2));
35950 emit_insn (insn);
35952 arg0 = CALL_EXPR_ARG (exp, 0);
35953 op4 = expand_normal (arg0);
35954 if (!address_operand (op4, VOIDmode))
35956 op4 = convert_memory_address (Pmode, op4);
35957 op4 = copy_addr_to_reg (op4);
35959 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
35962 if (target == 0)
35964 /* mode is VOIDmode if __builtin_rd* has been called
35965 without lhs. */
35966 if (mode == VOIDmode)
35967 return target;
35968 target = gen_reg_rtx (mode);
35971 if (TARGET_64BIT)
35973 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
35974 op1, 1, OPTAB_DIRECT);
35975 op0 = expand_simple_binop (DImode, IOR, op0, op1,
35976 op0, 1, OPTAB_DIRECT);
35979 emit_move_insn (target, op0);
35980 return target;
35982 case IX86_BUILTIN_FXSAVE:
35983 case IX86_BUILTIN_FXRSTOR:
35984 case IX86_BUILTIN_FXSAVE64:
35985 case IX86_BUILTIN_FXRSTOR64:
35986 case IX86_BUILTIN_FNSTENV:
35987 case IX86_BUILTIN_FLDENV:
35988 mode0 = BLKmode;
35989 switch (fcode)
35991 case IX86_BUILTIN_FXSAVE:
35992 icode = CODE_FOR_fxsave;
35993 break;
35994 case IX86_BUILTIN_FXRSTOR:
35995 icode = CODE_FOR_fxrstor;
35996 break;
35997 case IX86_BUILTIN_FXSAVE64:
35998 icode = CODE_FOR_fxsave64;
35999 break;
36000 case IX86_BUILTIN_FXRSTOR64:
36001 icode = CODE_FOR_fxrstor64;
36002 break;
36003 case IX86_BUILTIN_FNSTENV:
36004 icode = CODE_FOR_fnstenv;
36005 break;
36006 case IX86_BUILTIN_FLDENV:
36007 icode = CODE_FOR_fldenv;
36008 break;
36009 default:
36010 gcc_unreachable ();
36013 arg0 = CALL_EXPR_ARG (exp, 0);
36014 op0 = expand_normal (arg0);
36016 if (!address_operand (op0, VOIDmode))
36018 op0 = convert_memory_address (Pmode, op0);
36019 op0 = copy_addr_to_reg (op0);
36021 op0 = gen_rtx_MEM (mode0, op0);
36023 pat = GEN_FCN (icode) (op0);
36024 if (pat)
36025 emit_insn (pat);
36026 return 0;
36028 case IX86_BUILTIN_XSETBV:
36029 arg0 = CALL_EXPR_ARG (exp, 0);
36030 arg1 = CALL_EXPR_ARG (exp, 1);
36031 op0 = expand_normal (arg0);
36032 op1 = expand_normal (arg1);
36034 if (!REG_P (op0))
36035 op0 = copy_to_mode_reg (SImode, op0);
36037 if (TARGET_64BIT)
36039 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36040 NULL, 1, OPTAB_DIRECT);
36042 op2 = gen_lowpart (SImode, op2);
36043 op1 = gen_lowpart (SImode, op1);
36044 if (!REG_P (op1))
36045 op1 = copy_to_mode_reg (SImode, op1);
36046 if (!REG_P (op2))
36047 op2 = copy_to_mode_reg (SImode, op2);
36048 icode = CODE_FOR_xsetbv_rex64;
36049 pat = GEN_FCN (icode) (op0, op1, op2);
36051 else
36053 if (!REG_P (op1))
36054 op1 = copy_to_mode_reg (DImode, op1);
36055 icode = CODE_FOR_xsetbv;
36056 pat = GEN_FCN (icode) (op0, op1);
36058 if (pat)
36059 emit_insn (pat);
36060 return 0;
36062 case IX86_BUILTIN_XSAVE:
36063 case IX86_BUILTIN_XRSTOR:
36064 case IX86_BUILTIN_XSAVE64:
36065 case IX86_BUILTIN_XRSTOR64:
36066 case IX86_BUILTIN_XSAVEOPT:
36067 case IX86_BUILTIN_XSAVEOPT64:
36068 case IX86_BUILTIN_XSAVES:
36069 case IX86_BUILTIN_XRSTORS:
36070 case IX86_BUILTIN_XSAVES64:
36071 case IX86_BUILTIN_XRSTORS64:
36072 case IX86_BUILTIN_XSAVEC:
36073 case IX86_BUILTIN_XSAVEC64:
36074 arg0 = CALL_EXPR_ARG (exp, 0);
36075 arg1 = CALL_EXPR_ARG (exp, 1);
36076 op0 = expand_normal (arg0);
36077 op1 = expand_normal (arg1);
36079 if (!address_operand (op0, VOIDmode))
36081 op0 = convert_memory_address (Pmode, op0);
36082 op0 = copy_addr_to_reg (op0);
36084 op0 = gen_rtx_MEM (BLKmode, op0);
36086 op1 = force_reg (DImode, op1);
36088 if (TARGET_64BIT)
36090 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36091 NULL, 1, OPTAB_DIRECT);
36092 switch (fcode)
36094 case IX86_BUILTIN_XSAVE:
36095 icode = CODE_FOR_xsave_rex64;
36096 break;
36097 case IX86_BUILTIN_XRSTOR:
36098 icode = CODE_FOR_xrstor_rex64;
36099 break;
36100 case IX86_BUILTIN_XSAVE64:
36101 icode = CODE_FOR_xsave64;
36102 break;
36103 case IX86_BUILTIN_XRSTOR64:
36104 icode = CODE_FOR_xrstor64;
36105 break;
36106 case IX86_BUILTIN_XSAVEOPT:
36107 icode = CODE_FOR_xsaveopt_rex64;
36108 break;
36109 case IX86_BUILTIN_XSAVEOPT64:
36110 icode = CODE_FOR_xsaveopt64;
36111 break;
36112 case IX86_BUILTIN_XSAVES:
36113 icode = CODE_FOR_xsaves_rex64;
36114 break;
36115 case IX86_BUILTIN_XRSTORS:
36116 icode = CODE_FOR_xrstors_rex64;
36117 break;
36118 case IX86_BUILTIN_XSAVES64:
36119 icode = CODE_FOR_xsaves64;
36120 break;
36121 case IX86_BUILTIN_XRSTORS64:
36122 icode = CODE_FOR_xrstors64;
36123 break;
36124 case IX86_BUILTIN_XSAVEC:
36125 icode = CODE_FOR_xsavec_rex64;
36126 break;
36127 case IX86_BUILTIN_XSAVEC64:
36128 icode = CODE_FOR_xsavec64;
36129 break;
36130 default:
36131 gcc_unreachable ();
36134 op2 = gen_lowpart (SImode, op2);
36135 op1 = gen_lowpart (SImode, op1);
36136 pat = GEN_FCN (icode) (op0, op1, op2);
36138 else
36140 switch (fcode)
36142 case IX86_BUILTIN_XSAVE:
36143 icode = CODE_FOR_xsave;
36144 break;
36145 case IX86_BUILTIN_XRSTOR:
36146 icode = CODE_FOR_xrstor;
36147 break;
36148 case IX86_BUILTIN_XSAVEOPT:
36149 icode = CODE_FOR_xsaveopt;
36150 break;
36151 case IX86_BUILTIN_XSAVES:
36152 icode = CODE_FOR_xsaves;
36153 break;
36154 case IX86_BUILTIN_XRSTORS:
36155 icode = CODE_FOR_xrstors;
36156 break;
36157 case IX86_BUILTIN_XSAVEC:
36158 icode = CODE_FOR_xsavec;
36159 break;
36160 default:
36161 gcc_unreachable ();
36163 pat = GEN_FCN (icode) (op0, op1);
36166 if (pat)
36167 emit_insn (pat);
36168 return 0;
36170 case IX86_BUILTIN_LLWPCB:
36171 arg0 = CALL_EXPR_ARG (exp, 0);
36172 op0 = expand_normal (arg0);
36173 icode = CODE_FOR_lwp_llwpcb;
36174 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36175 op0 = ix86_zero_extend_to_Pmode (op0);
36176 emit_insn (gen_lwp_llwpcb (op0));
36177 return 0;
36179 case IX86_BUILTIN_SLWPCB:
36180 icode = CODE_FOR_lwp_slwpcb;
36181 if (!target
36182 || !insn_data[icode].operand[0].predicate (target, Pmode))
36183 target = gen_reg_rtx (Pmode);
36184 emit_insn (gen_lwp_slwpcb (target));
36185 return target;
36187 case IX86_BUILTIN_BEXTRI32:
36188 case IX86_BUILTIN_BEXTRI64:
36189 arg0 = CALL_EXPR_ARG (exp, 0);
36190 arg1 = CALL_EXPR_ARG (exp, 1);
36191 op0 = expand_normal (arg0);
36192 op1 = expand_normal (arg1);
36193 icode = (fcode == IX86_BUILTIN_BEXTRI32
36194 ? CODE_FOR_tbm_bextri_si
36195 : CODE_FOR_tbm_bextri_di);
36196 if (!CONST_INT_P (op1))
36198 error ("last argument must be an immediate");
36199 return const0_rtx;
36201 else
36203 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36204 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36205 op1 = GEN_INT (length);
36206 op2 = GEN_INT (lsb_index);
36207 pat = GEN_FCN (icode) (target, op0, op1, op2);
36208 if (pat)
36209 emit_insn (pat);
36210 return target;
36213 case IX86_BUILTIN_RDRAND16_STEP:
36214 icode = CODE_FOR_rdrandhi_1;
36215 mode0 = HImode;
36216 goto rdrand_step;
36218 case IX86_BUILTIN_RDRAND32_STEP:
36219 icode = CODE_FOR_rdrandsi_1;
36220 mode0 = SImode;
36221 goto rdrand_step;
36223 case IX86_BUILTIN_RDRAND64_STEP:
36224 icode = CODE_FOR_rdranddi_1;
36225 mode0 = DImode;
36227 rdrand_step:
36228 arg0 = CALL_EXPR_ARG (exp, 0);
36229 op1 = expand_normal (arg0);
36230 if (!address_operand (op1, VOIDmode))
36232 op1 = convert_memory_address (Pmode, op1);
36233 op1 = copy_addr_to_reg (op1);
36236 op0 = gen_reg_rtx (mode0);
36237 emit_insn (GEN_FCN (icode) (op0));
36239 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36241 op1 = gen_reg_rtx (SImode);
36242 emit_move_insn (op1, CONST1_RTX (SImode));
36244 /* Emit SImode conditional move. */
36245 if (mode0 == HImode)
36247 if (TARGET_ZERO_EXTEND_WITH_AND
36248 && optimize_function_for_speed_p (cfun))
36250 op2 = force_reg (SImode, const0_rtx);
36252 emit_insn (gen_movstricthi
36253 (gen_lowpart (HImode, op2), op0));
36255 else
36257 op2 = gen_reg_rtx (SImode);
36259 emit_insn (gen_zero_extendhisi2 (op2, op0));
36262 else if (mode0 == SImode)
36263 op2 = op0;
36264 else
36265 op2 = gen_rtx_SUBREG (SImode, op0, 0);
36267 if (target == 0
36268 || !register_operand (target, SImode))
36269 target = gen_reg_rtx (SImode);
36271 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
36272 const0_rtx);
36273 emit_insn (gen_rtx_SET (target,
36274 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
36275 return target;
36277 case IX86_BUILTIN_RDSEED16_STEP:
36278 icode = CODE_FOR_rdseedhi_1;
36279 mode0 = HImode;
36280 goto rdseed_step;
36282 case IX86_BUILTIN_RDSEED32_STEP:
36283 icode = CODE_FOR_rdseedsi_1;
36284 mode0 = SImode;
36285 goto rdseed_step;
36287 case IX86_BUILTIN_RDSEED64_STEP:
36288 icode = CODE_FOR_rdseeddi_1;
36289 mode0 = DImode;
36291 rdseed_step:
36292 arg0 = CALL_EXPR_ARG (exp, 0);
36293 op1 = expand_normal (arg0);
36294 if (!address_operand (op1, VOIDmode))
36296 op1 = convert_memory_address (Pmode, op1);
36297 op1 = copy_addr_to_reg (op1);
36300 op0 = gen_reg_rtx (mode0);
36301 emit_insn (GEN_FCN (icode) (op0));
36303 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36305 op2 = gen_reg_rtx (QImode);
36307 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36308 const0_rtx);
36309 emit_insn (gen_rtx_SET (op2, pat));
36311 if (target == 0
36312 || !register_operand (target, SImode))
36313 target = gen_reg_rtx (SImode);
36315 emit_insn (gen_zero_extendqisi2 (target, op2));
36316 return target;
36318 case IX86_BUILTIN_SBB32:
36319 icode = CODE_FOR_subborrowsi;
36320 icode2 = CODE_FOR_subborrowsi_0;
36321 mode0 = SImode;
36322 mode1 = DImode;
36323 mode2 = CCmode;
36324 goto handlecarry;
36326 case IX86_BUILTIN_SBB64:
36327 icode = CODE_FOR_subborrowdi;
36328 icode2 = CODE_FOR_subborrowdi_0;
36329 mode0 = DImode;
36330 mode1 = TImode;
36331 mode2 = CCmode;
36332 goto handlecarry;
36334 case IX86_BUILTIN_ADDCARRYX32:
36335 icode = CODE_FOR_addcarrysi;
36336 icode2 = CODE_FOR_addcarrysi_0;
36337 mode0 = SImode;
36338 mode1 = DImode;
36339 mode2 = CCCmode;
36340 goto handlecarry;
36342 case IX86_BUILTIN_ADDCARRYX64:
36343 icode = CODE_FOR_addcarrydi;
36344 icode2 = CODE_FOR_addcarrydi_0;
36345 mode0 = DImode;
36346 mode1 = TImode;
36347 mode2 = CCCmode;
36349 handlecarry:
36350 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36351 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36352 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36353 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36355 op1 = expand_normal (arg0);
36356 if (!integer_zerop (arg0))
36357 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36359 op2 = expand_normal (arg1);
36360 if (!register_operand (op2, mode0))
36361 op2 = copy_to_mode_reg (mode0, op2);
36363 op3 = expand_normal (arg2);
36364 if (!register_operand (op3, mode0))
36365 op3 = copy_to_mode_reg (mode0, op3);
36367 op4 = expand_normal (arg3);
36368 if (!address_operand (op4, VOIDmode))
36370 op4 = convert_memory_address (Pmode, op4);
36371 op4 = copy_addr_to_reg (op4);
36374 op0 = gen_reg_rtx (mode0);
36375 if (integer_zerop (arg0))
36377 /* If arg0 is 0, optimize right away into add or sub
36378 instruction that sets CCCmode flags. */
36379 op1 = gen_rtx_REG (mode2, FLAGS_REG);
36380 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
36382 else
36384 /* Generate CF from input operand. */
36385 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36387 /* Generate instruction that consumes CF. */
36388 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36389 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
36390 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
36391 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
36394 /* Return current CF value. */
36395 if (target == 0)
36396 target = gen_reg_rtx (QImode);
36398 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
36399 emit_insn (gen_rtx_SET (target, pat));
36401 /* Store the result. */
36402 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36404 return target;
36406 case IX86_BUILTIN_READ_FLAGS:
36407 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36409 if (optimize
36410 || target == NULL_RTX
36411 || !nonimmediate_operand (target, word_mode)
36412 || GET_MODE (target) != word_mode)
36413 target = gen_reg_rtx (word_mode);
36415 emit_insn (gen_pop (target));
36416 return target;
36418 case IX86_BUILTIN_WRITE_FLAGS:
36420 arg0 = CALL_EXPR_ARG (exp, 0);
36421 op0 = expand_normal (arg0);
36422 if (!general_no_elim_operand (op0, word_mode))
36423 op0 = copy_to_mode_reg (word_mode, op0);
36425 emit_insn (gen_push (op0));
36426 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36427 return 0;
36429 case IX86_BUILTIN_KTESTC8:
36430 icode = CODE_FOR_ktestqi;
36431 mode3 = CCCmode;
36432 goto kortest;
36434 case IX86_BUILTIN_KTESTZ8:
36435 icode = CODE_FOR_ktestqi;
36436 mode3 = CCZmode;
36437 goto kortest;
36439 case IX86_BUILTIN_KTESTC16:
36440 icode = CODE_FOR_ktesthi;
36441 mode3 = CCCmode;
36442 goto kortest;
36444 case IX86_BUILTIN_KTESTZ16:
36445 icode = CODE_FOR_ktesthi;
36446 mode3 = CCZmode;
36447 goto kortest;
36449 case IX86_BUILTIN_KTESTC32:
36450 icode = CODE_FOR_ktestsi;
36451 mode3 = CCCmode;
36452 goto kortest;
36454 case IX86_BUILTIN_KTESTZ32:
36455 icode = CODE_FOR_ktestsi;
36456 mode3 = CCZmode;
36457 goto kortest;
36459 case IX86_BUILTIN_KTESTC64:
36460 icode = CODE_FOR_ktestdi;
36461 mode3 = CCCmode;
36462 goto kortest;
36464 case IX86_BUILTIN_KTESTZ64:
36465 icode = CODE_FOR_ktestdi;
36466 mode3 = CCZmode;
36467 goto kortest;
36469 case IX86_BUILTIN_KORTESTC8:
36470 icode = CODE_FOR_kortestqi;
36471 mode3 = CCCmode;
36472 goto kortest;
36474 case IX86_BUILTIN_KORTESTZ8:
36475 icode = CODE_FOR_kortestqi;
36476 mode3 = CCZmode;
36477 goto kortest;
36479 case IX86_BUILTIN_KORTESTC16:
36480 icode = CODE_FOR_kortesthi;
36481 mode3 = CCCmode;
36482 goto kortest;
36484 case IX86_BUILTIN_KORTESTZ16:
36485 icode = CODE_FOR_kortesthi;
36486 mode3 = CCZmode;
36487 goto kortest;
36489 case IX86_BUILTIN_KORTESTC32:
36490 icode = CODE_FOR_kortestsi;
36491 mode3 = CCCmode;
36492 goto kortest;
36494 case IX86_BUILTIN_KORTESTZ32:
36495 icode = CODE_FOR_kortestsi;
36496 mode3 = CCZmode;
36497 goto kortest;
36499 case IX86_BUILTIN_KORTESTC64:
36500 icode = CODE_FOR_kortestdi;
36501 mode3 = CCCmode;
36502 goto kortest;
36504 case IX86_BUILTIN_KORTESTZ64:
36505 icode = CODE_FOR_kortestdi;
36506 mode3 = CCZmode;
36508 kortest:
36509 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36510 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36511 op0 = expand_normal (arg0);
36512 op1 = expand_normal (arg1);
36514 mode0 = insn_data[icode].operand[0].mode;
36515 mode1 = insn_data[icode].operand[1].mode;
36517 if (GET_MODE (op0) != VOIDmode)
36518 op0 = force_reg (GET_MODE (op0), op0);
36520 op0 = gen_lowpart (mode0, op0);
36522 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36523 op0 = copy_to_mode_reg (mode0, op0);
36525 if (GET_MODE (op1) != VOIDmode)
36526 op1 = force_reg (GET_MODE (op1), op1);
36528 op1 = gen_lowpart (mode1, op1);
36530 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36531 op1 = copy_to_mode_reg (mode1, op1);
36533 target = gen_reg_rtx (QImode);
36535 /* Emit kortest. */
36536 emit_insn (GEN_FCN (icode) (op0, op1));
36537 /* And use setcc to return result from flags. */
36538 ix86_expand_setcc (target, EQ,
36539 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36540 return target;
36542 case IX86_BUILTIN_GATHERSIV2DF:
36543 icode = CODE_FOR_avx2_gathersiv2df;
36544 goto gather_gen;
36545 case IX86_BUILTIN_GATHERSIV4DF:
36546 icode = CODE_FOR_avx2_gathersiv4df;
36547 goto gather_gen;
36548 case IX86_BUILTIN_GATHERDIV2DF:
36549 icode = CODE_FOR_avx2_gatherdiv2df;
36550 goto gather_gen;
36551 case IX86_BUILTIN_GATHERDIV4DF:
36552 icode = CODE_FOR_avx2_gatherdiv4df;
36553 goto gather_gen;
36554 case IX86_BUILTIN_GATHERSIV4SF:
36555 icode = CODE_FOR_avx2_gathersiv4sf;
36556 goto gather_gen;
36557 case IX86_BUILTIN_GATHERSIV8SF:
36558 icode = CODE_FOR_avx2_gathersiv8sf;
36559 goto gather_gen;
36560 case IX86_BUILTIN_GATHERDIV4SF:
36561 icode = CODE_FOR_avx2_gatherdiv4sf;
36562 goto gather_gen;
36563 case IX86_BUILTIN_GATHERDIV8SF:
36564 icode = CODE_FOR_avx2_gatherdiv8sf;
36565 goto gather_gen;
36566 case IX86_BUILTIN_GATHERSIV2DI:
36567 icode = CODE_FOR_avx2_gathersiv2di;
36568 goto gather_gen;
36569 case IX86_BUILTIN_GATHERSIV4DI:
36570 icode = CODE_FOR_avx2_gathersiv4di;
36571 goto gather_gen;
36572 case IX86_BUILTIN_GATHERDIV2DI:
36573 icode = CODE_FOR_avx2_gatherdiv2di;
36574 goto gather_gen;
36575 case IX86_BUILTIN_GATHERDIV4DI:
36576 icode = CODE_FOR_avx2_gatherdiv4di;
36577 goto gather_gen;
36578 case IX86_BUILTIN_GATHERSIV4SI:
36579 icode = CODE_FOR_avx2_gathersiv4si;
36580 goto gather_gen;
36581 case IX86_BUILTIN_GATHERSIV8SI:
36582 icode = CODE_FOR_avx2_gathersiv8si;
36583 goto gather_gen;
36584 case IX86_BUILTIN_GATHERDIV4SI:
36585 icode = CODE_FOR_avx2_gatherdiv4si;
36586 goto gather_gen;
36587 case IX86_BUILTIN_GATHERDIV8SI:
36588 icode = CODE_FOR_avx2_gatherdiv8si;
36589 goto gather_gen;
36590 case IX86_BUILTIN_GATHERALTSIV4DF:
36591 icode = CODE_FOR_avx2_gathersiv4df;
36592 goto gather_gen;
36593 case IX86_BUILTIN_GATHERALTDIV8SF:
36594 icode = CODE_FOR_avx2_gatherdiv8sf;
36595 goto gather_gen;
36596 case IX86_BUILTIN_GATHERALTSIV4DI:
36597 icode = CODE_FOR_avx2_gathersiv4di;
36598 goto gather_gen;
36599 case IX86_BUILTIN_GATHERALTDIV8SI:
36600 icode = CODE_FOR_avx2_gatherdiv8si;
36601 goto gather_gen;
36602 case IX86_BUILTIN_GATHER3SIV16SF:
36603 icode = CODE_FOR_avx512f_gathersiv16sf;
36604 goto gather_gen;
36605 case IX86_BUILTIN_GATHER3SIV8DF:
36606 icode = CODE_FOR_avx512f_gathersiv8df;
36607 goto gather_gen;
36608 case IX86_BUILTIN_GATHER3DIV16SF:
36609 icode = CODE_FOR_avx512f_gatherdiv16sf;
36610 goto gather_gen;
36611 case IX86_BUILTIN_GATHER3DIV8DF:
36612 icode = CODE_FOR_avx512f_gatherdiv8df;
36613 goto gather_gen;
36614 case IX86_BUILTIN_GATHER3SIV16SI:
36615 icode = CODE_FOR_avx512f_gathersiv16si;
36616 goto gather_gen;
36617 case IX86_BUILTIN_GATHER3SIV8DI:
36618 icode = CODE_FOR_avx512f_gathersiv8di;
36619 goto gather_gen;
36620 case IX86_BUILTIN_GATHER3DIV16SI:
36621 icode = CODE_FOR_avx512f_gatherdiv16si;
36622 goto gather_gen;
36623 case IX86_BUILTIN_GATHER3DIV8DI:
36624 icode = CODE_FOR_avx512f_gatherdiv8di;
36625 goto gather_gen;
36626 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36627 icode = CODE_FOR_avx512f_gathersiv8df;
36628 goto gather_gen;
36629 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36630 icode = CODE_FOR_avx512f_gatherdiv16sf;
36631 goto gather_gen;
36632 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36633 icode = CODE_FOR_avx512f_gathersiv8di;
36634 goto gather_gen;
36635 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36636 icode = CODE_FOR_avx512f_gatherdiv16si;
36637 goto gather_gen;
36638 case IX86_BUILTIN_GATHER3SIV2DF:
36639 icode = CODE_FOR_avx512vl_gathersiv2df;
36640 goto gather_gen;
36641 case IX86_BUILTIN_GATHER3SIV4DF:
36642 icode = CODE_FOR_avx512vl_gathersiv4df;
36643 goto gather_gen;
36644 case IX86_BUILTIN_GATHER3DIV2DF:
36645 icode = CODE_FOR_avx512vl_gatherdiv2df;
36646 goto gather_gen;
36647 case IX86_BUILTIN_GATHER3DIV4DF:
36648 icode = CODE_FOR_avx512vl_gatherdiv4df;
36649 goto gather_gen;
36650 case IX86_BUILTIN_GATHER3SIV4SF:
36651 icode = CODE_FOR_avx512vl_gathersiv4sf;
36652 goto gather_gen;
36653 case IX86_BUILTIN_GATHER3SIV8SF:
36654 icode = CODE_FOR_avx512vl_gathersiv8sf;
36655 goto gather_gen;
36656 case IX86_BUILTIN_GATHER3DIV4SF:
36657 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36658 goto gather_gen;
36659 case IX86_BUILTIN_GATHER3DIV8SF:
36660 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36661 goto gather_gen;
36662 case IX86_BUILTIN_GATHER3SIV2DI:
36663 icode = CODE_FOR_avx512vl_gathersiv2di;
36664 goto gather_gen;
36665 case IX86_BUILTIN_GATHER3SIV4DI:
36666 icode = CODE_FOR_avx512vl_gathersiv4di;
36667 goto gather_gen;
36668 case IX86_BUILTIN_GATHER3DIV2DI:
36669 icode = CODE_FOR_avx512vl_gatherdiv2di;
36670 goto gather_gen;
36671 case IX86_BUILTIN_GATHER3DIV4DI:
36672 icode = CODE_FOR_avx512vl_gatherdiv4di;
36673 goto gather_gen;
36674 case IX86_BUILTIN_GATHER3SIV4SI:
36675 icode = CODE_FOR_avx512vl_gathersiv4si;
36676 goto gather_gen;
36677 case IX86_BUILTIN_GATHER3SIV8SI:
36678 icode = CODE_FOR_avx512vl_gathersiv8si;
36679 goto gather_gen;
36680 case IX86_BUILTIN_GATHER3DIV4SI:
36681 icode = CODE_FOR_avx512vl_gatherdiv4si;
36682 goto gather_gen;
36683 case IX86_BUILTIN_GATHER3DIV8SI:
36684 icode = CODE_FOR_avx512vl_gatherdiv8si;
36685 goto gather_gen;
36686 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36687 icode = CODE_FOR_avx512vl_gathersiv4df;
36688 goto gather_gen;
36689 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36690 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36691 goto gather_gen;
36692 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36693 icode = CODE_FOR_avx512vl_gathersiv4di;
36694 goto gather_gen;
36695 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36696 icode = CODE_FOR_avx512vl_gatherdiv8si;
36697 goto gather_gen;
36698 case IX86_BUILTIN_SCATTERSIV16SF:
36699 icode = CODE_FOR_avx512f_scattersiv16sf;
36700 goto scatter_gen;
36701 case IX86_BUILTIN_SCATTERSIV8DF:
36702 icode = CODE_FOR_avx512f_scattersiv8df;
36703 goto scatter_gen;
36704 case IX86_BUILTIN_SCATTERDIV16SF:
36705 icode = CODE_FOR_avx512f_scatterdiv16sf;
36706 goto scatter_gen;
36707 case IX86_BUILTIN_SCATTERDIV8DF:
36708 icode = CODE_FOR_avx512f_scatterdiv8df;
36709 goto scatter_gen;
36710 case IX86_BUILTIN_SCATTERSIV16SI:
36711 icode = CODE_FOR_avx512f_scattersiv16si;
36712 goto scatter_gen;
36713 case IX86_BUILTIN_SCATTERSIV8DI:
36714 icode = CODE_FOR_avx512f_scattersiv8di;
36715 goto scatter_gen;
36716 case IX86_BUILTIN_SCATTERDIV16SI:
36717 icode = CODE_FOR_avx512f_scatterdiv16si;
36718 goto scatter_gen;
36719 case IX86_BUILTIN_SCATTERDIV8DI:
36720 icode = CODE_FOR_avx512f_scatterdiv8di;
36721 goto scatter_gen;
36722 case IX86_BUILTIN_SCATTERSIV8SF:
36723 icode = CODE_FOR_avx512vl_scattersiv8sf;
36724 goto scatter_gen;
36725 case IX86_BUILTIN_SCATTERSIV4SF:
36726 icode = CODE_FOR_avx512vl_scattersiv4sf;
36727 goto scatter_gen;
36728 case IX86_BUILTIN_SCATTERSIV4DF:
36729 icode = CODE_FOR_avx512vl_scattersiv4df;
36730 goto scatter_gen;
36731 case IX86_BUILTIN_SCATTERSIV2DF:
36732 icode = CODE_FOR_avx512vl_scattersiv2df;
36733 goto scatter_gen;
36734 case IX86_BUILTIN_SCATTERDIV8SF:
36735 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36736 goto scatter_gen;
36737 case IX86_BUILTIN_SCATTERDIV4SF:
36738 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36739 goto scatter_gen;
36740 case IX86_BUILTIN_SCATTERDIV4DF:
36741 icode = CODE_FOR_avx512vl_scatterdiv4df;
36742 goto scatter_gen;
36743 case IX86_BUILTIN_SCATTERDIV2DF:
36744 icode = CODE_FOR_avx512vl_scatterdiv2df;
36745 goto scatter_gen;
36746 case IX86_BUILTIN_SCATTERSIV8SI:
36747 icode = CODE_FOR_avx512vl_scattersiv8si;
36748 goto scatter_gen;
36749 case IX86_BUILTIN_SCATTERSIV4SI:
36750 icode = CODE_FOR_avx512vl_scattersiv4si;
36751 goto scatter_gen;
36752 case IX86_BUILTIN_SCATTERSIV4DI:
36753 icode = CODE_FOR_avx512vl_scattersiv4di;
36754 goto scatter_gen;
36755 case IX86_BUILTIN_SCATTERSIV2DI:
36756 icode = CODE_FOR_avx512vl_scattersiv2di;
36757 goto scatter_gen;
36758 case IX86_BUILTIN_SCATTERDIV8SI:
36759 icode = CODE_FOR_avx512vl_scatterdiv8si;
36760 goto scatter_gen;
36761 case IX86_BUILTIN_SCATTERDIV4SI:
36762 icode = CODE_FOR_avx512vl_scatterdiv4si;
36763 goto scatter_gen;
36764 case IX86_BUILTIN_SCATTERDIV4DI:
36765 icode = CODE_FOR_avx512vl_scatterdiv4di;
36766 goto scatter_gen;
36767 case IX86_BUILTIN_SCATTERDIV2DI:
36768 icode = CODE_FOR_avx512vl_scatterdiv2di;
36769 goto scatter_gen;
36770 case IX86_BUILTIN_GATHERPFDPD:
36771 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36772 goto vec_prefetch_gen;
36773 case IX86_BUILTIN_SCATTERALTSIV8DF:
36774 icode = CODE_FOR_avx512f_scattersiv8df;
36775 goto scatter_gen;
36776 case IX86_BUILTIN_SCATTERALTDIV16SF:
36777 icode = CODE_FOR_avx512f_scatterdiv16sf;
36778 goto scatter_gen;
36779 case IX86_BUILTIN_SCATTERALTSIV8DI:
36780 icode = CODE_FOR_avx512f_scattersiv8di;
36781 goto scatter_gen;
36782 case IX86_BUILTIN_SCATTERALTDIV16SI:
36783 icode = CODE_FOR_avx512f_scatterdiv16si;
36784 goto scatter_gen;
36785 case IX86_BUILTIN_GATHERPFDPS:
36786 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36787 goto vec_prefetch_gen;
36788 case IX86_BUILTIN_GATHERPFQPD:
36789 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36790 goto vec_prefetch_gen;
36791 case IX86_BUILTIN_GATHERPFQPS:
36792 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36793 goto vec_prefetch_gen;
36794 case IX86_BUILTIN_SCATTERPFDPD:
36795 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36796 goto vec_prefetch_gen;
36797 case IX86_BUILTIN_SCATTERPFDPS:
36798 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36799 goto vec_prefetch_gen;
36800 case IX86_BUILTIN_SCATTERPFQPD:
36801 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36802 goto vec_prefetch_gen;
36803 case IX86_BUILTIN_SCATTERPFQPS:
36804 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36805 goto vec_prefetch_gen;
36807 gather_gen:
36808 rtx half;
36809 rtx (*gen) (rtx, rtx);
36811 arg0 = CALL_EXPR_ARG (exp, 0);
36812 arg1 = CALL_EXPR_ARG (exp, 1);
36813 arg2 = CALL_EXPR_ARG (exp, 2);
36814 arg3 = CALL_EXPR_ARG (exp, 3);
36815 arg4 = CALL_EXPR_ARG (exp, 4);
36816 op0 = expand_normal (arg0);
36817 op1 = expand_normal (arg1);
36818 op2 = expand_normal (arg2);
36819 op3 = expand_normal (arg3);
36820 op4 = expand_normal (arg4);
36821 /* Note the arg order is different from the operand order. */
36822 mode0 = insn_data[icode].operand[1].mode;
36823 mode2 = insn_data[icode].operand[3].mode;
36824 mode3 = insn_data[icode].operand[4].mode;
36825 mode4 = insn_data[icode].operand[5].mode;
36827 if (target == NULL_RTX
36828 || GET_MODE (target) != insn_data[icode].operand[0].mode
36829 || !insn_data[icode].operand[0].predicate (target,
36830 GET_MODE (target)))
36831 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36832 else
36833 subtarget = target;
36835 switch (fcode)
36837 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36838 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36839 half = gen_reg_rtx (V8SImode);
36840 if (!nonimmediate_operand (op2, V16SImode))
36841 op2 = copy_to_mode_reg (V16SImode, op2);
36842 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36843 op2 = half;
36844 break;
36845 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36846 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36847 case IX86_BUILTIN_GATHERALTSIV4DF:
36848 case IX86_BUILTIN_GATHERALTSIV4DI:
36849 half = gen_reg_rtx (V4SImode);
36850 if (!nonimmediate_operand (op2, V8SImode))
36851 op2 = copy_to_mode_reg (V8SImode, op2);
36852 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36853 op2 = half;
36854 break;
36855 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36856 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36857 half = gen_reg_rtx (mode0);
36858 if (mode0 == V8SFmode)
36859 gen = gen_vec_extract_lo_v16sf;
36860 else
36861 gen = gen_vec_extract_lo_v16si;
36862 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36863 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36864 emit_insn (gen (half, op0));
36865 op0 = half;
36866 if (GET_MODE (op3) != VOIDmode)
36868 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36869 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36870 emit_insn (gen (half, op3));
36871 op3 = half;
36873 break;
36874 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36875 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36876 case IX86_BUILTIN_GATHERALTDIV8SF:
36877 case IX86_BUILTIN_GATHERALTDIV8SI:
36878 half = gen_reg_rtx (mode0);
36879 if (mode0 == V4SFmode)
36880 gen = gen_vec_extract_lo_v8sf;
36881 else
36882 gen = gen_vec_extract_lo_v8si;
36883 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36884 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36885 emit_insn (gen (half, op0));
36886 op0 = half;
36887 if (GET_MODE (op3) != VOIDmode)
36889 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36890 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36891 emit_insn (gen (half, op3));
36892 op3 = half;
36894 break;
36895 default:
36896 break;
36899 /* Force memory operand only with base register here. But we
36900 don't want to do it on memory operand for other builtin
36901 functions. */
36902 op1 = ix86_zero_extend_to_Pmode (op1);
36904 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36905 op0 = copy_to_mode_reg (mode0, op0);
36906 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36907 op1 = copy_to_mode_reg (Pmode, op1);
36908 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36909 op2 = copy_to_mode_reg (mode2, op2);
36911 op3 = fixup_modeless_constant (op3, mode3);
36913 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36915 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36916 op3 = copy_to_mode_reg (mode3, op3);
36918 else
36920 op3 = copy_to_reg (op3);
36921 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36923 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36925 error ("the last argument must be scale 1, 2, 4, 8");
36926 return const0_rtx;
36929 /* Optimize. If mask is known to have all high bits set,
36930 replace op0 with pc_rtx to signal that the instruction
36931 overwrites the whole destination and doesn't use its
36932 previous contents. */
36933 if (optimize)
36935 if (TREE_CODE (arg3) == INTEGER_CST)
36937 if (integer_all_onesp (arg3))
36938 op0 = pc_rtx;
36940 else if (TREE_CODE (arg3) == VECTOR_CST)
36942 unsigned int negative = 0;
36943 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36945 tree cst = VECTOR_CST_ELT (arg3, i);
36946 if (TREE_CODE (cst) == INTEGER_CST
36947 && tree_int_cst_sign_bit (cst))
36948 negative++;
36949 else if (TREE_CODE (cst) == REAL_CST
36950 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36951 negative++;
36953 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
36954 op0 = pc_rtx;
36956 else if (TREE_CODE (arg3) == SSA_NAME
36957 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
36959 /* Recognize also when mask is like:
36960 __v2df src = _mm_setzero_pd ();
36961 __v2df mask = _mm_cmpeq_pd (src, src);
36963 __v8sf src = _mm256_setzero_ps ();
36964 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
36965 as that is a cheaper way to load all ones into
36966 a register than having to load a constant from
36967 memory. */
36968 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
36969 if (is_gimple_call (def_stmt))
36971 tree fndecl = gimple_call_fndecl (def_stmt);
36972 if (fndecl
36973 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
36974 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
36976 case IX86_BUILTIN_CMPPD:
36977 case IX86_BUILTIN_CMPPS:
36978 case IX86_BUILTIN_CMPPD256:
36979 case IX86_BUILTIN_CMPPS256:
36980 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
36981 break;
36982 /* FALLTHRU */
36983 case IX86_BUILTIN_CMPEQPD:
36984 case IX86_BUILTIN_CMPEQPS:
36985 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
36986 && initializer_zerop (gimple_call_arg (def_stmt,
36987 1)))
36988 op0 = pc_rtx;
36989 break;
36990 default:
36991 break;
36997 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
36998 if (! pat)
36999 return const0_rtx;
37000 emit_insn (pat);
37002 switch (fcode)
37004 case IX86_BUILTIN_GATHER3DIV16SF:
37005 if (target == NULL_RTX)
37006 target = gen_reg_rtx (V8SFmode);
37007 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37008 break;
37009 case IX86_BUILTIN_GATHER3DIV16SI:
37010 if (target == NULL_RTX)
37011 target = gen_reg_rtx (V8SImode);
37012 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37013 break;
37014 case IX86_BUILTIN_GATHER3DIV8SF:
37015 case IX86_BUILTIN_GATHERDIV8SF:
37016 if (target == NULL_RTX)
37017 target = gen_reg_rtx (V4SFmode);
37018 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37019 break;
37020 case IX86_BUILTIN_GATHER3DIV8SI:
37021 case IX86_BUILTIN_GATHERDIV8SI:
37022 if (target == NULL_RTX)
37023 target = gen_reg_rtx (V4SImode);
37024 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37025 break;
37026 default:
37027 target = subtarget;
37028 break;
37030 return target;
37032 scatter_gen:
37033 arg0 = CALL_EXPR_ARG (exp, 0);
37034 arg1 = CALL_EXPR_ARG (exp, 1);
37035 arg2 = CALL_EXPR_ARG (exp, 2);
37036 arg3 = CALL_EXPR_ARG (exp, 3);
37037 arg4 = CALL_EXPR_ARG (exp, 4);
37038 op0 = expand_normal (arg0);
37039 op1 = expand_normal (arg1);
37040 op2 = expand_normal (arg2);
37041 op3 = expand_normal (arg3);
37042 op4 = expand_normal (arg4);
37043 mode1 = insn_data[icode].operand[1].mode;
37044 mode2 = insn_data[icode].operand[2].mode;
37045 mode3 = insn_data[icode].operand[3].mode;
37046 mode4 = insn_data[icode].operand[4].mode;
37048 /* Scatter instruction stores operand op3 to memory with
37049 indices from op2 and scale from op4 under writemask op1.
37050 If index operand op2 has more elements then source operand
37051 op3 one need to use only its low half. And vice versa. */
37052 switch (fcode)
37054 case IX86_BUILTIN_SCATTERALTSIV8DF:
37055 case IX86_BUILTIN_SCATTERALTSIV8DI:
37056 half = gen_reg_rtx (V8SImode);
37057 if (!nonimmediate_operand (op2, V16SImode))
37058 op2 = copy_to_mode_reg (V16SImode, op2);
37059 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37060 op2 = half;
37061 break;
37062 case IX86_BUILTIN_SCATTERALTDIV16SF:
37063 case IX86_BUILTIN_SCATTERALTDIV16SI:
37064 half = gen_reg_rtx (mode3);
37065 if (mode3 == V8SFmode)
37066 gen = gen_vec_extract_lo_v16sf;
37067 else
37068 gen = gen_vec_extract_lo_v16si;
37069 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37070 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37071 emit_insn (gen (half, op3));
37072 op3 = half;
37073 break;
37074 default:
37075 break;
37078 /* Force memory operand only with base register here. But we
37079 don't want to do it on memory operand for other builtin
37080 functions. */
37081 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37083 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37084 op0 = copy_to_mode_reg (Pmode, op0);
37086 op1 = fixup_modeless_constant (op1, mode1);
37088 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37090 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37091 op1 = copy_to_mode_reg (mode1, op1);
37093 else
37095 op1 = copy_to_reg (op1);
37096 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37099 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37100 op2 = copy_to_mode_reg (mode2, op2);
37102 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37103 op3 = copy_to_mode_reg (mode3, op3);
37105 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37107 error ("the last argument must be scale 1, 2, 4, 8");
37108 return const0_rtx;
37111 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37112 if (! pat)
37113 return const0_rtx;
37115 emit_insn (pat);
37116 return 0;
37118 vec_prefetch_gen:
37119 arg0 = CALL_EXPR_ARG (exp, 0);
37120 arg1 = CALL_EXPR_ARG (exp, 1);
37121 arg2 = CALL_EXPR_ARG (exp, 2);
37122 arg3 = CALL_EXPR_ARG (exp, 3);
37123 arg4 = CALL_EXPR_ARG (exp, 4);
37124 op0 = expand_normal (arg0);
37125 op1 = expand_normal (arg1);
37126 op2 = expand_normal (arg2);
37127 op3 = expand_normal (arg3);
37128 op4 = expand_normal (arg4);
37129 mode0 = insn_data[icode].operand[0].mode;
37130 mode1 = insn_data[icode].operand[1].mode;
37131 mode3 = insn_data[icode].operand[3].mode;
37132 mode4 = insn_data[icode].operand[4].mode;
37134 op0 = fixup_modeless_constant (op0, mode0);
37136 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37138 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37139 op0 = copy_to_mode_reg (mode0, op0);
37141 else
37143 op0 = copy_to_reg (op0);
37144 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37147 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37148 op1 = copy_to_mode_reg (mode1, op1);
37150 /* Force memory operand only with base register here. But we
37151 don't want to do it on memory operand for other builtin
37152 functions. */
37153 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37155 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37156 op2 = copy_to_mode_reg (Pmode, op2);
37158 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37160 error ("the forth argument must be scale 1, 2, 4, 8");
37161 return const0_rtx;
37164 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37166 error ("incorrect hint operand");
37167 return const0_rtx;
37170 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37171 if (! pat)
37172 return const0_rtx;
37174 emit_insn (pat);
37176 return 0;
37178 case IX86_BUILTIN_XABORT:
37179 icode = CODE_FOR_xabort;
37180 arg0 = CALL_EXPR_ARG (exp, 0);
37181 op0 = expand_normal (arg0);
37182 mode0 = insn_data[icode].operand[0].mode;
37183 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37185 error ("the xabort's argument must be an 8-bit immediate");
37186 return const0_rtx;
37188 emit_insn (gen_xabort (op0));
37189 return 0;
37191 case IX86_BUILTIN_RSTORSSP:
37192 case IX86_BUILTIN_CLRSSBSY:
37193 arg0 = CALL_EXPR_ARG (exp, 0);
37194 op0 = expand_normal (arg0);
37195 icode = (fcode == IX86_BUILTIN_RSTORSSP
37196 ? CODE_FOR_rstorssp
37197 : CODE_FOR_clrssbsy);
37198 if (!address_operand (op0, VOIDmode))
37200 op1 = convert_memory_address (Pmode, op0);
37201 op0 = copy_addr_to_reg (op1);
37203 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
37204 return 0;
37206 case IX86_BUILTIN_WRSSD:
37207 case IX86_BUILTIN_WRSSQ:
37208 case IX86_BUILTIN_WRUSSD:
37209 case IX86_BUILTIN_WRUSSQ:
37210 arg0 = CALL_EXPR_ARG (exp, 0);
37211 op0 = expand_normal (arg0);
37212 arg1 = CALL_EXPR_ARG (exp, 1);
37213 op1 = expand_normal (arg1);
37214 switch (fcode)
37216 case IX86_BUILTIN_WRSSD:
37217 icode = CODE_FOR_wrsssi;
37218 mode = SImode;
37219 break;
37220 case IX86_BUILTIN_WRSSQ:
37221 icode = CODE_FOR_wrssdi;
37222 mode = DImode;
37223 break;
37224 case IX86_BUILTIN_WRUSSD:
37225 icode = CODE_FOR_wrusssi;
37226 mode = SImode;
37227 break;
37228 case IX86_BUILTIN_WRUSSQ:
37229 icode = CODE_FOR_wrussdi;
37230 mode = DImode;
37231 break;
37233 op0 = force_reg (mode, op0);
37234 if (!address_operand (op1, VOIDmode))
37236 op2 = convert_memory_address (Pmode, op1);
37237 op1 = copy_addr_to_reg (op2);
37239 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
37240 return 0;
37242 default:
37243 break;
37246 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37247 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37249 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37250 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37251 target);
37254 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37255 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37257 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37258 switch (fcode)
37260 case IX86_BUILTIN_FABSQ:
37261 case IX86_BUILTIN_COPYSIGNQ:
37262 if (!TARGET_SSE)
37263 /* Emit a normal call if SSE isn't available. */
37264 return expand_call (exp, target, ignore);
37265 /* FALLTHRU */
37266 default:
37267 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37271 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
37272 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
37274 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
37275 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
37276 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
37277 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
37278 int masked = 1;
37279 machine_mode mode, wide_mode, nar_mode;
37281 nar_mode = V4SFmode;
37282 mode = V16SFmode;
37283 wide_mode = V64SFmode;
37284 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
37285 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
37287 switch (fcode)
37289 case IX86_BUILTIN_4FMAPS:
37290 fcn = gen_avx5124fmaddps_4fmaddps;
37291 masked = 0;
37292 goto v4fma_expand;
37294 case IX86_BUILTIN_4DPWSSD:
37295 nar_mode = V4SImode;
37296 mode = V16SImode;
37297 wide_mode = V64SImode;
37298 fcn = gen_avx5124vnniw_vp4dpwssd;
37299 masked = 0;
37300 goto v4fma_expand;
37302 case IX86_BUILTIN_4DPWSSDS:
37303 nar_mode = V4SImode;
37304 mode = V16SImode;
37305 wide_mode = V64SImode;
37306 fcn = gen_avx5124vnniw_vp4dpwssds;
37307 masked = 0;
37308 goto v4fma_expand;
37310 case IX86_BUILTIN_4FNMAPS:
37311 fcn = gen_avx5124fmaddps_4fnmaddps;
37312 masked = 0;
37313 goto v4fma_expand;
37315 case IX86_BUILTIN_4FNMAPS_MASK:
37316 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37317 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37318 goto v4fma_expand;
37320 case IX86_BUILTIN_4DPWSSD_MASK:
37321 nar_mode = V4SImode;
37322 mode = V16SImode;
37323 wide_mode = V64SImode;
37324 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37325 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37326 goto v4fma_expand;
37328 case IX86_BUILTIN_4DPWSSDS_MASK:
37329 nar_mode = V4SImode;
37330 mode = V16SImode;
37331 wide_mode = V64SImode;
37332 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37333 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37334 goto v4fma_expand;
37336 case IX86_BUILTIN_4FMAPS_MASK:
37338 tree args[4];
37339 rtx ops[4];
37340 rtx wide_reg;
37341 rtx accum;
37342 rtx addr;
37343 rtx mem;
37345 v4fma_expand:
37346 wide_reg = gen_reg_rtx (wide_mode);
37347 for (i = 0; i < 4; i++)
37349 args[i] = CALL_EXPR_ARG (exp, i);
37350 ops[i] = expand_normal (args[i]);
37352 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37353 ops[i]);
37356 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37357 accum = force_reg (mode, accum);
37359 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37360 addr = force_reg (Pmode, addr);
37362 mem = gen_rtx_MEM (nar_mode, addr);
37364 target = gen_reg_rtx (mode);
37366 emit_move_insn (target, accum);
37368 if (! masked)
37369 emit_insn (fcn (target, accum, wide_reg, mem));
37370 else
37372 rtx merge, mask;
37373 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37375 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37377 if (CONST_INT_P (mask))
37378 mask = fixup_modeless_constant (mask, HImode);
37380 mask = force_reg (HImode, mask);
37382 if (GET_MODE (mask) != HImode)
37383 mask = gen_rtx_SUBREG (HImode, mask, 0);
37385 /* If merge is 0 then we're about to emit z-masked variant. */
37386 if (const0_operand (merge, mode))
37387 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37388 /* If merge is the same as accum then emit merge-masked variant. */
37389 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37391 merge = force_reg (mode, merge);
37392 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37394 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37395 else
37397 target = gen_reg_rtx (mode);
37398 emit_move_insn (target, merge);
37399 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37402 return target;
37405 case IX86_BUILTIN_4FNMASS:
37406 fcn = gen_avx5124fmaddps_4fnmaddss;
37407 masked = 0;
37408 goto s4fma_expand;
37410 case IX86_BUILTIN_4FMASS:
37411 fcn = gen_avx5124fmaddps_4fmaddss;
37412 masked = 0;
37413 goto s4fma_expand;
37415 case IX86_BUILTIN_4FNMASS_MASK:
37416 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37417 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37418 goto s4fma_expand;
37420 case IX86_BUILTIN_4FMASS_MASK:
37422 tree args[4];
37423 rtx ops[4];
37424 rtx wide_reg;
37425 rtx accum;
37426 rtx addr;
37427 rtx mem;
37429 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37430 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37432 s4fma_expand:
37433 mode = V4SFmode;
37434 wide_reg = gen_reg_rtx (V64SFmode);
37435 for (i = 0; i < 4; i++)
37437 rtx tmp;
37438 args[i] = CALL_EXPR_ARG (exp, i);
37439 ops[i] = expand_normal (args[i]);
37441 tmp = gen_reg_rtx (SFmode);
37442 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37444 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37445 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37448 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37449 accum = force_reg (V4SFmode, accum);
37451 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37452 addr = force_reg (Pmode, addr);
37454 mem = gen_rtx_MEM (V4SFmode, addr);
37456 target = gen_reg_rtx (V4SFmode);
37458 emit_move_insn (target, accum);
37460 if (! masked)
37461 emit_insn (fcn (target, accum, wide_reg, mem));
37462 else
37464 rtx merge, mask;
37465 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37467 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37469 if (CONST_INT_P (mask))
37470 mask = fixup_modeless_constant (mask, QImode);
37472 mask = force_reg (QImode, mask);
37474 if (GET_MODE (mask) != QImode)
37475 mask = gen_rtx_SUBREG (QImode, mask, 0);
37477 /* If merge is 0 then we're about to emit z-masked variant. */
37478 if (const0_operand (merge, mode))
37479 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37480 /* If merge is the same as accum then emit merge-masked
37481 variant. */
37482 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37484 merge = force_reg (mode, merge);
37485 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37487 /* Merge with something unknown might happen if we z-mask
37488 w/ -O0. */
37489 else
37491 target = gen_reg_rtx (mode);
37492 emit_move_insn (target, merge);
37493 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37496 return target;
37498 case IX86_BUILTIN_RDPID:
37499 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37500 target);
37501 default:
37502 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37506 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
37507 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
37509 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
37510 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
37511 target);
37514 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37515 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37517 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37518 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37521 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37522 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37524 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37525 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37528 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37529 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37531 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37532 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37535 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37536 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37538 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37539 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37542 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37543 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37545 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37546 const struct builtin_description *d = bdesc_multi_arg + i;
37547 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37548 (enum ix86_builtin_func_type)
37549 d->flag, d->comparison);
37552 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37553 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37555 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37556 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37557 target);
37560 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37561 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37563 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37564 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37565 target);
37568 gcc_unreachable ();
37571 /* This returns the target-specific builtin with code CODE if
37572 current_function_decl has visibility on this builtin, which is checked
37573 using isa flags. Returns NULL_TREE otherwise. */
37575 static tree ix86_get_builtin (enum ix86_builtins code)
37577 struct cl_target_option *opts;
37578 tree target_tree = NULL_TREE;
37580 /* Determine the isa flags of current_function_decl. */
37582 if (current_function_decl)
37583 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37585 if (target_tree == NULL)
37586 target_tree = target_option_default_node;
37588 opts = TREE_TARGET_OPTION (target_tree);
37590 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37591 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37592 return ix86_builtin_decl (code, true);
37593 else
37594 return NULL_TREE;
37597 /* Return function decl for target specific builtin
37598 for given MPX builtin passed i FCODE. */
37599 static tree
37600 ix86_builtin_mpx_function (unsigned fcode)
37602 switch (fcode)
37604 case BUILT_IN_CHKP_BNDMK:
37605 return ix86_builtins[IX86_BUILTIN_BNDMK];
37607 case BUILT_IN_CHKP_BNDSTX:
37608 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37610 case BUILT_IN_CHKP_BNDLDX:
37611 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37613 case BUILT_IN_CHKP_BNDCL:
37614 return ix86_builtins[IX86_BUILTIN_BNDCL];
37616 case BUILT_IN_CHKP_BNDCU:
37617 return ix86_builtins[IX86_BUILTIN_BNDCU];
37619 case BUILT_IN_CHKP_BNDRET:
37620 return ix86_builtins[IX86_BUILTIN_BNDRET];
37622 case BUILT_IN_CHKP_INTERSECT:
37623 return ix86_builtins[IX86_BUILTIN_BNDINT];
37625 case BUILT_IN_CHKP_NARROW:
37626 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37628 case BUILT_IN_CHKP_SIZEOF:
37629 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37631 case BUILT_IN_CHKP_EXTRACT_LOWER:
37632 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37634 case BUILT_IN_CHKP_EXTRACT_UPPER:
37635 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37637 default:
37638 return NULL_TREE;
37641 gcc_unreachable ();
37644 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37646 Return an address to be used to load/store bounds for pointer
37647 passed in SLOT.
37649 SLOT_NO is an integer constant holding number of a target
37650 dependent special slot to be used in case SLOT is not a memory.
37652 SPECIAL_BASE is a pointer to be used as a base of fake address
37653 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37654 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37656 static rtx
37657 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37659 rtx addr = NULL;
37661 /* NULL slot means we pass bounds for pointer not passed to the
37662 function at all. Register slot means we pass pointer in a
37663 register. In both these cases bounds are passed via Bounds
37664 Table. Since we do not have actual pointer stored in memory,
37665 we have to use fake addresses to access Bounds Table. We
37666 start with (special_base - sizeof (void*)) and decrease this
37667 address by pointer size to get addresses for other slots. */
37668 if (!slot || REG_P (slot))
37670 gcc_assert (CONST_INT_P (slot_no));
37671 addr = plus_constant (Pmode, special_base,
37672 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37674 /* If pointer is passed in a memory then its address is used to
37675 access Bounds Table. */
37676 else if (MEM_P (slot))
37678 addr = XEXP (slot, 0);
37679 if (!register_operand (addr, Pmode))
37680 addr = copy_addr_to_reg (addr);
37682 else
37683 gcc_unreachable ();
37685 return addr;
37688 /* Expand pass uses this hook to load bounds for function parameter
37689 PTR passed in SLOT in case its bounds are not passed in a register.
37691 If SLOT is a memory, then bounds are loaded as for regular pointer
37692 loaded from memory. PTR may be NULL in case SLOT is a memory.
37693 In such case value of PTR (if required) may be loaded from SLOT.
37695 If SLOT is NULL or a register then SLOT_NO is an integer constant
37696 holding number of the target dependent special slot which should be
37697 used to obtain bounds.
37699 Return loaded bounds. */
37701 static rtx
37702 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37704 rtx reg = gen_reg_rtx (BNDmode);
37705 rtx addr;
37707 /* Get address to be used to access Bounds Table. Special slots start
37708 at the location of return address of the current function. */
37709 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37711 /* Load pointer value from a memory if we don't have it. */
37712 if (!ptr)
37714 gcc_assert (MEM_P (slot));
37715 ptr = copy_addr_to_reg (slot);
37718 if (!register_operand (ptr, Pmode))
37719 ptr = ix86_zero_extend_to_Pmode (ptr);
37721 emit_insn (BNDmode == BND64mode
37722 ? gen_bnd64_ldx (reg, addr, ptr)
37723 : gen_bnd32_ldx (reg, addr, ptr));
37725 return reg;
37728 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37729 passed in SLOT in case BOUNDS are not passed in a register.
37731 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37732 stored in memory. PTR may be NULL in case SLOT is a memory.
37733 In such case value of PTR (if required) may be loaded from SLOT.
37735 If SLOT is NULL or a register then SLOT_NO is an integer constant
37736 holding number of the target dependent special slot which should be
37737 used to store BOUNDS. */
37739 static void
37740 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37742 rtx addr;
37744 /* Get address to be used to access Bounds Table. Special slots start
37745 at the location of return address of a called function. */
37746 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37748 /* Load pointer value from a memory if we don't have it. */
37749 if (!ptr)
37751 gcc_assert (MEM_P (slot));
37752 ptr = copy_addr_to_reg (slot);
37755 if (!register_operand (ptr, Pmode))
37756 ptr = ix86_zero_extend_to_Pmode (ptr);
37758 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37759 if (!register_operand (bounds, BNDmode))
37760 bounds = copy_to_mode_reg (BNDmode, bounds);
37762 emit_insn (BNDmode == BND64mode
37763 ? gen_bnd64_stx (addr, ptr, bounds)
37764 : gen_bnd32_stx (addr, ptr, bounds));
37767 /* Load and return bounds returned by function in SLOT. */
37769 static rtx
37770 ix86_load_returned_bounds (rtx slot)
37772 rtx res;
37774 gcc_assert (REG_P (slot));
37775 res = gen_reg_rtx (BNDmode);
37776 emit_move_insn (res, slot);
37778 return res;
37781 /* Store BOUNDS returned by function into SLOT. */
37783 static void
37784 ix86_store_returned_bounds (rtx slot, rtx bounds)
37786 gcc_assert (REG_P (slot));
37787 emit_move_insn (slot, bounds);
37790 /* Returns a function decl for a vectorized version of the combined function
37791 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37792 if it is not available. */
37794 static tree
37795 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37796 tree type_in)
37798 machine_mode in_mode, out_mode;
37799 int in_n, out_n;
37801 if (TREE_CODE (type_out) != VECTOR_TYPE
37802 || TREE_CODE (type_in) != VECTOR_TYPE)
37803 return NULL_TREE;
37805 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37806 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37807 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37808 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37810 switch (fn)
37812 CASE_CFN_EXP2:
37813 if (out_mode == SFmode && in_mode == SFmode)
37815 if (out_n == 16 && in_n == 16)
37816 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37818 break;
37820 CASE_CFN_IFLOOR:
37821 CASE_CFN_LFLOOR:
37822 CASE_CFN_LLFLOOR:
37823 /* The round insn does not trap on denormals. */
37824 if (flag_trapping_math || !TARGET_SSE4_1)
37825 break;
37827 if (out_mode == SImode && in_mode == DFmode)
37829 if (out_n == 4 && in_n == 2)
37830 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37831 else if (out_n == 8 && in_n == 4)
37832 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37833 else if (out_n == 16 && in_n == 8)
37834 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37836 if (out_mode == SImode && in_mode == SFmode)
37838 if (out_n == 4 && in_n == 4)
37839 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37840 else if (out_n == 8 && in_n == 8)
37841 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37842 else if (out_n == 16 && in_n == 16)
37843 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37845 break;
37847 CASE_CFN_ICEIL:
37848 CASE_CFN_LCEIL:
37849 CASE_CFN_LLCEIL:
37850 /* The round insn does not trap on denormals. */
37851 if (flag_trapping_math || !TARGET_SSE4_1)
37852 break;
37854 if (out_mode == SImode && in_mode == DFmode)
37856 if (out_n == 4 && in_n == 2)
37857 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37858 else if (out_n == 8 && in_n == 4)
37859 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37860 else if (out_n == 16 && in_n == 8)
37861 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37863 if (out_mode == SImode && in_mode == SFmode)
37865 if (out_n == 4 && in_n == 4)
37866 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37867 else if (out_n == 8 && in_n == 8)
37868 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37869 else if (out_n == 16 && in_n == 16)
37870 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37872 break;
37874 CASE_CFN_IRINT:
37875 CASE_CFN_LRINT:
37876 CASE_CFN_LLRINT:
37877 if (out_mode == SImode && in_mode == DFmode)
37879 if (out_n == 4 && in_n == 2)
37880 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37881 else if (out_n == 8 && in_n == 4)
37882 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37883 else if (out_n == 16 && in_n == 8)
37884 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37886 if (out_mode == SImode && in_mode == SFmode)
37888 if (out_n == 4 && in_n == 4)
37889 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37890 else if (out_n == 8 && in_n == 8)
37891 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37892 else if (out_n == 16 && in_n == 16)
37893 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37895 break;
37897 CASE_CFN_IROUND:
37898 CASE_CFN_LROUND:
37899 CASE_CFN_LLROUND:
37900 /* The round insn does not trap on denormals. */
37901 if (flag_trapping_math || !TARGET_SSE4_1)
37902 break;
37904 if (out_mode == SImode && in_mode == DFmode)
37906 if (out_n == 4 && in_n == 2)
37907 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37908 else if (out_n == 8 && in_n == 4)
37909 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37910 else if (out_n == 16 && in_n == 8)
37911 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37913 if (out_mode == SImode && in_mode == SFmode)
37915 if (out_n == 4 && in_n == 4)
37916 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37917 else if (out_n == 8 && in_n == 8)
37918 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37919 else if (out_n == 16 && in_n == 16)
37920 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37922 break;
37924 CASE_CFN_FLOOR:
37925 /* The round insn does not trap on denormals. */
37926 if (flag_trapping_math || !TARGET_SSE4_1)
37927 break;
37929 if (out_mode == DFmode && in_mode == DFmode)
37931 if (out_n == 2 && in_n == 2)
37932 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37933 else if (out_n == 4 && in_n == 4)
37934 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37935 else if (out_n == 8 && in_n == 8)
37936 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37938 if (out_mode == SFmode && in_mode == SFmode)
37940 if (out_n == 4 && in_n == 4)
37941 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37942 else if (out_n == 8 && in_n == 8)
37943 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37944 else if (out_n == 16 && in_n == 16)
37945 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37947 break;
37949 CASE_CFN_CEIL:
37950 /* The round insn does not trap on denormals. */
37951 if (flag_trapping_math || !TARGET_SSE4_1)
37952 break;
37954 if (out_mode == DFmode && in_mode == DFmode)
37956 if (out_n == 2 && in_n == 2)
37957 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37958 else if (out_n == 4 && in_n == 4)
37959 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37960 else if (out_n == 8 && in_n == 8)
37961 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
37963 if (out_mode == SFmode && in_mode == SFmode)
37965 if (out_n == 4 && in_n == 4)
37966 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
37967 else if (out_n == 8 && in_n == 8)
37968 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
37969 else if (out_n == 16 && in_n == 16)
37970 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
37972 break;
37974 CASE_CFN_TRUNC:
37975 /* The round insn does not trap on denormals. */
37976 if (flag_trapping_math || !TARGET_SSE4_1)
37977 break;
37979 if (out_mode == DFmode && in_mode == DFmode)
37981 if (out_n == 2 && in_n == 2)
37982 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
37983 else if (out_n == 4 && in_n == 4)
37984 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
37985 else if (out_n == 8 && in_n == 8)
37986 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
37988 if (out_mode == SFmode && in_mode == SFmode)
37990 if (out_n == 4 && in_n == 4)
37991 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
37992 else if (out_n == 8 && in_n == 8)
37993 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
37994 else if (out_n == 16 && in_n == 16)
37995 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
37997 break;
37999 CASE_CFN_RINT:
38000 /* The round insn does not trap on denormals. */
38001 if (flag_trapping_math || !TARGET_SSE4_1)
38002 break;
38004 if (out_mode == DFmode && in_mode == DFmode)
38006 if (out_n == 2 && in_n == 2)
38007 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38008 else if (out_n == 4 && in_n == 4)
38009 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38011 if (out_mode == SFmode && in_mode == SFmode)
38013 if (out_n == 4 && in_n == 4)
38014 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38015 else if (out_n == 8 && in_n == 8)
38016 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38018 break;
38020 CASE_CFN_FMA:
38021 if (out_mode == DFmode && in_mode == DFmode)
38023 if (out_n == 2 && in_n == 2)
38024 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38025 if (out_n == 4 && in_n == 4)
38026 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38028 if (out_mode == SFmode && in_mode == SFmode)
38030 if (out_n == 4 && in_n == 4)
38031 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38032 if (out_n == 8 && in_n == 8)
38033 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38035 break;
38037 default:
38038 break;
38041 /* Dispatch to a handler for a vectorization library. */
38042 if (ix86_veclib_handler)
38043 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38045 return NULL_TREE;
38048 /* Handler for an SVML-style interface to
38049 a library with vectorized intrinsics. */
38051 static tree
38052 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38054 char name[20];
38055 tree fntype, new_fndecl, args;
38056 unsigned arity;
38057 const char *bname;
38058 machine_mode el_mode, in_mode;
38059 int n, in_n;
38061 /* The SVML is suitable for unsafe math only. */
38062 if (!flag_unsafe_math_optimizations)
38063 return NULL_TREE;
38065 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38066 n = TYPE_VECTOR_SUBPARTS (type_out);
38067 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38068 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38069 if (el_mode != in_mode
38070 || n != in_n)
38071 return NULL_TREE;
38073 switch (fn)
38075 CASE_CFN_EXP:
38076 CASE_CFN_LOG:
38077 CASE_CFN_LOG10:
38078 CASE_CFN_POW:
38079 CASE_CFN_TANH:
38080 CASE_CFN_TAN:
38081 CASE_CFN_ATAN:
38082 CASE_CFN_ATAN2:
38083 CASE_CFN_ATANH:
38084 CASE_CFN_CBRT:
38085 CASE_CFN_SINH:
38086 CASE_CFN_SIN:
38087 CASE_CFN_ASINH:
38088 CASE_CFN_ASIN:
38089 CASE_CFN_COSH:
38090 CASE_CFN_COS:
38091 CASE_CFN_ACOSH:
38092 CASE_CFN_ACOS:
38093 if ((el_mode != DFmode || n != 2)
38094 && (el_mode != SFmode || n != 4))
38095 return NULL_TREE;
38096 break;
38098 default:
38099 return NULL_TREE;
38102 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38103 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38105 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38106 strcpy (name, "vmlsLn4");
38107 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38108 strcpy (name, "vmldLn2");
38109 else if (n == 4)
38111 sprintf (name, "vmls%s", bname+10);
38112 name[strlen (name)-1] = '4';
38114 else
38115 sprintf (name, "vmld%s2", bname+10);
38117 /* Convert to uppercase. */
38118 name[4] &= ~0x20;
38120 arity = 0;
38121 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38122 arity++;
38124 if (arity == 1)
38125 fntype = build_function_type_list (type_out, type_in, NULL);
38126 else
38127 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38129 /* Build a function declaration for the vectorized function. */
38130 new_fndecl = build_decl (BUILTINS_LOCATION,
38131 FUNCTION_DECL, get_identifier (name), fntype);
38132 TREE_PUBLIC (new_fndecl) = 1;
38133 DECL_EXTERNAL (new_fndecl) = 1;
38134 DECL_IS_NOVOPS (new_fndecl) = 1;
38135 TREE_READONLY (new_fndecl) = 1;
38137 return new_fndecl;
38140 /* Handler for an ACML-style interface to
38141 a library with vectorized intrinsics. */
38143 static tree
38144 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38146 char name[20] = "__vr.._";
38147 tree fntype, new_fndecl, args;
38148 unsigned arity;
38149 const char *bname;
38150 machine_mode el_mode, in_mode;
38151 int n, in_n;
38153 /* The ACML is 64bits only and suitable for unsafe math only as
38154 it does not correctly support parts of IEEE with the required
38155 precision such as denormals. */
38156 if (!TARGET_64BIT
38157 || !flag_unsafe_math_optimizations)
38158 return NULL_TREE;
38160 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38161 n = TYPE_VECTOR_SUBPARTS (type_out);
38162 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38163 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38164 if (el_mode != in_mode
38165 || n != in_n)
38166 return NULL_TREE;
38168 switch (fn)
38170 CASE_CFN_SIN:
38171 CASE_CFN_COS:
38172 CASE_CFN_EXP:
38173 CASE_CFN_LOG:
38174 CASE_CFN_LOG2:
38175 CASE_CFN_LOG10:
38176 if (el_mode == DFmode && n == 2)
38178 name[4] = 'd';
38179 name[5] = '2';
38181 else if (el_mode == SFmode && n == 4)
38183 name[4] = 's';
38184 name[5] = '4';
38186 else
38187 return NULL_TREE;
38188 break;
38190 default:
38191 return NULL_TREE;
38194 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38195 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38196 sprintf (name + 7, "%s", bname+10);
38198 arity = 0;
38199 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38200 arity++;
38202 if (arity == 1)
38203 fntype = build_function_type_list (type_out, type_in, NULL);
38204 else
38205 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38207 /* Build a function declaration for the vectorized function. */
38208 new_fndecl = build_decl (BUILTINS_LOCATION,
38209 FUNCTION_DECL, get_identifier (name), fntype);
38210 TREE_PUBLIC (new_fndecl) = 1;
38211 DECL_EXTERNAL (new_fndecl) = 1;
38212 DECL_IS_NOVOPS (new_fndecl) = 1;
38213 TREE_READONLY (new_fndecl) = 1;
38215 return new_fndecl;
38218 /* Returns a decl of a function that implements gather load with
38219 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38220 Return NULL_TREE if it is not available. */
38222 static tree
38223 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38224 const_tree index_type, int scale)
38226 bool si;
38227 enum ix86_builtins code;
38229 if (! TARGET_AVX2)
38230 return NULL_TREE;
38232 if ((TREE_CODE (index_type) != INTEGER_TYPE
38233 && !POINTER_TYPE_P (index_type))
38234 || (TYPE_MODE (index_type) != SImode
38235 && TYPE_MODE (index_type) != DImode))
38236 return NULL_TREE;
38238 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38239 return NULL_TREE;
38241 /* v*gather* insn sign extends index to pointer mode. */
38242 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38243 && TYPE_UNSIGNED (index_type))
38244 return NULL_TREE;
38246 if (scale <= 0
38247 || scale > 8
38248 || (scale & (scale - 1)) != 0)
38249 return NULL_TREE;
38251 si = TYPE_MODE (index_type) == SImode;
38252 switch (TYPE_MODE (mem_vectype))
38254 case E_V2DFmode:
38255 if (TARGET_AVX512VL)
38256 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38257 else
38258 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38259 break;
38260 case E_V4DFmode:
38261 if (TARGET_AVX512VL)
38262 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38263 else
38264 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38265 break;
38266 case E_V2DImode:
38267 if (TARGET_AVX512VL)
38268 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38269 else
38270 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38271 break;
38272 case E_V4DImode:
38273 if (TARGET_AVX512VL)
38274 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38275 else
38276 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38277 break;
38278 case E_V4SFmode:
38279 if (TARGET_AVX512VL)
38280 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38281 else
38282 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38283 break;
38284 case E_V8SFmode:
38285 if (TARGET_AVX512VL)
38286 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38287 else
38288 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38289 break;
38290 case E_V4SImode:
38291 if (TARGET_AVX512VL)
38292 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38293 else
38294 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38295 break;
38296 case E_V8SImode:
38297 if (TARGET_AVX512VL)
38298 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38299 else
38300 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38301 break;
38302 case E_V8DFmode:
38303 if (TARGET_AVX512F)
38304 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38305 else
38306 return NULL_TREE;
38307 break;
38308 case E_V8DImode:
38309 if (TARGET_AVX512F)
38310 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38311 else
38312 return NULL_TREE;
38313 break;
38314 case E_V16SFmode:
38315 if (TARGET_AVX512F)
38316 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38317 else
38318 return NULL_TREE;
38319 break;
38320 case E_V16SImode:
38321 if (TARGET_AVX512F)
38322 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38323 else
38324 return NULL_TREE;
38325 break;
38326 default:
38327 return NULL_TREE;
38330 return ix86_get_builtin (code);
38333 /* Returns a decl of a function that implements scatter store with
38334 register type VECTYPE and index type INDEX_TYPE and SCALE.
38335 Return NULL_TREE if it is not available. */
38337 static tree
38338 ix86_vectorize_builtin_scatter (const_tree vectype,
38339 const_tree index_type, int scale)
38341 bool si;
38342 enum ix86_builtins code;
38344 if (!TARGET_AVX512F)
38345 return NULL_TREE;
38347 if ((TREE_CODE (index_type) != INTEGER_TYPE
38348 && !POINTER_TYPE_P (index_type))
38349 || (TYPE_MODE (index_type) != SImode
38350 && TYPE_MODE (index_type) != DImode))
38351 return NULL_TREE;
38353 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38354 return NULL_TREE;
38356 /* v*scatter* insn sign extends index to pointer mode. */
38357 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38358 && TYPE_UNSIGNED (index_type))
38359 return NULL_TREE;
38361 /* Scale can be 1, 2, 4 or 8. */
38362 if (scale <= 0
38363 || scale > 8
38364 || (scale & (scale - 1)) != 0)
38365 return NULL_TREE;
38367 si = TYPE_MODE (index_type) == SImode;
38368 switch (TYPE_MODE (vectype))
38370 case E_V8DFmode:
38371 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38372 break;
38373 case E_V8DImode:
38374 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38375 break;
38376 case E_V16SFmode:
38377 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38378 break;
38379 case E_V16SImode:
38380 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38381 break;
38382 default:
38383 return NULL_TREE;
38386 return ix86_builtins[code];
38389 /* Return true if it is safe to use the rsqrt optabs to optimize
38390 1.0/sqrt. */
38392 static bool
38393 use_rsqrt_p ()
38395 return (TARGET_SSE_MATH
38396 && flag_finite_math_only
38397 && !flag_trapping_math
38398 && flag_unsafe_math_optimizations);
38401 /* Returns a code for a target-specific builtin that implements
38402 reciprocal of the function, or NULL_TREE if not available. */
38404 static tree
38405 ix86_builtin_reciprocal (tree fndecl)
38407 switch (DECL_FUNCTION_CODE (fndecl))
38409 /* Vectorized version of sqrt to rsqrt conversion. */
38410 case IX86_BUILTIN_SQRTPS_NR:
38411 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38413 case IX86_BUILTIN_SQRTPS_NR256:
38414 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38416 default:
38417 return NULL_TREE;
38421 /* Helper for avx_vpermilps256_operand et al. This is also used by
38422 the expansion functions to turn the parallel back into a mask.
38423 The return value is 0 for no match and the imm8+1 for a match. */
38426 avx_vpermilp_parallel (rtx par, machine_mode mode)
38428 unsigned i, nelt = GET_MODE_NUNITS (mode);
38429 unsigned mask = 0;
38430 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38432 if (XVECLEN (par, 0) != (int) nelt)
38433 return 0;
38435 /* Validate that all of the elements are constants, and not totally
38436 out of range. Copy the data into an integral array to make the
38437 subsequent checks easier. */
38438 for (i = 0; i < nelt; ++i)
38440 rtx er = XVECEXP (par, 0, i);
38441 unsigned HOST_WIDE_INT ei;
38443 if (!CONST_INT_P (er))
38444 return 0;
38445 ei = INTVAL (er);
38446 if (ei >= nelt)
38447 return 0;
38448 ipar[i] = ei;
38451 switch (mode)
38453 case E_V8DFmode:
38454 /* In the 512-bit DFmode case, we can only move elements within
38455 a 128-bit lane. First fill the second part of the mask,
38456 then fallthru. */
38457 for (i = 4; i < 6; ++i)
38459 if (ipar[i] < 4 || ipar[i] >= 6)
38460 return 0;
38461 mask |= (ipar[i] - 4) << i;
38463 for (i = 6; i < 8; ++i)
38465 if (ipar[i] < 6)
38466 return 0;
38467 mask |= (ipar[i] - 6) << i;
38469 /* FALLTHRU */
38471 case E_V4DFmode:
38472 /* In the 256-bit DFmode case, we can only move elements within
38473 a 128-bit lane. */
38474 for (i = 0; i < 2; ++i)
38476 if (ipar[i] >= 2)
38477 return 0;
38478 mask |= ipar[i] << i;
38480 for (i = 2; i < 4; ++i)
38482 if (ipar[i] < 2)
38483 return 0;
38484 mask |= (ipar[i] - 2) << i;
38486 break;
38488 case E_V16SFmode:
38489 /* In 512 bit SFmode case, permutation in the upper 256 bits
38490 must mirror the permutation in the lower 256-bits. */
38491 for (i = 0; i < 8; ++i)
38492 if (ipar[i] + 8 != ipar[i + 8])
38493 return 0;
38494 /* FALLTHRU */
38496 case E_V8SFmode:
38497 /* In 256 bit SFmode case, we have full freedom of
38498 movement within the low 128-bit lane, but the high 128-bit
38499 lane must mirror the exact same pattern. */
38500 for (i = 0; i < 4; ++i)
38501 if (ipar[i] + 4 != ipar[i + 4])
38502 return 0;
38503 nelt = 4;
38504 /* FALLTHRU */
38506 case E_V2DFmode:
38507 case E_V4SFmode:
38508 /* In the 128-bit case, we've full freedom in the placement of
38509 the elements from the source operand. */
38510 for (i = 0; i < nelt; ++i)
38511 mask |= ipar[i] << (i * (nelt / 2));
38512 break;
38514 default:
38515 gcc_unreachable ();
38518 /* Make sure success has a non-zero value by adding one. */
38519 return mask + 1;
38522 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38523 the expansion functions to turn the parallel back into a mask.
38524 The return value is 0 for no match and the imm8+1 for a match. */
38527 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38529 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38530 unsigned mask = 0;
38531 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38533 if (XVECLEN (par, 0) != (int) nelt)
38534 return 0;
38536 /* Validate that all of the elements are constants, and not totally
38537 out of range. Copy the data into an integral array to make the
38538 subsequent checks easier. */
38539 for (i = 0; i < nelt; ++i)
38541 rtx er = XVECEXP (par, 0, i);
38542 unsigned HOST_WIDE_INT ei;
38544 if (!CONST_INT_P (er))
38545 return 0;
38546 ei = INTVAL (er);
38547 if (ei >= 2 * nelt)
38548 return 0;
38549 ipar[i] = ei;
38552 /* Validate that the halves of the permute are halves. */
38553 for (i = 0; i < nelt2 - 1; ++i)
38554 if (ipar[i] + 1 != ipar[i + 1])
38555 return 0;
38556 for (i = nelt2; i < nelt - 1; ++i)
38557 if (ipar[i] + 1 != ipar[i + 1])
38558 return 0;
38560 /* Reconstruct the mask. */
38561 for (i = 0; i < 2; ++i)
38563 unsigned e = ipar[i * nelt2];
38564 if (e % nelt2)
38565 return 0;
38566 e /= nelt2;
38567 mask |= e << (i * 4);
38570 /* Make sure success has a non-zero value by adding one. */
38571 return mask + 1;
38574 /* Return a register priority for hard reg REGNO. */
38575 static int
38576 ix86_register_priority (int hard_regno)
38578 /* ebp and r13 as the base always wants a displacement, r12 as the
38579 base always wants an index. So discourage their usage in an
38580 address. */
38581 if (hard_regno == R12_REG || hard_regno == R13_REG)
38582 return 0;
38583 if (hard_regno == BP_REG)
38584 return 1;
38585 /* New x86-64 int registers result in bigger code size. Discourage
38586 them. */
38587 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38588 return 2;
38589 /* New x86-64 SSE registers result in bigger code size. Discourage
38590 them. */
38591 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38592 return 2;
38593 /* Usage of AX register results in smaller code. Prefer it. */
38594 if (hard_regno == AX_REG)
38595 return 4;
38596 return 3;
38599 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38601 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38602 QImode must go into class Q_REGS.
38603 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38604 movdf to do mem-to-mem moves through integer regs. */
38606 static reg_class_t
38607 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38609 machine_mode mode = GET_MODE (x);
38611 /* We're only allowed to return a subclass of CLASS. Many of the
38612 following checks fail for NO_REGS, so eliminate that early. */
38613 if (regclass == NO_REGS)
38614 return NO_REGS;
38616 /* All classes can load zeros. */
38617 if (x == CONST0_RTX (mode))
38618 return regclass;
38620 /* Force constants into memory if we are loading a (nonzero) constant into
38621 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38622 instructions to load from a constant. */
38623 if (CONSTANT_P (x)
38624 && (MAYBE_MMX_CLASS_P (regclass)
38625 || MAYBE_SSE_CLASS_P (regclass)
38626 || MAYBE_MASK_CLASS_P (regclass)))
38627 return NO_REGS;
38629 /* Floating-point constants need more complex checks. */
38630 if (CONST_DOUBLE_P (x))
38632 /* General regs can load everything. */
38633 if (INTEGER_CLASS_P (regclass))
38634 return regclass;
38636 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38637 zero above. We only want to wind up preferring 80387 registers if
38638 we plan on doing computation with them. */
38639 if (IS_STACK_MODE (mode)
38640 && standard_80387_constant_p (x) > 0)
38642 /* Limit class to FP regs. */
38643 if (FLOAT_CLASS_P (regclass))
38644 return FLOAT_REGS;
38645 else if (regclass == FP_TOP_SSE_REGS)
38646 return FP_TOP_REG;
38647 else if (regclass == FP_SECOND_SSE_REGS)
38648 return FP_SECOND_REG;
38651 return NO_REGS;
38654 /* Prefer SSE regs only, if we can use them for math. */
38655 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38656 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38658 /* Generally when we see PLUS here, it's the function invariant
38659 (plus soft-fp const_int). Which can only be computed into general
38660 regs. */
38661 if (GET_CODE (x) == PLUS)
38662 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38664 /* QImode constants are easy to load, but non-constant QImode data
38665 must go into Q_REGS. */
38666 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38668 if (Q_CLASS_P (regclass))
38669 return regclass;
38670 else if (reg_class_subset_p (Q_REGS, regclass))
38671 return Q_REGS;
38672 else
38673 return NO_REGS;
38676 return regclass;
38679 /* Discourage putting floating-point values in SSE registers unless
38680 SSE math is being used, and likewise for the 387 registers. */
38681 static reg_class_t
38682 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38684 machine_mode mode = GET_MODE (x);
38686 /* Restrict the output reload class to the register bank that we are doing
38687 math on. If we would like not to return a subset of CLASS, reject this
38688 alternative: if reload cannot do this, it will still use its choice. */
38689 mode = GET_MODE (x);
38690 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38691 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38693 if (IS_STACK_MODE (mode))
38695 if (regclass == FP_TOP_SSE_REGS)
38696 return FP_TOP_REG;
38697 else if (regclass == FP_SECOND_SSE_REGS)
38698 return FP_SECOND_REG;
38699 else
38700 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38703 return regclass;
38706 static reg_class_t
38707 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38708 machine_mode mode, secondary_reload_info *sri)
38710 /* Double-word spills from general registers to non-offsettable memory
38711 references (zero-extended addresses) require special handling. */
38712 if (TARGET_64BIT
38713 && MEM_P (x)
38714 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38715 && INTEGER_CLASS_P (rclass)
38716 && !offsettable_memref_p (x))
38718 sri->icode = (in_p
38719 ? CODE_FOR_reload_noff_load
38720 : CODE_FOR_reload_noff_store);
38721 /* Add the cost of moving address to a temporary. */
38722 sri->extra_cost = 1;
38724 return NO_REGS;
38727 /* QImode spills from non-QI registers require
38728 intermediate register on 32bit targets. */
38729 if (mode == QImode
38730 && ((!TARGET_64BIT && !in_p
38731 && INTEGER_CLASS_P (rclass)
38732 && MAYBE_NON_Q_CLASS_P (rclass))
38733 || (!TARGET_AVX512DQ
38734 && MAYBE_MASK_CLASS_P (rclass))))
38736 int regno = true_regnum (x);
38738 /* Return Q_REGS if the operand is in memory. */
38739 if (regno == -1)
38740 return Q_REGS;
38742 return NO_REGS;
38745 /* This condition handles corner case where an expression involving
38746 pointers gets vectorized. We're trying to use the address of a
38747 stack slot as a vector initializer.
38749 (set (reg:V2DI 74 [ vect_cst_.2 ])
38750 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38752 Eventually frame gets turned into sp+offset like this:
38754 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38755 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38756 (const_int 392 [0x188]))))
38758 That later gets turned into:
38760 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38761 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38762 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38764 We'll have the following reload recorded:
38766 Reload 0: reload_in (DI) =
38767 (plus:DI (reg/f:DI 7 sp)
38768 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38769 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38770 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38771 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38772 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38773 reload_reg_rtx: (reg:V2DI 22 xmm1)
38775 Which isn't going to work since SSE instructions can't handle scalar
38776 additions. Returning GENERAL_REGS forces the addition into integer
38777 register and reload can handle subsequent reloads without problems. */
38779 if (in_p && GET_CODE (x) == PLUS
38780 && SSE_CLASS_P (rclass)
38781 && SCALAR_INT_MODE_P (mode))
38782 return GENERAL_REGS;
38784 return NO_REGS;
38787 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38789 static bool
38790 ix86_class_likely_spilled_p (reg_class_t rclass)
38792 switch (rclass)
38794 case AREG:
38795 case DREG:
38796 case CREG:
38797 case BREG:
38798 case AD_REGS:
38799 case SIREG:
38800 case DIREG:
38801 case SSE_FIRST_REG:
38802 case FP_TOP_REG:
38803 case FP_SECOND_REG:
38804 case BND_REGS:
38805 return true;
38807 default:
38808 break;
38811 return false;
38814 /* If we are copying between registers from different register sets
38815 (e.g. FP and integer), we may need a memory location.
38817 The function can't work reliably when one of the CLASSES is a class
38818 containing registers from multiple sets. We avoid this by never combining
38819 different sets in a single alternative in the machine description.
38820 Ensure that this constraint holds to avoid unexpected surprises.
38822 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38823 so do not enforce these sanity checks.
38825 To optimize register_move_cost performance, define inline variant. */
38827 static inline bool
38828 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38829 reg_class_t class2, int strict)
38831 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38832 return false;
38834 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38835 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38836 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38837 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38838 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38839 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38840 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38841 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38843 gcc_assert (!strict || lra_in_progress);
38844 return true;
38847 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38848 return true;
38850 /* Between mask and general, we have moves no larger than word size. */
38851 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38852 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38853 return true;
38855 /* ??? This is a lie. We do have moves between mmx/general, and for
38856 mmx/sse2. But by saying we need secondary memory we discourage the
38857 register allocator from using the mmx registers unless needed. */
38858 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38859 return true;
38861 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38863 /* SSE1 doesn't have any direct moves from other classes. */
38864 if (!TARGET_SSE2)
38865 return true;
38867 /* If the target says that inter-unit moves are more expensive
38868 than moving through memory, then don't generate them. */
38869 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38870 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38871 return true;
38873 /* Between SSE and general, we have moves no larger than word size. */
38874 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38875 return true;
38878 return false;
38881 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38883 static bool
38884 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38885 reg_class_t class2)
38887 return inline_secondary_memory_needed (mode, class1, class2, true);
38890 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38892 get_secondary_mem widens integral modes to BITS_PER_WORD.
38893 There is no need to emit full 64 bit move on 64 bit targets
38894 for integral modes that can be moved using 32 bit move. */
38896 static machine_mode
38897 ix86_secondary_memory_needed_mode (machine_mode mode)
38899 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38900 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38901 return mode;
38904 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38906 On the 80386, this is the size of MODE in words,
38907 except in the FP regs, where a single reg is always enough. */
38909 static unsigned char
38910 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38912 if (MAYBE_INTEGER_CLASS_P (rclass))
38914 if (mode == XFmode)
38915 return (TARGET_64BIT ? 2 : 3);
38916 else if (mode == XCmode)
38917 return (TARGET_64BIT ? 4 : 6);
38918 else
38919 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38921 else
38923 if (COMPLEX_MODE_P (mode))
38924 return 2;
38925 else
38926 return 1;
38930 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38932 static bool
38933 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38934 reg_class_t regclass)
38936 if (from == to)
38937 return true;
38939 /* x87 registers can't do subreg at all, as all values are reformatted
38940 to extended precision. */
38941 if (MAYBE_FLOAT_CLASS_P (regclass))
38942 return false;
38944 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38946 /* Vector registers do not support QI or HImode loads. If we don't
38947 disallow a change to these modes, reload will assume it's ok to
38948 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38949 the vec_dupv4hi pattern. */
38950 if (GET_MODE_SIZE (from) < 4)
38951 return false;
38954 return true;
38957 /* Return index of MODE in the sse load/store tables. */
38959 static inline int
38960 sse_store_index (machine_mode mode)
38962 switch (GET_MODE_SIZE (mode))
38964 case 4:
38965 return 0;
38966 case 8:
38967 return 1;
38968 case 16:
38969 return 2;
38970 case 32:
38971 return 3;
38972 case 64:
38973 return 4;
38974 default:
38975 return -1;
38979 /* Return the cost of moving data of mode M between a
38980 register and memory. A value of 2 is the default; this cost is
38981 relative to those in `REGISTER_MOVE_COST'.
38983 This function is used extensively by register_move_cost that is used to
38984 build tables at startup. Make it inline in this case.
38985 When IN is 2, return maximum of in and out move cost.
38987 If moving between registers and memory is more expensive than
38988 between two registers, you should define this macro to express the
38989 relative cost.
38991 Model also increased moving costs of QImode registers in non
38992 Q_REGS classes.
38994 static inline int
38995 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
38996 int in)
38998 int cost;
38999 if (FLOAT_CLASS_P (regclass))
39001 int index;
39002 switch (mode)
39004 case E_SFmode:
39005 index = 0;
39006 break;
39007 case E_DFmode:
39008 index = 1;
39009 break;
39010 case E_XFmode:
39011 index = 2;
39012 break;
39013 default:
39014 return 100;
39016 if (in == 2)
39017 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39018 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39020 if (SSE_CLASS_P (regclass))
39022 int index = sse_store_index (mode);
39023 if (index == -1)
39024 return 100;
39025 if (in == 2)
39026 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39027 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39029 if (MMX_CLASS_P (regclass))
39031 int index;
39032 switch (GET_MODE_SIZE (mode))
39034 case 4:
39035 index = 0;
39036 break;
39037 case 8:
39038 index = 1;
39039 break;
39040 default:
39041 return 100;
39043 if (in)
39044 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39045 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39047 switch (GET_MODE_SIZE (mode))
39049 case 1:
39050 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39052 if (!in)
39053 return ix86_cost->int_store[0];
39054 if (TARGET_PARTIAL_REG_DEPENDENCY
39055 && optimize_function_for_speed_p (cfun))
39056 cost = ix86_cost->movzbl_load;
39057 else
39058 cost = ix86_cost->int_load[0];
39059 if (in == 2)
39060 return MAX (cost, ix86_cost->int_store[0]);
39061 return cost;
39063 else
39065 if (in == 2)
39066 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39067 if (in)
39068 return ix86_cost->movzbl_load;
39069 else
39070 return ix86_cost->int_store[0] + 4;
39072 break;
39073 case 2:
39074 if (in == 2)
39075 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39076 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39077 default:
39078 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39079 if (mode == TFmode)
39080 mode = XFmode;
39081 if (in == 2)
39082 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39083 else if (in)
39084 cost = ix86_cost->int_load[2];
39085 else
39086 cost = ix86_cost->int_store[2];
39087 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39091 static int
39092 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39093 bool in)
39095 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39099 /* Return the cost of moving data from a register in class CLASS1 to
39100 one in class CLASS2.
39102 It is not required that the cost always equal 2 when FROM is the same as TO;
39103 on some machines it is expensive to move between registers if they are not
39104 general registers. */
39106 static int
39107 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39108 reg_class_t class2_i)
39110 enum reg_class class1 = (enum reg_class) class1_i;
39111 enum reg_class class2 = (enum reg_class) class2_i;
39113 /* In case we require secondary memory, compute cost of the store followed
39114 by load. In order to avoid bad register allocation choices, we need
39115 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39117 if (inline_secondary_memory_needed (mode, class1, class2, false))
39119 int cost = 1;
39121 cost += inline_memory_move_cost (mode, class1, 2);
39122 cost += inline_memory_move_cost (mode, class2, 2);
39124 /* In case of copying from general_purpose_register we may emit multiple
39125 stores followed by single load causing memory size mismatch stall.
39126 Count this as arbitrarily high cost of 20. */
39127 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39128 && TARGET_MEMORY_MISMATCH_STALL
39129 && targetm.class_max_nregs (class1, mode)
39130 > targetm.class_max_nregs (class2, mode))
39131 cost += 20;
39133 /* In the case of FP/MMX moves, the registers actually overlap, and we
39134 have to switch modes in order to treat them differently. */
39135 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39136 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39137 cost += 20;
39139 return cost;
39142 /* Moves between SSE/MMX and integer unit are expensive. */
39143 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39144 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39146 /* ??? By keeping returned value relatively high, we limit the number
39147 of moves between integer and MMX/SSE registers for all targets.
39148 Additionally, high value prevents problem with x86_modes_tieable_p(),
39149 where integer modes in MMX/SSE registers are not tieable
39150 because of missing QImode and HImode moves to, from or between
39151 MMX/SSE registers. */
39152 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39153 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39155 if (MAYBE_FLOAT_CLASS_P (class1))
39156 return ix86_cost->fp_move;
39157 if (MAYBE_SSE_CLASS_P (class1))
39159 if (GET_MODE_BITSIZE (mode) <= 128)
39160 return ix86_cost->xmm_move;
39161 if (GET_MODE_BITSIZE (mode) <= 256)
39162 return ix86_cost->ymm_move;
39163 return ix86_cost->zmm_move;
39165 if (MAYBE_MMX_CLASS_P (class1))
39166 return ix86_cost->mmx_move;
39167 return 2;
39170 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39171 words of a value of mode MODE but can be less for certain modes in
39172 special long registers.
39174 Actually there are no two word move instructions for consecutive
39175 registers. And only registers 0-3 may have mov byte instructions
39176 applied to them. */
39178 static unsigned int
39179 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
39181 if (GENERAL_REGNO_P (regno))
39183 if (mode == XFmode)
39184 return TARGET_64BIT ? 2 : 3;
39185 if (mode == XCmode)
39186 return TARGET_64BIT ? 4 : 6;
39187 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39189 if (COMPLEX_MODE_P (mode))
39190 return 2;
39191 if (mode == V64SFmode || mode == V64SImode)
39192 return 4;
39193 return 1;
39196 /* Implement TARGET_HARD_REGNO_MODE_OK. */
39198 static bool
39199 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
39201 /* Flags and only flags can only hold CCmode values. */
39202 if (CC_REGNO_P (regno))
39203 return GET_MODE_CLASS (mode) == MODE_CC;
39204 if (GET_MODE_CLASS (mode) == MODE_CC
39205 || GET_MODE_CLASS (mode) == MODE_RANDOM
39206 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39207 return false;
39208 if (STACK_REGNO_P (regno))
39209 return VALID_FP_MODE_P (mode);
39210 if (MASK_REGNO_P (regno))
39211 return (VALID_MASK_REG_MODE (mode)
39212 || (TARGET_AVX512BW
39213 && VALID_MASK_AVX512BW_MODE (mode)));
39214 if (BND_REGNO_P (regno))
39215 return VALID_BND_REG_MODE (mode);
39216 if (SSE_REGNO_P (regno))
39218 /* We implement the move patterns for all vector modes into and
39219 out of SSE registers, even when no operation instructions
39220 are available. */
39222 /* For AVX-512 we allow, regardless of regno:
39223 - XI mode
39224 - any of 512-bit wide vector mode
39225 - any scalar mode. */
39226 if (TARGET_AVX512F
39227 && (mode == XImode
39228 || VALID_AVX512F_REG_MODE (mode)
39229 || VALID_AVX512F_SCALAR_MODE (mode)))
39230 return true;
39232 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
39233 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39234 && MOD4_SSE_REGNO_P (regno)
39235 && mode == V64SFmode)
39236 return true;
39238 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
39239 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39240 && MOD4_SSE_REGNO_P (regno)
39241 && mode == V64SImode)
39242 return true;
39244 /* TODO check for QI/HI scalars. */
39245 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39246 if (TARGET_AVX512VL
39247 && (mode == OImode
39248 || mode == TImode
39249 || VALID_AVX256_REG_MODE (mode)
39250 || VALID_AVX512VL_128_REG_MODE (mode)))
39251 return true;
39253 /* xmm16-xmm31 are only available for AVX-512. */
39254 if (EXT_REX_SSE_REGNO_P (regno))
39255 return false;
39257 /* OImode and AVX modes are available only when AVX is enabled. */
39258 return ((TARGET_AVX
39259 && VALID_AVX256_REG_OR_OI_MODE (mode))
39260 || VALID_SSE_REG_MODE (mode)
39261 || VALID_SSE2_REG_MODE (mode)
39262 || VALID_MMX_REG_MODE (mode)
39263 || VALID_MMX_REG_MODE_3DNOW (mode));
39265 if (MMX_REGNO_P (regno))
39267 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39268 so if the register is available at all, then we can move data of
39269 the given mode into or out of it. */
39270 return (VALID_MMX_REG_MODE (mode)
39271 || VALID_MMX_REG_MODE_3DNOW (mode));
39274 if (mode == QImode)
39276 /* Take care for QImode values - they can be in non-QI regs,
39277 but then they do cause partial register stalls. */
39278 if (ANY_QI_REGNO_P (regno))
39279 return true;
39280 if (!TARGET_PARTIAL_REG_STALL)
39281 return true;
39282 /* LRA checks if the hard register is OK for the given mode.
39283 QImode values can live in non-QI regs, so we allow all
39284 registers here. */
39285 if (lra_in_progress)
39286 return true;
39287 return !can_create_pseudo_p ();
39289 /* We handle both integer and floats in the general purpose registers. */
39290 else if (VALID_INT_MODE_P (mode))
39291 return true;
39292 else if (VALID_FP_MODE_P (mode))
39293 return true;
39294 else if (VALID_DFP_MODE_P (mode))
39295 return true;
39296 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39297 on to use that value in smaller contexts, this can easily force a
39298 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39299 supporting DImode, allow it. */
39300 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39301 return true;
39303 return false;
39306 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
39307 saves SSE registers across calls is Win64 (thus no need to check the
39308 current ABI here), and with AVX enabled Win64 only guarantees that
39309 the low 16 bytes are saved. */
39311 static bool
39312 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
39314 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39317 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39318 tieable integer mode. */
39320 static bool
39321 ix86_tieable_integer_mode_p (machine_mode mode)
39323 switch (mode)
39325 case E_HImode:
39326 case E_SImode:
39327 return true;
39329 case E_QImode:
39330 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39332 case E_DImode:
39333 return TARGET_64BIT;
39335 default:
39336 return false;
39340 /* Implement TARGET_MODES_TIEABLE_P.
39342 Return true if MODE1 is accessible in a register that can hold MODE2
39343 without copying. That is, all register classes that can hold MODE2
39344 can also hold MODE1. */
39346 static bool
39347 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39349 if (mode1 == mode2)
39350 return true;
39352 if (ix86_tieable_integer_mode_p (mode1)
39353 && ix86_tieable_integer_mode_p (mode2))
39354 return true;
39356 /* MODE2 being XFmode implies fp stack or general regs, which means we
39357 can tie any smaller floating point modes to it. Note that we do not
39358 tie this with TFmode. */
39359 if (mode2 == XFmode)
39360 return mode1 == SFmode || mode1 == DFmode;
39362 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39363 that we can tie it with SFmode. */
39364 if (mode2 == DFmode)
39365 return mode1 == SFmode;
39367 /* If MODE2 is only appropriate for an SSE register, then tie with
39368 any other mode acceptable to SSE registers. */
39369 if (GET_MODE_SIZE (mode2) == 32
39370 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39371 return (GET_MODE_SIZE (mode1) == 32
39372 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39373 if (GET_MODE_SIZE (mode2) == 16
39374 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39375 return (GET_MODE_SIZE (mode1) == 16
39376 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39378 /* If MODE2 is appropriate for an MMX register, then tie
39379 with any other mode acceptable to MMX registers. */
39380 if (GET_MODE_SIZE (mode2) == 8
39381 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39382 return (GET_MODE_SIZE (mode1) == 8
39383 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39385 return false;
39388 /* Return the cost of moving between two registers of mode MODE. */
39390 static int
39391 ix86_set_reg_reg_cost (machine_mode mode)
39393 unsigned int units = UNITS_PER_WORD;
39395 switch (GET_MODE_CLASS (mode))
39397 default:
39398 break;
39400 case MODE_CC:
39401 units = GET_MODE_SIZE (CCmode);
39402 break;
39404 case MODE_FLOAT:
39405 if ((TARGET_SSE && mode == TFmode)
39406 || (TARGET_80387 && mode == XFmode)
39407 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39408 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39409 units = GET_MODE_SIZE (mode);
39410 break;
39412 case MODE_COMPLEX_FLOAT:
39413 if ((TARGET_SSE && mode == TCmode)
39414 || (TARGET_80387 && mode == XCmode)
39415 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39416 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39417 units = GET_MODE_SIZE (mode);
39418 break;
39420 case MODE_VECTOR_INT:
39421 case MODE_VECTOR_FLOAT:
39422 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39423 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39424 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39425 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39426 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39427 units = GET_MODE_SIZE (mode);
39430 /* Return the cost of moving between two registers of mode MODE,
39431 assuming that the move will be in pieces of at most UNITS bytes. */
39432 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39435 /* Return cost of vector operation in MODE given that scalar version has
39436 COST. If PARALLEL is true assume that CPU has more than one unit
39437 performing the operation. */
39439 static int
39440 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39442 if (!VECTOR_MODE_P (mode))
39443 return cost;
39445 if (!parallel)
39446 return cost * GET_MODE_NUNITS (mode);
39447 if (GET_MODE_BITSIZE (mode) == 128
39448 && TARGET_SSE_SPLIT_REGS)
39449 return cost * 2;
39450 if (GET_MODE_BITSIZE (mode) > 128
39451 && TARGET_AVX128_OPTIMAL)
39452 return cost * GET_MODE_BITSIZE (mode) / 128;
39453 return cost;
39456 /* Return cost of multiplication in MODE. */
39458 static int
39459 ix86_multiplication_cost (const struct processor_costs *cost,
39460 enum machine_mode mode)
39462 machine_mode inner_mode = mode;
39463 if (VECTOR_MODE_P (mode))
39464 inner_mode = GET_MODE_INNER (mode);
39466 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39467 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
39468 else if (X87_FLOAT_MODE_P (mode))
39469 return cost->fmul;
39470 else if (FLOAT_MODE_P (mode))
39471 return ix86_vec_cost (mode,
39472 inner_mode == DFmode
39473 ? cost->mulsd : cost->mulss, true);
39474 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39476 /* V*QImode is emulated with 7-13 insns. */
39477 if (mode == V16QImode || mode == V32QImode)
39479 int extra = 11;
39480 if (TARGET_XOP && mode == V16QImode)
39481 extra = 5;
39482 else if (TARGET_SSSE3)
39483 extra = 6;
39484 return ix86_vec_cost (mode,
39485 cost->mulss * 2 + cost->sse_op * extra,
39486 true);
39488 /* V*DImode is emulated with 5-8 insns. */
39489 else if (mode == V2DImode || mode == V4DImode)
39491 if (TARGET_XOP && mode == V2DImode)
39492 return ix86_vec_cost (mode,
39493 cost->mulss * 2 + cost->sse_op * 3,
39494 true);
39495 else
39496 return ix86_vec_cost (mode,
39497 cost->mulss * 3 + cost->sse_op * 5,
39498 true);
39500 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39501 insns, including two PMULUDQ. */
39502 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39503 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39504 true);
39505 else
39506 return ix86_vec_cost (mode, cost->mulss, true);
39508 else
39509 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
39512 /* Return cost of multiplication in MODE. */
39514 static int
39515 ix86_division_cost (const struct processor_costs *cost,
39516 enum machine_mode mode)
39518 machine_mode inner_mode = mode;
39519 if (VECTOR_MODE_P (mode))
39520 inner_mode = GET_MODE_INNER (mode);
39522 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39523 return inner_mode == DFmode ? cost->divsd : cost->divss;
39524 else if (X87_FLOAT_MODE_P (mode))
39525 return cost->fdiv;
39526 else if (FLOAT_MODE_P (mode))
39527 return ix86_vec_cost (mode,
39528 inner_mode == DFmode ? cost->divsd : cost->divss,
39529 true);
39530 else
39531 return cost->divide[MODE_INDEX (mode)];
39534 /* Return cost of shift in MODE.
39535 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
39536 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
39537 if op1 is a result of subreg.
39539 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
39541 static int
39542 ix86_shift_rotate_cost (const struct processor_costs *cost,
39543 enum machine_mode mode, bool constant_op1,
39544 HOST_WIDE_INT op1_val,
39545 bool speed,
39546 bool and_in_op1,
39547 bool shift_and_truncate,
39548 bool *skip_op0, bool *skip_op1)
39550 if (skip_op0)
39551 *skip_op0 = *skip_op1 = false;
39552 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39554 /* V*QImode is emulated with 1-11 insns. */
39555 if (mode == V16QImode || mode == V32QImode)
39557 int count = 11;
39558 if (TARGET_XOP && mode == V16QImode)
39560 /* For XOP we use vpshab, which requires a broadcast of the
39561 value to the variable shift insn. For constants this
39562 means a V16Q const in mem; even when we can perform the
39563 shift with one insn set the cost to prefer paddb. */
39564 if (constant_op1)
39566 if (skip_op1)
39567 *skip_op1 = true;
39568 return ix86_vec_cost (mode,
39569 cost->sse_op
39570 + (speed
39572 : COSTS_N_BYTES
39573 (GET_MODE_UNIT_SIZE (mode))), true);
39575 count = 3;
39577 else if (TARGET_SSSE3)
39578 count = 7;
39579 return ix86_vec_cost (mode, cost->sse_op * count, true);
39581 else
39582 return ix86_vec_cost (mode, cost->sse_op, true);
39584 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39586 if (constant_op1)
39588 if (op1_val > 32)
39589 return cost->shift_const + COSTS_N_INSNS (2);
39590 else
39591 return cost->shift_const * 2;
39593 else
39595 if (and_in_op1)
39596 return cost->shift_var * 2;
39597 else
39598 return cost->shift_var * 6 + COSTS_N_INSNS (2);
39601 else
39603 if (constant_op1)
39604 return cost->shift_const;
39605 else if (shift_and_truncate)
39607 if (skip_op0)
39608 *skip_op0 = *skip_op1 = true;
39609 /* Return the cost after shift-and truncation. */
39610 return cost->shift_var;
39612 else
39613 return cost->shift_var;
39615 return cost->shift_const;
39618 /* Compute a (partial) cost for rtx X. Return true if the complete
39619 cost has been computed, and false if subexpressions should be
39620 scanned. In either case, *TOTAL contains the cost result. */
39622 static bool
39623 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39624 int *total, bool speed)
39626 rtx mask;
39627 enum rtx_code code = GET_CODE (x);
39628 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39629 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39630 int src_cost;
39632 switch (code)
39634 case SET:
39635 if (register_operand (SET_DEST (x), VOIDmode)
39636 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39638 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39639 return true;
39642 if (register_operand (SET_SRC (x), VOIDmode))
39643 /* Avoid potentially incorrect high cost from rtx_costs
39644 for non-tieable SUBREGs. */
39645 src_cost = 0;
39646 else
39648 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39650 if (CONSTANT_P (SET_SRC (x)))
39651 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39652 a small value, possibly zero for cheap constants. */
39653 src_cost += COSTS_N_INSNS (1);
39656 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39657 return true;
39659 case CONST_INT:
39660 case CONST:
39661 case LABEL_REF:
39662 case SYMBOL_REF:
39663 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39664 *total = 3;
39665 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39666 *total = 2;
39667 else if (flag_pic && SYMBOLIC_CONST (x)
39668 && !(TARGET_64BIT
39669 && (GET_CODE (x) == LABEL_REF
39670 || (GET_CODE (x) == SYMBOL_REF
39671 && SYMBOL_REF_LOCAL_P (x))))
39672 /* Use 0 cost for CONST to improve its propagation. */
39673 && (TARGET_64BIT || GET_CODE (x) != CONST))
39674 *total = 1;
39675 else
39676 *total = 0;
39677 return true;
39679 case CONST_DOUBLE:
39680 if (IS_STACK_MODE (mode))
39681 switch (standard_80387_constant_p (x))
39683 case -1:
39684 case 0:
39685 break;
39686 case 1: /* 0.0 */
39687 *total = 1;
39688 return true;
39689 default: /* Other constants */
39690 *total = 2;
39691 return true;
39693 /* FALLTHRU */
39695 case CONST_VECTOR:
39696 switch (standard_sse_constant_p (x, mode))
39698 case 0:
39699 break;
39700 case 1: /* 0: xor eliminates false dependency */
39701 *total = 0;
39702 return true;
39703 default: /* -1: cmp contains false dependency */
39704 *total = 1;
39705 return true;
39707 /* FALLTHRU */
39709 case CONST_WIDE_INT:
39710 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39711 it'll probably end up. Add a penalty for size. */
39712 *total = (COSTS_N_INSNS (1)
39713 + (!TARGET_64BIT && flag_pic)
39714 + (GET_MODE_SIZE (mode) <= 4
39715 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39716 return true;
39718 case ZERO_EXTEND:
39719 /* The zero extensions is often completely free on x86_64, so make
39720 it as cheap as possible. */
39721 if (TARGET_64BIT && mode == DImode
39722 && GET_MODE (XEXP (x, 0)) == SImode)
39723 *total = 1;
39724 else if (TARGET_ZERO_EXTEND_WITH_AND)
39725 *total = cost->add;
39726 else
39727 *total = cost->movzx;
39728 return false;
39730 case SIGN_EXTEND:
39731 *total = cost->movsx;
39732 return false;
39734 case ASHIFT:
39735 if (SCALAR_INT_MODE_P (mode)
39736 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39737 && CONST_INT_P (XEXP (x, 1)))
39739 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39740 if (value == 1)
39742 *total = cost->add;
39743 return false;
39745 if ((value == 2 || value == 3)
39746 && cost->lea <= cost->shift_const)
39748 *total = cost->lea;
39749 return false;
39752 /* FALLTHRU */
39754 case ROTATE:
39755 case ASHIFTRT:
39756 case LSHIFTRT:
39757 case ROTATERT:
39758 bool skip_op0, skip_op1;
39759 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
39760 CONST_INT_P (XEXP (x, 1))
39761 ? INTVAL (XEXP (x, 1)) : -1,
39762 speed,
39763 GET_CODE (XEXP (x, 1)) == AND,
39764 SUBREG_P (XEXP (x, 1))
39765 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
39766 &skip_op0, &skip_op1);
39767 if (skip_op0 || skip_op1)
39769 if (!skip_op0)
39770 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
39771 if (!skip_op1)
39772 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
39773 return true;
39775 return false;
39777 case FMA:
39779 rtx sub;
39781 gcc_assert (FLOAT_MODE_P (mode));
39782 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39784 *total = ix86_vec_cost (mode,
39785 mode == SFmode ? cost->fmass : cost->fmasd,
39786 true);
39787 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39789 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39790 sub = XEXP (x, 0);
39791 if (GET_CODE (sub) == NEG)
39792 sub = XEXP (sub, 0);
39793 *total += rtx_cost (sub, mode, FMA, 0, speed);
39795 sub = XEXP (x, 2);
39796 if (GET_CODE (sub) == NEG)
39797 sub = XEXP (sub, 0);
39798 *total += rtx_cost (sub, mode, FMA, 2, speed);
39799 return true;
39802 case MULT:
39803 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
39805 rtx op0 = XEXP (x, 0);
39806 rtx op1 = XEXP (x, 1);
39807 int nbits;
39808 if (CONST_INT_P (XEXP (x, 1)))
39810 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39811 for (nbits = 0; value != 0; value &= value - 1)
39812 nbits++;
39814 else
39815 /* This is arbitrary. */
39816 nbits = 7;
39818 /* Compute costs correctly for widening multiplication. */
39819 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39820 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39821 == GET_MODE_SIZE (mode))
39823 int is_mulwiden = 0;
39824 machine_mode inner_mode = GET_MODE (op0);
39826 if (GET_CODE (op0) == GET_CODE (op1))
39827 is_mulwiden = 1, op1 = XEXP (op1, 0);
39828 else if (CONST_INT_P (op1))
39830 if (GET_CODE (op0) == SIGN_EXTEND)
39831 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39832 == INTVAL (op1);
39833 else
39834 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39837 if (is_mulwiden)
39838 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39841 *total = (cost->mult_init[MODE_INDEX (mode)]
39842 + nbits * cost->mult_bit
39843 + rtx_cost (op0, mode, outer_code, opno, speed)
39844 + rtx_cost (op1, mode, outer_code, opno, speed));
39846 return true;
39848 *total = ix86_multiplication_cost (cost, mode);
39849 return false;
39851 case DIV:
39852 case UDIV:
39853 case MOD:
39854 case UMOD:
39855 *total = ix86_division_cost (cost, mode);
39856 return false;
39858 case PLUS:
39859 if (GET_MODE_CLASS (mode) == MODE_INT
39860 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39862 if (GET_CODE (XEXP (x, 0)) == PLUS
39863 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39864 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39865 && CONSTANT_P (XEXP (x, 1)))
39867 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39868 if (val == 2 || val == 4 || val == 8)
39870 *total = cost->lea;
39871 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39872 outer_code, opno, speed);
39873 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39874 outer_code, opno, speed);
39875 *total += rtx_cost (XEXP (x, 1), mode,
39876 outer_code, opno, speed);
39877 return true;
39880 else if (GET_CODE (XEXP (x, 0)) == MULT
39881 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39883 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39884 if (val == 2 || val == 4 || val == 8)
39886 *total = cost->lea;
39887 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39888 outer_code, opno, speed);
39889 *total += rtx_cost (XEXP (x, 1), mode,
39890 outer_code, opno, speed);
39891 return true;
39894 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39896 /* Add with carry, ignore the cost of adding a carry flag. */
39897 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39898 *total = cost->add;
39899 else
39901 *total = cost->lea;
39902 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39903 outer_code, opno, speed);
39906 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39907 outer_code, opno, speed);
39908 *total += rtx_cost (XEXP (x, 1), mode,
39909 outer_code, opno, speed);
39910 return true;
39913 /* FALLTHRU */
39915 case MINUS:
39916 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39917 if (GET_MODE_CLASS (mode) == MODE_INT
39918 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39919 && GET_CODE (XEXP (x, 0)) == MINUS
39920 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39922 *total = cost->add;
39923 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39924 outer_code, opno, speed);
39925 *total += rtx_cost (XEXP (x, 1), mode,
39926 outer_code, opno, speed);
39927 return true;
39930 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39932 *total = cost->addss;
39933 return false;
39935 else if (X87_FLOAT_MODE_P (mode))
39937 *total = cost->fadd;
39938 return false;
39940 else if (FLOAT_MODE_P (mode))
39942 *total = ix86_vec_cost (mode, cost->addss, true);
39943 return false;
39945 /* FALLTHRU */
39947 case AND:
39948 case IOR:
39949 case XOR:
39950 if (GET_MODE_CLASS (mode) == MODE_INT
39951 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39953 *total = (cost->add * 2
39954 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39955 << (GET_MODE (XEXP (x, 0)) != DImode))
39956 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39957 << (GET_MODE (XEXP (x, 1)) != DImode)));
39958 return true;
39960 /* FALLTHRU */
39962 case NEG:
39963 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39965 *total = cost->sse_op;
39966 return false;
39968 else if (X87_FLOAT_MODE_P (mode))
39970 *total = cost->fchs;
39971 return false;
39973 else if (FLOAT_MODE_P (mode))
39975 *total = ix86_vec_cost (mode, cost->sse_op, true);
39976 return false;
39978 /* FALLTHRU */
39980 case NOT:
39981 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39982 *total = ix86_vec_cost (mode, cost->sse_op, true);
39983 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39984 *total = cost->add * 2;
39985 else
39986 *total = cost->add;
39987 return false;
39989 case COMPARE:
39990 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
39991 && XEXP (XEXP (x, 0), 1) == const1_rtx
39992 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
39993 && XEXP (x, 1) == const0_rtx)
39995 /* This kind of construct is implemented using test[bwl].
39996 Treat it as if we had an AND. */
39997 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
39998 *total = (cost->add
39999 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40000 opno, speed)
40001 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40002 return true;
40005 /* The embedded comparison operand is completely free. */
40006 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40007 && XEXP (x, 1) == const0_rtx)
40008 *total = 0;
40010 return false;
40012 case FLOAT_EXTEND:
40013 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40014 *total = 0;
40015 else
40016 *total = ix86_vec_cost (mode, cost->addss, true);
40017 return false;
40019 case FLOAT_TRUNCATE:
40020 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40021 *total = cost->fadd;
40022 else
40023 *total = ix86_vec_cost (mode, cost->addss, true);
40024 return false;
40026 case ABS:
40027 /* SSE requires memory load for the constant operand. It may make
40028 sense to account for this. Of course the constant operand may or
40029 may not be reused. */
40030 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40031 *total = cost->sse_op;
40032 else if (X87_FLOAT_MODE_P (mode))
40033 *total = cost->fabs;
40034 else if (FLOAT_MODE_P (mode))
40035 *total = ix86_vec_cost (mode, cost->sse_op, true);
40036 return false;
40038 case SQRT:
40039 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40040 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40041 else if (X87_FLOAT_MODE_P (mode))
40042 *total = cost->fsqrt;
40043 else if (FLOAT_MODE_P (mode))
40044 *total = ix86_vec_cost (mode,
40045 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40046 true);
40047 return false;
40049 case UNSPEC:
40050 if (XINT (x, 1) == UNSPEC_TP)
40051 *total = 0;
40052 return false;
40054 case VEC_SELECT:
40055 case VEC_CONCAT:
40056 case VEC_DUPLICATE:
40057 /* ??? Assume all of these vector manipulation patterns are
40058 recognizable. In which case they all pretty much have the
40059 same cost. */
40060 *total = cost->sse_op;
40061 return true;
40062 case VEC_MERGE:
40063 mask = XEXP (x, 2);
40064 /* This is masked instruction, assume the same cost,
40065 as nonmasked variant. */
40066 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40067 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40068 else
40069 *total = cost->sse_op;
40070 return true;
40072 default:
40073 return false;
40077 #if TARGET_MACHO
40079 static int current_machopic_label_num;
40081 /* Given a symbol name and its associated stub, write out the
40082 definition of the stub. */
40084 void
40085 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40087 unsigned int length;
40088 char *binder_name, *symbol_name, lazy_ptr_name[32];
40089 int label = ++current_machopic_label_num;
40091 /* For 64-bit we shouldn't get here. */
40092 gcc_assert (!TARGET_64BIT);
40094 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40095 symb = targetm.strip_name_encoding (symb);
40097 length = strlen (stub);
40098 binder_name = XALLOCAVEC (char, length + 32);
40099 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40101 length = strlen (symb);
40102 symbol_name = XALLOCAVEC (char, length + 32);
40103 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40105 sprintf (lazy_ptr_name, "L%d$lz", label);
40107 if (MACHOPIC_ATT_STUB)
40108 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40109 else if (MACHOPIC_PURE)
40110 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40111 else
40112 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40114 fprintf (file, "%s:\n", stub);
40115 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40117 if (MACHOPIC_ATT_STUB)
40119 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40121 else if (MACHOPIC_PURE)
40123 /* PIC stub. */
40124 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40125 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40126 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40127 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40128 label, lazy_ptr_name, label);
40129 fprintf (file, "\tjmp\t*%%ecx\n");
40131 else
40132 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40134 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40135 it needs no stub-binding-helper. */
40136 if (MACHOPIC_ATT_STUB)
40137 return;
40139 fprintf (file, "%s:\n", binder_name);
40141 if (MACHOPIC_PURE)
40143 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40144 fprintf (file, "\tpushl\t%%ecx\n");
40146 else
40147 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40149 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40151 /* N.B. Keep the correspondence of these
40152 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40153 old-pic/new-pic/non-pic stubs; altering this will break
40154 compatibility with existing dylibs. */
40155 if (MACHOPIC_PURE)
40157 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40158 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40160 else
40161 /* 16-byte -mdynamic-no-pic stub. */
40162 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40164 fprintf (file, "%s:\n", lazy_ptr_name);
40165 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40166 fprintf (file, ASM_LONG "%s\n", binder_name);
40168 #endif /* TARGET_MACHO */
40170 /* Order the registers for register allocator. */
40172 void
40173 x86_order_regs_for_local_alloc (void)
40175 int pos = 0;
40176 int i;
40178 /* First allocate the local general purpose registers. */
40179 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40180 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40181 reg_alloc_order [pos++] = i;
40183 /* Global general purpose registers. */
40184 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40185 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40186 reg_alloc_order [pos++] = i;
40188 /* x87 registers come first in case we are doing FP math
40189 using them. */
40190 if (!TARGET_SSE_MATH)
40191 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40192 reg_alloc_order [pos++] = i;
40194 /* SSE registers. */
40195 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40196 reg_alloc_order [pos++] = i;
40197 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40198 reg_alloc_order [pos++] = i;
40200 /* Extended REX SSE registers. */
40201 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40202 reg_alloc_order [pos++] = i;
40204 /* Mask register. */
40205 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40206 reg_alloc_order [pos++] = i;
40208 /* MPX bound registers. */
40209 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40210 reg_alloc_order [pos++] = i;
40212 /* x87 registers. */
40213 if (TARGET_SSE_MATH)
40214 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40215 reg_alloc_order [pos++] = i;
40217 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40218 reg_alloc_order [pos++] = i;
40220 /* Initialize the rest of array as we do not allocate some registers
40221 at all. */
40222 while (pos < FIRST_PSEUDO_REGISTER)
40223 reg_alloc_order [pos++] = 0;
40226 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40227 in struct attribute_spec handler. */
40228 static tree
40229 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40230 tree args,
40231 int,
40232 bool *no_add_attrs)
40234 if (TREE_CODE (*node) != FUNCTION_TYPE
40235 && TREE_CODE (*node) != METHOD_TYPE
40236 && TREE_CODE (*node) != FIELD_DECL
40237 && TREE_CODE (*node) != TYPE_DECL)
40239 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40240 name);
40241 *no_add_attrs = true;
40242 return NULL_TREE;
40244 if (TARGET_64BIT)
40246 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40247 name);
40248 *no_add_attrs = true;
40249 return NULL_TREE;
40251 if (is_attribute_p ("callee_pop_aggregate_return", name))
40253 tree cst;
40255 cst = TREE_VALUE (args);
40256 if (TREE_CODE (cst) != INTEGER_CST)
40258 warning (OPT_Wattributes,
40259 "%qE attribute requires an integer constant argument",
40260 name);
40261 *no_add_attrs = true;
40263 else if (compare_tree_int (cst, 0) != 0
40264 && compare_tree_int (cst, 1) != 0)
40266 warning (OPT_Wattributes,
40267 "argument to %qE attribute is neither zero, nor one",
40268 name);
40269 *no_add_attrs = true;
40272 return NULL_TREE;
40275 return NULL_TREE;
40278 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40279 struct attribute_spec.handler. */
40280 static tree
40281 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40282 bool *no_add_attrs)
40284 if (TREE_CODE (*node) != FUNCTION_TYPE
40285 && TREE_CODE (*node) != METHOD_TYPE
40286 && TREE_CODE (*node) != FIELD_DECL
40287 && TREE_CODE (*node) != TYPE_DECL)
40289 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40290 name);
40291 *no_add_attrs = true;
40292 return NULL_TREE;
40295 /* Can combine regparm with all attributes but fastcall. */
40296 if (is_attribute_p ("ms_abi", name))
40298 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40300 error ("ms_abi and sysv_abi attributes are not compatible");
40303 return NULL_TREE;
40305 else if (is_attribute_p ("sysv_abi", name))
40307 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40309 error ("ms_abi and sysv_abi attributes are not compatible");
40312 return NULL_TREE;
40315 return NULL_TREE;
40318 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40319 struct attribute_spec.handler. */
40320 static tree
40321 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40322 bool *no_add_attrs)
40324 tree *type = NULL;
40325 if (DECL_P (*node))
40327 if (TREE_CODE (*node) == TYPE_DECL)
40328 type = &TREE_TYPE (*node);
40330 else
40331 type = node;
40333 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40335 warning (OPT_Wattributes, "%qE attribute ignored",
40336 name);
40337 *no_add_attrs = true;
40340 else if ((is_attribute_p ("ms_struct", name)
40341 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40342 || ((is_attribute_p ("gcc_struct", name)
40343 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40345 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40346 name);
40347 *no_add_attrs = true;
40350 return NULL_TREE;
40353 static tree
40354 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40355 bool *no_add_attrs)
40357 if (TREE_CODE (*node) != FUNCTION_DECL)
40359 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40360 name);
40361 *no_add_attrs = true;
40363 return NULL_TREE;
40366 static tree
40367 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40368 int, bool *)
40370 return NULL_TREE;
40373 static tree
40374 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40376 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40377 but the function type contains args and return type data. */
40378 tree func_type = *node;
40379 tree return_type = TREE_TYPE (func_type);
40381 int nargs = 0;
40382 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40383 while (current_arg_type
40384 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40386 if (nargs == 0)
40388 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40389 error ("interrupt service routine should have a pointer "
40390 "as the first argument");
40392 else if (nargs == 1)
40394 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40395 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40396 error ("interrupt service routine should have unsigned %s"
40397 "int as the second argument",
40398 TARGET_64BIT
40399 ? (TARGET_X32 ? "long long " : "long ")
40400 : "");
40402 nargs++;
40403 current_arg_type = TREE_CHAIN (current_arg_type);
40405 if (!nargs || nargs > 2)
40406 error ("interrupt service routine can only have a pointer argument "
40407 "and an optional integer argument");
40408 if (! VOID_TYPE_P (return_type))
40409 error ("interrupt service routine can't have non-void return value");
40411 return NULL_TREE;
40414 static bool
40415 ix86_ms_bitfield_layout_p (const_tree record_type)
40417 return ((TARGET_MS_BITFIELD_LAYOUT
40418 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40419 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40422 /* Returns an expression indicating where the this parameter is
40423 located on entry to the FUNCTION. */
40425 static rtx
40426 x86_this_parameter (tree function)
40428 tree type = TREE_TYPE (function);
40429 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40430 int nregs;
40432 if (TARGET_64BIT)
40434 const int *parm_regs;
40436 if (ix86_function_type_abi (type) == MS_ABI)
40437 parm_regs = x86_64_ms_abi_int_parameter_registers;
40438 else
40439 parm_regs = x86_64_int_parameter_registers;
40440 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40443 nregs = ix86_function_regparm (type, function);
40445 if (nregs > 0 && !stdarg_p (type))
40447 int regno;
40448 unsigned int ccvt = ix86_get_callcvt (type);
40450 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40451 regno = aggr ? DX_REG : CX_REG;
40452 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40454 regno = CX_REG;
40455 if (aggr)
40456 return gen_rtx_MEM (SImode,
40457 plus_constant (Pmode, stack_pointer_rtx, 4));
40459 else
40461 regno = AX_REG;
40462 if (aggr)
40464 regno = DX_REG;
40465 if (nregs == 1)
40466 return gen_rtx_MEM (SImode,
40467 plus_constant (Pmode,
40468 stack_pointer_rtx, 4));
40471 return gen_rtx_REG (SImode, regno);
40474 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40475 aggr ? 8 : 4));
40478 /* Determine whether x86_output_mi_thunk can succeed. */
40480 static bool
40481 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40482 const_tree function)
40484 /* 64-bit can handle anything. */
40485 if (TARGET_64BIT)
40486 return true;
40488 /* For 32-bit, everything's fine if we have one free register. */
40489 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40490 return true;
40492 /* Need a free register for vcall_offset. */
40493 if (vcall_offset)
40494 return false;
40496 /* Need a free register for GOT references. */
40497 if (flag_pic && !targetm.binds_local_p (function))
40498 return false;
40500 /* Otherwise ok. */
40501 return true;
40504 /* Output the assembler code for a thunk function. THUNK_DECL is the
40505 declaration for the thunk function itself, FUNCTION is the decl for
40506 the target function. DELTA is an immediate constant offset to be
40507 added to THIS. If VCALL_OFFSET is nonzero, the word at
40508 *(*this + vcall_offset) should be added to THIS. */
40510 static void
40511 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40512 HOST_WIDE_INT vcall_offset, tree function)
40514 rtx this_param = x86_this_parameter (function);
40515 rtx this_reg, tmp, fnaddr;
40516 unsigned int tmp_regno;
40517 rtx_insn *insn;
40519 if (TARGET_64BIT)
40520 tmp_regno = R10_REG;
40521 else
40523 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40524 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40525 tmp_regno = AX_REG;
40526 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40527 tmp_regno = DX_REG;
40528 else
40529 tmp_regno = CX_REG;
40532 emit_note (NOTE_INSN_PROLOGUE_END);
40534 /* CET is enabled, insert EB instruction. */
40535 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40536 emit_insn (gen_nop_endbr ());
40538 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40539 pull it in now and let DELTA benefit. */
40540 if (REG_P (this_param))
40541 this_reg = this_param;
40542 else if (vcall_offset)
40544 /* Put the this parameter into %eax. */
40545 this_reg = gen_rtx_REG (Pmode, AX_REG);
40546 emit_move_insn (this_reg, this_param);
40548 else
40549 this_reg = NULL_RTX;
40551 /* Adjust the this parameter by a fixed constant. */
40552 if (delta)
40554 rtx delta_rtx = GEN_INT (delta);
40555 rtx delta_dst = this_reg ? this_reg : this_param;
40557 if (TARGET_64BIT)
40559 if (!x86_64_general_operand (delta_rtx, Pmode))
40561 tmp = gen_rtx_REG (Pmode, tmp_regno);
40562 emit_move_insn (tmp, delta_rtx);
40563 delta_rtx = tmp;
40567 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40570 /* Adjust the this parameter by a value stored in the vtable. */
40571 if (vcall_offset)
40573 rtx vcall_addr, vcall_mem, this_mem;
40575 tmp = gen_rtx_REG (Pmode, tmp_regno);
40577 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40578 if (Pmode != ptr_mode)
40579 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40580 emit_move_insn (tmp, this_mem);
40582 /* Adjust the this parameter. */
40583 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40584 if (TARGET_64BIT
40585 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40587 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40588 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40589 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40592 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40593 if (Pmode != ptr_mode)
40594 emit_insn (gen_addsi_1_zext (this_reg,
40595 gen_rtx_REG (ptr_mode,
40596 REGNO (this_reg)),
40597 vcall_mem));
40598 else
40599 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40602 /* If necessary, drop THIS back to its stack slot. */
40603 if (this_reg && this_reg != this_param)
40604 emit_move_insn (this_param, this_reg);
40606 fnaddr = XEXP (DECL_RTL (function), 0);
40607 if (TARGET_64BIT)
40609 if (!flag_pic || targetm.binds_local_p (function)
40610 || TARGET_PECOFF)
40612 else
40614 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40615 tmp = gen_rtx_CONST (Pmode, tmp);
40616 fnaddr = gen_const_mem (Pmode, tmp);
40619 else
40621 if (!flag_pic || targetm.binds_local_p (function))
40623 #if TARGET_MACHO
40624 else if (TARGET_MACHO)
40626 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40627 fnaddr = XEXP (fnaddr, 0);
40629 #endif /* TARGET_MACHO */
40630 else
40632 tmp = gen_rtx_REG (Pmode, CX_REG);
40633 output_set_got (tmp, NULL_RTX);
40635 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40636 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40637 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40638 fnaddr = gen_const_mem (Pmode, fnaddr);
40642 /* Our sibling call patterns do not allow memories, because we have no
40643 predicate that can distinguish between frame and non-frame memory.
40644 For our purposes here, we can get away with (ab)using a jump pattern,
40645 because we're going to do no optimization. */
40646 if (MEM_P (fnaddr))
40648 if (sibcall_insn_operand (fnaddr, word_mode))
40650 fnaddr = XEXP (DECL_RTL (function), 0);
40651 tmp = gen_rtx_MEM (QImode, fnaddr);
40652 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40653 tmp = emit_call_insn (tmp);
40654 SIBLING_CALL_P (tmp) = 1;
40656 else
40657 emit_jump_insn (gen_indirect_jump (fnaddr));
40659 else
40661 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40663 // CM_LARGE_PIC always uses pseudo PIC register which is
40664 // uninitialized. Since FUNCTION is local and calling it
40665 // doesn't go through PLT, we use scratch register %r11 as
40666 // PIC register and initialize it here.
40667 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40668 ix86_init_large_pic_reg (tmp_regno);
40669 fnaddr = legitimize_pic_address (fnaddr,
40670 gen_rtx_REG (Pmode, tmp_regno));
40673 if (!sibcall_insn_operand (fnaddr, word_mode))
40675 tmp = gen_rtx_REG (word_mode, tmp_regno);
40676 if (GET_MODE (fnaddr) != word_mode)
40677 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40678 emit_move_insn (tmp, fnaddr);
40679 fnaddr = tmp;
40682 tmp = gen_rtx_MEM (QImode, fnaddr);
40683 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40684 tmp = emit_call_insn (tmp);
40685 SIBLING_CALL_P (tmp) = 1;
40687 emit_barrier ();
40689 /* Emit just enough of rest_of_compilation to get the insns emitted.
40690 Note that use_thunk calls assemble_start_function et al. */
40691 insn = get_insns ();
40692 shorten_branches (insn);
40693 final_start_function (insn, file, 1);
40694 final (insn, file, 1);
40695 final_end_function ();
40698 static void
40699 x86_file_start (void)
40701 default_file_start ();
40702 if (TARGET_16BIT)
40703 fputs ("\t.code16gcc\n", asm_out_file);
40704 #if TARGET_MACHO
40705 darwin_file_start ();
40706 #endif
40707 if (X86_FILE_START_VERSION_DIRECTIVE)
40708 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40709 if (X86_FILE_START_FLTUSED)
40710 fputs ("\t.global\t__fltused\n", asm_out_file);
40711 if (ix86_asm_dialect == ASM_INTEL)
40712 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40716 x86_field_alignment (tree type, int computed)
40718 machine_mode mode;
40720 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40721 return computed;
40722 if (TARGET_IAMCU)
40723 return iamcu_alignment (type, computed);
40724 mode = TYPE_MODE (strip_array_types (type));
40725 if (mode == DFmode || mode == DCmode
40726 || GET_MODE_CLASS (mode) == MODE_INT
40727 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40728 return MIN (32, computed);
40729 return computed;
40732 /* Print call to TARGET to FILE. */
40734 static void
40735 x86_print_call_or_nop (FILE *file, const char *target)
40737 if (flag_nop_mcount)
40738 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
40739 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
40740 else
40741 fprintf (file, "1:\tcall\t%s\n", target);
40744 /* Output assembler code to FILE to increment profiler label # LABELNO
40745 for profiling a function entry. */
40746 void
40747 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40749 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40750 : MCOUNT_NAME);
40751 if (TARGET_64BIT)
40753 #ifndef NO_PROFILE_COUNTERS
40754 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40755 #endif
40757 if (!TARGET_PECOFF && flag_pic)
40758 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40759 else
40760 x86_print_call_or_nop (file, mcount_name);
40762 else if (flag_pic)
40764 #ifndef NO_PROFILE_COUNTERS
40765 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40766 LPREFIX, labelno);
40767 #endif
40768 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40770 else
40772 #ifndef NO_PROFILE_COUNTERS
40773 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40774 LPREFIX, labelno);
40775 #endif
40776 x86_print_call_or_nop (file, mcount_name);
40779 if (flag_record_mcount)
40781 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40782 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40783 fprintf (file, "\t.previous\n");
40787 /* We don't have exact information about the insn sizes, but we may assume
40788 quite safely that we are informed about all 1 byte insns and memory
40789 address sizes. This is enough to eliminate unnecessary padding in
40790 99% of cases. */
40793 ix86_min_insn_size (rtx_insn *insn)
40795 int l = 0, len;
40797 if (!INSN_P (insn) || !active_insn_p (insn))
40798 return 0;
40800 /* Discard alignments we've emit and jump instructions. */
40801 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40802 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40803 return 0;
40805 /* Important case - calls are always 5 bytes.
40806 It is common to have many calls in the row. */
40807 if (CALL_P (insn)
40808 && symbolic_reference_mentioned_p (PATTERN (insn))
40809 && !SIBLING_CALL_P (insn))
40810 return 5;
40811 len = get_attr_length (insn);
40812 if (len <= 1)
40813 return 1;
40815 /* For normal instructions we rely on get_attr_length being exact,
40816 with a few exceptions. */
40817 if (!JUMP_P (insn))
40819 enum attr_type type = get_attr_type (insn);
40821 switch (type)
40823 case TYPE_MULTI:
40824 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40825 || asm_noperands (PATTERN (insn)) >= 0)
40826 return 0;
40827 break;
40828 case TYPE_OTHER:
40829 case TYPE_FCMP:
40830 break;
40831 default:
40832 /* Otherwise trust get_attr_length. */
40833 return len;
40836 l = get_attr_length_address (insn);
40837 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40838 l = 4;
40840 if (l)
40841 return 1+l;
40842 else
40843 return 2;
40846 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40848 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40849 window. */
40851 static void
40852 ix86_avoid_jump_mispredicts (void)
40854 rtx_insn *insn, *start = get_insns ();
40855 int nbytes = 0, njumps = 0;
40856 bool isjump = false;
40858 /* Look for all minimal intervals of instructions containing 4 jumps.
40859 The intervals are bounded by START and INSN. NBYTES is the total
40860 size of instructions in the interval including INSN and not including
40861 START. When the NBYTES is smaller than 16 bytes, it is possible
40862 that the end of START and INSN ends up in the same 16byte page.
40864 The smallest offset in the page INSN can start is the case where START
40865 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40866 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40868 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40869 have to, control transfer to label(s) can be performed through other
40870 means, and also we estimate minimum length of all asm stmts as 0. */
40871 for (insn = start; insn; insn = NEXT_INSN (insn))
40873 int min_size;
40875 if (LABEL_P (insn))
40877 int align = label_to_alignment (insn);
40878 int max_skip = label_to_max_skip (insn);
40880 if (max_skip > 15)
40881 max_skip = 15;
40882 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40883 already in the current 16 byte page, because otherwise
40884 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40885 bytes to reach 16 byte boundary. */
40886 if (align <= 0
40887 || (align <= 3 && max_skip != (1 << align) - 1))
40888 max_skip = 0;
40889 if (dump_file)
40890 fprintf (dump_file, "Label %i with max_skip %i\n",
40891 INSN_UID (insn), max_skip);
40892 if (max_skip)
40894 while (nbytes + max_skip >= 16)
40896 start = NEXT_INSN (start);
40897 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40898 || CALL_P (start))
40899 njumps--, isjump = true;
40900 else
40901 isjump = false;
40902 nbytes -= ix86_min_insn_size (start);
40905 continue;
40908 min_size = ix86_min_insn_size (insn);
40909 nbytes += min_size;
40910 if (dump_file)
40911 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40912 INSN_UID (insn), min_size);
40913 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40914 || CALL_P (insn))
40915 njumps++;
40916 else
40917 continue;
40919 while (njumps > 3)
40921 start = NEXT_INSN (start);
40922 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40923 || CALL_P (start))
40924 njumps--, isjump = true;
40925 else
40926 isjump = false;
40927 nbytes -= ix86_min_insn_size (start);
40929 gcc_assert (njumps >= 0);
40930 if (dump_file)
40931 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40932 INSN_UID (start), INSN_UID (insn), nbytes);
40934 if (njumps == 3 && isjump && nbytes < 16)
40936 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40938 if (dump_file)
40939 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40940 INSN_UID (insn), padsize);
40941 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40945 #endif
40947 /* AMD Athlon works faster
40948 when RET is not destination of conditional jump or directly preceded
40949 by other jump instruction. We avoid the penalty by inserting NOP just
40950 before the RET instructions in such cases. */
40951 static void
40952 ix86_pad_returns (void)
40954 edge e;
40955 edge_iterator ei;
40957 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40959 basic_block bb = e->src;
40960 rtx_insn *ret = BB_END (bb);
40961 rtx_insn *prev;
40962 bool replace = false;
40964 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
40965 || optimize_bb_for_size_p (bb))
40966 continue;
40967 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
40968 if (active_insn_p (prev) || LABEL_P (prev))
40969 break;
40970 if (prev && LABEL_P (prev))
40972 edge e;
40973 edge_iterator ei;
40975 FOR_EACH_EDGE (e, ei, bb->preds)
40976 if (EDGE_FREQUENCY (e) && e->src->index >= 0
40977 && !(e->flags & EDGE_FALLTHRU))
40979 replace = true;
40980 break;
40983 if (!replace)
40985 prev = prev_active_insn (ret);
40986 if (prev
40987 && ((JUMP_P (prev) && any_condjump_p (prev))
40988 || CALL_P (prev)))
40989 replace = true;
40990 /* Empty functions get branch mispredict even when
40991 the jump destination is not visible to us. */
40992 if (!prev && !optimize_function_for_size_p (cfun))
40993 replace = true;
40995 if (replace)
40997 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
40998 delete_insn (ret);
41003 /* Count the minimum number of instructions in BB. Return 4 if the
41004 number of instructions >= 4. */
41006 static int
41007 ix86_count_insn_bb (basic_block bb)
41009 rtx_insn *insn;
41010 int insn_count = 0;
41012 /* Count number of instructions in this block. Return 4 if the number
41013 of instructions >= 4. */
41014 FOR_BB_INSNS (bb, insn)
41016 /* Only happen in exit blocks. */
41017 if (JUMP_P (insn)
41018 && ANY_RETURN_P (PATTERN (insn)))
41019 break;
41021 if (NONDEBUG_INSN_P (insn)
41022 && GET_CODE (PATTERN (insn)) != USE
41023 && GET_CODE (PATTERN (insn)) != CLOBBER)
41025 insn_count++;
41026 if (insn_count >= 4)
41027 return insn_count;
41031 return insn_count;
41035 /* Count the minimum number of instructions in code path in BB.
41036 Return 4 if the number of instructions >= 4. */
41038 static int
41039 ix86_count_insn (basic_block bb)
41041 edge e;
41042 edge_iterator ei;
41043 int min_prev_count;
41045 /* Only bother counting instructions along paths with no
41046 more than 2 basic blocks between entry and exit. Given
41047 that BB has an edge to exit, determine if a predecessor
41048 of BB has an edge from entry. If so, compute the number
41049 of instructions in the predecessor block. If there
41050 happen to be multiple such blocks, compute the minimum. */
41051 min_prev_count = 4;
41052 FOR_EACH_EDGE (e, ei, bb->preds)
41054 edge prev_e;
41055 edge_iterator prev_ei;
41057 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41059 min_prev_count = 0;
41060 break;
41062 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41064 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41066 int count = ix86_count_insn_bb (e->src);
41067 if (count < min_prev_count)
41068 min_prev_count = count;
41069 break;
41074 if (min_prev_count < 4)
41075 min_prev_count += ix86_count_insn_bb (bb);
41077 return min_prev_count;
41080 /* Pad short function to 4 instructions. */
41082 static void
41083 ix86_pad_short_function (void)
41085 edge e;
41086 edge_iterator ei;
41088 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41090 rtx_insn *ret = BB_END (e->src);
41091 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41093 int insn_count = ix86_count_insn (e->src);
41095 /* Pad short function. */
41096 if (insn_count < 4)
41098 rtx_insn *insn = ret;
41100 /* Find epilogue. */
41101 while (insn
41102 && (!NOTE_P (insn)
41103 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41104 insn = PREV_INSN (insn);
41106 if (!insn)
41107 insn = ret;
41109 /* Two NOPs count as one instruction. */
41110 insn_count = 2 * (4 - insn_count);
41111 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41117 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41118 the epilogue, the Windows system unwinder will apply epilogue logic and
41119 produce incorrect offsets. This can be avoided by adding a nop between
41120 the last insn that can throw and the first insn of the epilogue. */
41122 static void
41123 ix86_seh_fixup_eh_fallthru (void)
41125 edge e;
41126 edge_iterator ei;
41128 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41130 rtx_insn *insn, *next;
41132 /* Find the beginning of the epilogue. */
41133 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41134 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41135 break;
41136 if (insn == NULL)
41137 continue;
41139 /* We only care about preceding insns that can throw. */
41140 insn = prev_active_insn (insn);
41141 if (insn == NULL || !can_throw_internal (insn))
41142 continue;
41144 /* Do not separate calls from their debug information. */
41145 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41146 if (NOTE_P (next)
41147 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41148 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41149 insn = next;
41150 else
41151 break;
41153 emit_insn_after (gen_nops (const1_rtx), insn);
41157 /* Given a register number BASE, the lowest of a group of registers, update
41158 regsets IN and OUT with the registers that should be avoided in input
41159 and output operands respectively when trying to avoid generating a modr/m
41160 byte for -fmitigate-rop. */
41162 static void
41163 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41165 SET_HARD_REG_BIT (out, base);
41166 SET_HARD_REG_BIT (out, base + 1);
41167 SET_HARD_REG_BIT (in, base + 2);
41168 SET_HARD_REG_BIT (in, base + 3);
41171 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41172 that certain encodings of modr/m bytes do not occur. */
41173 static void
41174 ix86_mitigate_rop (void)
41176 HARD_REG_SET input_risky;
41177 HARD_REG_SET output_risky;
41178 HARD_REG_SET inout_risky;
41180 CLEAR_HARD_REG_SET (output_risky);
41181 CLEAR_HARD_REG_SET (input_risky);
41182 SET_HARD_REG_BIT (output_risky, AX_REG);
41183 SET_HARD_REG_BIT (output_risky, CX_REG);
41184 SET_HARD_REG_BIT (input_risky, BX_REG);
41185 SET_HARD_REG_BIT (input_risky, DX_REG);
41186 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41187 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41188 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41189 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41190 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41191 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41192 COPY_HARD_REG_SET (inout_risky, input_risky);
41193 IOR_HARD_REG_SET (inout_risky, output_risky);
41195 df_note_add_problem ();
41196 /* Fix up what stack-regs did. */
41197 df_insn_rescan_all ();
41198 df_analyze ();
41200 regrename_init (true);
41201 regrename_analyze (NULL);
41203 auto_vec<du_head_p> cands;
41205 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41207 if (!NONDEBUG_INSN_P (insn))
41208 continue;
41210 if (GET_CODE (PATTERN (insn)) == USE
41211 || GET_CODE (PATTERN (insn)) == CLOBBER)
41212 continue;
41214 extract_insn (insn);
41216 int opno0, opno1;
41217 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41218 recog_data.n_operands, &opno0,
41219 &opno1);
41221 if (!ix86_rop_should_change_byte_p (modrm))
41222 continue;
41224 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41226 /* This happens when regrename has to fail a block. */
41227 if (!info->op_info)
41228 continue;
41230 if (info->op_info[opno0].n_chains != 0)
41232 gcc_assert (info->op_info[opno0].n_chains == 1);
41233 du_head_p op0c;
41234 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41235 if (op0c->target_data_1 + op0c->target_data_2 == 0
41236 && !op0c->cannot_rename)
41237 cands.safe_push (op0c);
41239 op0c->target_data_1++;
41241 if (info->op_info[opno1].n_chains != 0)
41243 gcc_assert (info->op_info[opno1].n_chains == 1);
41244 du_head_p op1c;
41245 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41246 if (op1c->target_data_1 + op1c->target_data_2 == 0
41247 && !op1c->cannot_rename)
41248 cands.safe_push (op1c);
41250 op1c->target_data_2++;
41254 int i;
41255 du_head_p head;
41256 FOR_EACH_VEC_ELT (cands, i, head)
41258 int old_reg, best_reg;
41259 HARD_REG_SET unavailable;
41261 CLEAR_HARD_REG_SET (unavailable);
41262 if (head->target_data_1)
41263 IOR_HARD_REG_SET (unavailable, output_risky);
41264 if (head->target_data_2)
41265 IOR_HARD_REG_SET (unavailable, input_risky);
41267 int n_uses;
41268 reg_class superclass = regrename_find_superclass (head, &n_uses,
41269 &unavailable);
41270 old_reg = head->regno;
41271 best_reg = find_rename_reg (head, superclass, &unavailable,
41272 old_reg, false);
41273 bool ok = regrename_do_replace (head, best_reg);
41274 gcc_assert (ok);
41275 if (dump_file)
41276 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41277 reg_names[best_reg], reg_class_names[superclass]);
41281 regrename_finish ();
41283 df_analyze ();
41285 basic_block bb;
41286 regset_head live;
41288 INIT_REG_SET (&live);
41290 FOR_EACH_BB_FN (bb, cfun)
41292 rtx_insn *insn;
41294 COPY_REG_SET (&live, DF_LR_OUT (bb));
41295 df_simulate_initialize_backwards (bb, &live);
41297 FOR_BB_INSNS_REVERSE (bb, insn)
41299 if (!NONDEBUG_INSN_P (insn))
41300 continue;
41302 df_simulate_one_insn_backwards (bb, insn, &live);
41304 if (GET_CODE (PATTERN (insn)) == USE
41305 || GET_CODE (PATTERN (insn)) == CLOBBER)
41306 continue;
41308 extract_insn (insn);
41309 constrain_operands_cached (insn, reload_completed);
41310 int opno0, opno1;
41311 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41312 recog_data.n_operands, &opno0,
41313 &opno1);
41314 if (modrm < 0
41315 || !ix86_rop_should_change_byte_p (modrm)
41316 || opno0 == opno1)
41317 continue;
41319 rtx oldreg = recog_data.operand[opno1];
41320 preprocess_constraints (insn);
41321 const operand_alternative *alt = which_op_alt ();
41323 int i;
41324 for (i = 0; i < recog_data.n_operands; i++)
41325 if (i != opno1
41326 && alt[i].earlyclobber
41327 && reg_overlap_mentioned_p (recog_data.operand[i],
41328 oldreg))
41329 break;
41331 if (i < recog_data.n_operands)
41332 continue;
41334 if (dump_file)
41335 fprintf (dump_file,
41336 "attempting to fix modrm byte in insn %d:"
41337 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41338 reg_class_names[alt[opno1].cl]);
41340 HARD_REG_SET unavailable;
41341 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41342 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41343 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41344 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41345 IOR_HARD_REG_SET (unavailable, output_risky);
41346 IOR_COMPL_HARD_REG_SET (unavailable,
41347 reg_class_contents[alt[opno1].cl]);
41349 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41350 if (!TEST_HARD_REG_BIT (unavailable, i))
41351 break;
41352 if (i == FIRST_PSEUDO_REGISTER)
41354 if (dump_file)
41355 fprintf (dump_file, ", none available\n");
41356 continue;
41358 if (dump_file)
41359 fprintf (dump_file, " -> %d\n", i);
41360 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41361 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41362 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41367 /* Implement machine specific optimizations. We implement padding of returns
41368 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41369 static void
41370 ix86_reorg (void)
41372 /* We are freeing block_for_insn in the toplev to keep compatibility
41373 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41374 compute_bb_for_insn ();
41376 if (flag_mitigate_rop)
41377 ix86_mitigate_rop ();
41379 if (TARGET_SEH && current_function_has_exception_handlers ())
41380 ix86_seh_fixup_eh_fallthru ();
41382 if (optimize && optimize_function_for_speed_p (cfun))
41384 if (TARGET_PAD_SHORT_FUNCTION)
41385 ix86_pad_short_function ();
41386 else if (TARGET_PAD_RETURNS)
41387 ix86_pad_returns ();
41388 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41389 if (TARGET_FOUR_JUMP_LIMIT)
41390 ix86_avoid_jump_mispredicts ();
41391 #endif
41395 /* Return nonzero when QImode register that must be represented via REX prefix
41396 is used. */
41397 bool
41398 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41400 int i;
41401 extract_insn_cached (insn);
41402 for (i = 0; i < recog_data.n_operands; i++)
41403 if (GENERAL_REG_P (recog_data.operand[i])
41404 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41405 return true;
41406 return false;
41409 /* Return true when INSN mentions register that must be encoded using REX
41410 prefix. */
41411 bool
41412 x86_extended_reg_mentioned_p (rtx insn)
41414 subrtx_iterator::array_type array;
41415 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41417 const_rtx x = *iter;
41418 if (REG_P (x)
41419 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41420 return true;
41422 return false;
41425 /* If profitable, negate (without causing overflow) integer constant
41426 of mode MODE at location LOC. Return true in this case. */
41427 bool
41428 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41430 HOST_WIDE_INT val;
41432 if (!CONST_INT_P (*loc))
41433 return false;
41435 switch (mode)
41437 case E_DImode:
41438 /* DImode x86_64 constants must fit in 32 bits. */
41439 gcc_assert (x86_64_immediate_operand (*loc, mode));
41441 mode = SImode;
41442 break;
41444 case E_SImode:
41445 case E_HImode:
41446 case E_QImode:
41447 break;
41449 default:
41450 gcc_unreachable ();
41453 /* Avoid overflows. */
41454 if (mode_signbit_p (mode, *loc))
41455 return false;
41457 val = INTVAL (*loc);
41459 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41460 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41461 if ((val < 0 && val != -128)
41462 || val == 128)
41464 *loc = GEN_INT (-val);
41465 return true;
41468 return false;
41471 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41472 optabs would emit if we didn't have TFmode patterns. */
41474 void
41475 x86_emit_floatuns (rtx operands[2])
41477 rtx_code_label *neglab, *donelab;
41478 rtx i0, i1, f0, in, out;
41479 machine_mode mode, inmode;
41481 inmode = GET_MODE (operands[1]);
41482 gcc_assert (inmode == SImode || inmode == DImode);
41484 out = operands[0];
41485 in = force_reg (inmode, operands[1]);
41486 mode = GET_MODE (out);
41487 neglab = gen_label_rtx ();
41488 donelab = gen_label_rtx ();
41489 f0 = gen_reg_rtx (mode);
41491 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41493 expand_float (out, in, 0);
41495 emit_jump_insn (gen_jump (donelab));
41496 emit_barrier ();
41498 emit_label (neglab);
41500 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41501 1, OPTAB_DIRECT);
41502 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41503 1, OPTAB_DIRECT);
41504 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41506 expand_float (f0, i0, 0);
41508 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41510 emit_label (donelab);
41513 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41514 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41515 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41516 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41518 /* Get a vector mode of the same size as the original but with elements
41519 twice as wide. This is only guaranteed to apply to integral vectors. */
41521 static inline machine_mode
41522 get_mode_wider_vector (machine_mode o)
41524 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41525 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41526 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41527 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41528 return n;
41531 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41532 fill target with val via vec_duplicate. */
41534 static bool
41535 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41537 bool ok;
41538 rtx_insn *insn;
41539 rtx dup;
41541 /* First attempt to recognize VAL as-is. */
41542 dup = gen_vec_duplicate (mode, val);
41543 insn = emit_insn (gen_rtx_SET (target, dup));
41544 if (recog_memoized (insn) < 0)
41546 rtx_insn *seq;
41547 machine_mode innermode = GET_MODE_INNER (mode);
41548 rtx reg;
41550 /* If that fails, force VAL into a register. */
41552 start_sequence ();
41553 reg = force_reg (innermode, val);
41554 if (GET_MODE (reg) != innermode)
41555 reg = gen_lowpart (innermode, reg);
41556 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
41557 seq = get_insns ();
41558 end_sequence ();
41559 if (seq)
41560 emit_insn_before (seq, insn);
41562 ok = recog_memoized (insn) >= 0;
41563 gcc_assert (ok);
41565 return true;
41568 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41569 with all elements equal to VAR. Return true if successful. */
41571 static bool
41572 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41573 rtx target, rtx val)
41575 bool ok;
41577 switch (mode)
41579 case E_V2SImode:
41580 case E_V2SFmode:
41581 if (!mmx_ok)
41582 return false;
41583 /* FALLTHRU */
41585 case E_V4DFmode:
41586 case E_V4DImode:
41587 case E_V8SFmode:
41588 case E_V8SImode:
41589 case E_V2DFmode:
41590 case E_V2DImode:
41591 case E_V4SFmode:
41592 case E_V4SImode:
41593 case E_V16SImode:
41594 case E_V8DImode:
41595 case E_V16SFmode:
41596 case E_V8DFmode:
41597 return ix86_vector_duplicate_value (mode, target, val);
41599 case E_V4HImode:
41600 if (!mmx_ok)
41601 return false;
41602 if (TARGET_SSE || TARGET_3DNOW_A)
41604 rtx x;
41606 val = gen_lowpart (SImode, val);
41607 x = gen_rtx_TRUNCATE (HImode, val);
41608 x = gen_rtx_VEC_DUPLICATE (mode, x);
41609 emit_insn (gen_rtx_SET (target, x));
41610 return true;
41612 goto widen;
41614 case E_V8QImode:
41615 if (!mmx_ok)
41616 return false;
41617 goto widen;
41619 case E_V8HImode:
41620 if (TARGET_AVX2)
41621 return ix86_vector_duplicate_value (mode, target, val);
41623 if (TARGET_SSE2)
41625 struct expand_vec_perm_d dperm;
41626 rtx tmp1, tmp2;
41628 permute:
41629 memset (&dperm, 0, sizeof (dperm));
41630 dperm.target = target;
41631 dperm.vmode = mode;
41632 dperm.nelt = GET_MODE_NUNITS (mode);
41633 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41634 dperm.one_operand_p = true;
41636 /* Extend to SImode using a paradoxical SUBREG. */
41637 tmp1 = gen_reg_rtx (SImode);
41638 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41640 /* Insert the SImode value as low element of a V4SImode vector. */
41641 tmp2 = gen_reg_rtx (V4SImode);
41642 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41643 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41645 ok = (expand_vec_perm_1 (&dperm)
41646 || expand_vec_perm_broadcast_1 (&dperm));
41647 gcc_assert (ok);
41648 return ok;
41650 goto widen;
41652 case E_V16QImode:
41653 if (TARGET_AVX2)
41654 return ix86_vector_duplicate_value (mode, target, val);
41656 if (TARGET_SSE2)
41657 goto permute;
41658 goto widen;
41660 widen:
41661 /* Replicate the value once into the next wider mode and recurse. */
41663 machine_mode smode, wsmode, wvmode;
41664 rtx x;
41666 smode = GET_MODE_INNER (mode);
41667 wvmode = get_mode_wider_vector (mode);
41668 wsmode = GET_MODE_INNER (wvmode);
41670 val = convert_modes (wsmode, smode, val, true);
41671 x = expand_simple_binop (wsmode, ASHIFT, val,
41672 GEN_INT (GET_MODE_BITSIZE (smode)),
41673 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41674 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41676 x = gen_reg_rtx (wvmode);
41677 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41678 gcc_assert (ok);
41679 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41680 return ok;
41683 case E_V16HImode:
41684 case E_V32QImode:
41685 if (TARGET_AVX2)
41686 return ix86_vector_duplicate_value (mode, target, val);
41687 else
41689 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41690 rtx x = gen_reg_rtx (hvmode);
41692 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41693 gcc_assert (ok);
41695 x = gen_rtx_VEC_CONCAT (mode, x, x);
41696 emit_insn (gen_rtx_SET (target, x));
41698 return true;
41700 case E_V64QImode:
41701 case E_V32HImode:
41702 if (TARGET_AVX512BW)
41703 return ix86_vector_duplicate_value (mode, target, val);
41704 else
41706 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41707 rtx x = gen_reg_rtx (hvmode);
41709 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41710 gcc_assert (ok);
41712 x = gen_rtx_VEC_CONCAT (mode, x, x);
41713 emit_insn (gen_rtx_SET (target, x));
41715 return true;
41717 default:
41718 return false;
41722 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41723 whose ONE_VAR element is VAR, and other elements are zero. Return true
41724 if successful. */
41726 static bool
41727 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41728 rtx target, rtx var, int one_var)
41730 machine_mode vsimode;
41731 rtx new_target;
41732 rtx x, tmp;
41733 bool use_vector_set = false;
41735 switch (mode)
41737 case E_V2DImode:
41738 /* For SSE4.1, we normally use vector set. But if the second
41739 element is zero and inter-unit moves are OK, we use movq
41740 instead. */
41741 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41742 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41743 && one_var == 0));
41744 break;
41745 case E_V16QImode:
41746 case E_V4SImode:
41747 case E_V4SFmode:
41748 use_vector_set = TARGET_SSE4_1;
41749 break;
41750 case E_V8HImode:
41751 use_vector_set = TARGET_SSE2;
41752 break;
41753 case E_V4HImode:
41754 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41755 break;
41756 case E_V32QImode:
41757 case E_V16HImode:
41758 case E_V8SImode:
41759 case E_V8SFmode:
41760 case E_V4DFmode:
41761 use_vector_set = TARGET_AVX;
41762 break;
41763 case E_V4DImode:
41764 /* Use ix86_expand_vector_set in 64bit mode only. */
41765 use_vector_set = TARGET_AVX && TARGET_64BIT;
41766 break;
41767 default:
41768 break;
41771 if (use_vector_set)
41773 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41774 var = force_reg (GET_MODE_INNER (mode), var);
41775 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41776 return true;
41779 switch (mode)
41781 case E_V2SFmode:
41782 case E_V2SImode:
41783 if (!mmx_ok)
41784 return false;
41785 /* FALLTHRU */
41787 case E_V2DFmode:
41788 case E_V2DImode:
41789 if (one_var != 0)
41790 return false;
41791 var = force_reg (GET_MODE_INNER (mode), var);
41792 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41793 emit_insn (gen_rtx_SET (target, x));
41794 return true;
41796 case E_V4SFmode:
41797 case E_V4SImode:
41798 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41799 new_target = gen_reg_rtx (mode);
41800 else
41801 new_target = target;
41802 var = force_reg (GET_MODE_INNER (mode), var);
41803 x = gen_rtx_VEC_DUPLICATE (mode, var);
41804 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41805 emit_insn (gen_rtx_SET (new_target, x));
41806 if (one_var != 0)
41808 /* We need to shuffle the value to the correct position, so
41809 create a new pseudo to store the intermediate result. */
41811 /* With SSE2, we can use the integer shuffle insns. */
41812 if (mode != V4SFmode && TARGET_SSE2)
41814 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41815 const1_rtx,
41816 GEN_INT (one_var == 1 ? 0 : 1),
41817 GEN_INT (one_var == 2 ? 0 : 1),
41818 GEN_INT (one_var == 3 ? 0 : 1)));
41819 if (target != new_target)
41820 emit_move_insn (target, new_target);
41821 return true;
41824 /* Otherwise convert the intermediate result to V4SFmode and
41825 use the SSE1 shuffle instructions. */
41826 if (mode != V4SFmode)
41828 tmp = gen_reg_rtx (V4SFmode);
41829 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41831 else
41832 tmp = new_target;
41834 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41835 const1_rtx,
41836 GEN_INT (one_var == 1 ? 0 : 1),
41837 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41838 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41840 if (mode != V4SFmode)
41841 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41842 else if (tmp != target)
41843 emit_move_insn (target, tmp);
41845 else if (target != new_target)
41846 emit_move_insn (target, new_target);
41847 return true;
41849 case E_V8HImode:
41850 case E_V16QImode:
41851 vsimode = V4SImode;
41852 goto widen;
41853 case E_V4HImode:
41854 case E_V8QImode:
41855 if (!mmx_ok)
41856 return false;
41857 vsimode = V2SImode;
41858 goto widen;
41859 widen:
41860 if (one_var != 0)
41861 return false;
41863 /* Zero extend the variable element to SImode and recurse. */
41864 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41866 x = gen_reg_rtx (vsimode);
41867 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41868 var, one_var))
41869 gcc_unreachable ();
41871 emit_move_insn (target, gen_lowpart (mode, x));
41872 return true;
41874 default:
41875 return false;
41879 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41880 consisting of the values in VALS. It is known that all elements
41881 except ONE_VAR are constants. Return true if successful. */
41883 static bool
41884 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41885 rtx target, rtx vals, int one_var)
41887 rtx var = XVECEXP (vals, 0, one_var);
41888 machine_mode wmode;
41889 rtx const_vec, x;
41891 const_vec = copy_rtx (vals);
41892 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41893 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41895 switch (mode)
41897 case E_V2DFmode:
41898 case E_V2DImode:
41899 case E_V2SFmode:
41900 case E_V2SImode:
41901 /* For the two element vectors, it's just as easy to use
41902 the general case. */
41903 return false;
41905 case E_V4DImode:
41906 /* Use ix86_expand_vector_set in 64bit mode only. */
41907 if (!TARGET_64BIT)
41908 return false;
41909 /* FALLTHRU */
41910 case E_V4DFmode:
41911 case E_V8SFmode:
41912 case E_V8SImode:
41913 case E_V16HImode:
41914 case E_V32QImode:
41915 case E_V4SFmode:
41916 case E_V4SImode:
41917 case E_V8HImode:
41918 case E_V4HImode:
41919 break;
41921 case E_V16QImode:
41922 if (TARGET_SSE4_1)
41923 break;
41924 wmode = V8HImode;
41925 goto widen;
41926 case E_V8QImode:
41927 wmode = V4HImode;
41928 goto widen;
41929 widen:
41930 /* There's no way to set one QImode entry easily. Combine
41931 the variable value with its adjacent constant value, and
41932 promote to an HImode set. */
41933 x = XVECEXP (vals, 0, one_var ^ 1);
41934 if (one_var & 1)
41936 var = convert_modes (HImode, QImode, var, true);
41937 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
41938 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41939 x = GEN_INT (INTVAL (x) & 0xff);
41941 else
41943 var = convert_modes (HImode, QImode, var, true);
41944 x = gen_int_mode (INTVAL (x) << 8, HImode);
41946 if (x != const0_rtx)
41947 var = expand_simple_binop (HImode, IOR, var, x, var,
41948 1, OPTAB_LIB_WIDEN);
41950 x = gen_reg_rtx (wmode);
41951 emit_move_insn (x, gen_lowpart (wmode, const_vec));
41952 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
41954 emit_move_insn (target, gen_lowpart (mode, x));
41955 return true;
41957 default:
41958 return false;
41961 emit_move_insn (target, const_vec);
41962 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41963 return true;
41966 /* A subroutine of ix86_expand_vector_init_general. Use vector
41967 concatenate to handle the most general case: all values variable,
41968 and none identical. */
41970 static void
41971 ix86_expand_vector_init_concat (machine_mode mode,
41972 rtx target, rtx *ops, int n)
41974 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
41975 rtx first[16], second[8], third[4];
41976 rtvec v;
41977 int i, j;
41979 switch (n)
41981 case 2:
41982 switch (mode)
41984 case E_V16SImode:
41985 cmode = V8SImode;
41986 break;
41987 case E_V16SFmode:
41988 cmode = V8SFmode;
41989 break;
41990 case E_V8DImode:
41991 cmode = V4DImode;
41992 break;
41993 case E_V8DFmode:
41994 cmode = V4DFmode;
41995 break;
41996 case E_V8SImode:
41997 cmode = V4SImode;
41998 break;
41999 case E_V8SFmode:
42000 cmode = V4SFmode;
42001 break;
42002 case E_V4DImode:
42003 cmode = V2DImode;
42004 break;
42005 case E_V4DFmode:
42006 cmode = V2DFmode;
42007 break;
42008 case E_V4SImode:
42009 cmode = V2SImode;
42010 break;
42011 case E_V4SFmode:
42012 cmode = V2SFmode;
42013 break;
42014 case E_V2DImode:
42015 cmode = DImode;
42016 break;
42017 case E_V2SImode:
42018 cmode = SImode;
42019 break;
42020 case E_V2DFmode:
42021 cmode = DFmode;
42022 break;
42023 case E_V2SFmode:
42024 cmode = SFmode;
42025 break;
42026 default:
42027 gcc_unreachable ();
42030 if (!register_operand (ops[1], cmode))
42031 ops[1] = force_reg (cmode, ops[1]);
42032 if (!register_operand (ops[0], cmode))
42033 ops[0] = force_reg (cmode, ops[0]);
42034 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42035 ops[1])));
42036 break;
42038 case 4:
42039 switch (mode)
42041 case E_V4DImode:
42042 cmode = V2DImode;
42043 break;
42044 case E_V4DFmode:
42045 cmode = V2DFmode;
42046 break;
42047 case E_V4SImode:
42048 cmode = V2SImode;
42049 break;
42050 case E_V4SFmode:
42051 cmode = V2SFmode;
42052 break;
42053 default:
42054 gcc_unreachable ();
42056 goto half;
42058 case 8:
42059 switch (mode)
42061 case E_V8DImode:
42062 cmode = V2DImode;
42063 hmode = V4DImode;
42064 break;
42065 case E_V8DFmode:
42066 cmode = V2DFmode;
42067 hmode = V4DFmode;
42068 break;
42069 case E_V8SImode:
42070 cmode = V2SImode;
42071 hmode = V4SImode;
42072 break;
42073 case E_V8SFmode:
42074 cmode = V2SFmode;
42075 hmode = V4SFmode;
42076 break;
42077 default:
42078 gcc_unreachable ();
42080 goto half;
42082 case 16:
42083 switch (mode)
42085 case E_V16SImode:
42086 cmode = V2SImode;
42087 hmode = V4SImode;
42088 gmode = V8SImode;
42089 break;
42090 case E_V16SFmode:
42091 cmode = V2SFmode;
42092 hmode = V4SFmode;
42093 gmode = V8SFmode;
42094 break;
42095 default:
42096 gcc_unreachable ();
42098 goto half;
42100 half:
42101 /* FIXME: We process inputs backward to help RA. PR 36222. */
42102 i = n - 1;
42103 j = (n >> 1) - 1;
42104 for (; i > 0; i -= 2, j--)
42106 first[j] = gen_reg_rtx (cmode);
42107 v = gen_rtvec (2, ops[i - 1], ops[i]);
42108 ix86_expand_vector_init (false, first[j],
42109 gen_rtx_PARALLEL (cmode, v));
42112 n >>= 1;
42113 if (n > 4)
42115 gcc_assert (hmode != VOIDmode);
42116 gcc_assert (gmode != VOIDmode);
42117 for (i = j = 0; i < n; i += 2, j++)
42119 second[j] = gen_reg_rtx (hmode);
42120 ix86_expand_vector_init_concat (hmode, second [j],
42121 &first [i], 2);
42123 n >>= 1;
42124 for (i = j = 0; i < n; i += 2, j++)
42126 third[j] = gen_reg_rtx (gmode);
42127 ix86_expand_vector_init_concat (gmode, third[j],
42128 &second[i], 2);
42130 n >>= 1;
42131 ix86_expand_vector_init_concat (mode, target, third, n);
42133 else if (n > 2)
42135 gcc_assert (hmode != VOIDmode);
42136 for (i = j = 0; i < n; i += 2, j++)
42138 second[j] = gen_reg_rtx (hmode);
42139 ix86_expand_vector_init_concat (hmode, second [j],
42140 &first [i], 2);
42142 n >>= 1;
42143 ix86_expand_vector_init_concat (mode, target, second, n);
42145 else
42146 ix86_expand_vector_init_concat (mode, target, first, n);
42147 break;
42149 default:
42150 gcc_unreachable ();
42154 /* A subroutine of ix86_expand_vector_init_general. Use vector
42155 interleave to handle the most general case: all values variable,
42156 and none identical. */
42158 static void
42159 ix86_expand_vector_init_interleave (machine_mode mode,
42160 rtx target, rtx *ops, int n)
42162 machine_mode first_imode, second_imode, third_imode, inner_mode;
42163 int i, j;
42164 rtx op0, op1;
42165 rtx (*gen_load_even) (rtx, rtx, rtx);
42166 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42167 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42169 switch (mode)
42171 case E_V8HImode:
42172 gen_load_even = gen_vec_setv8hi;
42173 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42174 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42175 inner_mode = HImode;
42176 first_imode = V4SImode;
42177 second_imode = V2DImode;
42178 third_imode = VOIDmode;
42179 break;
42180 case E_V16QImode:
42181 gen_load_even = gen_vec_setv16qi;
42182 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42183 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42184 inner_mode = QImode;
42185 first_imode = V8HImode;
42186 second_imode = V4SImode;
42187 third_imode = V2DImode;
42188 break;
42189 default:
42190 gcc_unreachable ();
42193 for (i = 0; i < n; i++)
42195 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42196 op0 = gen_reg_rtx (SImode);
42197 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42199 /* Insert the SImode value as low element of V4SImode vector. */
42200 op1 = gen_reg_rtx (V4SImode);
42201 op0 = gen_rtx_VEC_MERGE (V4SImode,
42202 gen_rtx_VEC_DUPLICATE (V4SImode,
42203 op0),
42204 CONST0_RTX (V4SImode),
42205 const1_rtx);
42206 emit_insn (gen_rtx_SET (op1, op0));
42208 /* Cast the V4SImode vector back to a vector in orignal mode. */
42209 op0 = gen_reg_rtx (mode);
42210 emit_move_insn (op0, gen_lowpart (mode, op1));
42212 /* Load even elements into the second position. */
42213 emit_insn (gen_load_even (op0,
42214 force_reg (inner_mode,
42215 ops [i + i + 1]),
42216 const1_rtx));
42218 /* Cast vector to FIRST_IMODE vector. */
42219 ops[i] = gen_reg_rtx (first_imode);
42220 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42223 /* Interleave low FIRST_IMODE vectors. */
42224 for (i = j = 0; i < n; i += 2, j++)
42226 op0 = gen_reg_rtx (first_imode);
42227 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42229 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42230 ops[j] = gen_reg_rtx (second_imode);
42231 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42234 /* Interleave low SECOND_IMODE vectors. */
42235 switch (second_imode)
42237 case E_V4SImode:
42238 for (i = j = 0; i < n / 2; i += 2, j++)
42240 op0 = gen_reg_rtx (second_imode);
42241 emit_insn (gen_interleave_second_low (op0, ops[i],
42242 ops[i + 1]));
42244 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42245 vector. */
42246 ops[j] = gen_reg_rtx (third_imode);
42247 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42249 second_imode = V2DImode;
42250 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42251 /* FALLTHRU */
42253 case E_V2DImode:
42254 op0 = gen_reg_rtx (second_imode);
42255 emit_insn (gen_interleave_second_low (op0, ops[0],
42256 ops[1]));
42258 /* Cast the SECOND_IMODE vector back to a vector on original
42259 mode. */
42260 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42261 break;
42263 default:
42264 gcc_unreachable ();
42268 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42269 all values variable, and none identical. */
42271 static void
42272 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42273 rtx target, rtx vals)
42275 rtx ops[64], op0, op1, op2, op3, op4, op5;
42276 machine_mode half_mode = VOIDmode;
42277 machine_mode quarter_mode = VOIDmode;
42278 int n, i;
42280 switch (mode)
42282 case E_V2SFmode:
42283 case E_V2SImode:
42284 if (!mmx_ok && !TARGET_SSE)
42285 break;
42286 /* FALLTHRU */
42288 case E_V16SImode:
42289 case E_V16SFmode:
42290 case E_V8DFmode:
42291 case E_V8DImode:
42292 case E_V8SFmode:
42293 case E_V8SImode:
42294 case E_V4DFmode:
42295 case E_V4DImode:
42296 case E_V4SFmode:
42297 case E_V4SImode:
42298 case E_V2DFmode:
42299 case E_V2DImode:
42300 n = GET_MODE_NUNITS (mode);
42301 for (i = 0; i < n; i++)
42302 ops[i] = XVECEXP (vals, 0, i);
42303 ix86_expand_vector_init_concat (mode, target, ops, n);
42304 return;
42306 case E_V2TImode:
42307 for (i = 0; i < 2; i++)
42308 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42309 op0 = gen_reg_rtx (V4DImode);
42310 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
42311 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42312 return;
42314 case E_V4TImode:
42315 for (i = 0; i < 4; i++)
42316 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42317 ops[4] = gen_reg_rtx (V4DImode);
42318 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
42319 ops[5] = gen_reg_rtx (V4DImode);
42320 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
42321 op0 = gen_reg_rtx (V8DImode);
42322 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
42323 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42324 return;
42326 case E_V32QImode:
42327 half_mode = V16QImode;
42328 goto half;
42330 case E_V16HImode:
42331 half_mode = V8HImode;
42332 goto half;
42334 half:
42335 n = GET_MODE_NUNITS (mode);
42336 for (i = 0; i < n; i++)
42337 ops[i] = XVECEXP (vals, 0, i);
42338 op0 = gen_reg_rtx (half_mode);
42339 op1 = gen_reg_rtx (half_mode);
42340 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42341 n >> 2);
42342 ix86_expand_vector_init_interleave (half_mode, op1,
42343 &ops [n >> 1], n >> 2);
42344 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42345 return;
42347 case E_V64QImode:
42348 quarter_mode = V16QImode;
42349 half_mode = V32QImode;
42350 goto quarter;
42352 case E_V32HImode:
42353 quarter_mode = V8HImode;
42354 half_mode = V16HImode;
42355 goto quarter;
42357 quarter:
42358 n = GET_MODE_NUNITS (mode);
42359 for (i = 0; i < n; i++)
42360 ops[i] = XVECEXP (vals, 0, i);
42361 op0 = gen_reg_rtx (quarter_mode);
42362 op1 = gen_reg_rtx (quarter_mode);
42363 op2 = gen_reg_rtx (quarter_mode);
42364 op3 = gen_reg_rtx (quarter_mode);
42365 op4 = gen_reg_rtx (half_mode);
42366 op5 = gen_reg_rtx (half_mode);
42367 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42368 n >> 3);
42369 ix86_expand_vector_init_interleave (quarter_mode, op1,
42370 &ops [n >> 2], n >> 3);
42371 ix86_expand_vector_init_interleave (quarter_mode, op2,
42372 &ops [n >> 1], n >> 3);
42373 ix86_expand_vector_init_interleave (quarter_mode, op3,
42374 &ops [(n >> 1) | (n >> 2)], n >> 3);
42375 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42376 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42377 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42378 return;
42380 case E_V16QImode:
42381 if (!TARGET_SSE4_1)
42382 break;
42383 /* FALLTHRU */
42385 case E_V8HImode:
42386 if (!TARGET_SSE2)
42387 break;
42389 /* Don't use ix86_expand_vector_init_interleave if we can't
42390 move from GPR to SSE register directly. */
42391 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42392 break;
42394 n = GET_MODE_NUNITS (mode);
42395 for (i = 0; i < n; i++)
42396 ops[i] = XVECEXP (vals, 0, i);
42397 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42398 return;
42400 case E_V4HImode:
42401 case E_V8QImode:
42402 break;
42404 default:
42405 gcc_unreachable ();
42409 int i, j, n_elts, n_words, n_elt_per_word;
42410 machine_mode inner_mode;
42411 rtx words[4], shift;
42413 inner_mode = GET_MODE_INNER (mode);
42414 n_elts = GET_MODE_NUNITS (mode);
42415 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42416 n_elt_per_word = n_elts / n_words;
42417 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42419 for (i = 0; i < n_words; ++i)
42421 rtx word = NULL_RTX;
42423 for (j = 0; j < n_elt_per_word; ++j)
42425 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42426 elt = convert_modes (word_mode, inner_mode, elt, true);
42428 if (j == 0)
42429 word = elt;
42430 else
42432 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42433 word, 1, OPTAB_LIB_WIDEN);
42434 word = expand_simple_binop (word_mode, IOR, word, elt,
42435 word, 1, OPTAB_LIB_WIDEN);
42439 words[i] = word;
42442 if (n_words == 1)
42443 emit_move_insn (target, gen_lowpart (mode, words[0]));
42444 else if (n_words == 2)
42446 rtx tmp = gen_reg_rtx (mode);
42447 emit_clobber (tmp);
42448 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42449 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42450 emit_move_insn (target, tmp);
42452 else if (n_words == 4)
42454 rtx tmp = gen_reg_rtx (V4SImode);
42455 gcc_assert (word_mode == SImode);
42456 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42457 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42458 emit_move_insn (target, gen_lowpart (mode, tmp));
42460 else
42461 gcc_unreachable ();
42465 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42466 instructions unless MMX_OK is true. */
42468 void
42469 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42471 machine_mode mode = GET_MODE (target);
42472 machine_mode inner_mode = GET_MODE_INNER (mode);
42473 int n_elts = GET_MODE_NUNITS (mode);
42474 int n_var = 0, one_var = -1;
42475 bool all_same = true, all_const_zero = true;
42476 int i;
42477 rtx x;
42479 /* Handle first initialization from vector elts. */
42480 if (n_elts != XVECLEN (vals, 0))
42482 rtx subtarget = target;
42483 x = XVECEXP (vals, 0, 0);
42484 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42485 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42487 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42488 if (inner_mode == QImode || inner_mode == HImode)
42490 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42491 mode = mode_for_vector (SImode, n_bits / 4).require ();
42492 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42493 ops[0] = gen_lowpart (inner_mode, ops[0]);
42494 ops[1] = gen_lowpart (inner_mode, ops[1]);
42495 subtarget = gen_reg_rtx (mode);
42497 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42498 if (subtarget != target)
42499 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42500 return;
42502 gcc_unreachable ();
42505 for (i = 0; i < n_elts; ++i)
42507 x = XVECEXP (vals, 0, i);
42508 if (!(CONST_SCALAR_INT_P (x)
42509 || CONST_DOUBLE_P (x)
42510 || CONST_FIXED_P (x)))
42511 n_var++, one_var = i;
42512 else if (x != CONST0_RTX (inner_mode))
42513 all_const_zero = false;
42514 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42515 all_same = false;
42518 /* Constants are best loaded from the constant pool. */
42519 if (n_var == 0)
42521 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42522 return;
42525 /* If all values are identical, broadcast the value. */
42526 if (all_same
42527 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42528 XVECEXP (vals, 0, 0)))
42529 return;
42531 /* Values where only one field is non-constant are best loaded from
42532 the pool and overwritten via move later. */
42533 if (n_var == 1)
42535 if (all_const_zero
42536 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42537 XVECEXP (vals, 0, one_var),
42538 one_var))
42539 return;
42541 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42542 return;
42545 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42548 void
42549 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42551 machine_mode mode = GET_MODE (target);
42552 machine_mode inner_mode = GET_MODE_INNER (mode);
42553 machine_mode half_mode;
42554 bool use_vec_merge = false;
42555 rtx tmp;
42556 static rtx (*gen_extract[6][2]) (rtx, rtx)
42558 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42559 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42560 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42561 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42562 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42563 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42565 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42567 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42568 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42569 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42570 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42571 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42572 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42574 int i, j, n;
42575 machine_mode mmode = VOIDmode;
42576 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42578 switch (mode)
42580 case E_V2SFmode:
42581 case E_V2SImode:
42582 if (mmx_ok)
42584 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42585 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42586 if (elt == 0)
42587 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42588 else
42589 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42590 emit_insn (gen_rtx_SET (target, tmp));
42591 return;
42593 break;
42595 case E_V2DImode:
42596 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42597 if (use_vec_merge)
42598 break;
42600 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42601 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42602 if (elt == 0)
42603 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42604 else
42605 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42606 emit_insn (gen_rtx_SET (target, tmp));
42607 return;
42609 case E_V2DFmode:
42611 rtx op0, op1;
42613 /* For the two element vectors, we implement a VEC_CONCAT with
42614 the extraction of the other element. */
42616 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42617 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42619 if (elt == 0)
42620 op0 = val, op1 = tmp;
42621 else
42622 op0 = tmp, op1 = val;
42624 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42625 emit_insn (gen_rtx_SET (target, tmp));
42627 return;
42629 case E_V4SFmode:
42630 use_vec_merge = TARGET_SSE4_1;
42631 if (use_vec_merge)
42632 break;
42634 switch (elt)
42636 case 0:
42637 use_vec_merge = true;
42638 break;
42640 case 1:
42641 /* tmp = target = A B C D */
42642 tmp = copy_to_reg (target);
42643 /* target = A A B B */
42644 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42645 /* target = X A B B */
42646 ix86_expand_vector_set (false, target, val, 0);
42647 /* target = A X C D */
42648 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42649 const1_rtx, const0_rtx,
42650 GEN_INT (2+4), GEN_INT (3+4)));
42651 return;
42653 case 2:
42654 /* tmp = target = A B C D */
42655 tmp = copy_to_reg (target);
42656 /* tmp = X B C D */
42657 ix86_expand_vector_set (false, tmp, val, 0);
42658 /* target = A B X D */
42659 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42660 const0_rtx, const1_rtx,
42661 GEN_INT (0+4), GEN_INT (3+4)));
42662 return;
42664 case 3:
42665 /* tmp = target = A B C D */
42666 tmp = copy_to_reg (target);
42667 /* tmp = X B C D */
42668 ix86_expand_vector_set (false, tmp, val, 0);
42669 /* target = A B X D */
42670 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42671 const0_rtx, const1_rtx,
42672 GEN_INT (2+4), GEN_INT (0+4)));
42673 return;
42675 default:
42676 gcc_unreachable ();
42678 break;
42680 case E_V4SImode:
42681 use_vec_merge = TARGET_SSE4_1;
42682 if (use_vec_merge)
42683 break;
42685 /* Element 0 handled by vec_merge below. */
42686 if (elt == 0)
42688 use_vec_merge = true;
42689 break;
42692 if (TARGET_SSE2)
42694 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42695 store into element 0, then shuffle them back. */
42697 rtx order[4];
42699 order[0] = GEN_INT (elt);
42700 order[1] = const1_rtx;
42701 order[2] = const2_rtx;
42702 order[3] = GEN_INT (3);
42703 order[elt] = const0_rtx;
42705 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42706 order[1], order[2], order[3]));
42708 ix86_expand_vector_set (false, target, val, 0);
42710 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42711 order[1], order[2], order[3]));
42713 else
42715 /* For SSE1, we have to reuse the V4SF code. */
42716 rtx t = gen_reg_rtx (V4SFmode);
42717 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42718 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42719 emit_move_insn (target, gen_lowpart (mode, t));
42721 return;
42723 case E_V8HImode:
42724 use_vec_merge = TARGET_SSE2;
42725 break;
42726 case E_V4HImode:
42727 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42728 break;
42730 case E_V16QImode:
42731 use_vec_merge = TARGET_SSE4_1;
42732 break;
42734 case E_V8QImode:
42735 break;
42737 case E_V32QImode:
42738 half_mode = V16QImode;
42739 j = 0;
42740 n = 16;
42741 goto half;
42743 case E_V16HImode:
42744 half_mode = V8HImode;
42745 j = 1;
42746 n = 8;
42747 goto half;
42749 case E_V8SImode:
42750 half_mode = V4SImode;
42751 j = 2;
42752 n = 4;
42753 goto half;
42755 case E_V4DImode:
42756 half_mode = V2DImode;
42757 j = 3;
42758 n = 2;
42759 goto half;
42761 case E_V8SFmode:
42762 half_mode = V4SFmode;
42763 j = 4;
42764 n = 4;
42765 goto half;
42767 case E_V4DFmode:
42768 half_mode = V2DFmode;
42769 j = 5;
42770 n = 2;
42771 goto half;
42773 half:
42774 /* Compute offset. */
42775 i = elt / n;
42776 elt %= n;
42778 gcc_assert (i <= 1);
42780 /* Extract the half. */
42781 tmp = gen_reg_rtx (half_mode);
42782 emit_insn (gen_extract[j][i] (tmp, target));
42784 /* Put val in tmp at elt. */
42785 ix86_expand_vector_set (false, tmp, val, elt);
42787 /* Put it back. */
42788 emit_insn (gen_insert[j][i] (target, target, tmp));
42789 return;
42791 case E_V8DFmode:
42792 if (TARGET_AVX512F)
42794 mmode = QImode;
42795 gen_blendm = gen_avx512f_blendmv8df;
42797 break;
42799 case E_V8DImode:
42800 if (TARGET_AVX512F)
42802 mmode = QImode;
42803 gen_blendm = gen_avx512f_blendmv8di;
42805 break;
42807 case E_V16SFmode:
42808 if (TARGET_AVX512F)
42810 mmode = HImode;
42811 gen_blendm = gen_avx512f_blendmv16sf;
42813 break;
42815 case E_V16SImode:
42816 if (TARGET_AVX512F)
42818 mmode = HImode;
42819 gen_blendm = gen_avx512f_blendmv16si;
42821 break;
42823 case E_V32HImode:
42824 if (TARGET_AVX512F && TARGET_AVX512BW)
42826 mmode = SImode;
42827 gen_blendm = gen_avx512bw_blendmv32hi;
42829 break;
42831 case E_V64QImode:
42832 if (TARGET_AVX512F && TARGET_AVX512BW)
42834 mmode = DImode;
42835 gen_blendm = gen_avx512bw_blendmv64qi;
42837 break;
42839 default:
42840 break;
42843 if (mmode != VOIDmode)
42845 tmp = gen_reg_rtx (mode);
42846 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42847 /* The avx512*_blendm<mode> expanders have different operand order
42848 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42849 elements where the mask is set and second input operand otherwise,
42850 in {sse,avx}*_*blend* the first input operand is used for elements
42851 where the mask is clear and second input operand otherwise. */
42852 emit_insn (gen_blendm (target, target, tmp,
42853 force_reg (mmode,
42854 gen_int_mode (1 << elt, mmode))));
42856 else if (use_vec_merge)
42858 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42859 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42860 emit_insn (gen_rtx_SET (target, tmp));
42862 else
42864 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42866 emit_move_insn (mem, target);
42868 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42869 emit_move_insn (tmp, val);
42871 emit_move_insn (target, mem);
42875 void
42876 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42878 machine_mode mode = GET_MODE (vec);
42879 machine_mode inner_mode = GET_MODE_INNER (mode);
42880 bool use_vec_extr = false;
42881 rtx tmp;
42883 switch (mode)
42885 case E_V2SImode:
42886 case E_V2SFmode:
42887 if (!mmx_ok)
42888 break;
42889 /* FALLTHRU */
42891 case E_V2DFmode:
42892 case E_V2DImode:
42893 case E_V2TImode:
42894 case E_V4TImode:
42895 use_vec_extr = true;
42896 break;
42898 case E_V4SFmode:
42899 use_vec_extr = TARGET_SSE4_1;
42900 if (use_vec_extr)
42901 break;
42903 switch (elt)
42905 case 0:
42906 tmp = vec;
42907 break;
42909 case 1:
42910 case 3:
42911 tmp = gen_reg_rtx (mode);
42912 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42913 GEN_INT (elt), GEN_INT (elt),
42914 GEN_INT (elt+4), GEN_INT (elt+4)));
42915 break;
42917 case 2:
42918 tmp = gen_reg_rtx (mode);
42919 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42920 break;
42922 default:
42923 gcc_unreachable ();
42925 vec = tmp;
42926 use_vec_extr = true;
42927 elt = 0;
42928 break;
42930 case E_V4SImode:
42931 use_vec_extr = TARGET_SSE4_1;
42932 if (use_vec_extr)
42933 break;
42935 if (TARGET_SSE2)
42937 switch (elt)
42939 case 0:
42940 tmp = vec;
42941 break;
42943 case 1:
42944 case 3:
42945 tmp = gen_reg_rtx (mode);
42946 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
42947 GEN_INT (elt), GEN_INT (elt),
42948 GEN_INT (elt), GEN_INT (elt)));
42949 break;
42951 case 2:
42952 tmp = gen_reg_rtx (mode);
42953 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
42954 break;
42956 default:
42957 gcc_unreachable ();
42959 vec = tmp;
42960 use_vec_extr = true;
42961 elt = 0;
42963 else
42965 /* For SSE1, we have to reuse the V4SF code. */
42966 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
42967 gen_lowpart (V4SFmode, vec), elt);
42968 return;
42970 break;
42972 case E_V8HImode:
42973 use_vec_extr = TARGET_SSE2;
42974 break;
42975 case E_V4HImode:
42976 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42977 break;
42979 case E_V16QImode:
42980 use_vec_extr = TARGET_SSE4_1;
42981 break;
42983 case E_V8SFmode:
42984 if (TARGET_AVX)
42986 tmp = gen_reg_rtx (V4SFmode);
42987 if (elt < 4)
42988 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
42989 else
42990 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
42991 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42992 return;
42994 break;
42996 case E_V4DFmode:
42997 if (TARGET_AVX)
42999 tmp = gen_reg_rtx (V2DFmode);
43000 if (elt < 2)
43001 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43002 else
43003 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43004 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43005 return;
43007 break;
43009 case E_V32QImode:
43010 if (TARGET_AVX)
43012 tmp = gen_reg_rtx (V16QImode);
43013 if (elt < 16)
43014 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43015 else
43016 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43017 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43018 return;
43020 break;
43022 case E_V16HImode:
43023 if (TARGET_AVX)
43025 tmp = gen_reg_rtx (V8HImode);
43026 if (elt < 8)
43027 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43028 else
43029 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43030 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43031 return;
43033 break;
43035 case E_V8SImode:
43036 if (TARGET_AVX)
43038 tmp = gen_reg_rtx (V4SImode);
43039 if (elt < 4)
43040 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43041 else
43042 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43043 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43044 return;
43046 break;
43048 case E_V4DImode:
43049 if (TARGET_AVX)
43051 tmp = gen_reg_rtx (V2DImode);
43052 if (elt < 2)
43053 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43054 else
43055 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43056 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43057 return;
43059 break;
43061 case E_V32HImode:
43062 if (TARGET_AVX512BW)
43064 tmp = gen_reg_rtx (V16HImode);
43065 if (elt < 16)
43066 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43067 else
43068 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43069 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43070 return;
43072 break;
43074 case E_V64QImode:
43075 if (TARGET_AVX512BW)
43077 tmp = gen_reg_rtx (V32QImode);
43078 if (elt < 32)
43079 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43080 else
43081 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43082 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43083 return;
43085 break;
43087 case E_V16SFmode:
43088 tmp = gen_reg_rtx (V8SFmode);
43089 if (elt < 8)
43090 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43091 else
43092 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43093 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43094 return;
43096 case E_V8DFmode:
43097 tmp = gen_reg_rtx (V4DFmode);
43098 if (elt < 4)
43099 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43100 else
43101 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43102 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43103 return;
43105 case E_V16SImode:
43106 tmp = gen_reg_rtx (V8SImode);
43107 if (elt < 8)
43108 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43109 else
43110 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43111 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43112 return;
43114 case E_V8DImode:
43115 tmp = gen_reg_rtx (V4DImode);
43116 if (elt < 4)
43117 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43118 else
43119 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43120 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43121 return;
43123 case E_V8QImode:
43124 /* ??? Could extract the appropriate HImode element and shift. */
43125 default:
43126 break;
43129 if (use_vec_extr)
43131 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43132 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43134 /* Let the rtl optimizers know about the zero extension performed. */
43135 if (inner_mode == QImode || inner_mode == HImode)
43137 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43138 target = gen_lowpart (SImode, target);
43141 emit_insn (gen_rtx_SET (target, tmp));
43143 else
43145 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43147 emit_move_insn (mem, vec);
43149 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43150 emit_move_insn (target, tmp);
43154 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43155 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43156 The upper bits of DEST are undefined, though they shouldn't cause
43157 exceptions (some bits from src or all zeros are ok). */
43159 static void
43160 emit_reduc_half (rtx dest, rtx src, int i)
43162 rtx tem, d = dest;
43163 switch (GET_MODE (src))
43165 case E_V4SFmode:
43166 if (i == 128)
43167 tem = gen_sse_movhlps (dest, src, src);
43168 else
43169 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43170 GEN_INT (1 + 4), GEN_INT (1 + 4));
43171 break;
43172 case E_V2DFmode:
43173 tem = gen_vec_interleave_highv2df (dest, src, src);
43174 break;
43175 case E_V16QImode:
43176 case E_V8HImode:
43177 case E_V4SImode:
43178 case E_V2DImode:
43179 d = gen_reg_rtx (V1TImode);
43180 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43181 GEN_INT (i / 2));
43182 break;
43183 case E_V8SFmode:
43184 if (i == 256)
43185 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43186 else
43187 tem = gen_avx_shufps256 (dest, src, src,
43188 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43189 break;
43190 case E_V4DFmode:
43191 if (i == 256)
43192 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43193 else
43194 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43195 break;
43196 case E_V32QImode:
43197 case E_V16HImode:
43198 case E_V8SImode:
43199 case E_V4DImode:
43200 if (i == 256)
43202 if (GET_MODE (dest) != V4DImode)
43203 d = gen_reg_rtx (V4DImode);
43204 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43205 gen_lowpart (V4DImode, src),
43206 const1_rtx);
43208 else
43210 d = gen_reg_rtx (V2TImode);
43211 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43212 GEN_INT (i / 2));
43214 break;
43215 case E_V64QImode:
43216 case E_V32HImode:
43217 case E_V16SImode:
43218 case E_V16SFmode:
43219 case E_V8DImode:
43220 case E_V8DFmode:
43221 if (i > 128)
43222 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43223 gen_lowpart (V16SImode, src),
43224 gen_lowpart (V16SImode, src),
43225 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43226 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43227 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43228 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43229 GEN_INT (0xC), GEN_INT (0xD),
43230 GEN_INT (0xE), GEN_INT (0xF),
43231 GEN_INT (0x10), GEN_INT (0x11),
43232 GEN_INT (0x12), GEN_INT (0x13),
43233 GEN_INT (0x14), GEN_INT (0x15),
43234 GEN_INT (0x16), GEN_INT (0x17));
43235 else
43236 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43237 gen_lowpart (V16SImode, src),
43238 GEN_INT (i == 128 ? 0x2 : 0x1),
43239 GEN_INT (0x3),
43240 GEN_INT (0x3),
43241 GEN_INT (0x3),
43242 GEN_INT (i == 128 ? 0x6 : 0x5),
43243 GEN_INT (0x7),
43244 GEN_INT (0x7),
43245 GEN_INT (0x7),
43246 GEN_INT (i == 128 ? 0xA : 0x9),
43247 GEN_INT (0xB),
43248 GEN_INT (0xB),
43249 GEN_INT (0xB),
43250 GEN_INT (i == 128 ? 0xE : 0xD),
43251 GEN_INT (0xF),
43252 GEN_INT (0xF),
43253 GEN_INT (0xF));
43254 break;
43255 default:
43256 gcc_unreachable ();
43258 emit_insn (tem);
43259 if (d != dest)
43260 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43263 /* Expand a vector reduction. FN is the binary pattern to reduce;
43264 DEST is the destination; IN is the input vector. */
43266 void
43267 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43269 rtx half, dst, vec = in;
43270 machine_mode mode = GET_MODE (in);
43271 int i;
43273 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43274 if (TARGET_SSE4_1
43275 && mode == V8HImode
43276 && fn == gen_uminv8hi3)
43278 emit_insn (gen_sse4_1_phminposuw (dest, in));
43279 return;
43282 for (i = GET_MODE_BITSIZE (mode);
43283 i > GET_MODE_UNIT_BITSIZE (mode);
43284 i >>= 1)
43286 half = gen_reg_rtx (mode);
43287 emit_reduc_half (half, vec, i);
43288 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43289 dst = dest;
43290 else
43291 dst = gen_reg_rtx (mode);
43292 emit_insn (fn (dst, half, vec));
43293 vec = dst;
43297 /* Target hook for scalar_mode_supported_p. */
43298 static bool
43299 ix86_scalar_mode_supported_p (scalar_mode mode)
43301 if (DECIMAL_FLOAT_MODE_P (mode))
43302 return default_decimal_float_supported_p ();
43303 else if (mode == TFmode)
43304 return true;
43305 else
43306 return default_scalar_mode_supported_p (mode);
43309 /* Implements target hook vector_mode_supported_p. */
43310 static bool
43311 ix86_vector_mode_supported_p (machine_mode mode)
43313 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43314 return true;
43315 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43316 return true;
43317 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43318 return true;
43319 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43320 return true;
43321 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43322 return true;
43323 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43324 return true;
43325 return false;
43328 /* Target hook for c_mode_for_suffix. */
43329 static machine_mode
43330 ix86_c_mode_for_suffix (char suffix)
43332 if (suffix == 'q')
43333 return TFmode;
43334 if (suffix == 'w')
43335 return XFmode;
43337 return VOIDmode;
43340 /* Worker function for TARGET_MD_ASM_ADJUST.
43342 We implement asm flag outputs, and maintain source compatibility
43343 with the old cc0-based compiler. */
43345 static rtx_insn *
43346 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43347 vec<const char *> &constraints,
43348 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43350 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43351 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43353 bool saw_asm_flag = false;
43355 start_sequence ();
43356 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43358 const char *con = constraints[i];
43359 if (strncmp (con, "=@cc", 4) != 0)
43360 continue;
43361 con += 4;
43362 if (strchr (con, ',') != NULL)
43364 error ("alternatives not allowed in asm flag output");
43365 continue;
43368 bool invert = false;
43369 if (con[0] == 'n')
43370 invert = true, con++;
43372 machine_mode mode = CCmode;
43373 rtx_code code = UNKNOWN;
43375 switch (con[0])
43377 case 'a':
43378 if (con[1] == 0)
43379 mode = CCAmode, code = EQ;
43380 else if (con[1] == 'e' && con[2] == 0)
43381 mode = CCCmode, code = NE;
43382 break;
43383 case 'b':
43384 if (con[1] == 0)
43385 mode = CCCmode, code = EQ;
43386 else if (con[1] == 'e' && con[2] == 0)
43387 mode = CCAmode, code = NE;
43388 break;
43389 case 'c':
43390 if (con[1] == 0)
43391 mode = CCCmode, code = EQ;
43392 break;
43393 case 'e':
43394 if (con[1] == 0)
43395 mode = CCZmode, code = EQ;
43396 break;
43397 case 'g':
43398 if (con[1] == 0)
43399 mode = CCGCmode, code = GT;
43400 else if (con[1] == 'e' && con[2] == 0)
43401 mode = CCGCmode, code = GE;
43402 break;
43403 case 'l':
43404 if (con[1] == 0)
43405 mode = CCGCmode, code = LT;
43406 else if (con[1] == 'e' && con[2] == 0)
43407 mode = CCGCmode, code = LE;
43408 break;
43409 case 'o':
43410 if (con[1] == 0)
43411 mode = CCOmode, code = EQ;
43412 break;
43413 case 'p':
43414 if (con[1] == 0)
43415 mode = CCPmode, code = EQ;
43416 break;
43417 case 's':
43418 if (con[1] == 0)
43419 mode = CCSmode, code = EQ;
43420 break;
43421 case 'z':
43422 if (con[1] == 0)
43423 mode = CCZmode, code = EQ;
43424 break;
43426 if (code == UNKNOWN)
43428 error ("unknown asm flag output %qs", constraints[i]);
43429 continue;
43431 if (invert)
43432 code = reverse_condition (code);
43434 rtx dest = outputs[i];
43435 if (!saw_asm_flag)
43437 /* This is the first asm flag output. Here we put the flags
43438 register in as the real output and adjust the condition to
43439 allow it. */
43440 constraints[i] = "=Bf";
43441 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43442 saw_asm_flag = true;
43444 else
43446 /* We don't need the flags register as output twice. */
43447 constraints[i] = "=X";
43448 outputs[i] = gen_rtx_SCRATCH (SImode);
43451 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43452 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43454 machine_mode dest_mode = GET_MODE (dest);
43455 if (!SCALAR_INT_MODE_P (dest_mode))
43457 error ("invalid type for asm flag output");
43458 continue;
43461 if (dest_mode == DImode && !TARGET_64BIT)
43462 dest_mode = SImode;
43464 if (dest_mode != QImode)
43466 rtx destqi = gen_reg_rtx (QImode);
43467 emit_insn (gen_rtx_SET (destqi, x));
43469 if (TARGET_ZERO_EXTEND_WITH_AND
43470 && optimize_function_for_speed_p (cfun))
43472 x = force_reg (dest_mode, const0_rtx);
43474 emit_insn (gen_movstrictqi
43475 (gen_lowpart (QImode, x), destqi));
43477 else
43478 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43481 if (dest_mode != GET_MODE (dest))
43483 rtx tmp = gen_reg_rtx (SImode);
43485 emit_insn (gen_rtx_SET (tmp, x));
43486 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43488 else
43489 emit_insn (gen_rtx_SET (dest, x));
43491 rtx_insn *seq = get_insns ();
43492 end_sequence ();
43494 if (saw_asm_flag)
43495 return seq;
43496 else
43498 /* If we had no asm flag outputs, clobber the flags. */
43499 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43500 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43501 return NULL;
43505 /* Implements target vector targetm.asm.encode_section_info. */
43507 static void ATTRIBUTE_UNUSED
43508 ix86_encode_section_info (tree decl, rtx rtl, int first)
43510 default_encode_section_info (decl, rtl, first);
43512 if (ix86_in_large_data_p (decl))
43513 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43516 /* Worker function for REVERSE_CONDITION. */
43518 enum rtx_code
43519 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43521 return (mode == CCFPmode
43522 ? reverse_condition_maybe_unordered (code)
43523 : reverse_condition (code));
43526 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43527 to OPERANDS[0]. */
43529 const char *
43530 output_387_reg_move (rtx_insn *insn, rtx *operands)
43532 if (REG_P (operands[0]))
43534 if (REG_P (operands[1])
43535 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43537 if (REGNO (operands[0]) == FIRST_STACK_REG)
43538 return output_387_ffreep (operands, 0);
43539 return "fstp\t%y0";
43541 if (STACK_TOP_P (operands[0]))
43542 return "fld%Z1\t%y1";
43543 return "fst\t%y0";
43545 else if (MEM_P (operands[0]))
43547 gcc_assert (REG_P (operands[1]));
43548 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43549 return "fstp%Z0\t%y0";
43550 else
43552 /* There is no non-popping store to memory for XFmode.
43553 So if we need one, follow the store with a load. */
43554 if (GET_MODE (operands[0]) == XFmode)
43555 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43556 else
43557 return "fst%Z0\t%y0";
43560 else
43561 gcc_unreachable();
43564 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43565 FP status register is set. */
43567 void
43568 ix86_emit_fp_unordered_jump (rtx label)
43570 rtx reg = gen_reg_rtx (HImode);
43571 rtx temp;
43573 emit_insn (gen_x86_fnstsw_1 (reg));
43575 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43577 emit_insn (gen_x86_sahf_1 (reg));
43579 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43580 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43582 else
43584 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43586 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43587 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43590 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43591 gen_rtx_LABEL_REF (VOIDmode, label),
43592 pc_rtx);
43593 temp = gen_rtx_SET (pc_rtx, temp);
43595 emit_jump_insn (temp);
43596 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43599 /* Output code to perform a log1p XFmode calculation. */
43601 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43603 rtx_code_label *label1 = gen_label_rtx ();
43604 rtx_code_label *label2 = gen_label_rtx ();
43606 rtx tmp = gen_reg_rtx (XFmode);
43607 rtx tmp2 = gen_reg_rtx (XFmode);
43608 rtx test;
43610 emit_insn (gen_absxf2 (tmp, op1));
43611 test = gen_rtx_GE (VOIDmode, tmp,
43612 const_double_from_real_value (
43613 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43614 XFmode));
43615 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43617 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43618 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43619 emit_jump (label2);
43621 emit_label (label1);
43622 emit_move_insn (tmp, CONST1_RTX (XFmode));
43623 emit_insn (gen_addxf3 (tmp, op1, tmp));
43624 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43625 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43627 emit_label (label2);
43630 /* Emit code for round calculation. */
43631 void ix86_emit_i387_round (rtx op0, rtx op1)
43633 machine_mode inmode = GET_MODE (op1);
43634 machine_mode outmode = GET_MODE (op0);
43635 rtx e1, e2, res, tmp, tmp1, half;
43636 rtx scratch = gen_reg_rtx (HImode);
43637 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43638 rtx_code_label *jump_label = gen_label_rtx ();
43639 rtx insn;
43640 rtx (*gen_abs) (rtx, rtx);
43641 rtx (*gen_neg) (rtx, rtx);
43643 switch (inmode)
43645 case E_SFmode:
43646 gen_abs = gen_abssf2;
43647 break;
43648 case E_DFmode:
43649 gen_abs = gen_absdf2;
43650 break;
43651 case E_XFmode:
43652 gen_abs = gen_absxf2;
43653 break;
43654 default:
43655 gcc_unreachable ();
43658 switch (outmode)
43660 case E_SFmode:
43661 gen_neg = gen_negsf2;
43662 break;
43663 case E_DFmode:
43664 gen_neg = gen_negdf2;
43665 break;
43666 case E_XFmode:
43667 gen_neg = gen_negxf2;
43668 break;
43669 case E_HImode:
43670 gen_neg = gen_neghi2;
43671 break;
43672 case E_SImode:
43673 gen_neg = gen_negsi2;
43674 break;
43675 case E_DImode:
43676 gen_neg = gen_negdi2;
43677 break;
43678 default:
43679 gcc_unreachable ();
43682 e1 = gen_reg_rtx (inmode);
43683 e2 = gen_reg_rtx (inmode);
43684 res = gen_reg_rtx (outmode);
43686 half = const_double_from_real_value (dconsthalf, inmode);
43688 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43690 /* scratch = fxam(op1) */
43691 emit_insn (gen_rtx_SET (scratch,
43692 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43693 UNSPEC_FXAM)));
43694 /* e1 = fabs(op1) */
43695 emit_insn (gen_abs (e1, op1));
43697 /* e2 = e1 + 0.5 */
43698 half = force_reg (inmode, half);
43699 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43701 /* res = floor(e2) */
43702 if (inmode != XFmode)
43704 tmp1 = gen_reg_rtx (XFmode);
43706 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43708 else
43709 tmp1 = e2;
43711 switch (outmode)
43713 case E_SFmode:
43714 case E_DFmode:
43716 rtx tmp0 = gen_reg_rtx (XFmode);
43718 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43720 emit_insn (gen_rtx_SET (res,
43721 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43722 UNSPEC_TRUNC_NOOP)));
43724 break;
43725 case E_XFmode:
43726 emit_insn (gen_frndintxf2_floor (res, tmp1));
43727 break;
43728 case E_HImode:
43729 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43730 break;
43731 case E_SImode:
43732 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43733 break;
43734 case E_DImode:
43735 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43736 break;
43737 default:
43738 gcc_unreachable ();
43741 /* flags = signbit(a) */
43742 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43744 /* if (flags) then res = -res */
43745 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43746 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43747 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43748 pc_rtx);
43749 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43750 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43751 JUMP_LABEL (insn) = jump_label;
43753 emit_insn (gen_neg (res, res));
43755 emit_label (jump_label);
43756 LABEL_NUSES (jump_label) = 1;
43758 emit_move_insn (op0, res);
43761 /* Output code to perform a Newton-Rhapson approximation of a single precision
43762 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43764 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43766 rtx x0, x1, e0, e1;
43768 x0 = gen_reg_rtx (mode);
43769 e0 = gen_reg_rtx (mode);
43770 e1 = gen_reg_rtx (mode);
43771 x1 = gen_reg_rtx (mode);
43773 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43775 b = force_reg (mode, b);
43777 /* x0 = rcp(b) estimate */
43778 if (mode == V16SFmode || mode == V8DFmode)
43780 if (TARGET_AVX512ER)
43782 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43783 UNSPEC_RCP28)));
43784 /* res = a * x0 */
43785 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43786 return;
43788 else
43789 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43790 UNSPEC_RCP14)));
43792 else
43793 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43794 UNSPEC_RCP)));
43796 /* e0 = x0 * b */
43797 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43799 /* e0 = x0 * e0 */
43800 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43802 /* e1 = x0 + x0 */
43803 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43805 /* x1 = e1 - e0 */
43806 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43808 /* res = a * x1 */
43809 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43812 /* Output code to perform a Newton-Rhapson approximation of a
43813 single precision floating point [reciprocal] square root. */
43815 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43817 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43818 REAL_VALUE_TYPE r;
43819 int unspec;
43821 x0 = gen_reg_rtx (mode);
43822 e0 = gen_reg_rtx (mode);
43823 e1 = gen_reg_rtx (mode);
43824 e2 = gen_reg_rtx (mode);
43825 e3 = gen_reg_rtx (mode);
43827 if (TARGET_AVX512ER && mode == V16SFmode)
43829 if (recip)
43830 /* res = rsqrt28(a) estimate */
43831 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43832 UNSPEC_RSQRT28)));
43833 else
43835 /* x0 = rsqrt28(a) estimate */
43836 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43837 UNSPEC_RSQRT28)));
43838 /* res = rcp28(x0) estimate */
43839 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43840 UNSPEC_RCP28)));
43842 return;
43845 real_from_integer (&r, VOIDmode, -3, SIGNED);
43846 mthree = const_double_from_real_value (r, SFmode);
43848 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43849 mhalf = const_double_from_real_value (r, SFmode);
43850 unspec = UNSPEC_RSQRT;
43852 if (VECTOR_MODE_P (mode))
43854 mthree = ix86_build_const_vector (mode, true, mthree);
43855 mhalf = ix86_build_const_vector (mode, true, mhalf);
43856 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43857 if (GET_MODE_SIZE (mode) == 64)
43858 unspec = UNSPEC_RSQRT14;
43861 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43862 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43864 a = force_reg (mode, a);
43866 /* x0 = rsqrt(a) estimate */
43867 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43868 unspec)));
43870 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43871 if (!recip)
43873 rtx zero = force_reg (mode, CONST0_RTX(mode));
43874 rtx mask;
43876 /* Handle masked compare. */
43877 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43879 mask = gen_reg_rtx (HImode);
43880 /* Imm value 0x4 corresponds to not-equal comparison. */
43881 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43882 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43884 else
43886 mask = gen_reg_rtx (mode);
43887 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43888 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43892 /* e0 = x0 * a */
43893 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43894 /* e1 = e0 * x0 */
43895 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43897 /* e2 = e1 - 3. */
43898 mthree = force_reg (mode, mthree);
43899 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43901 mhalf = force_reg (mode, mhalf);
43902 if (recip)
43903 /* e3 = -.5 * x0 */
43904 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43905 else
43906 /* e3 = -.5 * e0 */
43907 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43908 /* ret = e2 * e3 */
43909 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43912 #ifdef TARGET_SOLARIS
43913 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43915 static void
43916 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43917 tree decl)
43919 /* With Binutils 2.15, the "@unwind" marker must be specified on
43920 every occurrence of the ".eh_frame" section, not just the first
43921 one. */
43922 if (TARGET_64BIT
43923 && strcmp (name, ".eh_frame") == 0)
43925 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43926 flags & SECTION_WRITE ? "aw" : "a");
43927 return;
43930 #ifndef USE_GAS
43931 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43933 solaris_elf_asm_comdat_section (name, flags, decl);
43934 return;
43936 #endif
43938 default_elf_asm_named_section (name, flags, decl);
43940 #endif /* TARGET_SOLARIS */
43942 /* Return the mangling of TYPE if it is an extended fundamental type. */
43944 static const char *
43945 ix86_mangle_type (const_tree type)
43947 type = TYPE_MAIN_VARIANT (type);
43949 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
43950 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
43951 return NULL;
43953 switch (TYPE_MODE (type))
43955 case E_TFmode:
43956 /* __float128 is "g". */
43957 return "g";
43958 case E_XFmode:
43959 /* "long double" or __float80 is "e". */
43960 return "e";
43961 default:
43962 return NULL;
43966 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
43968 static tree
43969 ix86_stack_protect_guard (void)
43971 if (TARGET_SSP_TLS_GUARD)
43973 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
43974 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
43975 tree type = build_qualified_type (type_node, qual);
43976 tree t;
43978 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
43980 t = ix86_tls_stack_chk_guard_decl;
43982 if (t == NULL)
43984 rtx x;
43986 t = build_decl
43987 (UNKNOWN_LOCATION, VAR_DECL,
43988 get_identifier (ix86_stack_protector_guard_symbol_str),
43989 type);
43990 TREE_STATIC (t) = 1;
43991 TREE_PUBLIC (t) = 1;
43992 DECL_EXTERNAL (t) = 1;
43993 TREE_USED (t) = 1;
43994 TREE_THIS_VOLATILE (t) = 1;
43995 DECL_ARTIFICIAL (t) = 1;
43996 DECL_IGNORED_P (t) = 1;
43998 /* Do not share RTL as the declaration is visible outside of
43999 current function. */
44000 x = DECL_RTL (t);
44001 RTX_FLAG (x, used) = 1;
44003 ix86_tls_stack_chk_guard_decl = t;
44006 else
44008 tree asptrtype = build_pointer_type (type);
44010 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
44011 t = build2 (MEM_REF, asptrtype, t,
44012 build_int_cst (asptrtype, 0));
44015 return t;
44018 return default_stack_protect_guard ();
44021 /* For 32-bit code we can save PIC register setup by using
44022 __stack_chk_fail_local hidden function instead of calling
44023 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44024 register, so it is better to call __stack_chk_fail directly. */
44026 static tree ATTRIBUTE_UNUSED
44027 ix86_stack_protect_fail (void)
44029 return TARGET_64BIT
44030 ? default_external_stack_protect_fail ()
44031 : default_hidden_stack_protect_fail ();
44034 /* Select a format to encode pointers in exception handling data. CODE
44035 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44036 true if the symbol may be affected by dynamic relocations.
44038 ??? All x86 object file formats are capable of representing this.
44039 After all, the relocation needed is the same as for the call insn.
44040 Whether or not a particular assembler allows us to enter such, I
44041 guess we'll have to see. */
44043 asm_preferred_eh_data_format (int code, int global)
44045 if (flag_pic)
44047 int type = DW_EH_PE_sdata8;
44048 if (!TARGET_64BIT
44049 || ix86_cmodel == CM_SMALL_PIC
44050 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44051 type = DW_EH_PE_sdata4;
44052 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44054 if (ix86_cmodel == CM_SMALL
44055 || (ix86_cmodel == CM_MEDIUM && code))
44056 return DW_EH_PE_udata4;
44057 return DW_EH_PE_absptr;
44060 /* Expand copysign from SIGN to the positive value ABS_VALUE
44061 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44062 the sign-bit. */
44063 static void
44064 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44066 machine_mode mode = GET_MODE (sign);
44067 rtx sgn = gen_reg_rtx (mode);
44068 if (mask == NULL_RTX)
44070 machine_mode vmode;
44072 if (mode == SFmode)
44073 vmode = V4SFmode;
44074 else if (mode == DFmode)
44075 vmode = V2DFmode;
44076 else
44077 vmode = mode;
44079 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44080 if (!VECTOR_MODE_P (mode))
44082 /* We need to generate a scalar mode mask in this case. */
44083 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44084 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44085 mask = gen_reg_rtx (mode);
44086 emit_insn (gen_rtx_SET (mask, tmp));
44089 else
44090 mask = gen_rtx_NOT (mode, mask);
44091 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44092 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44095 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44096 mask for masking out the sign-bit is stored in *SMASK, if that is
44097 non-null. */
44098 static rtx
44099 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44101 machine_mode vmode, mode = GET_MODE (op0);
44102 rtx xa, mask;
44104 xa = gen_reg_rtx (mode);
44105 if (mode == SFmode)
44106 vmode = V4SFmode;
44107 else if (mode == DFmode)
44108 vmode = V2DFmode;
44109 else
44110 vmode = mode;
44111 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44112 if (!VECTOR_MODE_P (mode))
44114 /* We need to generate a scalar mode mask in this case. */
44115 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44116 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44117 mask = gen_reg_rtx (mode);
44118 emit_insn (gen_rtx_SET (mask, tmp));
44120 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44122 if (smask)
44123 *smask = mask;
44125 return xa;
44128 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44129 swapping the operands if SWAP_OPERANDS is true. The expanded
44130 code is a forward jump to a newly created label in case the
44131 comparison is true. The generated label rtx is returned. */
44132 static rtx_code_label *
44133 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44134 bool swap_operands)
44136 bool unordered_compare = ix86_unordered_fp_compare (code);
44137 rtx_code_label *label;
44138 rtx tmp, reg;
44140 if (swap_operands)
44141 std::swap (op0, op1);
44143 label = gen_label_rtx ();
44144 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
44145 if (unordered_compare)
44146 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
44147 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
44148 emit_insn (gen_rtx_SET (reg, tmp));
44149 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
44150 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44151 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44152 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44153 JUMP_LABEL (tmp) = label;
44155 return label;
44158 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44159 using comparison code CODE. Operands are swapped for the comparison if
44160 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44161 static rtx
44162 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44163 bool swap_operands)
44165 rtx (*insn)(rtx, rtx, rtx, rtx);
44166 machine_mode mode = GET_MODE (op0);
44167 rtx mask = gen_reg_rtx (mode);
44169 if (swap_operands)
44170 std::swap (op0, op1);
44172 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44174 emit_insn (insn (mask, op0, op1,
44175 gen_rtx_fmt_ee (code, mode, op0, op1)));
44176 return mask;
44179 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44180 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44181 static rtx
44182 ix86_gen_TWO52 (machine_mode mode)
44184 REAL_VALUE_TYPE TWO52r;
44185 rtx TWO52;
44187 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44188 TWO52 = const_double_from_real_value (TWO52r, mode);
44189 TWO52 = force_reg (mode, TWO52);
44191 return TWO52;
44194 /* Expand SSE sequence for computing lround from OP1 storing
44195 into OP0. */
44196 void
44197 ix86_expand_lround (rtx op0, rtx op1)
44199 /* C code for the stuff we're doing below:
44200 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44201 return (long)tmp;
44203 machine_mode mode = GET_MODE (op1);
44204 const struct real_format *fmt;
44205 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44206 rtx adj;
44208 /* load nextafter (0.5, 0.0) */
44209 fmt = REAL_MODE_FORMAT (mode);
44210 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44211 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44213 /* adj = copysign (0.5, op1) */
44214 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44215 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44217 /* adj = op1 + adj */
44218 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44220 /* op0 = (imode)adj */
44221 expand_fix (op0, adj, 0);
44224 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44225 into OPERAND0. */
44226 void
44227 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44229 /* C code for the stuff we're doing below (for do_floor):
44230 xi = (long)op1;
44231 xi -= (double)xi > op1 ? 1 : 0;
44232 return xi;
44234 machine_mode fmode = GET_MODE (op1);
44235 machine_mode imode = GET_MODE (op0);
44236 rtx ireg, freg, tmp;
44237 rtx_code_label *label;
44239 /* reg = (long)op1 */
44240 ireg = gen_reg_rtx (imode);
44241 expand_fix (ireg, op1, 0);
44243 /* freg = (double)reg */
44244 freg = gen_reg_rtx (fmode);
44245 expand_float (freg, ireg, 0);
44247 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44248 label = ix86_expand_sse_compare_and_jump (UNLE,
44249 freg, op1, !do_floor);
44250 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44251 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44252 emit_move_insn (ireg, tmp);
44254 emit_label (label);
44255 LABEL_NUSES (label) = 1;
44257 emit_move_insn (op0, ireg);
44260 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
44261 void
44262 ix86_expand_rint (rtx operand0, rtx operand1)
44264 /* C code for the stuff we're doing below:
44265 xa = fabs (operand1);
44266 if (!isless (xa, 2**52))
44267 return operand1;
44268 two52 = 2**52;
44269 if (flag_rounding_math)
44271 two52 = copysign (two52, operand1);
44272 xa = operand1;
44274 xa = xa + two52 - two52;
44275 return copysign (xa, operand1);
44277 machine_mode mode = GET_MODE (operand0);
44278 rtx res, xa, TWO52, two52, mask;
44279 rtx_code_label *label;
44281 res = gen_reg_rtx (mode);
44282 emit_move_insn (res, operand1);
44284 /* xa = abs (operand1) */
44285 xa = ix86_expand_sse_fabs (res, &mask);
44287 /* if (!isless (xa, TWO52)) goto label; */
44288 TWO52 = ix86_gen_TWO52 (mode);
44289 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44291 two52 = TWO52;
44292 if (flag_rounding_math)
44294 two52 = gen_reg_rtx (mode);
44295 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
44296 xa = res;
44299 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
44300 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
44302 ix86_sse_copysign_to_positive (res, xa, res, mask);
44304 emit_label (label);
44305 LABEL_NUSES (label) = 1;
44307 emit_move_insn (operand0, res);
44310 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44311 into OPERAND0. */
44312 void
44313 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44315 /* C code for the stuff we expand below.
44316 double xa = fabs (x), x2;
44317 if (!isless (xa, TWO52))
44318 return x;
44319 xa = xa + TWO52 - TWO52;
44320 x2 = copysign (xa, x);
44321 Compensate. Floor:
44322 if (x2 > x)
44323 x2 -= 1;
44324 Compensate. Ceil:
44325 if (x2 < x)
44326 x2 -= -1;
44327 return x2;
44329 machine_mode mode = GET_MODE (operand0);
44330 rtx xa, TWO52, tmp, one, res, mask;
44331 rtx_code_label *label;
44333 TWO52 = ix86_gen_TWO52 (mode);
44335 /* Temporary for holding the result, initialized to the input
44336 operand to ease control flow. */
44337 res = gen_reg_rtx (mode);
44338 emit_move_insn (res, operand1);
44340 /* xa = abs (operand1) */
44341 xa = ix86_expand_sse_fabs (res, &mask);
44343 /* if (!isless (xa, TWO52)) goto label; */
44344 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44346 /* xa = xa + TWO52 - TWO52; */
44347 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44348 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44350 /* xa = copysign (xa, operand1) */
44351 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44353 /* generate 1.0 or -1.0 */
44354 one = force_reg (mode,
44355 const_double_from_real_value (do_floor
44356 ? dconst1 : dconstm1, mode));
44358 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44359 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44360 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44361 /* We always need to subtract here to preserve signed zero. */
44362 tmp = expand_simple_binop (mode, MINUS,
44363 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44364 emit_move_insn (res, tmp);
44366 emit_label (label);
44367 LABEL_NUSES (label) = 1;
44369 emit_move_insn (operand0, res);
44372 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44373 into OPERAND0. */
44374 void
44375 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44377 /* C code for the stuff we expand below.
44378 double xa = fabs (x), x2;
44379 if (!isless (xa, TWO52))
44380 return x;
44381 x2 = (double)(long)x;
44382 Compensate. Floor:
44383 if (x2 > x)
44384 x2 -= 1;
44385 Compensate. Ceil:
44386 if (x2 < x)
44387 x2 += 1;
44388 if (HONOR_SIGNED_ZEROS (mode))
44389 return copysign (x2, x);
44390 return x2;
44392 machine_mode mode = GET_MODE (operand0);
44393 rtx xa, xi, TWO52, tmp, one, res, mask;
44394 rtx_code_label *label;
44396 TWO52 = ix86_gen_TWO52 (mode);
44398 /* Temporary for holding the result, initialized to the input
44399 operand to ease control flow. */
44400 res = gen_reg_rtx (mode);
44401 emit_move_insn (res, operand1);
44403 /* xa = abs (operand1) */
44404 xa = ix86_expand_sse_fabs (res, &mask);
44406 /* if (!isless (xa, TWO52)) goto label; */
44407 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44409 /* xa = (double)(long)x */
44410 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44411 expand_fix (xi, res, 0);
44412 expand_float (xa, xi, 0);
44414 /* generate 1.0 */
44415 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44417 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44418 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44419 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44420 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44421 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44422 emit_move_insn (res, tmp);
44424 if (HONOR_SIGNED_ZEROS (mode))
44425 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44427 emit_label (label);
44428 LABEL_NUSES (label) = 1;
44430 emit_move_insn (operand0, res);
44433 /* Expand SSE sequence for computing round from OPERAND1 storing
44434 into OPERAND0. Sequence that works without relying on DImode truncation
44435 via cvttsd2siq that is only available on 64bit targets. */
44436 void
44437 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44439 /* C code for the stuff we expand below.
44440 double xa = fabs (x), xa2, x2;
44441 if (!isless (xa, TWO52))
44442 return x;
44443 Using the absolute value and copying back sign makes
44444 -0.0 -> -0.0 correct.
44445 xa2 = xa + TWO52 - TWO52;
44446 Compensate.
44447 dxa = xa2 - xa;
44448 if (dxa <= -0.5)
44449 xa2 += 1;
44450 else if (dxa > 0.5)
44451 xa2 -= 1;
44452 x2 = copysign (xa2, x);
44453 return x2;
44455 machine_mode mode = GET_MODE (operand0);
44456 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44457 rtx_code_label *label;
44459 TWO52 = ix86_gen_TWO52 (mode);
44461 /* Temporary for holding the result, initialized to the input
44462 operand to ease control flow. */
44463 res = gen_reg_rtx (mode);
44464 emit_move_insn (res, operand1);
44466 /* xa = abs (operand1) */
44467 xa = ix86_expand_sse_fabs (res, &mask);
44469 /* if (!isless (xa, TWO52)) goto label; */
44470 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44472 /* xa2 = xa + TWO52 - TWO52; */
44473 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44474 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44476 /* dxa = xa2 - xa; */
44477 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44479 /* generate 0.5, 1.0 and -0.5 */
44480 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44481 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44482 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44483 0, OPTAB_DIRECT);
44485 /* Compensate. */
44486 tmp = gen_reg_rtx (mode);
44487 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44488 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44489 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44490 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44491 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44492 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44493 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44494 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44496 /* res = copysign (xa2, operand1) */
44497 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44499 emit_label (label);
44500 LABEL_NUSES (label) = 1;
44502 emit_move_insn (operand0, res);
44505 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44506 into OPERAND0. */
44507 void
44508 ix86_expand_trunc (rtx operand0, rtx operand1)
44510 /* C code for SSE variant we expand below.
44511 double xa = fabs (x), x2;
44512 if (!isless (xa, TWO52))
44513 return x;
44514 x2 = (double)(long)x;
44515 if (HONOR_SIGNED_ZEROS (mode))
44516 return copysign (x2, x);
44517 return x2;
44519 machine_mode mode = GET_MODE (operand0);
44520 rtx xa, xi, TWO52, res, mask;
44521 rtx_code_label *label;
44523 TWO52 = ix86_gen_TWO52 (mode);
44525 /* Temporary for holding the result, initialized to the input
44526 operand to ease control flow. */
44527 res = gen_reg_rtx (mode);
44528 emit_move_insn (res, operand1);
44530 /* xa = abs (operand1) */
44531 xa = ix86_expand_sse_fabs (res, &mask);
44533 /* if (!isless (xa, TWO52)) goto label; */
44534 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44536 /* x = (double)(long)x */
44537 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44538 expand_fix (xi, res, 0);
44539 expand_float (res, xi, 0);
44541 if (HONOR_SIGNED_ZEROS (mode))
44542 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44544 emit_label (label);
44545 LABEL_NUSES (label) = 1;
44547 emit_move_insn (operand0, res);
44550 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44551 into OPERAND0. */
44552 void
44553 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44555 machine_mode mode = GET_MODE (operand0);
44556 rtx xa, mask, TWO52, one, res, smask, tmp;
44557 rtx_code_label *label;
44559 /* C code for SSE variant we expand below.
44560 double xa = fabs (x), x2;
44561 if (!isless (xa, TWO52))
44562 return x;
44563 xa2 = xa + TWO52 - TWO52;
44564 Compensate:
44565 if (xa2 > xa)
44566 xa2 -= 1.0;
44567 x2 = copysign (xa2, x);
44568 return x2;
44571 TWO52 = ix86_gen_TWO52 (mode);
44573 /* Temporary for holding the result, initialized to the input
44574 operand to ease control flow. */
44575 res = gen_reg_rtx (mode);
44576 emit_move_insn (res, operand1);
44578 /* xa = abs (operand1) */
44579 xa = ix86_expand_sse_fabs (res, &smask);
44581 /* if (!isless (xa, TWO52)) goto label; */
44582 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44584 /* res = xa + TWO52 - TWO52; */
44585 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44586 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44587 emit_move_insn (res, tmp);
44589 /* generate 1.0 */
44590 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44592 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44593 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44594 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44595 tmp = expand_simple_binop (mode, MINUS,
44596 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44597 emit_move_insn (res, tmp);
44599 /* res = copysign (res, operand1) */
44600 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44602 emit_label (label);
44603 LABEL_NUSES (label) = 1;
44605 emit_move_insn (operand0, res);
44608 /* Expand SSE sequence for computing round from OPERAND1 storing
44609 into OPERAND0. */
44610 void
44611 ix86_expand_round (rtx operand0, rtx operand1)
44613 /* C code for the stuff we're doing below:
44614 double xa = fabs (x);
44615 if (!isless (xa, TWO52))
44616 return x;
44617 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44618 return copysign (xa, x);
44620 machine_mode mode = GET_MODE (operand0);
44621 rtx res, TWO52, xa, xi, half, mask;
44622 rtx_code_label *label;
44623 const struct real_format *fmt;
44624 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44626 /* Temporary for holding the result, initialized to the input
44627 operand to ease control flow. */
44628 res = gen_reg_rtx (mode);
44629 emit_move_insn (res, operand1);
44631 TWO52 = ix86_gen_TWO52 (mode);
44632 xa = ix86_expand_sse_fabs (res, &mask);
44633 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44635 /* load nextafter (0.5, 0.0) */
44636 fmt = REAL_MODE_FORMAT (mode);
44637 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44638 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44640 /* xa = xa + 0.5 */
44641 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44642 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44644 /* xa = (double)(int64_t)xa */
44645 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44646 expand_fix (xi, xa, 0);
44647 expand_float (xa, xi, 0);
44649 /* res = copysign (xa, operand1) */
44650 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44652 emit_label (label);
44653 LABEL_NUSES (label) = 1;
44655 emit_move_insn (operand0, res);
44658 /* Expand SSE sequence for computing round
44659 from OP1 storing into OP0 using sse4 round insn. */
44660 void
44661 ix86_expand_round_sse4 (rtx op0, rtx op1)
44663 machine_mode mode = GET_MODE (op0);
44664 rtx e1, e2, res, half;
44665 const struct real_format *fmt;
44666 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44667 rtx (*gen_copysign) (rtx, rtx, rtx);
44668 rtx (*gen_round) (rtx, rtx, rtx);
44670 switch (mode)
44672 case E_SFmode:
44673 gen_copysign = gen_copysignsf3;
44674 gen_round = gen_sse4_1_roundsf2;
44675 break;
44676 case E_DFmode:
44677 gen_copysign = gen_copysigndf3;
44678 gen_round = gen_sse4_1_rounddf2;
44679 break;
44680 default:
44681 gcc_unreachable ();
44684 /* round (a) = trunc (a + copysign (0.5, a)) */
44686 /* load nextafter (0.5, 0.0) */
44687 fmt = REAL_MODE_FORMAT (mode);
44688 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44689 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44690 half = const_double_from_real_value (pred_half, mode);
44692 /* e1 = copysign (0.5, op1) */
44693 e1 = gen_reg_rtx (mode);
44694 emit_insn (gen_copysign (e1, half, op1));
44696 /* e2 = op1 + e1 */
44697 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44699 /* res = trunc (e2) */
44700 res = gen_reg_rtx (mode);
44701 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44703 emit_move_insn (op0, res);
44707 /* Table of valid machine attributes. */
44708 static const struct attribute_spec ix86_attribute_table[] =
44710 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44711 affects_type_identity, exclusions } */
44712 /* Stdcall attribute says callee is responsible for popping arguments
44713 if they are not variable. */
44714 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44715 true, NULL },
44716 /* Fastcall attribute says callee is responsible for popping arguments
44717 if they are not variable. */
44718 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44719 true, NULL },
44720 /* Thiscall attribute says callee is responsible for popping arguments
44721 if they are not variable. */
44722 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44723 true, NULL },
44724 /* Cdecl attribute says the callee is a normal C declaration */
44725 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44726 true, NULL },
44727 /* Regparm attribute specifies how many integer arguments are to be
44728 passed in registers. */
44729 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44730 true, NULL },
44731 /* Sseregparm attribute says we are using x86_64 calling conventions
44732 for FP arguments. */
44733 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44734 true, NULL },
44735 /* The transactional memory builtins are implicitly regparm or fastcall
44736 depending on the ABI. Override the generic do-nothing attribute that
44737 these builtins were declared with. */
44738 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44739 true, NULL },
44740 /* force_align_arg_pointer says this function realigns the stack at entry. */
44741 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44742 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false,
44743 NULL },
44744 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44745 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false, NULL },
44746 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false, NULL },
44747 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44748 false, NULL },
44749 #endif
44750 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44751 false, NULL },
44752 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44753 false, NULL },
44754 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44755 SUBTARGET_ATTRIBUTE_TABLE,
44756 #endif
44757 /* ms_abi and sysv_abi calling convention function attributes. */
44758 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true, NULL },
44759 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true,
44760 NULL },
44761 { "ms_abi va_list", 0, 0, false, false, false, NULL, false, NULL },
44762 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false, NULL },
44763 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44764 false, NULL },
44765 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44766 ix86_handle_callee_pop_aggregate_return, true, NULL },
44767 { "interrupt", 0, 0, false, true, true,
44768 ix86_handle_interrupt_attribute, false, NULL },
44769 { "no_caller_saved_registers", 0, 0, false, true, true,
44770 ix86_handle_no_caller_saved_registers_attribute, false, NULL },
44771 { "naked", 0, 0, true, false, false,
44772 ix86_handle_fndecl_attribute, false, NULL },
44774 /* End element. */
44775 { NULL, 0, 0, false, false, false, NULL, false, NULL }
44778 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44779 static int
44780 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44781 tree vectype, int)
44783 bool fp = false;
44784 machine_mode mode = TImode;
44785 int index;
44786 if (vectype != NULL)
44788 fp = FLOAT_TYPE_P (vectype);
44789 mode = TYPE_MODE (vectype);
44792 switch (type_of_cost)
44794 case scalar_stmt:
44795 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44797 case scalar_load:
44798 /* load/store costs are relative to register move which is 2. Recompute
44799 it to COSTS_N_INSNS so everything have same base. */
44800 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44801 : ix86_cost->int_load [2]) / 2;
44803 case scalar_store:
44804 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44805 : ix86_cost->int_store [2]) / 2;
44807 case vector_stmt:
44808 return ix86_vec_cost (mode,
44809 fp ? ix86_cost->addss : ix86_cost->sse_op,
44810 true);
44812 case vector_load:
44813 index = sse_store_index (mode);
44814 /* See PR82713 - we may end up being called on non-vector type. */
44815 if (index < 0)
44816 index = 2;
44817 return ix86_vec_cost (mode,
44818 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44819 true);
44821 case vector_store:
44822 index = sse_store_index (mode);
44823 /* See PR82713 - we may end up being called on non-vector type. */
44824 if (index < 0)
44825 index = 2;
44826 return ix86_vec_cost (mode,
44827 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44828 true);
44830 case vec_to_scalar:
44831 case scalar_to_vec:
44832 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44834 /* We should have separate costs for unaligned loads and gather/scatter.
44835 Do that incrementally. */
44836 case unaligned_load:
44837 index = sse_store_index (mode);
44838 /* See PR82713 - we may end up being called on non-vector type. */
44839 if (index < 0)
44840 index = 2;
44841 return ix86_vec_cost (mode,
44842 COSTS_N_INSNS
44843 (ix86_cost->sse_unaligned_load[index]) / 2,
44844 true);
44846 case unaligned_store:
44847 index = sse_store_index (mode);
44848 /* See PR82713 - we may end up being called on non-vector type. */
44849 if (index < 0)
44850 index = 2;
44851 return ix86_vec_cost (mode,
44852 COSTS_N_INSNS
44853 (ix86_cost->sse_unaligned_store[index]) / 2,
44854 true);
44856 case vector_gather_load:
44857 return ix86_vec_cost (mode,
44858 COSTS_N_INSNS
44859 (ix86_cost->gather_static
44860 + ix86_cost->gather_per_elt
44861 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44862 true);
44864 case vector_scatter_store:
44865 return ix86_vec_cost (mode,
44866 COSTS_N_INSNS
44867 (ix86_cost->scatter_static
44868 + ix86_cost->scatter_per_elt
44869 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44870 true);
44872 case cond_branch_taken:
44873 return ix86_cost->cond_taken_branch_cost;
44875 case cond_branch_not_taken:
44876 return ix86_cost->cond_not_taken_branch_cost;
44878 case vec_perm:
44879 case vec_promote_demote:
44880 return ix86_vec_cost (mode,
44881 ix86_cost->sse_op, true);
44883 case vec_construct:
44884 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44886 default:
44887 gcc_unreachable ();
44891 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44892 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44893 insn every time. */
44895 static GTY(()) rtx_insn *vselect_insn;
44897 /* Initialize vselect_insn. */
44899 static void
44900 init_vselect_insn (void)
44902 unsigned i;
44903 rtx x;
44905 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44906 for (i = 0; i < MAX_VECT_LEN; ++i)
44907 XVECEXP (x, 0, i) = const0_rtx;
44908 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44909 const0_rtx), x);
44910 x = gen_rtx_SET (const0_rtx, x);
44911 start_sequence ();
44912 vselect_insn = emit_insn (x);
44913 end_sequence ();
44916 /* Construct (set target (vec_select op0 (parallel perm))) and
44917 return true if that's a valid instruction in the active ISA. */
44919 static bool
44920 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44921 unsigned nelt, bool testing_p)
44923 unsigned int i;
44924 rtx x, save_vconcat;
44925 int icode;
44927 if (vselect_insn == NULL_RTX)
44928 init_vselect_insn ();
44930 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44931 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44932 for (i = 0; i < nelt; ++i)
44933 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44934 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44935 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44936 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44937 SET_DEST (PATTERN (vselect_insn)) = target;
44938 icode = recog_memoized (vselect_insn);
44940 if (icode >= 0 && !testing_p)
44941 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44943 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44944 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44945 INSN_CODE (vselect_insn) = -1;
44947 return icode >= 0;
44950 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44952 static bool
44953 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44954 const unsigned char *perm, unsigned nelt,
44955 bool testing_p)
44957 machine_mode v2mode;
44958 rtx x;
44959 bool ok;
44961 if (vselect_insn == NULL_RTX)
44962 init_vselect_insn ();
44964 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
44965 return false;
44966 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44967 PUT_MODE (x, v2mode);
44968 XEXP (x, 0) = op0;
44969 XEXP (x, 1) = op1;
44970 ok = expand_vselect (target, x, perm, nelt, testing_p);
44971 XEXP (x, 0) = const0_rtx;
44972 XEXP (x, 1) = const0_rtx;
44973 return ok;
44976 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44977 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44979 static bool
44980 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44982 machine_mode mmode, vmode = d->vmode;
44983 unsigned i, mask, nelt = d->nelt;
44984 rtx target, op0, op1, maskop, x;
44985 rtx rperm[32], vperm;
44987 if (d->one_operand_p)
44988 return false;
44989 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44990 && (TARGET_AVX512BW
44991 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44993 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44995 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44997 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
44999 else
45000 return false;
45002 /* This is a blend, not a permute. Elements must stay in their
45003 respective lanes. */
45004 for (i = 0; i < nelt; ++i)
45006 unsigned e = d->perm[i];
45007 if (!(e == i || e == i + nelt))
45008 return false;
45011 if (d->testing_p)
45012 return true;
45014 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45015 decision should be extracted elsewhere, so that we only try that
45016 sequence once all budget==3 options have been tried. */
45017 target = d->target;
45018 op0 = d->op0;
45019 op1 = d->op1;
45020 mask = 0;
45022 switch (vmode)
45024 case E_V8DFmode:
45025 case E_V16SFmode:
45026 case E_V4DFmode:
45027 case E_V8SFmode:
45028 case E_V2DFmode:
45029 case E_V4SFmode:
45030 case E_V8HImode:
45031 case E_V8SImode:
45032 case E_V32HImode:
45033 case E_V64QImode:
45034 case E_V16SImode:
45035 case E_V8DImode:
45036 for (i = 0; i < nelt; ++i)
45037 mask |= (d->perm[i] >= nelt) << i;
45038 break;
45040 case E_V2DImode:
45041 for (i = 0; i < 2; ++i)
45042 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45043 vmode = V8HImode;
45044 goto do_subreg;
45046 case E_V4SImode:
45047 for (i = 0; i < 4; ++i)
45048 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45049 vmode = V8HImode;
45050 goto do_subreg;
45052 case E_V16QImode:
45053 /* See if bytes move in pairs so we can use pblendw with
45054 an immediate argument, rather than pblendvb with a vector
45055 argument. */
45056 for (i = 0; i < 16; i += 2)
45057 if (d->perm[i] + 1 != d->perm[i + 1])
45059 use_pblendvb:
45060 for (i = 0; i < nelt; ++i)
45061 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45063 finish_pblendvb:
45064 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45065 vperm = force_reg (vmode, vperm);
45067 if (GET_MODE_SIZE (vmode) == 16)
45068 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45069 else
45070 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45071 if (target != d->target)
45072 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45073 return true;
45076 for (i = 0; i < 8; ++i)
45077 mask |= (d->perm[i * 2] >= 16) << i;
45078 vmode = V8HImode;
45079 /* FALLTHRU */
45081 do_subreg:
45082 target = gen_reg_rtx (vmode);
45083 op0 = gen_lowpart (vmode, op0);
45084 op1 = gen_lowpart (vmode, op1);
45085 break;
45087 case E_V32QImode:
45088 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45089 for (i = 0; i < 32; i += 2)
45090 if (d->perm[i] + 1 != d->perm[i + 1])
45091 goto use_pblendvb;
45092 /* See if bytes move in quadruplets. If yes, vpblendd
45093 with immediate can be used. */
45094 for (i = 0; i < 32; i += 4)
45095 if (d->perm[i] + 2 != d->perm[i + 2])
45096 break;
45097 if (i < 32)
45099 /* See if bytes move the same in both lanes. If yes,
45100 vpblendw with immediate can be used. */
45101 for (i = 0; i < 16; i += 2)
45102 if (d->perm[i] + 16 != d->perm[i + 16])
45103 goto use_pblendvb;
45105 /* Use vpblendw. */
45106 for (i = 0; i < 16; ++i)
45107 mask |= (d->perm[i * 2] >= 32) << i;
45108 vmode = V16HImode;
45109 goto do_subreg;
45112 /* Use vpblendd. */
45113 for (i = 0; i < 8; ++i)
45114 mask |= (d->perm[i * 4] >= 32) << i;
45115 vmode = V8SImode;
45116 goto do_subreg;
45118 case E_V16HImode:
45119 /* See if words move in pairs. If yes, vpblendd can be used. */
45120 for (i = 0; i < 16; i += 2)
45121 if (d->perm[i] + 1 != d->perm[i + 1])
45122 break;
45123 if (i < 16)
45125 /* See if words move the same in both lanes. If not,
45126 vpblendvb must be used. */
45127 for (i = 0; i < 8; i++)
45128 if (d->perm[i] + 8 != d->perm[i + 8])
45130 /* Use vpblendvb. */
45131 for (i = 0; i < 32; ++i)
45132 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45134 vmode = V32QImode;
45135 nelt = 32;
45136 target = gen_reg_rtx (vmode);
45137 op0 = gen_lowpart (vmode, op0);
45138 op1 = gen_lowpart (vmode, op1);
45139 goto finish_pblendvb;
45142 /* Use vpblendw. */
45143 for (i = 0; i < 16; ++i)
45144 mask |= (d->perm[i] >= 16) << i;
45145 break;
45148 /* Use vpblendd. */
45149 for (i = 0; i < 8; ++i)
45150 mask |= (d->perm[i * 2] >= 16) << i;
45151 vmode = V8SImode;
45152 goto do_subreg;
45154 case E_V4DImode:
45155 /* Use vpblendd. */
45156 for (i = 0; i < 4; ++i)
45157 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45158 vmode = V8SImode;
45159 goto do_subreg;
45161 default:
45162 gcc_unreachable ();
45165 switch (vmode)
45167 case E_V8DFmode:
45168 case E_V8DImode:
45169 mmode = QImode;
45170 break;
45171 case E_V16SFmode:
45172 case E_V16SImode:
45173 mmode = HImode;
45174 break;
45175 case E_V32HImode:
45176 mmode = SImode;
45177 break;
45178 case E_V64QImode:
45179 mmode = DImode;
45180 break;
45181 default:
45182 mmode = VOIDmode;
45185 if (mmode != VOIDmode)
45186 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45187 else
45188 maskop = GEN_INT (mask);
45190 /* This matches five different patterns with the different modes. */
45191 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45192 x = gen_rtx_SET (target, x);
45193 emit_insn (x);
45194 if (target != d->target)
45195 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45197 return true;
45200 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45201 in terms of the variable form of vpermilps.
45203 Note that we will have already failed the immediate input vpermilps,
45204 which requires that the high and low part shuffle be identical; the
45205 variable form doesn't require that. */
45207 static bool
45208 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45210 rtx rperm[8], vperm;
45211 unsigned i;
45213 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45214 return false;
45216 /* We can only permute within the 128-bit lane. */
45217 for (i = 0; i < 8; ++i)
45219 unsigned e = d->perm[i];
45220 if (i < 4 ? e >= 4 : e < 4)
45221 return false;
45224 if (d->testing_p)
45225 return true;
45227 for (i = 0; i < 8; ++i)
45229 unsigned e = d->perm[i];
45231 /* Within each 128-bit lane, the elements of op0 are numbered
45232 from 0 and the elements of op1 are numbered from 4. */
45233 if (e >= 8 + 4)
45234 e -= 8;
45235 else if (e >= 4)
45236 e -= 4;
45238 rperm[i] = GEN_INT (e);
45241 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45242 vperm = force_reg (V8SImode, vperm);
45243 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45245 return true;
45248 /* Return true if permutation D can be performed as VMODE permutation
45249 instead. */
45251 static bool
45252 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45254 unsigned int i, j, chunk;
45256 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45257 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45258 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45259 return false;
45261 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45262 return true;
45264 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45265 for (i = 0; i < d->nelt; i += chunk)
45266 if (d->perm[i] & (chunk - 1))
45267 return false;
45268 else
45269 for (j = 1; j < chunk; ++j)
45270 if (d->perm[i] + j != d->perm[i + j])
45271 return false;
45273 return true;
45276 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45277 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45279 static bool
45280 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45282 unsigned i, nelt, eltsz, mask;
45283 unsigned char perm[64];
45284 machine_mode vmode = V16QImode;
45285 rtx rperm[64], vperm, target, op0, op1;
45287 nelt = d->nelt;
45289 if (!d->one_operand_p)
45291 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45293 if (TARGET_AVX2
45294 && valid_perm_using_mode_p (V2TImode, d))
45296 if (d->testing_p)
45297 return true;
45299 /* Use vperm2i128 insn. The pattern uses
45300 V4DImode instead of V2TImode. */
45301 target = d->target;
45302 if (d->vmode != V4DImode)
45303 target = gen_reg_rtx (V4DImode);
45304 op0 = gen_lowpart (V4DImode, d->op0);
45305 op1 = gen_lowpart (V4DImode, d->op1);
45306 rperm[0]
45307 = GEN_INT ((d->perm[0] / (nelt / 2))
45308 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45309 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45310 if (target != d->target)
45311 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45312 return true;
45314 return false;
45317 else
45319 if (GET_MODE_SIZE (d->vmode) == 16)
45321 if (!TARGET_SSSE3)
45322 return false;
45324 else if (GET_MODE_SIZE (d->vmode) == 32)
45326 if (!TARGET_AVX2)
45327 return false;
45329 /* V4DImode should be already handled through
45330 expand_vselect by vpermq instruction. */
45331 gcc_assert (d->vmode != V4DImode);
45333 vmode = V32QImode;
45334 if (d->vmode == V8SImode
45335 || d->vmode == V16HImode
45336 || d->vmode == V32QImode)
45338 /* First see if vpermq can be used for
45339 V8SImode/V16HImode/V32QImode. */
45340 if (valid_perm_using_mode_p (V4DImode, d))
45342 for (i = 0; i < 4; i++)
45343 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45344 if (d->testing_p)
45345 return true;
45346 target = gen_reg_rtx (V4DImode);
45347 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45348 perm, 4, false))
45350 emit_move_insn (d->target,
45351 gen_lowpart (d->vmode, target));
45352 return true;
45354 return false;
45357 /* Next see if vpermd can be used. */
45358 if (valid_perm_using_mode_p (V8SImode, d))
45359 vmode = V8SImode;
45361 /* Or if vpermps can be used. */
45362 else if (d->vmode == V8SFmode)
45363 vmode = V8SImode;
45365 if (vmode == V32QImode)
45367 /* vpshufb only works intra lanes, it is not
45368 possible to shuffle bytes in between the lanes. */
45369 for (i = 0; i < nelt; ++i)
45370 if ((d->perm[i] ^ i) & (nelt / 2))
45371 return false;
45374 else if (GET_MODE_SIZE (d->vmode) == 64)
45376 if (!TARGET_AVX512BW)
45377 return false;
45379 /* If vpermq didn't work, vpshufb won't work either. */
45380 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45381 return false;
45383 vmode = V64QImode;
45384 if (d->vmode == V16SImode
45385 || d->vmode == V32HImode
45386 || d->vmode == V64QImode)
45388 /* First see if vpermq can be used for
45389 V16SImode/V32HImode/V64QImode. */
45390 if (valid_perm_using_mode_p (V8DImode, d))
45392 for (i = 0; i < 8; i++)
45393 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45394 if (d->testing_p)
45395 return true;
45396 target = gen_reg_rtx (V8DImode);
45397 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45398 perm, 8, false))
45400 emit_move_insn (d->target,
45401 gen_lowpart (d->vmode, target));
45402 return true;
45404 return false;
45407 /* Next see if vpermd can be used. */
45408 if (valid_perm_using_mode_p (V16SImode, d))
45409 vmode = V16SImode;
45411 /* Or if vpermps can be used. */
45412 else if (d->vmode == V16SFmode)
45413 vmode = V16SImode;
45414 if (vmode == V64QImode)
45416 /* vpshufb only works intra lanes, it is not
45417 possible to shuffle bytes in between the lanes. */
45418 for (i = 0; i < nelt; ++i)
45419 if ((d->perm[i] ^ i) & (nelt / 4))
45420 return false;
45423 else
45424 return false;
45427 if (d->testing_p)
45428 return true;
45430 if (vmode == V8SImode)
45431 for (i = 0; i < 8; ++i)
45432 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45433 else if (vmode == V16SImode)
45434 for (i = 0; i < 16; ++i)
45435 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45436 else
45438 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45439 if (!d->one_operand_p)
45440 mask = 2 * nelt - 1;
45441 else if (vmode == V16QImode)
45442 mask = nelt - 1;
45443 else if (vmode == V64QImode)
45444 mask = nelt / 4 - 1;
45445 else
45446 mask = nelt / 2 - 1;
45448 for (i = 0; i < nelt; ++i)
45450 unsigned j, e = d->perm[i] & mask;
45451 for (j = 0; j < eltsz; ++j)
45452 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45456 vperm = gen_rtx_CONST_VECTOR (vmode,
45457 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45458 vperm = force_reg (vmode, vperm);
45460 target = d->target;
45461 if (d->vmode != vmode)
45462 target = gen_reg_rtx (vmode);
45463 op0 = gen_lowpart (vmode, d->op0);
45464 if (d->one_operand_p)
45466 if (vmode == V16QImode)
45467 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45468 else if (vmode == V32QImode)
45469 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45470 else if (vmode == V64QImode)
45471 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45472 else if (vmode == V8SFmode)
45473 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45474 else if (vmode == V8SImode)
45475 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45476 else if (vmode == V16SFmode)
45477 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45478 else if (vmode == V16SImode)
45479 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45480 else
45481 gcc_unreachable ();
45483 else
45485 op1 = gen_lowpart (vmode, d->op1);
45486 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45488 if (target != d->target)
45489 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45491 return true;
45494 /* For V*[QHS]Imode permutations, check if the same permutation
45495 can't be performed in a 2x, 4x or 8x wider inner mode. */
45497 static bool
45498 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45499 struct expand_vec_perm_d *nd)
45501 int i;
45502 machine_mode mode = VOIDmode;
45504 switch (d->vmode)
45506 case E_V16QImode: mode = V8HImode; break;
45507 case E_V32QImode: mode = V16HImode; break;
45508 case E_V64QImode: mode = V32HImode; break;
45509 case E_V8HImode: mode = V4SImode; break;
45510 case E_V16HImode: mode = V8SImode; break;
45511 case E_V32HImode: mode = V16SImode; break;
45512 case E_V4SImode: mode = V2DImode; break;
45513 case E_V8SImode: mode = V4DImode; break;
45514 case E_V16SImode: mode = V8DImode; break;
45515 default: return false;
45517 for (i = 0; i < d->nelt; i += 2)
45518 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45519 return false;
45520 nd->vmode = mode;
45521 nd->nelt = d->nelt / 2;
45522 for (i = 0; i < nd->nelt; i++)
45523 nd->perm[i] = d->perm[2 * i] / 2;
45524 if (GET_MODE_INNER (mode) != DImode)
45525 canonicalize_vector_int_perm (nd, nd);
45526 if (nd != d)
45528 nd->one_operand_p = d->one_operand_p;
45529 nd->testing_p = d->testing_p;
45530 if (d->op0 == d->op1)
45531 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45532 else
45534 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45535 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45537 if (d->testing_p)
45538 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45539 else
45540 nd->target = gen_reg_rtx (nd->vmode);
45542 return true;
45545 /* Try to expand one-operand permutation with constant mask. */
45547 static bool
45548 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45550 machine_mode mode = GET_MODE (d->op0);
45551 machine_mode maskmode = mode;
45552 rtx (*gen) (rtx, rtx, rtx) = NULL;
45553 rtx target, op0, mask;
45554 rtx vec[64];
45556 if (!rtx_equal_p (d->op0, d->op1))
45557 return false;
45559 if (!TARGET_AVX512F)
45560 return false;
45562 switch (mode)
45564 case E_V16SImode:
45565 gen = gen_avx512f_permvarv16si;
45566 break;
45567 case E_V16SFmode:
45568 gen = gen_avx512f_permvarv16sf;
45569 maskmode = V16SImode;
45570 break;
45571 case E_V8DImode:
45572 gen = gen_avx512f_permvarv8di;
45573 break;
45574 case E_V8DFmode:
45575 gen = gen_avx512f_permvarv8df;
45576 maskmode = V8DImode;
45577 break;
45578 default:
45579 return false;
45582 target = d->target;
45583 op0 = d->op0;
45584 for (int i = 0; i < d->nelt; ++i)
45585 vec[i] = GEN_INT (d->perm[i]);
45586 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45587 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45588 return true;
45591 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45592 in a single instruction. */
45594 static bool
45595 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45597 unsigned i, nelt = d->nelt;
45598 struct expand_vec_perm_d nd;
45600 /* Check plain VEC_SELECT first, because AVX has instructions that could
45601 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45602 input where SEL+CONCAT may not. */
45603 if (d->one_operand_p)
45605 int mask = nelt - 1;
45606 bool identity_perm = true;
45607 bool broadcast_perm = true;
45609 for (i = 0; i < nelt; i++)
45611 nd.perm[i] = d->perm[i] & mask;
45612 if (nd.perm[i] != i)
45613 identity_perm = false;
45614 if (nd.perm[i])
45615 broadcast_perm = false;
45618 if (identity_perm)
45620 if (!d->testing_p)
45621 emit_move_insn (d->target, d->op0);
45622 return true;
45624 else if (broadcast_perm && TARGET_AVX2)
45626 /* Use vpbroadcast{b,w,d}. */
45627 rtx (*gen) (rtx, rtx) = NULL;
45628 switch (d->vmode)
45630 case E_V64QImode:
45631 if (TARGET_AVX512BW)
45632 gen = gen_avx512bw_vec_dupv64qi_1;
45633 break;
45634 case E_V32QImode:
45635 gen = gen_avx2_pbroadcastv32qi_1;
45636 break;
45637 case E_V32HImode:
45638 if (TARGET_AVX512BW)
45639 gen = gen_avx512bw_vec_dupv32hi_1;
45640 break;
45641 case E_V16HImode:
45642 gen = gen_avx2_pbroadcastv16hi_1;
45643 break;
45644 case E_V16SImode:
45645 if (TARGET_AVX512F)
45646 gen = gen_avx512f_vec_dupv16si_1;
45647 break;
45648 case E_V8SImode:
45649 gen = gen_avx2_pbroadcastv8si_1;
45650 break;
45651 case E_V16QImode:
45652 gen = gen_avx2_pbroadcastv16qi;
45653 break;
45654 case E_V8HImode:
45655 gen = gen_avx2_pbroadcastv8hi;
45656 break;
45657 case E_V16SFmode:
45658 if (TARGET_AVX512F)
45659 gen = gen_avx512f_vec_dupv16sf_1;
45660 break;
45661 case E_V8SFmode:
45662 gen = gen_avx2_vec_dupv8sf_1;
45663 break;
45664 case E_V8DFmode:
45665 if (TARGET_AVX512F)
45666 gen = gen_avx512f_vec_dupv8df_1;
45667 break;
45668 case E_V8DImode:
45669 if (TARGET_AVX512F)
45670 gen = gen_avx512f_vec_dupv8di_1;
45671 break;
45672 /* For other modes prefer other shuffles this function creates. */
45673 default: break;
45675 if (gen != NULL)
45677 if (!d->testing_p)
45678 emit_insn (gen (d->target, d->op0));
45679 return true;
45683 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45684 return true;
45686 /* There are plenty of patterns in sse.md that are written for
45687 SEL+CONCAT and are not replicated for a single op. Perhaps
45688 that should be changed, to avoid the nastiness here. */
45690 /* Recognize interleave style patterns, which means incrementing
45691 every other permutation operand. */
45692 for (i = 0; i < nelt; i += 2)
45694 nd.perm[i] = d->perm[i] & mask;
45695 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45697 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45698 d->testing_p))
45699 return true;
45701 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45702 if (nelt >= 4)
45704 for (i = 0; i < nelt; i += 4)
45706 nd.perm[i + 0] = d->perm[i + 0] & mask;
45707 nd.perm[i + 1] = d->perm[i + 1] & mask;
45708 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45709 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45712 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45713 d->testing_p))
45714 return true;
45718 /* Finally, try the fully general two operand permute. */
45719 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45720 d->testing_p))
45721 return true;
45723 /* Recognize interleave style patterns with reversed operands. */
45724 if (!d->one_operand_p)
45726 for (i = 0; i < nelt; ++i)
45728 unsigned e = d->perm[i];
45729 if (e >= nelt)
45730 e -= nelt;
45731 else
45732 e += nelt;
45733 nd.perm[i] = e;
45736 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45737 d->testing_p))
45738 return true;
45741 /* Try the SSE4.1 blend variable merge instructions. */
45742 if (expand_vec_perm_blend (d))
45743 return true;
45745 /* Try one of the AVX vpermil variable permutations. */
45746 if (expand_vec_perm_vpermil (d))
45747 return true;
45749 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45750 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45751 if (expand_vec_perm_pshufb (d))
45752 return true;
45754 /* Try the AVX2 vpalignr instruction. */
45755 if (expand_vec_perm_palignr (d, true))
45756 return true;
45758 /* Try the AVX512F vperm{s,d} instructions. */
45759 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45760 return true;
45762 /* Try the AVX512F vpermt2/vpermi2 instructions. */
45763 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45764 return true;
45766 /* See if we can get the same permutation in different vector integer
45767 mode. */
45768 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45770 if (!d->testing_p)
45771 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45772 return true;
45774 return false;
45777 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45778 in terms of a pair of pshuflw + pshufhw instructions. */
45780 static bool
45781 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45783 unsigned char perm2[MAX_VECT_LEN];
45784 unsigned i;
45785 bool ok;
45787 if (d->vmode != V8HImode || !d->one_operand_p)
45788 return false;
45790 /* The two permutations only operate in 64-bit lanes. */
45791 for (i = 0; i < 4; ++i)
45792 if (d->perm[i] >= 4)
45793 return false;
45794 for (i = 4; i < 8; ++i)
45795 if (d->perm[i] < 4)
45796 return false;
45798 if (d->testing_p)
45799 return true;
45801 /* Emit the pshuflw. */
45802 memcpy (perm2, d->perm, 4);
45803 for (i = 4; i < 8; ++i)
45804 perm2[i] = i;
45805 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45806 gcc_assert (ok);
45808 /* Emit the pshufhw. */
45809 memcpy (perm2 + 4, d->perm + 4, 4);
45810 for (i = 0; i < 4; ++i)
45811 perm2[i] = i;
45812 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45813 gcc_assert (ok);
45815 return true;
45818 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45819 the permutation using the SSSE3 palignr instruction. This succeeds
45820 when all of the elements in PERM fit within one vector and we merely
45821 need to shift them down so that a single vector permutation has a
45822 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45823 the vpalignr instruction itself can perform the requested permutation. */
45825 static bool
45826 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45828 unsigned i, nelt = d->nelt;
45829 unsigned min, max, minswap, maxswap;
45830 bool in_order, ok, swap = false;
45831 rtx shift, target;
45832 struct expand_vec_perm_d dcopy;
45834 /* Even with AVX, palignr only operates on 128-bit vectors,
45835 in AVX2 palignr operates on both 128-bit lanes. */
45836 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45837 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45838 return false;
45840 min = 2 * nelt;
45841 max = 0;
45842 minswap = 2 * nelt;
45843 maxswap = 0;
45844 for (i = 0; i < nelt; ++i)
45846 unsigned e = d->perm[i];
45847 unsigned eswap = d->perm[i] ^ nelt;
45848 if (GET_MODE_SIZE (d->vmode) == 32)
45850 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45851 eswap = e ^ (nelt / 2);
45853 if (e < min)
45854 min = e;
45855 if (e > max)
45856 max = e;
45857 if (eswap < minswap)
45858 minswap = eswap;
45859 if (eswap > maxswap)
45860 maxswap = eswap;
45862 if (min == 0
45863 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45865 if (d->one_operand_p
45866 || minswap == 0
45867 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45868 ? nelt / 2 : nelt))
45869 return false;
45870 swap = true;
45871 min = minswap;
45872 max = maxswap;
45875 /* Given that we have SSSE3, we know we'll be able to implement the
45876 single operand permutation after the palignr with pshufb for
45877 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45878 first. */
45879 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45880 return true;
45882 dcopy = *d;
45883 if (swap)
45885 dcopy.op0 = d->op1;
45886 dcopy.op1 = d->op0;
45887 for (i = 0; i < nelt; ++i)
45888 dcopy.perm[i] ^= nelt;
45891 in_order = true;
45892 for (i = 0; i < nelt; ++i)
45894 unsigned e = dcopy.perm[i];
45895 if (GET_MODE_SIZE (d->vmode) == 32
45896 && e >= nelt
45897 && (e & (nelt / 2 - 1)) < min)
45898 e = e - min - (nelt / 2);
45899 else
45900 e = e - min;
45901 if (e != i)
45902 in_order = false;
45903 dcopy.perm[i] = e;
45905 dcopy.one_operand_p = true;
45907 if (single_insn_only_p && !in_order)
45908 return false;
45910 /* For AVX2, test whether we can permute the result in one instruction. */
45911 if (d->testing_p)
45913 if (in_order)
45914 return true;
45915 dcopy.op1 = dcopy.op0;
45916 return expand_vec_perm_1 (&dcopy);
45919 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45920 if (GET_MODE_SIZE (d->vmode) == 16)
45922 target = gen_reg_rtx (TImode);
45923 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45924 gen_lowpart (TImode, dcopy.op0), shift));
45926 else
45928 target = gen_reg_rtx (V2TImode);
45929 emit_insn (gen_avx2_palignrv2ti (target,
45930 gen_lowpart (V2TImode, dcopy.op1),
45931 gen_lowpart (V2TImode, dcopy.op0),
45932 shift));
45935 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45937 /* Test for the degenerate case where the alignment by itself
45938 produces the desired permutation. */
45939 if (in_order)
45941 emit_move_insn (d->target, dcopy.op0);
45942 return true;
45945 ok = expand_vec_perm_1 (&dcopy);
45946 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45948 return ok;
45951 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45952 the permutation using the SSE4_1 pblendv instruction. Potentially
45953 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45955 static bool
45956 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45958 unsigned i, which, nelt = d->nelt;
45959 struct expand_vec_perm_d dcopy, dcopy1;
45960 machine_mode vmode = d->vmode;
45961 bool ok;
45963 /* Use the same checks as in expand_vec_perm_blend. */
45964 if (d->one_operand_p)
45965 return false;
45966 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45968 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45970 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45972 else
45973 return false;
45975 /* Figure out where permutation elements stay not in their
45976 respective lanes. */
45977 for (i = 0, which = 0; i < nelt; ++i)
45979 unsigned e = d->perm[i];
45980 if (e != i)
45981 which |= (e < nelt ? 1 : 2);
45983 /* We can pblend the part where elements stay not in their
45984 respective lanes only when these elements are all in one
45985 half of a permutation.
45986 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45987 lanes, but both 8 and 9 >= 8
45988 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45989 respective lanes and 8 >= 8, but 2 not. */
45990 if (which != 1 && which != 2)
45991 return false;
45992 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45993 return true;
45995 /* First we apply one operand permutation to the part where
45996 elements stay not in their respective lanes. */
45997 dcopy = *d;
45998 if (which == 2)
45999 dcopy.op0 = dcopy.op1 = d->op1;
46000 else
46001 dcopy.op0 = dcopy.op1 = d->op0;
46002 if (!d->testing_p)
46003 dcopy.target = gen_reg_rtx (vmode);
46004 dcopy.one_operand_p = true;
46006 for (i = 0; i < nelt; ++i)
46007 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46009 ok = expand_vec_perm_1 (&dcopy);
46010 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46011 return false;
46012 else
46013 gcc_assert (ok);
46014 if (d->testing_p)
46015 return true;
46017 /* Next we put permuted elements into their positions. */
46018 dcopy1 = *d;
46019 if (which == 2)
46020 dcopy1.op1 = dcopy.target;
46021 else
46022 dcopy1.op0 = dcopy.target;
46024 for (i = 0; i < nelt; ++i)
46025 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46027 ok = expand_vec_perm_blend (&dcopy1);
46028 gcc_assert (ok);
46030 return true;
46033 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46035 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46036 a two vector permutation into a single vector permutation by using
46037 an interleave operation to merge the vectors. */
46039 static bool
46040 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46042 struct expand_vec_perm_d dremap, dfinal;
46043 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46044 unsigned HOST_WIDE_INT contents;
46045 unsigned char remap[2 * MAX_VECT_LEN];
46046 rtx_insn *seq;
46047 bool ok, same_halves = false;
46049 if (GET_MODE_SIZE (d->vmode) == 16)
46051 if (d->one_operand_p)
46052 return false;
46054 else if (GET_MODE_SIZE (d->vmode) == 32)
46056 if (!TARGET_AVX)
46057 return false;
46058 /* For 32-byte modes allow even d->one_operand_p.
46059 The lack of cross-lane shuffling in some instructions
46060 might prevent a single insn shuffle. */
46061 dfinal = *d;
46062 dfinal.testing_p = true;
46063 /* If expand_vec_perm_interleave3 can expand this into
46064 a 3 insn sequence, give up and let it be expanded as
46065 3 insn sequence. While that is one insn longer,
46066 it doesn't need a memory operand and in the common
46067 case that both interleave low and high permutations
46068 with the same operands are adjacent needs 4 insns
46069 for both after CSE. */
46070 if (expand_vec_perm_interleave3 (&dfinal))
46071 return false;
46073 else
46074 return false;
46076 /* Examine from whence the elements come. */
46077 contents = 0;
46078 for (i = 0; i < nelt; ++i)
46079 contents |= HOST_WIDE_INT_1U << d->perm[i];
46081 memset (remap, 0xff, sizeof (remap));
46082 dremap = *d;
46084 if (GET_MODE_SIZE (d->vmode) == 16)
46086 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46088 /* Split the two input vectors into 4 halves. */
46089 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46090 h2 = h1 << nelt2;
46091 h3 = h2 << nelt2;
46092 h4 = h3 << nelt2;
46094 /* If the elements from the low halves use interleave low, and similarly
46095 for interleave high. If the elements are from mis-matched halves, we
46096 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46097 if ((contents & (h1 | h3)) == contents)
46099 /* punpckl* */
46100 for (i = 0; i < nelt2; ++i)
46102 remap[i] = i * 2;
46103 remap[i + nelt] = i * 2 + 1;
46104 dremap.perm[i * 2] = i;
46105 dremap.perm[i * 2 + 1] = i + nelt;
46107 if (!TARGET_SSE2 && d->vmode == V4SImode)
46108 dremap.vmode = V4SFmode;
46110 else if ((contents & (h2 | h4)) == contents)
46112 /* punpckh* */
46113 for (i = 0; i < nelt2; ++i)
46115 remap[i + nelt2] = i * 2;
46116 remap[i + nelt + nelt2] = i * 2 + 1;
46117 dremap.perm[i * 2] = i + nelt2;
46118 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46120 if (!TARGET_SSE2 && d->vmode == V4SImode)
46121 dremap.vmode = V4SFmode;
46123 else if ((contents & (h1 | h4)) == contents)
46125 /* shufps */
46126 for (i = 0; i < nelt2; ++i)
46128 remap[i] = i;
46129 remap[i + nelt + nelt2] = i + nelt2;
46130 dremap.perm[i] = i;
46131 dremap.perm[i + nelt2] = i + nelt + nelt2;
46133 if (nelt != 4)
46135 /* shufpd */
46136 dremap.vmode = V2DImode;
46137 dremap.nelt = 2;
46138 dremap.perm[0] = 0;
46139 dremap.perm[1] = 3;
46142 else if ((contents & (h2 | h3)) == contents)
46144 /* shufps */
46145 for (i = 0; i < nelt2; ++i)
46147 remap[i + nelt2] = i;
46148 remap[i + nelt] = i + nelt2;
46149 dremap.perm[i] = i + nelt2;
46150 dremap.perm[i + nelt2] = i + nelt;
46152 if (nelt != 4)
46154 /* shufpd */
46155 dremap.vmode = V2DImode;
46156 dremap.nelt = 2;
46157 dremap.perm[0] = 1;
46158 dremap.perm[1] = 2;
46161 else
46162 return false;
46164 else
46166 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46167 unsigned HOST_WIDE_INT q[8];
46168 unsigned int nonzero_halves[4];
46170 /* Split the two input vectors into 8 quarters. */
46171 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46172 for (i = 1; i < 8; ++i)
46173 q[i] = q[0] << (nelt4 * i);
46174 for (i = 0; i < 4; ++i)
46175 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46177 nonzero_halves[nzcnt] = i;
46178 ++nzcnt;
46181 if (nzcnt == 1)
46183 gcc_assert (d->one_operand_p);
46184 nonzero_halves[1] = nonzero_halves[0];
46185 same_halves = true;
46187 else if (d->one_operand_p)
46189 gcc_assert (nonzero_halves[0] == 0);
46190 gcc_assert (nonzero_halves[1] == 1);
46193 if (nzcnt <= 2)
46195 if (d->perm[0] / nelt2 == nonzero_halves[1])
46197 /* Attempt to increase the likelihood that dfinal
46198 shuffle will be intra-lane. */
46199 std::swap (nonzero_halves[0], nonzero_halves[1]);
46202 /* vperm2f128 or vperm2i128. */
46203 for (i = 0; i < nelt2; ++i)
46205 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46206 remap[i + nonzero_halves[0] * nelt2] = i;
46207 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46208 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46211 if (d->vmode != V8SFmode
46212 && d->vmode != V4DFmode
46213 && d->vmode != V8SImode)
46215 dremap.vmode = V8SImode;
46216 dremap.nelt = 8;
46217 for (i = 0; i < 4; ++i)
46219 dremap.perm[i] = i + nonzero_halves[0] * 4;
46220 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46224 else if (d->one_operand_p)
46225 return false;
46226 else if (TARGET_AVX2
46227 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46229 /* vpunpckl* */
46230 for (i = 0; i < nelt4; ++i)
46232 remap[i] = i * 2;
46233 remap[i + nelt] = i * 2 + 1;
46234 remap[i + nelt2] = i * 2 + nelt2;
46235 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46236 dremap.perm[i * 2] = i;
46237 dremap.perm[i * 2 + 1] = i + nelt;
46238 dremap.perm[i * 2 + nelt2] = i + nelt2;
46239 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46242 else if (TARGET_AVX2
46243 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46245 /* vpunpckh* */
46246 for (i = 0; i < nelt4; ++i)
46248 remap[i + nelt4] = i * 2;
46249 remap[i + nelt + nelt4] = i * 2 + 1;
46250 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46251 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46252 dremap.perm[i * 2] = i + nelt4;
46253 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46254 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46255 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46258 else
46259 return false;
46262 /* Use the remapping array set up above to move the elements from their
46263 swizzled locations into their final destinations. */
46264 dfinal = *d;
46265 for (i = 0; i < nelt; ++i)
46267 unsigned e = remap[d->perm[i]];
46268 gcc_assert (e < nelt);
46269 /* If same_halves is true, both halves of the remapped vector are the
46270 same. Avoid cross-lane accesses if possible. */
46271 if (same_halves && i >= nelt2)
46273 gcc_assert (e < nelt2);
46274 dfinal.perm[i] = e + nelt2;
46276 else
46277 dfinal.perm[i] = e;
46279 if (!d->testing_p)
46281 dremap.target = gen_reg_rtx (dremap.vmode);
46282 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46284 dfinal.op1 = dfinal.op0;
46285 dfinal.one_operand_p = true;
46287 /* Test if the final remap can be done with a single insn. For V4SFmode or
46288 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46289 start_sequence ();
46290 ok = expand_vec_perm_1 (&dfinal);
46291 seq = get_insns ();
46292 end_sequence ();
46294 if (!ok)
46295 return false;
46297 if (d->testing_p)
46298 return true;
46300 if (dremap.vmode != dfinal.vmode)
46302 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46303 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46306 ok = expand_vec_perm_1 (&dremap);
46307 gcc_assert (ok);
46309 emit_insn (seq);
46310 return true;
46313 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46314 a single vector cross-lane permutation into vpermq followed
46315 by any of the single insn permutations. */
46317 static bool
46318 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46320 struct expand_vec_perm_d dremap, dfinal;
46321 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46322 unsigned contents[2];
46323 bool ok;
46325 if (!(TARGET_AVX2
46326 && (d->vmode == V32QImode || d->vmode == V16HImode)
46327 && d->one_operand_p))
46328 return false;
46330 contents[0] = 0;
46331 contents[1] = 0;
46332 for (i = 0; i < nelt2; ++i)
46334 contents[0] |= 1u << (d->perm[i] / nelt4);
46335 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46338 for (i = 0; i < 2; ++i)
46340 unsigned int cnt = 0;
46341 for (j = 0; j < 4; ++j)
46342 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46343 return false;
46346 if (d->testing_p)
46347 return true;
46349 dremap = *d;
46350 dremap.vmode = V4DImode;
46351 dremap.nelt = 4;
46352 dremap.target = gen_reg_rtx (V4DImode);
46353 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46354 dremap.op1 = dremap.op0;
46355 dremap.one_operand_p = true;
46356 for (i = 0; i < 2; ++i)
46358 unsigned int cnt = 0;
46359 for (j = 0; j < 4; ++j)
46360 if ((contents[i] & (1u << j)) != 0)
46361 dremap.perm[2 * i + cnt++] = j;
46362 for (; cnt < 2; ++cnt)
46363 dremap.perm[2 * i + cnt] = 0;
46366 dfinal = *d;
46367 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46368 dfinal.op1 = dfinal.op0;
46369 dfinal.one_operand_p = true;
46370 for (i = 0, j = 0; i < nelt; ++i)
46372 if (i == nelt2)
46373 j = 2;
46374 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46375 if ((d->perm[i] / nelt4) == dremap.perm[j])
46377 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46378 dfinal.perm[i] |= nelt4;
46379 else
46380 gcc_unreachable ();
46383 ok = expand_vec_perm_1 (&dremap);
46384 gcc_assert (ok);
46386 ok = expand_vec_perm_1 (&dfinal);
46387 gcc_assert (ok);
46389 return true;
46392 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46393 a vector permutation using two instructions, vperm2f128 resp.
46394 vperm2i128 followed by any single in-lane permutation. */
46396 static bool
46397 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46399 struct expand_vec_perm_d dfirst, dsecond;
46400 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46401 bool ok;
46403 if (!TARGET_AVX
46404 || GET_MODE_SIZE (d->vmode) != 32
46405 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46406 return false;
46408 dsecond = *d;
46409 dsecond.one_operand_p = false;
46410 dsecond.testing_p = true;
46412 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46413 immediate. For perm < 16 the second permutation uses
46414 d->op0 as first operand, for perm >= 16 it uses d->op1
46415 as first operand. The second operand is the result of
46416 vperm2[fi]128. */
46417 for (perm = 0; perm < 32; perm++)
46419 /* Ignore permutations which do not move anything cross-lane. */
46420 if (perm < 16)
46422 /* The second shuffle for e.g. V4DFmode has
46423 0123 and ABCD operands.
46424 Ignore AB23, as 23 is already in the second lane
46425 of the first operand. */
46426 if ((perm & 0xc) == (1 << 2)) continue;
46427 /* And 01CD, as 01 is in the first lane of the first
46428 operand. */
46429 if ((perm & 3) == 0) continue;
46430 /* And 4567, as then the vperm2[fi]128 doesn't change
46431 anything on the original 4567 second operand. */
46432 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46434 else
46436 /* The second shuffle for e.g. V4DFmode has
46437 4567 and ABCD operands.
46438 Ignore AB67, as 67 is already in the second lane
46439 of the first operand. */
46440 if ((perm & 0xc) == (3 << 2)) continue;
46441 /* And 45CD, as 45 is in the first lane of the first
46442 operand. */
46443 if ((perm & 3) == 2) continue;
46444 /* And 0123, as then the vperm2[fi]128 doesn't change
46445 anything on the original 0123 first operand. */
46446 if ((perm & 0xf) == (1 << 2)) continue;
46449 for (i = 0; i < nelt; i++)
46451 j = d->perm[i] / nelt2;
46452 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46453 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46454 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46455 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46456 else
46457 break;
46460 if (i == nelt)
46462 start_sequence ();
46463 ok = expand_vec_perm_1 (&dsecond);
46464 end_sequence ();
46466 else
46467 ok = false;
46469 if (ok)
46471 if (d->testing_p)
46472 return true;
46474 /* Found a usable second shuffle. dfirst will be
46475 vperm2f128 on d->op0 and d->op1. */
46476 dsecond.testing_p = false;
46477 dfirst = *d;
46478 dfirst.target = gen_reg_rtx (d->vmode);
46479 for (i = 0; i < nelt; i++)
46480 dfirst.perm[i] = (i & (nelt2 - 1))
46481 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46483 canonicalize_perm (&dfirst);
46484 ok = expand_vec_perm_1 (&dfirst);
46485 gcc_assert (ok);
46487 /* And dsecond is some single insn shuffle, taking
46488 d->op0 and result of vperm2f128 (if perm < 16) or
46489 d->op1 and result of vperm2f128 (otherwise). */
46490 if (perm >= 16)
46491 dsecond.op0 = dsecond.op1;
46492 dsecond.op1 = dfirst.target;
46494 ok = expand_vec_perm_1 (&dsecond);
46495 gcc_assert (ok);
46497 return true;
46500 /* For one operand, the only useful vperm2f128 permutation is 0x01
46501 aka lanes swap. */
46502 if (d->one_operand_p)
46503 return false;
46506 return false;
46509 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46510 a two vector permutation using 2 intra-lane interleave insns
46511 and cross-lane shuffle for 32-byte vectors. */
46513 static bool
46514 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46516 unsigned i, nelt;
46517 rtx (*gen) (rtx, rtx, rtx);
46519 if (d->one_operand_p)
46520 return false;
46521 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46523 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46525 else
46526 return false;
46528 nelt = d->nelt;
46529 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46530 return false;
46531 for (i = 0; i < nelt; i += 2)
46532 if (d->perm[i] != d->perm[0] + i / 2
46533 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46534 return false;
46536 if (d->testing_p)
46537 return true;
46539 switch (d->vmode)
46541 case E_V32QImode:
46542 if (d->perm[0])
46543 gen = gen_vec_interleave_highv32qi;
46544 else
46545 gen = gen_vec_interleave_lowv32qi;
46546 break;
46547 case E_V16HImode:
46548 if (d->perm[0])
46549 gen = gen_vec_interleave_highv16hi;
46550 else
46551 gen = gen_vec_interleave_lowv16hi;
46552 break;
46553 case E_V8SImode:
46554 if (d->perm[0])
46555 gen = gen_vec_interleave_highv8si;
46556 else
46557 gen = gen_vec_interleave_lowv8si;
46558 break;
46559 case E_V4DImode:
46560 if (d->perm[0])
46561 gen = gen_vec_interleave_highv4di;
46562 else
46563 gen = gen_vec_interleave_lowv4di;
46564 break;
46565 case E_V8SFmode:
46566 if (d->perm[0])
46567 gen = gen_vec_interleave_highv8sf;
46568 else
46569 gen = gen_vec_interleave_lowv8sf;
46570 break;
46571 case E_V4DFmode:
46572 if (d->perm[0])
46573 gen = gen_vec_interleave_highv4df;
46574 else
46575 gen = gen_vec_interleave_lowv4df;
46576 break;
46577 default:
46578 gcc_unreachable ();
46581 emit_insn (gen (d->target, d->op0, d->op1));
46582 return true;
46585 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46586 a single vector permutation using a single intra-lane vector
46587 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46588 the non-swapped and swapped vectors together. */
46590 static bool
46591 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46593 struct expand_vec_perm_d dfirst, dsecond;
46594 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46595 rtx_insn *seq;
46596 bool ok;
46597 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46599 if (!TARGET_AVX
46600 || TARGET_AVX2
46601 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46602 || !d->one_operand_p)
46603 return false;
46605 dfirst = *d;
46606 for (i = 0; i < nelt; i++)
46607 dfirst.perm[i] = 0xff;
46608 for (i = 0, msk = 0; i < nelt; i++)
46610 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46611 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46612 return false;
46613 dfirst.perm[j] = d->perm[i];
46614 if (j != i)
46615 msk |= (1 << i);
46617 for (i = 0; i < nelt; i++)
46618 if (dfirst.perm[i] == 0xff)
46619 dfirst.perm[i] = i;
46621 if (!d->testing_p)
46622 dfirst.target = gen_reg_rtx (dfirst.vmode);
46624 start_sequence ();
46625 ok = expand_vec_perm_1 (&dfirst);
46626 seq = get_insns ();
46627 end_sequence ();
46629 if (!ok)
46630 return false;
46632 if (d->testing_p)
46633 return true;
46635 emit_insn (seq);
46637 dsecond = *d;
46638 dsecond.op0 = dfirst.target;
46639 dsecond.op1 = dfirst.target;
46640 dsecond.one_operand_p = true;
46641 dsecond.target = gen_reg_rtx (dsecond.vmode);
46642 for (i = 0; i < nelt; i++)
46643 dsecond.perm[i] = i ^ nelt2;
46645 ok = expand_vec_perm_1 (&dsecond);
46646 gcc_assert (ok);
46648 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46649 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46650 return true;
46653 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46654 permutation using two vperm2f128, followed by a vshufpd insn blending
46655 the two vectors together. */
46657 static bool
46658 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46660 struct expand_vec_perm_d dfirst, dsecond, dthird;
46661 bool ok;
46663 if (!TARGET_AVX || (d->vmode != V4DFmode))
46664 return false;
46666 if (d->testing_p)
46667 return true;
46669 dfirst = *d;
46670 dsecond = *d;
46671 dthird = *d;
46673 dfirst.perm[0] = (d->perm[0] & ~1);
46674 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46675 dfirst.perm[2] = (d->perm[2] & ~1);
46676 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46677 dsecond.perm[0] = (d->perm[1] & ~1);
46678 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46679 dsecond.perm[2] = (d->perm[3] & ~1);
46680 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46681 dthird.perm[0] = (d->perm[0] % 2);
46682 dthird.perm[1] = (d->perm[1] % 2) + 4;
46683 dthird.perm[2] = (d->perm[2] % 2) + 2;
46684 dthird.perm[3] = (d->perm[3] % 2) + 6;
46686 dfirst.target = gen_reg_rtx (dfirst.vmode);
46687 dsecond.target = gen_reg_rtx (dsecond.vmode);
46688 dthird.op0 = dfirst.target;
46689 dthird.op1 = dsecond.target;
46690 dthird.one_operand_p = false;
46692 canonicalize_perm (&dfirst);
46693 canonicalize_perm (&dsecond);
46695 ok = expand_vec_perm_1 (&dfirst)
46696 && expand_vec_perm_1 (&dsecond)
46697 && expand_vec_perm_1 (&dthird);
46699 gcc_assert (ok);
46701 return true;
46704 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46705 permutation with two pshufb insns and an ior. We should have already
46706 failed all two instruction sequences. */
46708 static bool
46709 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46711 rtx rperm[2][16], vperm, l, h, op, m128;
46712 unsigned int i, nelt, eltsz;
46714 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46715 return false;
46716 gcc_assert (!d->one_operand_p);
46718 if (d->testing_p)
46719 return true;
46721 nelt = d->nelt;
46722 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46724 /* Generate two permutation masks. If the required element is within
46725 the given vector it is shuffled into the proper lane. If the required
46726 element is in the other vector, force a zero into the lane by setting
46727 bit 7 in the permutation mask. */
46728 m128 = GEN_INT (-128);
46729 for (i = 0; i < nelt; ++i)
46731 unsigned j, e = d->perm[i];
46732 unsigned which = (e >= nelt);
46733 if (e >= nelt)
46734 e -= nelt;
46736 for (j = 0; j < eltsz; ++j)
46738 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46739 rperm[1-which][i*eltsz + j] = m128;
46743 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46744 vperm = force_reg (V16QImode, vperm);
46746 l = gen_reg_rtx (V16QImode);
46747 op = gen_lowpart (V16QImode, d->op0);
46748 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46750 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46751 vperm = force_reg (V16QImode, vperm);
46753 h = gen_reg_rtx (V16QImode);
46754 op = gen_lowpart (V16QImode, d->op1);
46755 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46757 op = d->target;
46758 if (d->vmode != V16QImode)
46759 op = gen_reg_rtx (V16QImode);
46760 emit_insn (gen_iorv16qi3 (op, l, h));
46761 if (op != d->target)
46762 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46764 return true;
46767 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46768 with two vpshufb insns, vpermq and vpor. We should have already failed
46769 all two or three instruction sequences. */
46771 static bool
46772 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46774 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46775 unsigned int i, nelt, eltsz;
46777 if (!TARGET_AVX2
46778 || !d->one_operand_p
46779 || (d->vmode != V32QImode && d->vmode != V16HImode))
46780 return false;
46782 if (d->testing_p)
46783 return true;
46785 nelt = d->nelt;
46786 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46788 /* Generate two permutation masks. If the required element is within
46789 the same lane, it is shuffled in. If the required element from the
46790 other lane, force a zero by setting bit 7 in the permutation mask.
46791 In the other mask the mask has non-negative elements if element
46792 is requested from the other lane, but also moved to the other lane,
46793 so that the result of vpshufb can have the two V2TImode halves
46794 swapped. */
46795 m128 = GEN_INT (-128);
46796 for (i = 0; i < nelt; ++i)
46798 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46799 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46801 for (j = 0; j < eltsz; ++j)
46803 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46804 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46808 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46809 vperm = force_reg (V32QImode, vperm);
46811 h = gen_reg_rtx (V32QImode);
46812 op = gen_lowpart (V32QImode, d->op0);
46813 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46815 /* Swap the 128-byte lanes of h into hp. */
46816 hp = gen_reg_rtx (V4DImode);
46817 op = gen_lowpart (V4DImode, h);
46818 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46819 const1_rtx));
46821 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46822 vperm = force_reg (V32QImode, vperm);
46824 l = gen_reg_rtx (V32QImode);
46825 op = gen_lowpart (V32QImode, d->op0);
46826 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46828 op = d->target;
46829 if (d->vmode != V32QImode)
46830 op = gen_reg_rtx (V32QImode);
46831 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46832 if (op != d->target)
46833 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46835 return true;
46838 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46839 and extract-odd permutations of two V32QImode and V16QImode operand
46840 with two vpshufb insns, vpor and vpermq. We should have already
46841 failed all two or three instruction sequences. */
46843 static bool
46844 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46846 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46847 unsigned int i, nelt, eltsz;
46849 if (!TARGET_AVX2
46850 || d->one_operand_p
46851 || (d->vmode != V32QImode && d->vmode != V16HImode))
46852 return false;
46854 for (i = 0; i < d->nelt; ++i)
46855 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46856 return false;
46858 if (d->testing_p)
46859 return true;
46861 nelt = d->nelt;
46862 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46864 /* Generate two permutation masks. In the first permutation mask
46865 the first quarter will contain indexes for the first half
46866 of the op0, the second quarter will contain bit 7 set, third quarter
46867 will contain indexes for the second half of the op0 and the
46868 last quarter bit 7 set. In the second permutation mask
46869 the first quarter will contain bit 7 set, the second quarter
46870 indexes for the first half of the op1, the third quarter bit 7 set
46871 and last quarter indexes for the second half of the op1.
46872 I.e. the first mask e.g. for V32QImode extract even will be:
46873 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46874 (all values masked with 0xf except for -128) and second mask
46875 for extract even will be
46876 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46877 m128 = GEN_INT (-128);
46878 for (i = 0; i < nelt; ++i)
46880 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46881 unsigned which = d->perm[i] >= nelt;
46882 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46884 for (j = 0; j < eltsz; ++j)
46886 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46887 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46891 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46892 vperm = force_reg (V32QImode, vperm);
46894 l = gen_reg_rtx (V32QImode);
46895 op = gen_lowpart (V32QImode, d->op0);
46896 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46898 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46899 vperm = force_reg (V32QImode, vperm);
46901 h = gen_reg_rtx (V32QImode);
46902 op = gen_lowpart (V32QImode, d->op1);
46903 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46905 ior = gen_reg_rtx (V32QImode);
46906 emit_insn (gen_iorv32qi3 (ior, l, h));
46908 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46909 op = gen_reg_rtx (V4DImode);
46910 ior = gen_lowpart (V4DImode, ior);
46911 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46912 const1_rtx, GEN_INT (3)));
46913 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46915 return true;
46918 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46919 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46920 with two "and" and "pack" or two "shift" and "pack" insns. We should
46921 have already failed all two instruction sequences. */
46923 static bool
46924 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46926 rtx op, dop0, dop1, t;
46927 unsigned i, odd, c, s, nelt = d->nelt;
46928 bool end_perm = false;
46929 machine_mode half_mode;
46930 rtx (*gen_and) (rtx, rtx, rtx);
46931 rtx (*gen_pack) (rtx, rtx, rtx);
46932 rtx (*gen_shift) (rtx, rtx, rtx);
46934 if (d->one_operand_p)
46935 return false;
46937 switch (d->vmode)
46939 case E_V8HImode:
46940 /* Required for "pack". */
46941 if (!TARGET_SSE4_1)
46942 return false;
46943 c = 0xffff;
46944 s = 16;
46945 half_mode = V4SImode;
46946 gen_and = gen_andv4si3;
46947 gen_pack = gen_sse4_1_packusdw;
46948 gen_shift = gen_lshrv4si3;
46949 break;
46950 case E_V16QImode:
46951 /* No check as all instructions are SSE2. */
46952 c = 0xff;
46953 s = 8;
46954 half_mode = V8HImode;
46955 gen_and = gen_andv8hi3;
46956 gen_pack = gen_sse2_packuswb;
46957 gen_shift = gen_lshrv8hi3;
46958 break;
46959 case E_V16HImode:
46960 if (!TARGET_AVX2)
46961 return false;
46962 c = 0xffff;
46963 s = 16;
46964 half_mode = V8SImode;
46965 gen_and = gen_andv8si3;
46966 gen_pack = gen_avx2_packusdw;
46967 gen_shift = gen_lshrv8si3;
46968 end_perm = true;
46969 break;
46970 case E_V32QImode:
46971 if (!TARGET_AVX2)
46972 return false;
46973 c = 0xff;
46974 s = 8;
46975 half_mode = V16HImode;
46976 gen_and = gen_andv16hi3;
46977 gen_pack = gen_avx2_packuswb;
46978 gen_shift = gen_lshrv16hi3;
46979 end_perm = true;
46980 break;
46981 default:
46982 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46983 general shuffles. */
46984 return false;
46987 /* Check that permutation is even or odd. */
46988 odd = d->perm[0];
46989 if (odd > 1)
46990 return false;
46992 for (i = 1; i < nelt; ++i)
46993 if (d->perm[i] != 2 * i + odd)
46994 return false;
46996 if (d->testing_p)
46997 return true;
46999 dop0 = gen_reg_rtx (half_mode);
47000 dop1 = gen_reg_rtx (half_mode);
47001 if (odd == 0)
47003 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
47004 t = force_reg (half_mode, t);
47005 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47006 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47008 else
47010 emit_insn (gen_shift (dop0,
47011 gen_lowpart (half_mode, d->op0),
47012 GEN_INT (s)));
47013 emit_insn (gen_shift (dop1,
47014 gen_lowpart (half_mode, d->op1),
47015 GEN_INT (s)));
47017 /* In AVX2 for 256 bit case we need to permute pack result. */
47018 if (TARGET_AVX2 && end_perm)
47020 op = gen_reg_rtx (d->vmode);
47021 t = gen_reg_rtx (V4DImode);
47022 emit_insn (gen_pack (op, dop0, dop1));
47023 emit_insn (gen_avx2_permv4di_1 (t,
47024 gen_lowpart (V4DImode, op),
47025 const0_rtx,
47026 const2_rtx,
47027 const1_rtx,
47028 GEN_INT (3)));
47029 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47031 else
47032 emit_insn (gen_pack (d->target, dop0, dop1));
47034 return true;
47037 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47038 and extract-odd permutations of two V64QI operands
47039 with two "shifts", two "truncs" and one "concat" insns for "odd"
47040 and two "truncs" and one concat insn for "even."
47041 Have already failed all two instruction sequences. */
47043 static bool
47044 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47046 rtx t1, t2, t3, t4;
47047 unsigned i, odd, nelt = d->nelt;
47049 if (!TARGET_AVX512BW
47050 || d->one_operand_p
47051 || d->vmode != V64QImode)
47052 return false;
47054 /* Check that permutation is even or odd. */
47055 odd = d->perm[0];
47056 if (odd > 1)
47057 return false;
47059 for (i = 1; i < nelt; ++i)
47060 if (d->perm[i] != 2 * i + odd)
47061 return false;
47063 if (d->testing_p)
47064 return true;
47067 if (odd)
47069 t1 = gen_reg_rtx (V32HImode);
47070 t2 = gen_reg_rtx (V32HImode);
47071 emit_insn (gen_lshrv32hi3 (t1,
47072 gen_lowpart (V32HImode, d->op0),
47073 GEN_INT (8)));
47074 emit_insn (gen_lshrv32hi3 (t2,
47075 gen_lowpart (V32HImode, d->op1),
47076 GEN_INT (8)));
47078 else
47080 t1 = gen_lowpart (V32HImode, d->op0);
47081 t2 = gen_lowpart (V32HImode, d->op1);
47084 t3 = gen_reg_rtx (V32QImode);
47085 t4 = gen_reg_rtx (V32QImode);
47086 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47087 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47088 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47090 return true;
47093 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47094 and extract-odd permutations. */
47096 static bool
47097 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47099 rtx t1, t2, t3, t4, t5;
47101 switch (d->vmode)
47103 case E_V4DFmode:
47104 if (d->testing_p)
47105 break;
47106 t1 = gen_reg_rtx (V4DFmode);
47107 t2 = gen_reg_rtx (V4DFmode);
47109 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47110 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47111 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47113 /* Now an unpck[lh]pd will produce the result required. */
47114 if (odd)
47115 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47116 else
47117 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47118 emit_insn (t3);
47119 break;
47121 case E_V8SFmode:
47123 int mask = odd ? 0xdd : 0x88;
47125 if (d->testing_p)
47126 break;
47127 t1 = gen_reg_rtx (V8SFmode);
47128 t2 = gen_reg_rtx (V8SFmode);
47129 t3 = gen_reg_rtx (V8SFmode);
47131 /* Shuffle within the 128-bit lanes to produce:
47132 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47133 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47134 GEN_INT (mask)));
47136 /* Shuffle the lanes around to produce:
47137 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47138 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47139 GEN_INT (0x3)));
47141 /* Shuffle within the 128-bit lanes to produce:
47142 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47143 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47145 /* Shuffle within the 128-bit lanes to produce:
47146 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47147 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47149 /* Shuffle the lanes around to produce:
47150 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47151 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47152 GEN_INT (0x20)));
47154 break;
47156 case E_V2DFmode:
47157 case E_V4SFmode:
47158 case E_V2DImode:
47159 case E_V4SImode:
47160 /* These are always directly implementable by expand_vec_perm_1. */
47161 gcc_unreachable ();
47163 case E_V8HImode:
47164 if (TARGET_SSE4_1)
47165 return expand_vec_perm_even_odd_pack (d);
47166 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47167 return expand_vec_perm_pshufb2 (d);
47168 else
47170 if (d->testing_p)
47171 break;
47172 /* We need 2*log2(N)-1 operations to achieve odd/even
47173 with interleave. */
47174 t1 = gen_reg_rtx (V8HImode);
47175 t2 = gen_reg_rtx (V8HImode);
47176 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47177 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47178 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47179 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47180 if (odd)
47181 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47182 else
47183 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47184 emit_insn (t3);
47186 break;
47188 case E_V16QImode:
47189 return expand_vec_perm_even_odd_pack (d);
47191 case E_V16HImode:
47192 case E_V32QImode:
47193 return expand_vec_perm_even_odd_pack (d);
47195 case E_V64QImode:
47196 return expand_vec_perm_even_odd_trunc (d);
47198 case E_V4DImode:
47199 if (!TARGET_AVX2)
47201 struct expand_vec_perm_d d_copy = *d;
47202 d_copy.vmode = V4DFmode;
47203 if (d->testing_p)
47204 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47205 else
47206 d_copy.target = gen_reg_rtx (V4DFmode);
47207 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47208 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47209 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47211 if (!d->testing_p)
47212 emit_move_insn (d->target,
47213 gen_lowpart (V4DImode, d_copy.target));
47214 return true;
47216 return false;
47219 if (d->testing_p)
47220 break;
47222 t1 = gen_reg_rtx (V4DImode);
47223 t2 = gen_reg_rtx (V4DImode);
47225 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47226 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47227 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47229 /* Now an vpunpck[lh]qdq will produce the result required. */
47230 if (odd)
47231 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47232 else
47233 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47234 emit_insn (t3);
47235 break;
47237 case E_V8SImode:
47238 if (!TARGET_AVX2)
47240 struct expand_vec_perm_d d_copy = *d;
47241 d_copy.vmode = V8SFmode;
47242 if (d->testing_p)
47243 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47244 else
47245 d_copy.target = gen_reg_rtx (V8SFmode);
47246 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47247 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47248 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47250 if (!d->testing_p)
47251 emit_move_insn (d->target,
47252 gen_lowpart (V8SImode, d_copy.target));
47253 return true;
47255 return false;
47258 if (d->testing_p)
47259 break;
47261 t1 = gen_reg_rtx (V8SImode);
47262 t2 = gen_reg_rtx (V8SImode);
47263 t3 = gen_reg_rtx (V4DImode);
47264 t4 = gen_reg_rtx (V4DImode);
47265 t5 = gen_reg_rtx (V4DImode);
47267 /* Shuffle the lanes around into
47268 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47269 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47270 gen_lowpart (V4DImode, d->op1),
47271 GEN_INT (0x20)));
47272 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47273 gen_lowpart (V4DImode, d->op1),
47274 GEN_INT (0x31)));
47276 /* Swap the 2nd and 3rd position in each lane into
47277 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47278 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47279 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47280 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47281 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47283 /* Now an vpunpck[lh]qdq will produce
47284 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47285 if (odd)
47286 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47287 gen_lowpart (V4DImode, t2));
47288 else
47289 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47290 gen_lowpart (V4DImode, t2));
47291 emit_insn (t3);
47292 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47293 break;
47295 default:
47296 gcc_unreachable ();
47299 return true;
47302 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47303 extract-even and extract-odd permutations. */
47305 static bool
47306 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47308 unsigned i, odd, nelt = d->nelt;
47310 odd = d->perm[0];
47311 if (odd != 0 && odd != 1)
47312 return false;
47314 for (i = 1; i < nelt; ++i)
47315 if (d->perm[i] != 2 * i + odd)
47316 return false;
47318 return expand_vec_perm_even_odd_1 (d, odd);
47321 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47322 permutations. We assume that expand_vec_perm_1 has already failed. */
47324 static bool
47325 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47327 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47328 machine_mode vmode = d->vmode;
47329 unsigned char perm2[4];
47330 rtx op0 = d->op0, dest;
47331 bool ok;
47333 switch (vmode)
47335 case E_V4DFmode:
47336 case E_V8SFmode:
47337 /* These are special-cased in sse.md so that we can optionally
47338 use the vbroadcast instruction. They expand to two insns
47339 if the input happens to be in a register. */
47340 gcc_unreachable ();
47342 case E_V2DFmode:
47343 case E_V2DImode:
47344 case E_V4SFmode:
47345 case E_V4SImode:
47346 /* These are always implementable using standard shuffle patterns. */
47347 gcc_unreachable ();
47349 case E_V8HImode:
47350 case E_V16QImode:
47351 /* These can be implemented via interleave. We save one insn by
47352 stopping once we have promoted to V4SImode and then use pshufd. */
47353 if (d->testing_p)
47354 return true;
47357 rtx dest;
47358 rtx (*gen) (rtx, rtx, rtx)
47359 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47360 : gen_vec_interleave_lowv8hi;
47362 if (elt >= nelt2)
47364 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47365 : gen_vec_interleave_highv8hi;
47366 elt -= nelt2;
47368 nelt2 /= 2;
47370 dest = gen_reg_rtx (vmode);
47371 emit_insn (gen (dest, op0, op0));
47372 vmode = get_mode_wider_vector (vmode);
47373 op0 = gen_lowpart (vmode, dest);
47375 while (vmode != V4SImode);
47377 memset (perm2, elt, 4);
47378 dest = gen_reg_rtx (V4SImode);
47379 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47380 gcc_assert (ok);
47381 if (!d->testing_p)
47382 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47383 return true;
47385 case E_V64QImode:
47386 case E_V32QImode:
47387 case E_V16HImode:
47388 case E_V8SImode:
47389 case E_V4DImode:
47390 /* For AVX2 broadcasts of the first element vpbroadcast* or
47391 vpermq should be used by expand_vec_perm_1. */
47392 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47393 return false;
47395 default:
47396 gcc_unreachable ();
47400 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47401 broadcast permutations. */
47403 static bool
47404 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47406 unsigned i, elt, nelt = d->nelt;
47408 if (!d->one_operand_p)
47409 return false;
47411 elt = d->perm[0];
47412 for (i = 1; i < nelt; ++i)
47413 if (d->perm[i] != elt)
47414 return false;
47416 return expand_vec_perm_broadcast_1 (d);
47419 /* Implement arbitrary permutations of two V64QImode operands
47420 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
47421 static bool
47422 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
47424 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47425 return false;
47427 if (d->testing_p)
47428 return true;
47430 struct expand_vec_perm_d ds[2];
47431 rtx rperm[128], vperm, target0, target1;
47432 unsigned int i, nelt;
47433 machine_mode vmode;
47435 nelt = d->nelt;
47436 vmode = V64QImode;
47438 for (i = 0; i < 2; i++)
47440 ds[i] = *d;
47441 ds[i].vmode = V32HImode;
47442 ds[i].nelt = 32;
47443 ds[i].target = gen_reg_rtx (V32HImode);
47444 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47445 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47448 /* Prepare permutations such that the first one takes care of
47449 putting the even bytes into the right positions or one higher
47450 positions (ds[0]) and the second one takes care of
47451 putting the odd bytes into the right positions or one below
47452 (ds[1]). */
47454 for (i = 0; i < nelt; i++)
47456 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47457 if (i & 1)
47459 rperm[i] = constm1_rtx;
47460 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47462 else
47464 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47465 rperm[i + 64] = constm1_rtx;
47469 bool ok = expand_vec_perm_1 (&ds[0]);
47470 gcc_assert (ok);
47471 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47473 ok = expand_vec_perm_1 (&ds[1]);
47474 gcc_assert (ok);
47475 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47477 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47478 vperm = force_reg (vmode, vperm);
47479 target0 = gen_reg_rtx (V64QImode);
47480 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47482 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47483 vperm = force_reg (vmode, vperm);
47484 target1 = gen_reg_rtx (V64QImode);
47485 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47487 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47488 return true;
47491 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47492 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47493 all the shorter instruction sequences. */
47495 static bool
47496 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47498 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47499 unsigned int i, nelt, eltsz;
47500 bool used[4];
47502 if (!TARGET_AVX2
47503 || d->one_operand_p
47504 || (d->vmode != V32QImode && d->vmode != V16HImode))
47505 return false;
47507 if (d->testing_p)
47508 return true;
47510 nelt = d->nelt;
47511 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47513 /* Generate 4 permutation masks. If the required element is within
47514 the same lane, it is shuffled in. If the required element from the
47515 other lane, force a zero by setting bit 7 in the permutation mask.
47516 In the other mask the mask has non-negative elements if element
47517 is requested from the other lane, but also moved to the other lane,
47518 so that the result of vpshufb can have the two V2TImode halves
47519 swapped. */
47520 m128 = GEN_INT (-128);
47521 for (i = 0; i < 32; ++i)
47523 rperm[0][i] = m128;
47524 rperm[1][i] = m128;
47525 rperm[2][i] = m128;
47526 rperm[3][i] = m128;
47528 used[0] = false;
47529 used[1] = false;
47530 used[2] = false;
47531 used[3] = false;
47532 for (i = 0; i < nelt; ++i)
47534 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47535 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47536 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47538 for (j = 0; j < eltsz; ++j)
47539 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47540 used[which] = true;
47543 for (i = 0; i < 2; ++i)
47545 if (!used[2 * i + 1])
47547 h[i] = NULL_RTX;
47548 continue;
47550 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47551 gen_rtvec_v (32, rperm[2 * i + 1]));
47552 vperm = force_reg (V32QImode, vperm);
47553 h[i] = gen_reg_rtx (V32QImode);
47554 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47555 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47558 /* Swap the 128-byte lanes of h[X]. */
47559 for (i = 0; i < 2; ++i)
47561 if (h[i] == NULL_RTX)
47562 continue;
47563 op = gen_reg_rtx (V4DImode);
47564 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47565 const2_rtx, GEN_INT (3), const0_rtx,
47566 const1_rtx));
47567 h[i] = gen_lowpart (V32QImode, op);
47570 for (i = 0; i < 2; ++i)
47572 if (!used[2 * i])
47574 l[i] = NULL_RTX;
47575 continue;
47577 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47578 vperm = force_reg (V32QImode, vperm);
47579 l[i] = gen_reg_rtx (V32QImode);
47580 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47581 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47584 for (i = 0; i < 2; ++i)
47586 if (h[i] && l[i])
47588 op = gen_reg_rtx (V32QImode);
47589 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47590 l[i] = op;
47592 else if (h[i])
47593 l[i] = h[i];
47596 gcc_assert (l[0] && l[1]);
47597 op = d->target;
47598 if (d->vmode != V32QImode)
47599 op = gen_reg_rtx (V32QImode);
47600 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47601 if (op != d->target)
47602 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47603 return true;
47606 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47607 With all of the interface bits taken care of, perform the expansion
47608 in D and return true on success. */
47610 static bool
47611 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47613 /* Try a single instruction expansion. */
47614 if (expand_vec_perm_1 (d))
47615 return true;
47617 /* Try sequences of two instructions. */
47619 if (expand_vec_perm_pshuflw_pshufhw (d))
47620 return true;
47622 if (expand_vec_perm_palignr (d, false))
47623 return true;
47625 if (expand_vec_perm_interleave2 (d))
47626 return true;
47628 if (expand_vec_perm_broadcast (d))
47629 return true;
47631 if (expand_vec_perm_vpermq_perm_1 (d))
47632 return true;
47634 if (expand_vec_perm_vperm2f128 (d))
47635 return true;
47637 if (expand_vec_perm_pblendv (d))
47638 return true;
47640 /* Try sequences of three instructions. */
47642 if (expand_vec_perm_even_odd_pack (d))
47643 return true;
47645 if (expand_vec_perm_2vperm2f128_vshuf (d))
47646 return true;
47648 if (expand_vec_perm_pshufb2 (d))
47649 return true;
47651 if (expand_vec_perm_interleave3 (d))
47652 return true;
47654 if (expand_vec_perm_vperm2f128_vblend (d))
47655 return true;
47657 /* Try sequences of four instructions. */
47659 if (expand_vec_perm_even_odd_trunc (d))
47660 return true;
47661 if (expand_vec_perm_vpshufb2_vpermq (d))
47662 return true;
47664 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47665 return true;
47667 if (expand_vec_perm_vpermt2_vpshub2 (d))
47668 return true;
47670 /* ??? Look for narrow permutations whose element orderings would
47671 allow the promotion to a wider mode. */
47673 /* ??? Look for sequences of interleave or a wider permute that place
47674 the data into the correct lanes for a half-vector shuffle like
47675 pshuf[lh]w or vpermilps. */
47677 /* ??? Look for sequences of interleave that produce the desired results.
47678 The combinatorics of punpck[lh] get pretty ugly... */
47680 if (expand_vec_perm_even_odd (d))
47681 return true;
47683 /* Even longer sequences. */
47684 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47685 return true;
47687 /* See if we can get the same permutation in different vector integer
47688 mode. */
47689 struct expand_vec_perm_d nd;
47690 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47692 if (!d->testing_p)
47693 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47694 return true;
47697 return false;
47700 /* If a permutation only uses one operand, make it clear. Returns true
47701 if the permutation references both operands. */
47703 static bool
47704 canonicalize_perm (struct expand_vec_perm_d *d)
47706 int i, which, nelt = d->nelt;
47708 for (i = which = 0; i < nelt; ++i)
47709 which |= (d->perm[i] < nelt ? 1 : 2);
47711 d->one_operand_p = true;
47712 switch (which)
47714 default:
47715 gcc_unreachable();
47717 case 3:
47718 if (!rtx_equal_p (d->op0, d->op1))
47720 d->one_operand_p = false;
47721 break;
47723 /* The elements of PERM do not suggest that only the first operand
47724 is used, but both operands are identical. Allow easier matching
47725 of the permutation by folding the permutation into the single
47726 input vector. */
47727 /* FALLTHRU */
47729 case 2:
47730 for (i = 0; i < nelt; ++i)
47731 d->perm[i] &= nelt - 1;
47732 d->op0 = d->op1;
47733 break;
47735 case 1:
47736 d->op1 = d->op0;
47737 break;
47740 return (which == 3);
47743 bool
47744 ix86_expand_vec_perm_const (rtx operands[4])
47746 struct expand_vec_perm_d d;
47747 unsigned char perm[MAX_VECT_LEN];
47748 int i, nelt;
47749 bool two_args;
47750 rtx sel;
47752 d.target = operands[0];
47753 d.op0 = operands[1];
47754 d.op1 = operands[2];
47755 sel = operands[3];
47757 d.vmode = GET_MODE (d.target);
47758 gcc_assert (VECTOR_MODE_P (d.vmode));
47759 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47760 d.testing_p = false;
47762 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47763 gcc_assert (XVECLEN (sel, 0) == nelt);
47764 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47766 for (i = 0; i < nelt; ++i)
47768 rtx e = XVECEXP (sel, 0, i);
47769 int ei = INTVAL (e) & (2 * nelt - 1);
47770 d.perm[i] = ei;
47771 perm[i] = ei;
47774 two_args = canonicalize_perm (&d);
47776 if (ix86_expand_vec_perm_const_1 (&d))
47777 return true;
47779 /* If the selector says both arguments are needed, but the operands are the
47780 same, the above tried to expand with one_operand_p and flattened selector.
47781 If that didn't work, retry without one_operand_p; we succeeded with that
47782 during testing. */
47783 if (two_args && d.one_operand_p)
47785 d.one_operand_p = false;
47786 memcpy (d.perm, perm, sizeof (perm));
47787 return ix86_expand_vec_perm_const_1 (&d);
47790 return false;
47793 /* Implement targetm.vectorize.vec_perm_const_ok. */
47795 static bool
47796 ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
47798 struct expand_vec_perm_d d;
47799 unsigned int i, nelt, which;
47800 bool ret;
47802 d.vmode = vmode;
47803 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47804 d.testing_p = true;
47806 /* Given sufficient ISA support we can just return true here
47807 for selected vector modes. */
47808 switch (d.vmode)
47810 case E_V16SFmode:
47811 case E_V16SImode:
47812 case E_V8DImode:
47813 case E_V8DFmode:
47814 if (TARGET_AVX512F)
47815 /* All implementable with a single vperm[it]2 insn. */
47816 return true;
47817 break;
47818 case E_V32HImode:
47819 if (TARGET_AVX512BW)
47820 /* All implementable with a single vperm[it]2 insn. */
47821 return true;
47822 break;
47823 case E_V64QImode:
47824 if (TARGET_AVX512BW)
47825 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
47826 return true;
47827 break;
47828 case E_V8SImode:
47829 case E_V8SFmode:
47830 case E_V4DFmode:
47831 case E_V4DImode:
47832 if (TARGET_AVX512VL)
47833 /* All implementable with a single vperm[it]2 insn. */
47834 return true;
47835 break;
47836 case E_V16HImode:
47837 if (TARGET_AVX2)
47838 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47839 return true;
47840 break;
47841 case E_V32QImode:
47842 if (TARGET_AVX2)
47843 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47844 return true;
47845 break;
47846 case E_V4SImode:
47847 case E_V4SFmode:
47848 case E_V8HImode:
47849 case E_V16QImode:
47850 /* All implementable with a single vpperm insn. */
47851 if (TARGET_XOP)
47852 return true;
47853 /* All implementable with 2 pshufb + 1 ior. */
47854 if (TARGET_SSSE3)
47855 return true;
47856 break;
47857 case E_V2DImode:
47858 case E_V2DFmode:
47859 /* All implementable with shufpd or unpck[lh]pd. */
47860 return true;
47861 default:
47862 return false;
47865 /* Extract the values from the vector CST into the permutation
47866 array in D. */
47867 for (i = which = 0; i < nelt; ++i)
47869 unsigned char e = sel[i];
47870 gcc_assert (e < 2 * nelt);
47871 d.perm[i] = e;
47872 which |= (e < nelt ? 1 : 2);
47875 /* For all elements from second vector, fold the elements to first. */
47876 if (which == 2)
47877 for (i = 0; i < nelt; ++i)
47878 d.perm[i] -= nelt;
47880 /* Check whether the mask can be applied to the vector type. */
47881 d.one_operand_p = (which != 3);
47883 /* Implementable with shufps or pshufd. */
47884 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47885 return true;
47887 /* Otherwise we have to go through the motions and see if we can
47888 figure out how to generate the requested permutation. */
47889 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47890 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47891 if (!d.one_operand_p)
47892 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47894 start_sequence ();
47895 ret = ix86_expand_vec_perm_const_1 (&d);
47896 end_sequence ();
47898 return ret;
47901 void
47902 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47904 struct expand_vec_perm_d d;
47905 unsigned i, nelt;
47907 d.target = targ;
47908 d.op0 = op0;
47909 d.op1 = op1;
47910 d.vmode = GET_MODE (targ);
47911 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47912 d.one_operand_p = false;
47913 d.testing_p = false;
47915 for (i = 0; i < nelt; ++i)
47916 d.perm[i] = i * 2 + odd;
47918 /* We'll either be able to implement the permutation directly... */
47919 if (expand_vec_perm_1 (&d))
47920 return;
47922 /* ... or we use the special-case patterns. */
47923 expand_vec_perm_even_odd_1 (&d, odd);
47926 static void
47927 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47929 struct expand_vec_perm_d d;
47930 unsigned i, nelt, base;
47931 bool ok;
47933 d.target = targ;
47934 d.op0 = op0;
47935 d.op1 = op1;
47936 d.vmode = GET_MODE (targ);
47937 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47938 d.one_operand_p = false;
47939 d.testing_p = false;
47941 base = high_p ? nelt / 2 : 0;
47942 for (i = 0; i < nelt / 2; ++i)
47944 d.perm[i * 2] = i + base;
47945 d.perm[i * 2 + 1] = i + base + nelt;
47948 /* Note that for AVX this isn't one instruction. */
47949 ok = ix86_expand_vec_perm_const_1 (&d);
47950 gcc_assert (ok);
47954 /* Expand a vector operation CODE for a V*QImode in terms of the
47955 same operation on V*HImode. */
47957 void
47958 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47960 machine_mode qimode = GET_MODE (dest);
47961 machine_mode himode;
47962 rtx (*gen_il) (rtx, rtx, rtx);
47963 rtx (*gen_ih) (rtx, rtx, rtx);
47964 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47965 struct expand_vec_perm_d d;
47966 bool ok, full_interleave;
47967 bool uns_p = false;
47968 int i;
47970 switch (qimode)
47972 case E_V16QImode:
47973 himode = V8HImode;
47974 gen_il = gen_vec_interleave_lowv16qi;
47975 gen_ih = gen_vec_interleave_highv16qi;
47976 break;
47977 case E_V32QImode:
47978 himode = V16HImode;
47979 gen_il = gen_avx2_interleave_lowv32qi;
47980 gen_ih = gen_avx2_interleave_highv32qi;
47981 break;
47982 case E_V64QImode:
47983 himode = V32HImode;
47984 gen_il = gen_avx512bw_interleave_lowv64qi;
47985 gen_ih = gen_avx512bw_interleave_highv64qi;
47986 break;
47987 default:
47988 gcc_unreachable ();
47991 op2_l = op2_h = op2;
47992 switch (code)
47994 case MULT:
47995 /* Unpack data such that we've got a source byte in each low byte of
47996 each word. We don't care what goes into the high byte of each word.
47997 Rather than trying to get zero in there, most convenient is to let
47998 it be a copy of the low byte. */
47999 op2_l = gen_reg_rtx (qimode);
48000 op2_h = gen_reg_rtx (qimode);
48001 emit_insn (gen_il (op2_l, op2, op2));
48002 emit_insn (gen_ih (op2_h, op2, op2));
48004 op1_l = gen_reg_rtx (qimode);
48005 op1_h = gen_reg_rtx (qimode);
48006 emit_insn (gen_il (op1_l, op1, op1));
48007 emit_insn (gen_ih (op1_h, op1, op1));
48008 full_interleave = qimode == V16QImode;
48009 break;
48011 case ASHIFT:
48012 case LSHIFTRT:
48013 uns_p = true;
48014 /* FALLTHRU */
48015 case ASHIFTRT:
48016 op1_l = gen_reg_rtx (himode);
48017 op1_h = gen_reg_rtx (himode);
48018 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48019 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48020 full_interleave = true;
48021 break;
48022 default:
48023 gcc_unreachable ();
48026 /* Perform the operation. */
48027 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48028 1, OPTAB_DIRECT);
48029 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48030 1, OPTAB_DIRECT);
48031 gcc_assert (res_l && res_h);
48033 /* Merge the data back into the right place. */
48034 d.target = dest;
48035 d.op0 = gen_lowpart (qimode, res_l);
48036 d.op1 = gen_lowpart (qimode, res_h);
48037 d.vmode = qimode;
48038 d.nelt = GET_MODE_NUNITS (qimode);
48039 d.one_operand_p = false;
48040 d.testing_p = false;
48042 if (full_interleave)
48044 /* For SSE2, we used an full interleave, so the desired
48045 results are in the even elements. */
48046 for (i = 0; i < d.nelt; ++i)
48047 d.perm[i] = i * 2;
48049 else
48051 /* For AVX, the interleave used above was not cross-lane. So the
48052 extraction is evens but with the second and third quarter swapped.
48053 Happily, that is even one insn shorter than even extraction.
48054 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48055 always first from the first and then from the second source operand,
48056 the index bits above the low 4 bits remains the same.
48057 Thus, for d.nelt == 32 we want permutation
48058 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48059 and for d.nelt == 64 we want permutation
48060 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48061 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48062 for (i = 0; i < d.nelt; ++i)
48063 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48066 ok = ix86_expand_vec_perm_const_1 (&d);
48067 gcc_assert (ok);
48069 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48070 gen_rtx_fmt_ee (code, qimode, op1, op2));
48073 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48074 if op is CONST_VECTOR with all odd elements equal to their
48075 preceding element. */
48077 static bool
48078 const_vector_equal_evenodd_p (rtx op)
48080 machine_mode mode = GET_MODE (op);
48081 int i, nunits = GET_MODE_NUNITS (mode);
48082 if (GET_CODE (op) != CONST_VECTOR
48083 || nunits != CONST_VECTOR_NUNITS (op))
48084 return false;
48085 for (i = 0; i < nunits; i += 2)
48086 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48087 return false;
48088 return true;
48091 void
48092 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48093 bool uns_p, bool odd_p)
48095 machine_mode mode = GET_MODE (op1);
48096 machine_mode wmode = GET_MODE (dest);
48097 rtx x;
48098 rtx orig_op1 = op1, orig_op2 = op2;
48100 if (!nonimmediate_operand (op1, mode))
48101 op1 = force_reg (mode, op1);
48102 if (!nonimmediate_operand (op2, mode))
48103 op2 = force_reg (mode, op2);
48105 /* We only play even/odd games with vectors of SImode. */
48106 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48108 /* If we're looking for the odd results, shift those members down to
48109 the even slots. For some cpus this is faster than a PSHUFD. */
48110 if (odd_p)
48112 /* For XOP use vpmacsdqh, but only for smult, as it is only
48113 signed. */
48114 if (TARGET_XOP && mode == V4SImode && !uns_p)
48116 x = force_reg (wmode, CONST0_RTX (wmode));
48117 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48118 return;
48121 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48122 if (!const_vector_equal_evenodd_p (orig_op1))
48123 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48124 x, NULL, 1, OPTAB_DIRECT);
48125 if (!const_vector_equal_evenodd_p (orig_op2))
48126 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48127 x, NULL, 1, OPTAB_DIRECT);
48128 op1 = gen_lowpart (mode, op1);
48129 op2 = gen_lowpart (mode, op2);
48132 if (mode == V16SImode)
48134 if (uns_p)
48135 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48136 else
48137 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48139 else if (mode == V8SImode)
48141 if (uns_p)
48142 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48143 else
48144 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48146 else if (uns_p)
48147 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48148 else if (TARGET_SSE4_1)
48149 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48150 else
48152 rtx s1, s2, t0, t1, t2;
48154 /* The easiest way to implement this without PMULDQ is to go through
48155 the motions as if we are performing a full 64-bit multiply. With
48156 the exception that we need to do less shuffling of the elements. */
48158 /* Compute the sign-extension, aka highparts, of the two operands. */
48159 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48160 op1, pc_rtx, pc_rtx);
48161 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48162 op2, pc_rtx, pc_rtx);
48164 /* Multiply LO(A) * HI(B), and vice-versa. */
48165 t1 = gen_reg_rtx (wmode);
48166 t2 = gen_reg_rtx (wmode);
48167 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48168 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48170 /* Multiply LO(A) * LO(B). */
48171 t0 = gen_reg_rtx (wmode);
48172 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48174 /* Combine and shift the highparts into place. */
48175 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48176 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48177 1, OPTAB_DIRECT);
48179 /* Combine high and low parts. */
48180 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48181 return;
48183 emit_insn (x);
48186 void
48187 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48188 bool uns_p, bool high_p)
48190 machine_mode wmode = GET_MODE (dest);
48191 machine_mode mode = GET_MODE (op1);
48192 rtx t1, t2, t3, t4, mask;
48194 switch (mode)
48196 case E_V4SImode:
48197 t1 = gen_reg_rtx (mode);
48198 t2 = gen_reg_rtx (mode);
48199 if (TARGET_XOP && !uns_p)
48201 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48202 shuffle the elements once so that all elements are in the right
48203 place for immediate use: { A C B D }. */
48204 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48205 const1_rtx, GEN_INT (3)));
48206 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48207 const1_rtx, GEN_INT (3)));
48209 else
48211 /* Put the elements into place for the multiply. */
48212 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48213 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48214 high_p = false;
48216 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48217 break;
48219 case E_V8SImode:
48220 /* Shuffle the elements between the lanes. After this we
48221 have { A B E F | C D G H } for each operand. */
48222 t1 = gen_reg_rtx (V4DImode);
48223 t2 = gen_reg_rtx (V4DImode);
48224 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48225 const0_rtx, const2_rtx,
48226 const1_rtx, GEN_INT (3)));
48227 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48228 const0_rtx, const2_rtx,
48229 const1_rtx, GEN_INT (3)));
48231 /* Shuffle the elements within the lanes. After this we
48232 have { A A B B | C C D D } or { E E F F | G G H H }. */
48233 t3 = gen_reg_rtx (V8SImode);
48234 t4 = gen_reg_rtx (V8SImode);
48235 mask = GEN_INT (high_p
48236 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48237 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48238 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48239 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48241 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48242 break;
48244 case E_V8HImode:
48245 case E_V16HImode:
48246 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48247 uns_p, OPTAB_DIRECT);
48248 t2 = expand_binop (mode,
48249 uns_p ? umul_highpart_optab : smul_highpart_optab,
48250 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48251 gcc_assert (t1 && t2);
48253 t3 = gen_reg_rtx (mode);
48254 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48255 emit_move_insn (dest, gen_lowpart (wmode, t3));
48256 break;
48258 case E_V16QImode:
48259 case E_V32QImode:
48260 case E_V32HImode:
48261 case E_V16SImode:
48262 case E_V64QImode:
48263 t1 = gen_reg_rtx (wmode);
48264 t2 = gen_reg_rtx (wmode);
48265 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48266 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48268 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48269 break;
48271 default:
48272 gcc_unreachable ();
48276 void
48277 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48279 rtx res_1, res_2, res_3, res_4;
48281 res_1 = gen_reg_rtx (V4SImode);
48282 res_2 = gen_reg_rtx (V4SImode);
48283 res_3 = gen_reg_rtx (V2DImode);
48284 res_4 = gen_reg_rtx (V2DImode);
48285 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48286 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48288 /* Move the results in element 2 down to element 1; we don't care
48289 what goes in elements 2 and 3. Then we can merge the parts
48290 back together with an interleave.
48292 Note that two other sequences were tried:
48293 (1) Use interleaves at the start instead of psrldq, which allows
48294 us to use a single shufps to merge things back at the end.
48295 (2) Use shufps here to combine the two vectors, then pshufd to
48296 put the elements in the correct order.
48297 In both cases the cost of the reformatting stall was too high
48298 and the overall sequence slower. */
48300 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48301 const0_rtx, const2_rtx,
48302 const0_rtx, const0_rtx));
48303 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48304 const0_rtx, const2_rtx,
48305 const0_rtx, const0_rtx));
48306 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48308 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48311 void
48312 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48314 machine_mode mode = GET_MODE (op0);
48315 rtx t1, t2, t3, t4, t5, t6;
48317 if (TARGET_AVX512DQ && mode == V8DImode)
48318 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48319 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48320 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48321 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48322 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48323 else if (TARGET_XOP && mode == V2DImode)
48325 /* op1: A,B,C,D, op2: E,F,G,H */
48326 op1 = gen_lowpart (V4SImode, op1);
48327 op2 = gen_lowpart (V4SImode, op2);
48329 t1 = gen_reg_rtx (V4SImode);
48330 t2 = gen_reg_rtx (V4SImode);
48331 t3 = gen_reg_rtx (V2DImode);
48332 t4 = gen_reg_rtx (V2DImode);
48334 /* t1: B,A,D,C */
48335 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48336 GEN_INT (1),
48337 GEN_INT (0),
48338 GEN_INT (3),
48339 GEN_INT (2)));
48341 /* t2: (B*E),(A*F),(D*G),(C*H) */
48342 emit_insn (gen_mulv4si3 (t2, t1, op2));
48344 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48345 emit_insn (gen_xop_phadddq (t3, t2));
48347 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48348 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48350 /* Multiply lower parts and add all */
48351 t5 = gen_reg_rtx (V2DImode);
48352 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48353 gen_lowpart (V4SImode, op1),
48354 gen_lowpart (V4SImode, op2)));
48355 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48358 else
48360 machine_mode nmode;
48361 rtx (*umul) (rtx, rtx, rtx);
48363 if (mode == V2DImode)
48365 umul = gen_vec_widen_umult_even_v4si;
48366 nmode = V4SImode;
48368 else if (mode == V4DImode)
48370 umul = gen_vec_widen_umult_even_v8si;
48371 nmode = V8SImode;
48373 else if (mode == V8DImode)
48375 umul = gen_vec_widen_umult_even_v16si;
48376 nmode = V16SImode;
48378 else
48379 gcc_unreachable ();
48382 /* Multiply low parts. */
48383 t1 = gen_reg_rtx (mode);
48384 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48386 /* Shift input vectors right 32 bits so we can multiply high parts. */
48387 t6 = GEN_INT (32);
48388 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48389 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48391 /* Multiply high parts by low parts. */
48392 t4 = gen_reg_rtx (mode);
48393 t5 = gen_reg_rtx (mode);
48394 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48395 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48397 /* Combine and shift the highparts back. */
48398 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48399 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48401 /* Combine high and low parts. */
48402 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48405 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48406 gen_rtx_MULT (mode, op1, op2));
48409 /* Return 1 if control tansfer instruction INSN
48410 should be encoded with bnd prefix.
48411 If insn is NULL then return 1 when control
48412 transfer instructions should be prefixed with
48413 bnd by default for current function. */
48415 bool
48416 ix86_bnd_prefixed_insn_p (rtx insn)
48418 /* For call insns check special flag. */
48419 if (insn && CALL_P (insn))
48421 rtx call = get_call_rtx_from (insn);
48422 if (call)
48423 return CALL_EXPR_WITH_BOUNDS_P (call);
48426 /* All other insns are prefixed only if function is instrumented. */
48427 return chkp_function_instrumented_p (current_function_decl);
48430 /* Return 1 if control tansfer instruction INSN
48431 should be encoded with notrack prefix. */
48433 static bool
48434 ix86_notrack_prefixed_insn_p (rtx insn)
48436 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48437 return false;
48439 if (CALL_P (insn))
48441 rtx call = get_call_rtx_from (insn);
48442 gcc_assert (call != NULL_RTX);
48443 rtx addr = XEXP (call, 0);
48445 /* Do not emit 'notrack' if it's not an indirect call. */
48446 if (MEM_P (addr)
48447 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48448 return false;
48449 else
48450 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48453 if (JUMP_P (insn) && !flag_cet_switch)
48455 rtx target = JUMP_LABEL (insn);
48456 if (target == NULL_RTX || ANY_RETURN_P (target))
48457 return false;
48459 /* Check the jump is a switch table. */
48460 rtx_insn *label = as_a<rtx_insn *> (target);
48461 rtx_insn *table = next_insn (label);
48462 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48463 return false;
48464 else
48465 return true;
48467 return false;
48470 /* Calculate integer abs() using only SSE2 instructions. */
48472 void
48473 ix86_expand_sse2_abs (rtx target, rtx input)
48475 machine_mode mode = GET_MODE (target);
48476 rtx tmp0, tmp1, x;
48478 switch (mode)
48480 /* For 32-bit signed integer X, the best way to calculate the absolute
48481 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48482 case E_V4SImode:
48483 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48484 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48485 NULL, 0, OPTAB_DIRECT);
48486 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48487 NULL, 0, OPTAB_DIRECT);
48488 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48489 target, 0, OPTAB_DIRECT);
48490 break;
48492 /* For 16-bit signed integer X, the best way to calculate the absolute
48493 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48494 case E_V8HImode:
48495 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48497 x = expand_simple_binop (mode, SMAX, tmp0, input,
48498 target, 0, OPTAB_DIRECT);
48499 break;
48501 /* For 8-bit signed integer X, the best way to calculate the absolute
48502 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48503 as SSE2 provides the PMINUB insn. */
48504 case E_V16QImode:
48505 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48507 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48508 target, 0, OPTAB_DIRECT);
48509 break;
48511 default:
48512 gcc_unreachable ();
48515 if (x != target)
48516 emit_move_insn (target, x);
48519 /* Expand an extract from a vector register through pextr insn.
48520 Return true if successful. */
48522 bool
48523 ix86_expand_pextr (rtx *operands)
48525 rtx dst = operands[0];
48526 rtx src = operands[1];
48528 unsigned int size = INTVAL (operands[2]);
48529 unsigned int pos = INTVAL (operands[3]);
48531 if (SUBREG_P (dst))
48533 /* Reject non-lowpart subregs. */
48534 if (SUBREG_BYTE (dst) > 0)
48535 return false;
48536 dst = SUBREG_REG (dst);
48539 if (SUBREG_P (src))
48541 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48542 src = SUBREG_REG (src);
48545 switch (GET_MODE (src))
48547 case E_V16QImode:
48548 case E_V8HImode:
48549 case E_V4SImode:
48550 case E_V2DImode:
48551 case E_V1TImode:
48552 case E_TImode:
48554 machine_mode srcmode, dstmode;
48555 rtx d, pat;
48557 if (!int_mode_for_size (size, 0).exists (&dstmode))
48558 return false;
48560 switch (dstmode)
48562 case E_QImode:
48563 if (!TARGET_SSE4_1)
48564 return false;
48565 srcmode = V16QImode;
48566 break;
48568 case E_HImode:
48569 if (!TARGET_SSE2)
48570 return false;
48571 srcmode = V8HImode;
48572 break;
48574 case E_SImode:
48575 if (!TARGET_SSE4_1)
48576 return false;
48577 srcmode = V4SImode;
48578 break;
48580 case E_DImode:
48581 gcc_assert (TARGET_64BIT);
48582 if (!TARGET_SSE4_1)
48583 return false;
48584 srcmode = V2DImode;
48585 break;
48587 default:
48588 return false;
48591 /* Reject extractions from misaligned positions. */
48592 if (pos & (size-1))
48593 return false;
48595 if (GET_MODE (dst) == dstmode)
48596 d = dst;
48597 else
48598 d = gen_reg_rtx (dstmode);
48600 /* Construct insn pattern. */
48601 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48602 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48604 /* Let the rtl optimizers know about the zero extension performed. */
48605 if (dstmode == QImode || dstmode == HImode)
48607 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48608 d = gen_lowpart (SImode, d);
48611 emit_insn (gen_rtx_SET (d, pat));
48613 if (d != dst)
48614 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48615 return true;
48618 default:
48619 return false;
48623 /* Expand an insert into a vector register through pinsr insn.
48624 Return true if successful. */
48626 bool
48627 ix86_expand_pinsr (rtx *operands)
48629 rtx dst = operands[0];
48630 rtx src = operands[3];
48632 unsigned int size = INTVAL (operands[1]);
48633 unsigned int pos = INTVAL (operands[2]);
48635 if (SUBREG_P (dst))
48637 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48638 dst = SUBREG_REG (dst);
48641 switch (GET_MODE (dst))
48643 case E_V16QImode:
48644 case E_V8HImode:
48645 case E_V4SImode:
48646 case E_V2DImode:
48647 case E_V1TImode:
48648 case E_TImode:
48650 machine_mode srcmode, dstmode;
48651 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48652 rtx d;
48654 if (!int_mode_for_size (size, 0).exists (&srcmode))
48655 return false;
48657 switch (srcmode)
48659 case E_QImode:
48660 if (!TARGET_SSE4_1)
48661 return false;
48662 dstmode = V16QImode;
48663 pinsr = gen_sse4_1_pinsrb;
48664 break;
48666 case E_HImode:
48667 if (!TARGET_SSE2)
48668 return false;
48669 dstmode = V8HImode;
48670 pinsr = gen_sse2_pinsrw;
48671 break;
48673 case E_SImode:
48674 if (!TARGET_SSE4_1)
48675 return false;
48676 dstmode = V4SImode;
48677 pinsr = gen_sse4_1_pinsrd;
48678 break;
48680 case E_DImode:
48681 gcc_assert (TARGET_64BIT);
48682 if (!TARGET_SSE4_1)
48683 return false;
48684 dstmode = V2DImode;
48685 pinsr = gen_sse4_1_pinsrq;
48686 break;
48688 default:
48689 return false;
48692 /* Reject insertions to misaligned positions. */
48693 if (pos & (size-1))
48694 return false;
48696 if (SUBREG_P (src))
48698 unsigned int srcpos = SUBREG_BYTE (src);
48700 if (srcpos > 0)
48702 rtx extr_ops[4];
48704 extr_ops[0] = gen_reg_rtx (srcmode);
48705 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48706 extr_ops[2] = GEN_INT (size);
48707 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48709 if (!ix86_expand_pextr (extr_ops))
48710 return false;
48712 src = extr_ops[0];
48714 else
48715 src = gen_lowpart (srcmode, SUBREG_REG (src));
48718 if (GET_MODE (dst) == dstmode)
48719 d = dst;
48720 else
48721 d = gen_reg_rtx (dstmode);
48723 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48724 gen_lowpart (srcmode, src),
48725 GEN_INT (1 << (pos / size))));
48726 if (d != dst)
48727 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48728 return true;
48731 default:
48732 return false;
48736 /* This function returns the calling abi specific va_list type node.
48737 It returns the FNDECL specific va_list type. */
48739 static tree
48740 ix86_fn_abi_va_list (tree fndecl)
48742 if (!TARGET_64BIT)
48743 return va_list_type_node;
48744 gcc_assert (fndecl != NULL_TREE);
48746 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48747 return ms_va_list_type_node;
48748 else
48749 return sysv_va_list_type_node;
48752 /* Returns the canonical va_list type specified by TYPE. If there
48753 is no valid TYPE provided, it return NULL_TREE. */
48755 static tree
48756 ix86_canonical_va_list_type (tree type)
48758 if (TARGET_64BIT)
48760 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48761 return ms_va_list_type_node;
48763 if ((TREE_CODE (type) == ARRAY_TYPE
48764 && integer_zerop (array_type_nelts (type)))
48765 || POINTER_TYPE_P (type))
48767 tree elem_type = TREE_TYPE (type);
48768 if (TREE_CODE (elem_type) == RECORD_TYPE
48769 && lookup_attribute ("sysv_abi va_list",
48770 TYPE_ATTRIBUTES (elem_type)))
48771 return sysv_va_list_type_node;
48774 return NULL_TREE;
48777 return std_canonical_va_list_type (type);
48780 /* Iterate through the target-specific builtin types for va_list.
48781 IDX denotes the iterator, *PTREE is set to the result type of
48782 the va_list builtin, and *PNAME to its internal type.
48783 Returns zero if there is no element for this index, otherwise
48784 IDX should be increased upon the next call.
48785 Note, do not iterate a base builtin's name like __builtin_va_list.
48786 Used from c_common_nodes_and_builtins. */
48788 static int
48789 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48791 if (TARGET_64BIT)
48793 switch (idx)
48795 default:
48796 break;
48798 case 0:
48799 *ptree = ms_va_list_type_node;
48800 *pname = "__builtin_ms_va_list";
48801 return 1;
48803 case 1:
48804 *ptree = sysv_va_list_type_node;
48805 *pname = "__builtin_sysv_va_list";
48806 return 1;
48810 return 0;
48813 #undef TARGET_SCHED_DISPATCH
48814 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48815 #undef TARGET_SCHED_DISPATCH_DO
48816 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48817 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48818 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48819 #undef TARGET_SCHED_REORDER
48820 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48821 #undef TARGET_SCHED_ADJUST_PRIORITY
48822 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48823 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48824 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48825 ix86_dependencies_evaluation_hook
48828 /* Implementation of reassociation_width target hook used by
48829 reassoc phase to identify parallelism level in reassociated
48830 tree. Statements tree_code is passed in OPC. Arguments type
48831 is passed in MODE. */
48833 static int
48834 ix86_reassociation_width (unsigned int op, machine_mode mode)
48836 int width = 1;
48837 /* Vector part. */
48838 if (VECTOR_MODE_P (mode))
48840 int div = 1;
48841 if (INTEGRAL_MODE_P (mode))
48842 width = ix86_cost->reassoc_vec_int;
48843 else if (FLOAT_MODE_P (mode))
48844 width = ix86_cost->reassoc_vec_fp;
48846 if (width == 1)
48847 return 1;
48849 /* Integer vector instructions execute in FP unit
48850 and can execute 3 additions and one multiplication per cycle. */
48851 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48852 && op != PLUS && op != MINUS)
48853 return 1;
48855 /* Account for targets that splits wide vectors into multiple parts. */
48856 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48857 div = GET_MODE_BITSIZE (mode) / 128;
48858 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48859 div = GET_MODE_BITSIZE (mode) / 64;
48860 width = (width + div - 1) / div;
48862 /* Scalar part. */
48863 else if (INTEGRAL_MODE_P (mode))
48864 width = ix86_cost->reassoc_int;
48865 else if (FLOAT_MODE_P (mode))
48866 width = ix86_cost->reassoc_fp;
48868 /* Avoid using too many registers in 32bit mode. */
48869 if (!TARGET_64BIT && width > 2)
48870 width = 2;
48871 return width;
48874 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48875 place emms and femms instructions. */
48877 static machine_mode
48878 ix86_preferred_simd_mode (scalar_mode mode)
48880 if (!TARGET_SSE)
48881 return word_mode;
48883 switch (mode)
48885 case E_QImode:
48886 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48887 return V64QImode;
48888 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48889 return V32QImode;
48890 else
48891 return V16QImode;
48893 case E_HImode:
48894 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48895 return V32HImode;
48896 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48897 return V16HImode;
48898 else
48899 return V8HImode;
48901 case E_SImode:
48902 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48903 return V16SImode;
48904 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48905 return V8SImode;
48906 else
48907 return V4SImode;
48909 case E_DImode:
48910 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48911 return V8DImode;
48912 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48913 return V4DImode;
48914 else
48915 return V2DImode;
48917 case E_SFmode:
48918 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48919 return V16SFmode;
48920 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48921 return V8SFmode;
48922 else
48923 return V4SFmode;
48925 case E_DFmode:
48926 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48927 return V8DFmode;
48928 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48929 return V4DFmode;
48930 else if (TARGET_SSE2)
48931 return V2DFmode;
48932 /* FALLTHRU */
48934 default:
48935 return word_mode;
48939 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
48940 vectors. If AVX512F is enabled then try vectorizing with 512bit,
48941 256bit and 128bit vectors. */
48943 static unsigned int
48944 ix86_autovectorize_vector_sizes (void)
48946 unsigned int bytesizes = 0;
48948 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48949 bytesizes |= (64 | 32 | 16);
48950 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48951 bytesizes |= (32 | 16);
48953 return bytesizes;
48956 /* Implemenation of targetm.vectorize.get_mask_mode. */
48958 static opt_machine_mode
48959 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
48961 unsigned elem_size = vector_size / nunits;
48963 /* Scalar mask case. */
48964 if ((TARGET_AVX512F && vector_size == 64)
48965 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
48967 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
48968 return smallest_int_mode_for_size (nunits);
48971 scalar_int_mode elem_mode
48972 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
48974 gcc_assert (elem_size * nunits == vector_size);
48976 return mode_for_vector (elem_mode, nunits);
48981 /* Return class of registers which could be used for pseudo of MODE
48982 and of class RCLASS for spilling instead of memory. Return NO_REGS
48983 if it is not possible or non-profitable. */
48985 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
48987 static reg_class_t
48988 ix86_spill_class (reg_class_t rclass, machine_mode mode)
48990 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
48991 && TARGET_SSE2
48992 && TARGET_INTER_UNIT_MOVES_TO_VEC
48993 && TARGET_INTER_UNIT_MOVES_FROM_VEC
48994 && (mode == SImode || (TARGET_64BIT && mode == DImode))
48995 && INTEGER_CLASS_P (rclass))
48996 return ALL_SSE_REGS;
48997 return NO_REGS;
49000 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
49001 but returns a lower bound. */
49003 static unsigned int
49004 ix86_max_noce_ifcvt_seq_cost (edge e)
49006 bool predictable_p = predictable_edge_p (e);
49008 enum compiler_param param
49009 = (predictable_p
49010 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
49011 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
49013 /* If we have a parameter set, use that, otherwise take a guess using
49014 BRANCH_COST. */
49015 if (global_options_set.x_param_values[param])
49016 return PARAM_VALUE (param);
49017 else
49018 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
49021 /* Return true if SEQ is a good candidate as a replacement for the
49022 if-convertible sequence described in IF_INFO. */
49024 static bool
49025 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
49027 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
49029 int cmov_cnt = 0;
49030 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
49031 Maybe we should allow even more conditional moves as long as they
49032 are used far enough not to stall the CPU, or also consider
49033 IF_INFO->TEST_BB succ edge probabilities. */
49034 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
49036 rtx set = single_set (insn);
49037 if (!set)
49038 continue;
49039 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
49040 continue;
49041 rtx src = SET_SRC (set);
49042 machine_mode mode = GET_MODE (src);
49043 if (GET_MODE_CLASS (mode) != MODE_INT
49044 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49045 continue;
49046 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49047 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49048 continue;
49049 /* insn is CMOV or FCMOV. */
49050 if (++cmov_cnt > 1)
49051 return false;
49054 return default_noce_conversion_profitable_p (seq, if_info);
49057 /* Implement targetm.vectorize.init_cost. */
49059 static void *
49060 ix86_init_cost (struct loop *)
49062 unsigned *cost = XNEWVEC (unsigned, 3);
49063 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49064 return cost;
49067 /* Implement targetm.vectorize.add_stmt_cost. */
49069 static unsigned
49070 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49071 struct _stmt_vec_info *stmt_info, int misalign,
49072 enum vect_cost_model_location where)
49074 unsigned *cost = (unsigned *) data;
49075 unsigned retval = 0;
49077 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49078 int stmt_cost = - 1;
49080 if ((kind == vector_stmt || kind == scalar_stmt)
49081 && stmt_info
49082 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
49084 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
49085 bool fp = false;
49086 machine_mode mode = TImode;
49088 if (vectype != NULL)
49090 fp = FLOAT_TYPE_P (vectype);
49091 mode = TYPE_MODE (vectype);
49093 /*machine_mode inner_mode = mode;
49094 if (VECTOR_MODE_P (mode))
49095 inner_mode = GET_MODE_INNER (mode);*/
49097 switch (subcode)
49099 case PLUS_EXPR:
49100 case POINTER_PLUS_EXPR:
49101 case MINUS_EXPR:
49102 if (kind == scalar_stmt)
49104 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49105 stmt_cost = ix86_cost->addss;
49106 else if (X87_FLOAT_MODE_P (mode))
49107 stmt_cost = ix86_cost->fadd;
49108 else
49109 stmt_cost = ix86_cost->add;
49111 else
49112 stmt_cost = ix86_vec_cost (mode,
49113 fp ? ix86_cost->addss
49114 : ix86_cost->sse_op,
49115 true);
49116 break;
49118 case MULT_EXPR:
49119 case WIDEN_MULT_EXPR:
49120 case MULT_HIGHPART_EXPR:
49121 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
49122 break;
49123 case FMA_EXPR:
49124 stmt_cost = ix86_vec_cost (mode,
49125 mode == SFmode ? ix86_cost->fmass
49126 : ix86_cost->fmasd,
49127 true);
49128 break;
49129 case NEGATE_EXPR:
49130 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49131 stmt_cost = ix86_cost->sse_op;
49132 else if (X87_FLOAT_MODE_P (mode))
49133 stmt_cost = ix86_cost->fchs;
49134 else if (VECTOR_MODE_P (mode))
49135 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49136 else
49137 stmt_cost = ix86_cost->add;
49138 break;
49139 case TRUNC_DIV_EXPR:
49140 case CEIL_DIV_EXPR:
49141 case FLOOR_DIV_EXPR:
49142 case ROUND_DIV_EXPR:
49143 case TRUNC_MOD_EXPR:
49144 case CEIL_MOD_EXPR:
49145 case FLOOR_MOD_EXPR:
49146 case RDIV_EXPR:
49147 case ROUND_MOD_EXPR:
49148 case EXACT_DIV_EXPR:
49149 stmt_cost = ix86_division_cost (ix86_cost, mode);
49150 break;
49152 case RSHIFT_EXPR:
49153 case LSHIFT_EXPR:
49154 case LROTATE_EXPR:
49155 case RROTATE_EXPR:
49157 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
49158 stmt_cost = ix86_shift_rotate_cost
49159 (ix86_cost, mode,
49160 TREE_CODE (op2) == INTEGER_CST,
49161 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
49162 true, false, false, NULL, NULL);
49164 break;
49165 case NOP_EXPR:
49166 stmt_cost = 0;
49167 break;
49169 case BIT_IOR_EXPR:
49170 case ABS_EXPR:
49171 case MIN_EXPR:
49172 case MAX_EXPR:
49173 case BIT_XOR_EXPR:
49174 case BIT_AND_EXPR:
49175 case BIT_NOT_EXPR:
49176 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49177 stmt_cost = ix86_cost->sse_op;
49178 else if (VECTOR_MODE_P (mode))
49179 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49180 else
49181 stmt_cost = ix86_cost->add;
49182 break;
49183 default:
49184 break;
49187 if (stmt_cost == -1)
49188 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49190 /* Penalize DFmode vector operations for Bonnell. */
49191 if (TARGET_BONNELL && kind == vector_stmt
49192 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49193 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49195 /* Statements in an inner loop relative to the loop being
49196 vectorized are weighted more heavily. The value here is
49197 arbitrary and could potentially be improved with analysis. */
49198 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49199 count *= 50; /* FIXME. */
49201 retval = (unsigned) (count * stmt_cost);
49203 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49204 for Silvermont as it has out of order integer pipeline and can execute
49205 2 scalar instruction per tick, but has in order SIMD pipeline. */
49206 if ((TARGET_SILVERMONT || TARGET_INTEL)
49207 && stmt_info && stmt_info->stmt)
49209 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49210 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49211 retval = (retval * 17) / 10;
49214 cost[where] += retval;
49216 return retval;
49219 /* Implement targetm.vectorize.finish_cost. */
49221 static void
49222 ix86_finish_cost (void *data, unsigned *prologue_cost,
49223 unsigned *body_cost, unsigned *epilogue_cost)
49225 unsigned *cost = (unsigned *) data;
49226 *prologue_cost = cost[vect_prologue];
49227 *body_cost = cost[vect_body];
49228 *epilogue_cost = cost[vect_epilogue];
49231 /* Implement targetm.vectorize.destroy_cost_data. */
49233 static void
49234 ix86_destroy_cost_data (void *data)
49236 free (data);
49239 /* Validate target specific memory model bits in VAL. */
49241 static unsigned HOST_WIDE_INT
49242 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49244 enum memmodel model = memmodel_from_int (val);
49245 bool strong;
49247 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49248 |MEMMODEL_MASK)
49249 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49251 warning (OPT_Winvalid_memory_model,
49252 "unknown architecture specific memory model");
49253 return MEMMODEL_SEQ_CST;
49255 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49256 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49258 warning (OPT_Winvalid_memory_model,
49259 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49260 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49262 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49264 warning (OPT_Winvalid_memory_model,
49265 "HLE_RELEASE not used with RELEASE or stronger memory model");
49266 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49268 return val;
49271 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49272 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49273 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49274 or number of vecsize_mangle variants that should be emitted. */
49276 static int
49277 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49278 struct cgraph_simd_clone *clonei,
49279 tree base_type, int num)
49281 int ret = 1;
49283 if (clonei->simdlen
49284 && (clonei->simdlen < 2
49285 || clonei->simdlen > 1024
49286 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49288 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49289 "unsupported simdlen %d", clonei->simdlen);
49290 return 0;
49293 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49294 if (TREE_CODE (ret_type) != VOID_TYPE)
49295 switch (TYPE_MODE (ret_type))
49297 case E_QImode:
49298 case E_HImode:
49299 case E_SImode:
49300 case E_DImode:
49301 case E_SFmode:
49302 case E_DFmode:
49303 /* case E_SCmode: */
49304 /* case E_DCmode: */
49305 break;
49306 default:
49307 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49308 "unsupported return type %qT for simd\n", ret_type);
49309 return 0;
49312 tree t;
49313 int i;
49315 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49316 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49317 switch (TYPE_MODE (TREE_TYPE (t)))
49319 case E_QImode:
49320 case E_HImode:
49321 case E_SImode:
49322 case E_DImode:
49323 case E_SFmode:
49324 case E_DFmode:
49325 /* case E_SCmode: */
49326 /* case E_DCmode: */
49327 break;
49328 default:
49329 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49330 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49331 return 0;
49334 if (!TREE_PUBLIC (node->decl))
49336 /* If the function isn't exported, we can pick up just one ISA
49337 for the clones. */
49338 if (TARGET_AVX512F)
49339 clonei->vecsize_mangle = 'e';
49340 else if (TARGET_AVX2)
49341 clonei->vecsize_mangle = 'd';
49342 else if (TARGET_AVX)
49343 clonei->vecsize_mangle = 'c';
49344 else
49345 clonei->vecsize_mangle = 'b';
49346 ret = 1;
49348 else
49350 clonei->vecsize_mangle = "bcde"[num];
49351 ret = 4;
49353 clonei->mask_mode = VOIDmode;
49354 switch (clonei->vecsize_mangle)
49356 case 'b':
49357 clonei->vecsize_int = 128;
49358 clonei->vecsize_float = 128;
49359 break;
49360 case 'c':
49361 clonei->vecsize_int = 128;
49362 clonei->vecsize_float = 256;
49363 break;
49364 case 'd':
49365 clonei->vecsize_int = 256;
49366 clonei->vecsize_float = 256;
49367 break;
49368 case 'e':
49369 clonei->vecsize_int = 512;
49370 clonei->vecsize_float = 512;
49371 if (TYPE_MODE (base_type) == QImode)
49372 clonei->mask_mode = DImode;
49373 else
49374 clonei->mask_mode = SImode;
49375 break;
49377 if (clonei->simdlen == 0)
49379 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49380 clonei->simdlen = clonei->vecsize_int;
49381 else
49382 clonei->simdlen = clonei->vecsize_float;
49383 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49385 else if (clonei->simdlen > 16)
49387 /* For compatibility with ICC, use the same upper bounds
49388 for simdlen. In particular, for CTYPE below, use the return type,
49389 unless the function returns void, in that case use the characteristic
49390 type. If it is possible for given SIMDLEN to pass CTYPE value
49391 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49392 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49393 emit corresponding clone. */
49394 tree ctype = ret_type;
49395 if (TREE_CODE (ret_type) == VOID_TYPE)
49396 ctype = base_type;
49397 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49398 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49399 cnt /= clonei->vecsize_int;
49400 else
49401 cnt /= clonei->vecsize_float;
49402 if (cnt > (TARGET_64BIT ? 16 : 8))
49404 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49405 "unsupported simdlen %d", clonei->simdlen);
49406 return 0;
49409 return ret;
49412 /* Add target attribute to SIMD clone NODE if needed. */
49414 static void
49415 ix86_simd_clone_adjust (struct cgraph_node *node)
49417 const char *str = NULL;
49418 gcc_assert (node->decl == cfun->decl);
49419 switch (node->simdclone->vecsize_mangle)
49421 case 'b':
49422 if (!TARGET_SSE2)
49423 str = "sse2";
49424 break;
49425 case 'c':
49426 if (!TARGET_AVX)
49427 str = "avx";
49428 break;
49429 case 'd':
49430 if (!TARGET_AVX2)
49431 str = "avx2";
49432 break;
49433 case 'e':
49434 if (!TARGET_AVX512F)
49435 str = "avx512f";
49436 break;
49437 default:
49438 gcc_unreachable ();
49440 if (str == NULL)
49441 return;
49442 push_cfun (NULL);
49443 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49444 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49445 gcc_assert (ok);
49446 pop_cfun ();
49447 ix86_reset_previous_fndecl ();
49448 ix86_set_current_function (node->decl);
49451 /* If SIMD clone NODE can't be used in a vectorized loop
49452 in current function, return -1, otherwise return a badness of using it
49453 (0 if it is most desirable from vecsize_mangle point of view, 1
49454 slightly less desirable, etc.). */
49456 static int
49457 ix86_simd_clone_usable (struct cgraph_node *node)
49459 switch (node->simdclone->vecsize_mangle)
49461 case 'b':
49462 if (!TARGET_SSE2)
49463 return -1;
49464 if (!TARGET_AVX)
49465 return 0;
49466 return TARGET_AVX2 ? 2 : 1;
49467 case 'c':
49468 if (!TARGET_AVX)
49469 return -1;
49470 return TARGET_AVX2 ? 1 : 0;
49471 case 'd':
49472 if (!TARGET_AVX2)
49473 return -1;
49474 return 0;
49475 case 'e':
49476 if (!TARGET_AVX512F)
49477 return -1;
49478 return 0;
49479 default:
49480 gcc_unreachable ();
49484 /* This function adjusts the unroll factor based on
49485 the hardware capabilities. For ex, bdver3 has
49486 a loop buffer which makes unrolling of smaller
49487 loops less important. This function decides the
49488 unroll factor using number of memory references
49489 (value 32 is used) as a heuristic. */
49491 static unsigned
49492 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49494 basic_block *bbs;
49495 rtx_insn *insn;
49496 unsigned i;
49497 unsigned mem_count = 0;
49499 if (!TARGET_ADJUST_UNROLL)
49500 return nunroll;
49502 /* Count the number of memory references within the loop body.
49503 This value determines the unrolling factor for bdver3 and bdver4
49504 architectures. */
49505 subrtx_iterator::array_type array;
49506 bbs = get_loop_body (loop);
49507 for (i = 0; i < loop->num_nodes; i++)
49508 FOR_BB_INSNS (bbs[i], insn)
49509 if (NONDEBUG_INSN_P (insn))
49510 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49511 if (const_rtx x = *iter)
49512 if (MEM_P (x))
49514 machine_mode mode = GET_MODE (x);
49515 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49516 if (n_words > 4)
49517 mem_count += 2;
49518 else
49519 mem_count += 1;
49521 free (bbs);
49523 if (mem_count && mem_count <=32)
49524 return 32/mem_count;
49526 return nunroll;
49530 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49532 static bool
49533 ix86_float_exceptions_rounding_supported_p (void)
49535 /* For x87 floating point with standard excess precision handling,
49536 there is no adddf3 pattern (since x87 floating point only has
49537 XFmode operations) so the default hook implementation gets this
49538 wrong. */
49539 return TARGET_80387 || TARGET_SSE_MATH;
49542 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49544 static void
49545 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49547 if (!TARGET_80387 && !TARGET_SSE_MATH)
49548 return;
49549 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49550 if (TARGET_80387)
49552 tree fenv_index_type = build_index_type (size_int (6));
49553 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49554 tree fenv_var = create_tmp_var_raw (fenv_type);
49555 TREE_ADDRESSABLE (fenv_var) = 1;
49556 tree fenv_ptr = build_pointer_type (fenv_type);
49557 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49558 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49559 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49560 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49561 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49562 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49563 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49564 tree hold_fnclex = build_call_expr (fnclex, 0);
49565 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49566 NULL_TREE, NULL_TREE);
49567 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49568 hold_fnclex);
49569 *clear = build_call_expr (fnclex, 0);
49570 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49571 tree fnstsw_call = build_call_expr (fnstsw, 0);
49572 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49573 sw_var, fnstsw_call);
49574 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49575 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49576 exceptions_var, exceptions_x87);
49577 *update = build2 (COMPOUND_EXPR, integer_type_node,
49578 sw_mod, update_mod);
49579 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49580 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49582 if (TARGET_SSE_MATH)
49584 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49585 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49586 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49587 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49588 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49589 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49590 mxcsr_orig_var, stmxcsr_hold_call);
49591 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49592 mxcsr_orig_var,
49593 build_int_cst (unsigned_type_node, 0x1f80));
49594 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49595 build_int_cst (unsigned_type_node, 0xffffffc0));
49596 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49597 mxcsr_mod_var, hold_mod_val);
49598 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49599 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49600 hold_assign_orig, hold_assign_mod);
49601 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49602 ldmxcsr_hold_call);
49603 if (*hold)
49604 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49605 else
49606 *hold = hold_all;
49607 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49608 if (*clear)
49609 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49610 ldmxcsr_clear_call);
49611 else
49612 *clear = ldmxcsr_clear_call;
49613 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49614 tree exceptions_sse = fold_convert (integer_type_node,
49615 stxmcsr_update_call);
49616 if (*update)
49618 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49619 exceptions_var, exceptions_sse);
49620 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49621 exceptions_var, exceptions_mod);
49622 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49623 exceptions_assign);
49625 else
49626 *update = build2 (MODIFY_EXPR, integer_type_node,
49627 exceptions_var, exceptions_sse);
49628 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49629 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49630 ldmxcsr_update_call);
49632 tree atomic_feraiseexcept
49633 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49634 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49635 1, exceptions_var);
49636 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49637 atomic_feraiseexcept_call);
49640 /* Return mode to be used for bounds or VOIDmode
49641 if bounds are not supported. */
49643 static machine_mode
49644 ix86_mpx_bound_mode ()
49646 /* Do not support pointer checker if MPX
49647 is not enabled. */
49648 if (!TARGET_MPX)
49650 if (flag_check_pointer_bounds)
49651 warning (0, "Pointer Checker requires MPX support on this target."
49652 " Use -mmpx options to enable MPX.");
49653 return VOIDmode;
49656 return BNDmode;
49659 /* Return constant used to statically initialize constant bounds.
49661 This function is used to create special bound values. For now
49662 only INIT bounds and NONE bounds are expected. More special
49663 values may be added later. */
49665 static tree
49666 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49668 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49669 : build_zero_cst (pointer_sized_int_node);
49670 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49671 : build_minus_one_cst (pointer_sized_int_node);
49673 /* This function is supposed to be used to create INIT and
49674 NONE bounds only. */
49675 gcc_assert ((lb == 0 && ub == -1)
49676 || (lb == -1 && ub == 0));
49678 return build_complex (NULL, low, high);
49681 /* Generate a list of statements STMTS to initialize pointer bounds
49682 variable VAR with bounds LB and UB. Return the number of generated
49683 statements. */
49685 static int
49686 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49688 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49689 tree lhs, modify, var_p;
49691 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49692 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49694 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49695 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49696 append_to_statement_list (modify, stmts);
49698 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49699 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49700 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49701 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49702 append_to_statement_list (modify, stmts);
49704 return 2;
49707 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49708 /* For i386, common symbol is local only for non-PIE binaries. For
49709 x86-64, common symbol is local only for non-PIE binaries or linker
49710 supports copy reloc in PIE binaries. */
49712 static bool
49713 ix86_binds_local_p (const_tree exp)
49715 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49716 (!flag_pic
49717 || (TARGET_64BIT
49718 && HAVE_LD_PIE_COPYRELOC != 0)));
49720 #endif
49722 /* If MEM is in the form of [base+offset], extract the two parts
49723 of address and set to BASE and OFFSET, otherwise return false. */
49725 static bool
49726 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49728 rtx addr;
49730 gcc_assert (MEM_P (mem));
49732 addr = XEXP (mem, 0);
49734 if (GET_CODE (addr) == CONST)
49735 addr = XEXP (addr, 0);
49737 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49739 *base = addr;
49740 *offset = const0_rtx;
49741 return true;
49744 if (GET_CODE (addr) == PLUS
49745 && (REG_P (XEXP (addr, 0))
49746 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49747 && CONST_INT_P (XEXP (addr, 1)))
49749 *base = XEXP (addr, 0);
49750 *offset = XEXP (addr, 1);
49751 return true;
49754 return false;
49757 /* Given OPERANDS of consecutive load/store, check if we can merge
49758 them into move multiple. LOAD is true if they are load instructions.
49759 MODE is the mode of memory operands. */
49761 bool
49762 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49763 machine_mode mode)
49765 HOST_WIDE_INT offval_1, offval_2, msize;
49766 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49768 if (load)
49770 mem_1 = operands[1];
49771 mem_2 = operands[3];
49772 reg_1 = operands[0];
49773 reg_2 = operands[2];
49775 else
49777 mem_1 = operands[0];
49778 mem_2 = operands[2];
49779 reg_1 = operands[1];
49780 reg_2 = operands[3];
49783 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49785 if (REGNO (reg_1) != REGNO (reg_2))
49786 return false;
49788 /* Check if the addresses are in the form of [base+offset]. */
49789 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49790 return false;
49791 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49792 return false;
49794 /* Check if the bases are the same. */
49795 if (!rtx_equal_p (base_1, base_2))
49796 return false;
49798 offval_1 = INTVAL (offset_1);
49799 offval_2 = INTVAL (offset_2);
49800 msize = GET_MODE_SIZE (mode);
49801 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49802 if (offval_1 + msize != offval_2)
49803 return false;
49805 return true;
49808 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49810 static bool
49811 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49812 optimization_type opt_type)
49814 switch (op)
49816 case asin_optab:
49817 case acos_optab:
49818 case log1p_optab:
49819 case exp_optab:
49820 case exp10_optab:
49821 case exp2_optab:
49822 case expm1_optab:
49823 case ldexp_optab:
49824 case scalb_optab:
49825 case round_optab:
49826 return opt_type == OPTIMIZE_FOR_SPEED;
49828 case rint_optab:
49829 if (SSE_FLOAT_MODE_P (mode1)
49830 && TARGET_SSE_MATH
49831 && !flag_trapping_math
49832 && !TARGET_SSE4_1)
49833 return opt_type == OPTIMIZE_FOR_SPEED;
49834 return true;
49836 case floor_optab:
49837 case ceil_optab:
49838 case btrunc_optab:
49839 if (SSE_FLOAT_MODE_P (mode1)
49840 && TARGET_SSE_MATH
49841 && !flag_trapping_math
49842 && TARGET_SSE4_1)
49843 return true;
49844 return opt_type == OPTIMIZE_FOR_SPEED;
49846 case rsqrt_optab:
49847 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49849 default:
49850 return true;
49854 /* Address space support.
49856 This is not "far pointers" in the 16-bit sense, but an easy way
49857 to use %fs and %gs segment prefixes. Therefore:
49859 (a) All address spaces have the same modes,
49860 (b) All address spaces have the same addresss forms,
49861 (c) While %fs and %gs are technically subsets of the generic
49862 address space, they are probably not subsets of each other.
49863 (d) Since we have no access to the segment base register values
49864 without resorting to a system call, we cannot convert a
49865 non-default address space to a default address space.
49866 Therefore we do not claim %fs or %gs are subsets of generic.
49868 Therefore we can (mostly) use the default hooks. */
49870 /* All use of segmentation is assumed to make address 0 valid. */
49872 static bool
49873 ix86_addr_space_zero_address_valid (addr_space_t as)
49875 return as != ADDR_SPACE_GENERIC;
49878 static void
49879 ix86_init_libfuncs (void)
49881 if (TARGET_64BIT)
49883 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49884 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49886 else
49888 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49889 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49892 #if TARGET_MACHO
49893 darwin_rename_builtins ();
49894 #endif
49897 /* Generate call to __divmoddi4. */
49899 static void
49900 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49901 rtx op0, rtx op1,
49902 rtx *quot_p, rtx *rem_p)
49904 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49906 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49907 mode,
49908 op0, GET_MODE (op0),
49909 op1, GET_MODE (op1),
49910 XEXP (rem, 0), Pmode);
49911 *quot_p = quot;
49912 *rem_p = rem;
49915 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49916 FPU, assume that the fpcw is set to extended precision; when using
49917 only SSE, rounding is correct; when using both SSE and the FPU,
49918 the rounding precision is indeterminate, since either may be chosen
49919 apparently at random. */
49921 static enum flt_eval_method
49922 ix86_excess_precision (enum excess_precision_type type)
49924 switch (type)
49926 case EXCESS_PRECISION_TYPE_FAST:
49927 /* The fastest type to promote to will always be the native type,
49928 whether that occurs with implicit excess precision or
49929 otherwise. */
49930 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49931 case EXCESS_PRECISION_TYPE_STANDARD:
49932 case EXCESS_PRECISION_TYPE_IMPLICIT:
49933 /* Otherwise, the excess precision we want when we are
49934 in a standards compliant mode, and the implicit precision we
49935 provide would be identical were it not for the unpredictable
49936 cases. */
49937 if (!TARGET_80387)
49938 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49939 else if (!TARGET_MIX_SSE_I387)
49941 if (!TARGET_SSE_MATH)
49942 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
49943 else if (TARGET_SSE2)
49944 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49947 /* If we are in standards compliant mode, but we know we will
49948 calculate in unpredictable precision, return
49949 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
49950 excess precision if the target can't guarantee it will honor
49951 it. */
49952 return (type == EXCESS_PRECISION_TYPE_STANDARD
49953 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
49954 : FLT_EVAL_METHOD_UNPREDICTABLE);
49955 default:
49956 gcc_unreachable ();
49959 return FLT_EVAL_METHOD_UNPREDICTABLE;
49962 /* Target-specific selftests. */
49964 #if CHECKING_P
49966 namespace selftest {
49968 /* Verify that hard regs are dumped as expected (in compact mode). */
49970 static void
49971 ix86_test_dumping_hard_regs ()
49973 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
49974 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
49977 /* Test dumping an insn with repeated references to the same SCRATCH,
49978 to verify the rtx_reuse code. */
49980 static void
49981 ix86_test_dumping_memory_blockage ()
49983 set_new_first_and_last_insn (NULL, NULL);
49985 rtx pat = gen_memory_blockage ();
49986 rtx_reuse_manager r;
49987 r.preprocess (pat);
49989 /* Verify that the repeated references to the SCRATCH show use
49990 reuse IDS. The first should be prefixed with a reuse ID,
49991 and the second should be dumped as a "reuse_rtx" of that ID.
49992 The expected string assumes Pmode == DImode. */
49993 if (Pmode == DImode)
49994 ASSERT_RTL_DUMP_EQ_WITH_REUSE
49995 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
49996 " (unspec:BLK [\n"
49997 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
49998 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
50001 /* Verify loading an RTL dump; specifically a dump of copying
50002 a param on x86_64 from a hard reg into the frame.
50003 This test is target-specific since the dump contains target-specific
50004 hard reg names. */
50006 static void
50007 ix86_test_loading_dump_fragment_1 ()
50009 rtl_dump_test t (SELFTEST_LOCATION,
50010 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
50012 rtx_insn *insn = get_insn_by_uid (1);
50014 /* The block structure and indentation here is purely for
50015 readability; it mirrors the structure of the rtx. */
50016 tree mem_expr;
50018 rtx pat = PATTERN (insn);
50019 ASSERT_EQ (SET, GET_CODE (pat));
50021 rtx dest = SET_DEST (pat);
50022 ASSERT_EQ (MEM, GET_CODE (dest));
50023 /* Verify the "/c" was parsed. */
50024 ASSERT_TRUE (RTX_FLAG (dest, call));
50025 ASSERT_EQ (SImode, GET_MODE (dest));
50027 rtx addr = XEXP (dest, 0);
50028 ASSERT_EQ (PLUS, GET_CODE (addr));
50029 ASSERT_EQ (DImode, GET_MODE (addr));
50031 rtx lhs = XEXP (addr, 0);
50032 /* Verify that the "frame" REG was consolidated. */
50033 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
50036 rtx rhs = XEXP (addr, 1);
50037 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
50038 ASSERT_EQ (-4, INTVAL (rhs));
50041 /* Verify the "[1 i+0 S4 A32]" was parsed. */
50042 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
50043 /* "i" should have been handled by synthesizing a global int
50044 variable named "i". */
50045 mem_expr = MEM_EXPR (dest);
50046 ASSERT_NE (mem_expr, NULL);
50047 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
50048 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
50049 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
50050 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
50051 /* "+0". */
50052 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
50053 ASSERT_EQ (0, MEM_OFFSET (dest));
50054 /* "S4". */
50055 ASSERT_EQ (4, MEM_SIZE (dest));
50056 /* "A32. */
50057 ASSERT_EQ (32, MEM_ALIGN (dest));
50060 rtx src = SET_SRC (pat);
50061 ASSERT_EQ (REG, GET_CODE (src));
50062 ASSERT_EQ (SImode, GET_MODE (src));
50063 ASSERT_EQ (5, REGNO (src));
50064 tree reg_expr = REG_EXPR (src);
50065 /* "i" here should point to the same var as for the MEM_EXPR. */
50066 ASSERT_EQ (reg_expr, mem_expr);
50071 /* Verify that the RTL loader copes with a call_insn dump.
50072 This test is target-specific since the dump contains a target-specific
50073 hard reg name. */
50075 static void
50076 ix86_test_loading_call_insn ()
50078 /* The test dump includes register "xmm0", where requires TARGET_SSE
50079 to exist. */
50080 if (!TARGET_SSE)
50081 return;
50083 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
50085 rtx_insn *insn = get_insns ();
50086 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
50088 /* "/j". */
50089 ASSERT_TRUE (RTX_FLAG (insn, jump));
50091 rtx pat = PATTERN (insn);
50092 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
50094 /* Verify REG_NOTES. */
50096 /* "(expr_list:REG_CALL_DECL". */
50097 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
50098 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
50099 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
50101 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
50102 rtx_expr_list *note1 = note0->next ();
50103 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
50105 ASSERT_EQ (NULL, note1->next ());
50108 /* Verify CALL_INSN_FUNCTION_USAGE. */
50110 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
50111 rtx_expr_list *usage
50112 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
50113 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
50114 ASSERT_EQ (DFmode, GET_MODE (usage));
50115 ASSERT_EQ (USE, GET_CODE (usage->element ()));
50116 ASSERT_EQ (NULL, usage->next ());
50120 /* Verify that the RTL loader copes a dump from print_rtx_function.
50121 This test is target-specific since the dump contains target-specific
50122 hard reg names. */
50124 static void
50125 ix86_test_loading_full_dump ()
50127 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
50129 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50131 rtx_insn *insn_1 = get_insn_by_uid (1);
50132 ASSERT_EQ (NOTE, GET_CODE (insn_1));
50134 rtx_insn *insn_7 = get_insn_by_uid (7);
50135 ASSERT_EQ (INSN, GET_CODE (insn_7));
50136 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
50138 rtx_insn *insn_15 = get_insn_by_uid (15);
50139 ASSERT_EQ (INSN, GET_CODE (insn_15));
50140 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
50142 /* Verify crtl->return_rtx. */
50143 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
50144 ASSERT_EQ (0, REGNO (crtl->return_rtx));
50145 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
50148 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
50149 In particular, verify that it correctly loads the 2nd operand.
50150 This test is target-specific since these are machine-specific
50151 operands (and enums). */
50153 static void
50154 ix86_test_loading_unspec ()
50156 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
50158 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50160 ASSERT_TRUE (cfun);
50162 /* Test of an UNSPEC. */
50163 rtx_insn *insn = get_insns ();
50164 ASSERT_EQ (INSN, GET_CODE (insn));
50165 rtx set = single_set (insn);
50166 ASSERT_NE (NULL, set);
50167 rtx dst = SET_DEST (set);
50168 ASSERT_EQ (MEM, GET_CODE (dst));
50169 rtx src = SET_SRC (set);
50170 ASSERT_EQ (UNSPEC, GET_CODE (src));
50171 ASSERT_EQ (BLKmode, GET_MODE (src));
50172 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
50174 rtx v0 = XVECEXP (src, 0, 0);
50176 /* Verify that the two uses of the first SCRATCH have pointer
50177 equality. */
50178 rtx scratch_a = XEXP (dst, 0);
50179 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
50181 rtx scratch_b = XEXP (v0, 0);
50182 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
50184 ASSERT_EQ (scratch_a, scratch_b);
50186 /* Verify that the two mems are thus treated as equal. */
50187 ASSERT_TRUE (rtx_equal_p (dst, v0));
50189 /* Verify the the insn is recognized. */
50190 ASSERT_NE(-1, recog_memoized (insn));
50192 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
50193 insn = NEXT_INSN (insn);
50194 ASSERT_EQ (INSN, GET_CODE (insn));
50196 set = single_set (insn);
50197 ASSERT_NE (NULL, set);
50199 src = SET_SRC (set);
50200 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
50201 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
50204 /* Run all target-specific selftests. */
50206 static void
50207 ix86_run_selftests (void)
50209 ix86_test_dumping_hard_regs ();
50210 ix86_test_dumping_memory_blockage ();
50212 /* Various tests of loading RTL dumps, here because they contain
50213 ix86-isms (e.g. names of hard regs). */
50214 ix86_test_loading_dump_fragment_1 ();
50215 ix86_test_loading_call_insn ();
50216 ix86_test_loading_full_dump ();
50217 ix86_test_loading_unspec ();
50220 } // namespace selftest
50222 #endif /* CHECKING_P */
50224 /* Initialize the GCC target structure. */
50225 #undef TARGET_RETURN_IN_MEMORY
50226 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50228 #undef TARGET_LEGITIMIZE_ADDRESS
50229 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50231 #undef TARGET_ATTRIBUTE_TABLE
50232 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50233 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50234 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50235 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50236 # undef TARGET_MERGE_DECL_ATTRIBUTES
50237 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50238 #endif
50240 #undef TARGET_COMP_TYPE_ATTRIBUTES
50241 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50243 #undef TARGET_INIT_BUILTINS
50244 #define TARGET_INIT_BUILTINS ix86_init_builtins
50245 #undef TARGET_BUILTIN_DECL
50246 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50247 #undef TARGET_EXPAND_BUILTIN
50248 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50250 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50251 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50252 ix86_builtin_vectorized_function
50254 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50255 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50257 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50258 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50260 #undef TARGET_BUILTIN_RECIPROCAL
50261 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50263 #undef TARGET_ASM_FUNCTION_EPILOGUE
50264 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50266 #undef TARGET_ENCODE_SECTION_INFO
50267 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50268 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50269 #else
50270 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50271 #endif
50273 #undef TARGET_ASM_OPEN_PAREN
50274 #define TARGET_ASM_OPEN_PAREN ""
50275 #undef TARGET_ASM_CLOSE_PAREN
50276 #define TARGET_ASM_CLOSE_PAREN ""
50278 #undef TARGET_ASM_BYTE_OP
50279 #define TARGET_ASM_BYTE_OP ASM_BYTE
50281 #undef TARGET_ASM_ALIGNED_HI_OP
50282 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50283 #undef TARGET_ASM_ALIGNED_SI_OP
50284 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50285 #ifdef ASM_QUAD
50286 #undef TARGET_ASM_ALIGNED_DI_OP
50287 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50288 #endif
50290 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50291 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50293 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50294 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50296 #undef TARGET_ASM_UNALIGNED_HI_OP
50297 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50298 #undef TARGET_ASM_UNALIGNED_SI_OP
50299 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50300 #undef TARGET_ASM_UNALIGNED_DI_OP
50301 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50303 #undef TARGET_PRINT_OPERAND
50304 #define TARGET_PRINT_OPERAND ix86_print_operand
50305 #undef TARGET_PRINT_OPERAND_ADDRESS
50306 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50307 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50308 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50309 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50310 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50312 #undef TARGET_SCHED_INIT_GLOBAL
50313 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50314 #undef TARGET_SCHED_ADJUST_COST
50315 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50316 #undef TARGET_SCHED_ISSUE_RATE
50317 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50318 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50319 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50320 ia32_multipass_dfa_lookahead
50321 #undef TARGET_SCHED_MACRO_FUSION_P
50322 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50323 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50324 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50326 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50327 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50329 #undef TARGET_MEMMODEL_CHECK
50330 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50332 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50333 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50335 #ifdef HAVE_AS_TLS
50336 #undef TARGET_HAVE_TLS
50337 #define TARGET_HAVE_TLS true
50338 #endif
50339 #undef TARGET_CANNOT_FORCE_CONST_MEM
50340 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50341 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50342 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50344 #undef TARGET_DELEGITIMIZE_ADDRESS
50345 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50347 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
50348 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
50350 #undef TARGET_MS_BITFIELD_LAYOUT_P
50351 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50353 #if TARGET_MACHO
50354 #undef TARGET_BINDS_LOCAL_P
50355 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50356 #else
50357 #undef TARGET_BINDS_LOCAL_P
50358 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50359 #endif
50360 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50361 #undef TARGET_BINDS_LOCAL_P
50362 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50363 #endif
50365 #undef TARGET_ASM_OUTPUT_MI_THUNK
50366 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50367 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50368 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50370 #undef TARGET_ASM_FILE_START
50371 #define TARGET_ASM_FILE_START x86_file_start
50373 #undef TARGET_OPTION_OVERRIDE
50374 #define TARGET_OPTION_OVERRIDE ix86_option_override
50376 #undef TARGET_REGISTER_MOVE_COST
50377 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50378 #undef TARGET_MEMORY_MOVE_COST
50379 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50380 #undef TARGET_RTX_COSTS
50381 #define TARGET_RTX_COSTS ix86_rtx_costs
50382 #undef TARGET_ADDRESS_COST
50383 #define TARGET_ADDRESS_COST ix86_address_cost
50385 #undef TARGET_FLAGS_REGNUM
50386 #define TARGET_FLAGS_REGNUM FLAGS_REG
50387 #undef TARGET_FIXED_CONDITION_CODE_REGS
50388 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50389 #undef TARGET_CC_MODES_COMPATIBLE
50390 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50392 #undef TARGET_MACHINE_DEPENDENT_REORG
50393 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50395 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50396 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50398 #undef TARGET_BUILD_BUILTIN_VA_LIST
50399 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50401 #undef TARGET_FOLD_BUILTIN
50402 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50404 #undef TARGET_GIMPLE_FOLD_BUILTIN
50405 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50407 #undef TARGET_COMPARE_VERSION_PRIORITY
50408 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50410 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50411 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50412 ix86_generate_version_dispatcher_body
50414 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50415 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50416 ix86_get_function_versions_dispatcher
50418 #undef TARGET_ENUM_VA_LIST_P
50419 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50421 #undef TARGET_FN_ABI_VA_LIST
50422 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50424 #undef TARGET_CANONICAL_VA_LIST_TYPE
50425 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50427 #undef TARGET_EXPAND_BUILTIN_VA_START
50428 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50430 #undef TARGET_MD_ASM_ADJUST
50431 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50433 #undef TARGET_C_EXCESS_PRECISION
50434 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
50435 #undef TARGET_PROMOTE_PROTOTYPES
50436 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50437 #undef TARGET_SETUP_INCOMING_VARARGS
50438 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50439 #undef TARGET_MUST_PASS_IN_STACK
50440 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50441 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
50442 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
50443 #undef TARGET_FUNCTION_ARG_ADVANCE
50444 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50445 #undef TARGET_FUNCTION_ARG
50446 #define TARGET_FUNCTION_ARG ix86_function_arg
50447 #undef TARGET_INIT_PIC_REG
50448 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50449 #undef TARGET_USE_PSEUDO_PIC_REG
50450 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50451 #undef TARGET_FUNCTION_ARG_BOUNDARY
50452 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50453 #undef TARGET_PASS_BY_REFERENCE
50454 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50455 #undef TARGET_INTERNAL_ARG_POINTER
50456 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50457 #undef TARGET_UPDATE_STACK_BOUNDARY
50458 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50459 #undef TARGET_GET_DRAP_RTX
50460 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50461 #undef TARGET_STRICT_ARGUMENT_NAMING
50462 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50463 #undef TARGET_STATIC_CHAIN
50464 #define TARGET_STATIC_CHAIN ix86_static_chain
50465 #undef TARGET_TRAMPOLINE_INIT
50466 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50467 #undef TARGET_RETURN_POPS_ARGS
50468 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50470 #undef TARGET_WARN_FUNC_RETURN
50471 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
50473 #undef TARGET_LEGITIMATE_COMBINED_INSN
50474 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50476 #undef TARGET_ASAN_SHADOW_OFFSET
50477 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50479 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50480 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50482 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50483 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50485 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50486 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50488 #undef TARGET_C_MODE_FOR_SUFFIX
50489 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50491 #ifdef HAVE_AS_TLS
50492 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50493 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50494 #endif
50496 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50497 #undef TARGET_INSERT_ATTRIBUTES
50498 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50499 #endif
50501 #undef TARGET_MANGLE_TYPE
50502 #define TARGET_MANGLE_TYPE ix86_mangle_type
50504 #undef TARGET_STACK_PROTECT_GUARD
50505 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50507 #if !TARGET_MACHO
50508 #undef TARGET_STACK_PROTECT_FAIL
50509 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50510 #endif
50512 #undef TARGET_FUNCTION_VALUE
50513 #define TARGET_FUNCTION_VALUE ix86_function_value
50515 #undef TARGET_FUNCTION_VALUE_REGNO_P
50516 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50518 #undef TARGET_PROMOTE_FUNCTION_MODE
50519 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50521 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50522 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50524 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50525 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50527 #undef TARGET_INSTANTIATE_DECLS
50528 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50530 #undef TARGET_SECONDARY_RELOAD
50531 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50532 #undef TARGET_SECONDARY_MEMORY_NEEDED
50533 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50534 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50535 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50537 #undef TARGET_CLASS_MAX_NREGS
50538 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50540 #undef TARGET_PREFERRED_RELOAD_CLASS
50541 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50542 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50543 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50544 #undef TARGET_CLASS_LIKELY_SPILLED_P
50545 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50547 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50548 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50549 ix86_builtin_vectorization_cost
50550 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50551 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50552 ix86_vectorize_vec_perm_const_ok
50553 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50554 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50555 ix86_preferred_simd_mode
50556 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50557 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50558 ix86_autovectorize_vector_sizes
50559 #undef TARGET_VECTORIZE_GET_MASK_MODE
50560 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50561 #undef TARGET_VECTORIZE_INIT_COST
50562 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50563 #undef TARGET_VECTORIZE_ADD_STMT_COST
50564 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50565 #undef TARGET_VECTORIZE_FINISH_COST
50566 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50567 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50568 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50570 #undef TARGET_SET_CURRENT_FUNCTION
50571 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50573 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50574 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50576 #undef TARGET_OPTION_SAVE
50577 #define TARGET_OPTION_SAVE ix86_function_specific_save
50579 #undef TARGET_OPTION_RESTORE
50580 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50582 #undef TARGET_OPTION_POST_STREAM_IN
50583 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50585 #undef TARGET_OPTION_PRINT
50586 #define TARGET_OPTION_PRINT ix86_function_specific_print
50588 #undef TARGET_OPTION_FUNCTION_VERSIONS
50589 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50591 #undef TARGET_CAN_INLINE_P
50592 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50594 #undef TARGET_LEGITIMATE_ADDRESS_P
50595 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50597 #undef TARGET_REGISTER_PRIORITY
50598 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50600 #undef TARGET_REGISTER_USAGE_LEVELING_P
50601 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50603 #undef TARGET_LEGITIMATE_CONSTANT_P
50604 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50606 #undef TARGET_COMPUTE_FRAME_LAYOUT
50607 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50609 #undef TARGET_FRAME_POINTER_REQUIRED
50610 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50612 #undef TARGET_CAN_ELIMINATE
50613 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50615 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50616 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50618 #undef TARGET_ASM_CODE_END
50619 #define TARGET_ASM_CODE_END ix86_code_end
50621 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50622 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50624 #undef TARGET_CANONICALIZE_COMPARISON
50625 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50627 #undef TARGET_LOOP_UNROLL_ADJUST
50628 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50630 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50631 #undef TARGET_SPILL_CLASS
50632 #define TARGET_SPILL_CLASS ix86_spill_class
50634 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50635 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50636 ix86_simd_clone_compute_vecsize_and_simdlen
50638 #undef TARGET_SIMD_CLONE_ADJUST
50639 #define TARGET_SIMD_CLONE_ADJUST \
50640 ix86_simd_clone_adjust
50642 #undef TARGET_SIMD_CLONE_USABLE
50643 #define TARGET_SIMD_CLONE_USABLE \
50644 ix86_simd_clone_usable
50646 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50647 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50648 ix86_float_exceptions_rounding_supported_p
50650 #undef TARGET_MODE_EMIT
50651 #define TARGET_MODE_EMIT ix86_emit_mode_set
50653 #undef TARGET_MODE_NEEDED
50654 #define TARGET_MODE_NEEDED ix86_mode_needed
50656 #undef TARGET_MODE_AFTER
50657 #define TARGET_MODE_AFTER ix86_mode_after
50659 #undef TARGET_MODE_ENTRY
50660 #define TARGET_MODE_ENTRY ix86_mode_entry
50662 #undef TARGET_MODE_EXIT
50663 #define TARGET_MODE_EXIT ix86_mode_exit
50665 #undef TARGET_MODE_PRIORITY
50666 #define TARGET_MODE_PRIORITY ix86_mode_priority
50668 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50669 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50671 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50672 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50674 #undef TARGET_STORE_BOUNDS_FOR_ARG
50675 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50677 #undef TARGET_LOAD_RETURNED_BOUNDS
50678 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50680 #undef TARGET_STORE_RETURNED_BOUNDS
50681 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50683 #undef TARGET_CHKP_BOUND_MODE
50684 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50686 #undef TARGET_BUILTIN_CHKP_FUNCTION
50687 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50689 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50690 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50692 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50693 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50695 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50696 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50698 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50699 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50701 #undef TARGET_OFFLOAD_OPTIONS
50702 #define TARGET_OFFLOAD_OPTIONS \
50703 ix86_offload_options
50705 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50706 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50708 #undef TARGET_OPTAB_SUPPORTED_P
50709 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50711 #undef TARGET_HARD_REGNO_SCRATCH_OK
50712 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50714 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50715 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50717 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50718 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50720 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50721 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50723 #undef TARGET_INIT_LIBFUNCS
50724 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50726 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50727 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50729 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50730 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50732 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50733 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50735 #undef TARGET_HARD_REGNO_NREGS
50736 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50737 #undef TARGET_HARD_REGNO_MODE_OK
50738 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50740 #undef TARGET_MODES_TIEABLE_P
50741 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50743 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50744 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50745 ix86_hard_regno_call_part_clobbered
50747 #undef TARGET_CAN_CHANGE_MODE_CLASS
50748 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50750 #undef TARGET_STATIC_RTX_ALIGNMENT
50751 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
50752 #undef TARGET_CONSTANT_ALIGNMENT
50753 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50755 #undef TARGET_EMPTY_RECORD_P
50756 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
50758 #undef TARGET_WARN_PARAMETER_PASSING_ABI
50759 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
50761 #if CHECKING_P
50762 #undef TARGET_RUN_TARGET_SELFTESTS
50763 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50764 #endif /* #if CHECKING_P */
50766 struct gcc_target targetm = TARGET_INITIALIZER;
50768 #include "gt-i386.h"