PR target/83203
[official-gcc.git] / gcc / config / i386 / i386.c
blobd625670c35ca849b0877fe14ce34e04a564e7920
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
94 /* This file should be included last. */
95 #include "target-def.h"
97 #include "x86-tune-costs.h"
99 static rtx legitimize_dllimport_symbol (rtx, bool);
100 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
101 static rtx legitimize_pe_coff_symbol (rtx, bool);
102 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
103 static bool ix86_save_reg (unsigned int, bool, bool);
104 static bool ix86_function_naked (const_tree);
105 static bool ix86_notrack_prefixed_insn_p (rtx);
106 static void ix86_emit_restore_reg_using_pop (rtx);
109 #ifndef CHECK_STACK_LIMIT
110 #define CHECK_STACK_LIMIT (-1)
111 #endif
113 /* Return index of given mode in mult and division cost tables. */
114 #define MODE_INDEX(mode) \
115 ((mode) == QImode ? 0 \
116 : (mode) == HImode ? 1 \
117 : (mode) == SImode ? 2 \
118 : (mode) == DImode ? 3 \
119 : 4)
122 /* Set by -mtune. */
123 const struct processor_costs *ix86_tune_cost = NULL;
125 /* Set by -mtune or -Os. */
126 const struct processor_costs *ix86_cost = NULL;
128 /* Processor feature/optimization bitmasks. */
129 #define m_386 (1U<<PROCESSOR_I386)
130 #define m_486 (1U<<PROCESSOR_I486)
131 #define m_PENT (1U<<PROCESSOR_PENTIUM)
132 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
133 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
134 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
135 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
136 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
137 #define m_CORE2 (1U<<PROCESSOR_CORE2)
138 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
139 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
140 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
141 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
142 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
143 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
144 #define m_KNL (1U<<PROCESSOR_KNL)
145 #define m_KNM (1U<<PROCESSOR_KNM)
146 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
147 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
148 #define m_INTEL (1U<<PROCESSOR_INTEL)
150 #define m_GEODE (1U<<PROCESSOR_GEODE)
151 #define m_K6 (1U<<PROCESSOR_K6)
152 #define m_K6_GEODE (m_K6 | m_GEODE)
153 #define m_K8 (1U<<PROCESSOR_K8)
154 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
155 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
156 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
157 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
158 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
159 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
160 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
161 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
162 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
163 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
164 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
165 #define m_BTVER (m_BTVER1 | m_BTVER2)
166 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
167 | m_ZNVER1)
169 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
171 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
172 #undef DEF_TUNE
173 #define DEF_TUNE(tune, name, selector) name,
174 #include "x86-tune.def"
175 #undef DEF_TUNE
178 /* Feature tests against the various tunings. */
179 unsigned char ix86_tune_features[X86_TUNE_LAST];
181 /* Feature tests against the various tunings used to create ix86_tune_features
182 based on the processor mask. */
183 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
184 #undef DEF_TUNE
185 #define DEF_TUNE(tune, name, selector) selector,
186 #include "x86-tune.def"
187 #undef DEF_TUNE
190 /* Feature tests against the various architecture variations. */
191 unsigned char ix86_arch_features[X86_ARCH_LAST];
193 /* Feature tests against the various architecture variations, used to create
194 ix86_arch_features based on the processor mask. */
195 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
196 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
197 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
199 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
200 ~m_386,
202 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
203 ~(m_386 | m_486),
205 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
206 ~m_386,
208 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
209 ~m_386,
212 /* In case the average insn count for single function invocation is
213 lower than this constant, emit fast (but longer) prologue and
214 epilogue code. */
215 #define FAST_PROLOGUE_INSN_COUNT 20
217 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
218 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
219 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
220 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
222 /* Array of the smallest class containing reg number REGNO, indexed by
223 REGNO. Used by REGNO_REG_CLASS in i386.h. */
225 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
227 /* ax, dx, cx, bx */
228 AREG, DREG, CREG, BREG,
229 /* si, di, bp, sp */
230 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
231 /* FP registers */
232 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
233 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
234 /* arg pointer */
235 NON_Q_REGS,
236 /* flags, fpsr, fpcr, frame */
237 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
238 /* SSE registers */
239 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
240 SSE_REGS, SSE_REGS,
241 /* MMX registers */
242 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
243 MMX_REGS, MMX_REGS,
244 /* REX registers */
245 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
246 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
247 /* SSE REX registers */
248 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
249 SSE_REGS, SSE_REGS,
250 /* AVX-512 SSE registers */
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 /* Mask registers. */
256 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
257 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
258 /* MPX bound registers */
259 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
262 /* The "default" register map used in 32bit mode. */
264 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
266 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
267 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
268 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
269 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
270 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
272 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
273 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
274 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
275 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
276 101, 102, 103, 104, /* bound registers */
279 /* The "default" register map used in 64bit mode. */
281 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
283 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
284 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
285 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
286 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
287 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
288 8,9,10,11,12,13,14,15, /* extended integer registers */
289 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
290 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
291 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
292 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
293 126, 127, 128, 129, /* bound registers */
296 /* Define the register numbers to be used in Dwarf debugging information.
297 The SVR4 reference port C compiler uses the following register numbers
298 in its Dwarf output code:
299 0 for %eax (gcc regno = 0)
300 1 for %ecx (gcc regno = 2)
301 2 for %edx (gcc regno = 1)
302 3 for %ebx (gcc regno = 3)
303 4 for %esp (gcc regno = 7)
304 5 for %ebp (gcc regno = 6)
305 6 for %esi (gcc regno = 4)
306 7 for %edi (gcc regno = 5)
307 The following three DWARF register numbers are never generated by
308 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
309 believed these numbers have these meanings.
310 8 for %eip (no gcc equivalent)
311 9 for %eflags (gcc regno = 17)
312 10 for %trapno (no gcc equivalent)
313 It is not at all clear how we should number the FP stack registers
314 for the x86 architecture. If the version of SDB on x86/svr4 were
315 a bit less brain dead with respect to floating-point then we would
316 have a precedent to follow with respect to DWARF register numbers
317 for x86 FP registers, but the SDB on x86/svr4 was so completely
318 broken with respect to FP registers that it is hardly worth thinking
319 of it as something to strive for compatibility with.
320 The version of x86/svr4 SDB I had does (partially)
321 seem to believe that DWARF register number 11 is associated with
322 the x86 register %st(0), but that's about all. Higher DWARF
323 register numbers don't seem to be associated with anything in
324 particular, and even for DWARF regno 11, SDB only seemed to under-
325 stand that it should say that a variable lives in %st(0) (when
326 asked via an `=' command) if we said it was in DWARF regno 11,
327 but SDB still printed garbage when asked for the value of the
328 variable in question (via a `/' command).
329 (Also note that the labels SDB printed for various FP stack regs
330 when doing an `x' command were all wrong.)
331 Note that these problems generally don't affect the native SVR4
332 C compiler because it doesn't allow the use of -O with -g and
333 because when it is *not* optimizing, it allocates a memory
334 location for each floating-point variable, and the memory
335 location is what gets described in the DWARF AT_location
336 attribute for the variable in question.
337 Regardless of the severe mental illness of the x86/svr4 SDB, we
338 do something sensible here and we use the following DWARF
339 register numbers. Note that these are all stack-top-relative
340 numbers.
341 11 for %st(0) (gcc regno = 8)
342 12 for %st(1) (gcc regno = 9)
343 13 for %st(2) (gcc regno = 10)
344 14 for %st(3) (gcc regno = 11)
345 15 for %st(4) (gcc regno = 12)
346 16 for %st(5) (gcc regno = 13)
347 17 for %st(6) (gcc regno = 14)
348 18 for %st(7) (gcc regno = 15)
350 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
352 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
353 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
354 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
355 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
356 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
358 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
359 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
360 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
361 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
362 101, 102, 103, 104, /* bound registers */
365 /* Define parameter passing and return registers. */
367 static int const x86_64_int_parameter_registers[6] =
369 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
372 static int const x86_64_ms_abi_int_parameter_registers[4] =
374 CX_REG, DX_REG, R8_REG, R9_REG
377 static int const x86_64_int_return_registers[4] =
379 AX_REG, DX_REG, DI_REG, SI_REG
382 /* Additional registers that are clobbered by SYSV calls. */
384 #define NUM_X86_64_MS_CLOBBERED_REGS 12
385 static int const x86_64_ms_sysv_extra_clobbered_registers
386 [NUM_X86_64_MS_CLOBBERED_REGS] =
388 SI_REG, DI_REG,
389 XMM6_REG, XMM7_REG,
390 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
391 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
394 enum xlogue_stub {
395 XLOGUE_STUB_SAVE,
396 XLOGUE_STUB_RESTORE,
397 XLOGUE_STUB_RESTORE_TAIL,
398 XLOGUE_STUB_SAVE_HFP,
399 XLOGUE_STUB_RESTORE_HFP,
400 XLOGUE_STUB_RESTORE_HFP_TAIL,
402 XLOGUE_STUB_COUNT
405 enum xlogue_stub_sets {
406 XLOGUE_SET_ALIGNED,
407 XLOGUE_SET_ALIGNED_PLUS_8,
408 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
409 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
411 XLOGUE_SET_COUNT
414 /* Register save/restore layout used by out-of-line stubs. */
415 class xlogue_layout {
416 public:
417 struct reginfo
419 unsigned regno;
420 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
421 rsi) to where each register is stored. */
424 unsigned get_nregs () const {return m_nregs;}
425 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
427 const reginfo &get_reginfo (unsigned reg) const
429 gcc_assert (reg < m_nregs);
430 return m_regs[reg];
433 static const char *get_stub_name (enum xlogue_stub stub,
434 unsigned n_extra_args);
436 /* Returns an rtx for the stub's symbol based upon
437 1.) the specified stub (save, restore or restore_ret) and
438 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
439 3.) rather or not stack alignment is being performed. */
440 static rtx get_stub_rtx (enum xlogue_stub stub);
442 /* Returns the amount of stack space (including padding) that the stub
443 needs to store registers based upon data in the machine_function. */
444 HOST_WIDE_INT get_stack_space_used () const
446 const struct machine_function *m = cfun->machine;
447 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
449 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
450 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
453 /* Returns the offset for the base pointer used by the stub. */
454 HOST_WIDE_INT get_stub_ptr_offset () const
456 return STUB_INDEX_OFFSET + m_stack_align_off_in;
459 static const struct xlogue_layout &get_instance ();
460 static unsigned count_stub_managed_regs ();
461 static bool is_stub_managed_reg (unsigned regno, unsigned count);
463 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
464 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
465 static const unsigned MAX_REGS = 18;
466 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
467 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
468 static const unsigned STUB_NAME_MAX_LEN = 20;
469 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
470 static const unsigned REG_ORDER[MAX_REGS];
471 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
473 private:
474 xlogue_layout ();
475 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
476 xlogue_layout (const xlogue_layout &);
478 /* True if hard frame pointer is used. */
479 bool m_hfp;
481 /* Max number of register this layout manages. */
482 unsigned m_nregs;
484 /* Incoming offset from 16-byte alignment. */
485 HOST_WIDE_INT m_stack_align_off_in;
487 /* Register order and offsets. */
488 struct reginfo m_regs[MAX_REGS];
490 /* Lazy-inited cache of symbol names for stubs. */
491 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
492 [STUB_NAME_MAX_LEN];
494 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
497 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
498 "savms64",
499 "resms64",
500 "resms64x",
501 "savms64f",
502 "resms64f",
503 "resms64fx"
506 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
507 /* The below offset values are where each register is stored for the layout
508 relative to incoming stack pointer. The value of each m_regs[].offset will
509 be relative to the incoming base pointer (rax or rsi) used by the stub.
511 s_instances: 0 1 2 3
512 Offset: realigned or aligned + 8
513 Register aligned aligned + 8 aligned w/HFP w/HFP */
514 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
515 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
516 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
517 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
518 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
519 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
520 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
521 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
522 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
523 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
524 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
525 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
526 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
527 BP_REG, /* 0xc0 0xc8 N/A N/A */
528 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
529 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
530 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
531 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
534 /* Instantiate static const values. */
535 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
536 const unsigned xlogue_layout::MIN_REGS;
537 const unsigned xlogue_layout::MAX_REGS;
538 const unsigned xlogue_layout::MAX_EXTRA_REGS;
539 const unsigned xlogue_layout::VARIANT_COUNT;
540 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
542 /* Initialize xlogue_layout::s_stub_names to zero. */
543 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
544 [STUB_NAME_MAX_LEN];
546 /* Instantiates all xlogue_layout instances. */
547 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
548 xlogue_layout (0, false),
549 xlogue_layout (8, false),
550 xlogue_layout (0, true),
551 xlogue_layout (8, true)
554 /* Return an appropriate const instance of xlogue_layout based upon values
555 in cfun->machine and crtl. */
556 const struct xlogue_layout &
557 xlogue_layout::get_instance ()
559 enum xlogue_stub_sets stub_set;
560 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
562 if (stack_realign_fp)
563 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
564 else if (frame_pointer_needed)
565 stub_set = aligned_plus_8
566 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
567 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
568 else
569 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
571 return s_instances[stub_set];
574 /* Determine how many clobbered registers can be saved by the stub.
575 Returns the count of registers the stub will save and restore. */
576 unsigned
577 xlogue_layout::count_stub_managed_regs ()
579 bool hfp = frame_pointer_needed || stack_realign_fp;
580 unsigned i, count;
581 unsigned regno;
583 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
585 regno = REG_ORDER[i];
586 if (regno == BP_REG && hfp)
587 continue;
588 if (!ix86_save_reg (regno, false, false))
589 break;
590 ++count;
592 return count;
595 /* Determine if register REGNO is a stub managed register given the
596 total COUNT of stub managed registers. */
597 bool
598 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
600 bool hfp = frame_pointer_needed || stack_realign_fp;
601 unsigned i;
603 for (i = 0; i < count; ++i)
605 gcc_assert (i < MAX_REGS);
606 if (REG_ORDER[i] == BP_REG && hfp)
607 ++count;
608 else if (REG_ORDER[i] == regno)
609 return true;
611 return false;
614 /* Constructor for xlogue_layout. */
615 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
616 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
617 m_stack_align_off_in (stack_align_off_in)
619 HOST_WIDE_INT offset = stack_align_off_in;
620 unsigned i, j;
622 for (i = j = 0; i < MAX_REGS; ++i)
624 unsigned regno = REG_ORDER[i];
626 if (regno == BP_REG && hfp)
627 continue;
628 if (SSE_REGNO_P (regno))
630 offset += 16;
631 /* Verify that SSE regs are always aligned. */
632 gcc_assert (!((stack_align_off_in + offset) & 15));
634 else
635 offset += 8;
637 m_regs[j].regno = regno;
638 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
640 gcc_assert (j == m_nregs);
643 const char *
644 xlogue_layout::get_stub_name (enum xlogue_stub stub,
645 unsigned n_extra_regs)
647 const int have_avx = TARGET_AVX;
648 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
650 /* Lazy init */
651 if (!*name)
653 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
654 (have_avx ? "avx" : "sse"),
655 STUB_BASE_NAMES[stub],
656 MIN_REGS + n_extra_regs);
657 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
660 return name;
663 /* Return rtx of a symbol ref for the entry point (based upon
664 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
666 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
668 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
669 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
670 gcc_assert (stub < XLOGUE_STUB_COUNT);
671 gcc_assert (crtl->stack_realign_finalized);
673 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
676 /* Define the structure for the machine field in struct function. */
678 struct GTY(()) stack_local_entry {
679 unsigned short mode;
680 unsigned short n;
681 rtx rtl;
682 struct stack_local_entry *next;
685 /* Which cpu are we scheduling for. */
686 enum attr_cpu ix86_schedule;
688 /* Which cpu are we optimizing for. */
689 enum processor_type ix86_tune;
691 /* Which instruction set architecture to use. */
692 enum processor_type ix86_arch;
694 /* True if processor has SSE prefetch instruction. */
695 unsigned char x86_prefetch_sse;
697 /* -mstackrealign option */
698 static const char ix86_force_align_arg_pointer_string[]
699 = "force_align_arg_pointer";
701 static rtx (*ix86_gen_leave) (void);
702 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
703 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
704 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
705 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
706 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_clzero) (rtx);
709 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
711 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
716 /* Preferred alignment for stack boundary in bits. */
717 unsigned int ix86_preferred_stack_boundary;
719 /* Alignment for incoming stack boundary in bits specified at
720 command line. */
721 static unsigned int ix86_user_incoming_stack_boundary;
723 /* Default alignment for incoming stack boundary in bits. */
724 static unsigned int ix86_default_incoming_stack_boundary;
726 /* Alignment for incoming stack boundary in bits. */
727 unsigned int ix86_incoming_stack_boundary;
729 /* Calling abi specific va_list type nodes. */
730 static GTY(()) tree sysv_va_list_type_node;
731 static GTY(()) tree ms_va_list_type_node;
733 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
734 char internal_label_prefix[16];
735 int internal_label_prefix_len;
737 /* Fence to use after loop using movnt. */
738 tree x86_mfence;
740 /* Register class used for passing given 64bit part of the argument.
741 These represent classes as documented by the PS ABI, with the exception
742 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
743 use SF or DFmode move instead of DImode to avoid reformatting penalties.
745 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
746 whenever possible (upper half does contain padding). */
747 enum x86_64_reg_class
749 X86_64_NO_CLASS,
750 X86_64_INTEGER_CLASS,
751 X86_64_INTEGERSI_CLASS,
752 X86_64_SSE_CLASS,
753 X86_64_SSESF_CLASS,
754 X86_64_SSEDF_CLASS,
755 X86_64_SSEUP_CLASS,
756 X86_64_X87_CLASS,
757 X86_64_X87UP_CLASS,
758 X86_64_COMPLEX_X87_CLASS,
759 X86_64_MEMORY_CLASS
762 #define MAX_CLASSES 8
764 /* Table of constants used by fldpi, fldln2, etc.... */
765 static REAL_VALUE_TYPE ext_80387_constants_table [5];
766 static bool ext_80387_constants_init;
769 static struct machine_function * ix86_init_machine_status (void);
770 static rtx ix86_function_value (const_tree, const_tree, bool);
771 static bool ix86_function_value_regno_p (const unsigned int);
772 static unsigned int ix86_function_arg_boundary (machine_mode,
773 const_tree);
774 static rtx ix86_static_chain (const_tree, bool);
775 static int ix86_function_regparm (const_tree, const_tree);
776 static void ix86_compute_frame_layout (void);
777 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
778 rtx, rtx, int);
779 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
780 static tree ix86_canonical_va_list_type (tree);
781 static void predict_jump (int);
782 static unsigned int split_stack_prologue_scratch_regno (void);
783 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
785 enum ix86_function_specific_strings
787 IX86_FUNCTION_SPECIFIC_ARCH,
788 IX86_FUNCTION_SPECIFIC_TUNE,
789 IX86_FUNCTION_SPECIFIC_MAX
792 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
793 const char *, const char *, enum fpmath_unit,
794 bool);
795 static void ix86_function_specific_save (struct cl_target_option *,
796 struct gcc_options *opts);
797 static void ix86_function_specific_restore (struct gcc_options *opts,
798 struct cl_target_option *);
799 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
800 static void ix86_function_specific_print (FILE *, int,
801 struct cl_target_option *);
802 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
803 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
804 struct gcc_options *,
805 struct gcc_options *,
806 struct gcc_options *);
807 static bool ix86_can_inline_p (tree, tree);
808 static void ix86_set_current_function (tree);
809 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
811 static enum calling_abi ix86_function_abi (const_tree);
814 #ifndef SUBTARGET32_DEFAULT_CPU
815 #define SUBTARGET32_DEFAULT_CPU "i386"
816 #endif
818 /* Whether -mtune= or -march= were specified */
819 static int ix86_tune_defaulted;
820 static int ix86_arch_specified;
822 /* Vectorization library interface and handlers. */
823 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
825 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
826 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
828 /* Processor target table, indexed by processor number */
829 struct ptt
831 const char *const name; /* processor name */
832 const struct processor_costs *cost; /* Processor costs */
833 const int align_loop; /* Default alignments. */
834 const int align_loop_max_skip;
835 const int align_jump;
836 const int align_jump_max_skip;
837 const int align_func;
840 /* This table must be in sync with enum processor_type in i386.h. */
841 static const struct ptt processor_target_table[PROCESSOR_max] =
843 {"generic", &generic_cost, 16, 10, 16, 10, 16},
844 {"i386", &i386_cost, 4, 3, 4, 3, 4},
845 {"i486", &i486_cost, 16, 15, 16, 15, 16},
846 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
847 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
848 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
849 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
850 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
851 {"core2", &core_cost, 16, 10, 16, 10, 16},
852 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
853 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
854 {"haswell", &core_cost, 16, 10, 16, 10, 16},
855 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
856 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
857 {"knl", &slm_cost, 16, 15, 16, 7, 16},
858 {"knm", &slm_cost, 16, 15, 16, 7, 16},
859 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
860 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
861 {"intel", &intel_cost, 16, 15, 16, 7, 16},
862 {"geode", &geode_cost, 0, 0, 0, 0, 0},
863 {"k6", &k6_cost, 32, 7, 32, 7, 32},
864 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
865 {"k8", &k8_cost, 16, 7, 16, 7, 16},
866 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
867 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
868 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
869 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
870 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
871 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
872 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
873 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
876 static unsigned int
877 rest_of_handle_insert_vzeroupper (void)
879 int i;
881 /* vzeroupper instructions are inserted immediately after reload to
882 account for possible spills from 256bit or 512bit registers. The pass
883 reuses mode switching infrastructure by re-running mode insertion
884 pass, so disable entities that have already been processed. */
885 for (i = 0; i < MAX_386_ENTITIES; i++)
886 ix86_optimize_mode_switching[i] = 0;
888 ix86_optimize_mode_switching[AVX_U128] = 1;
890 /* Call optimize_mode_switching. */
891 g->get_passes ()->execute_pass_mode_switching ();
892 return 0;
895 /* Return 1 if INSN uses or defines a hard register.
896 Hard register uses in a memory address are ignored.
897 Clobbers and flags definitions are ignored. */
899 static bool
900 has_non_address_hard_reg (rtx_insn *insn)
902 df_ref ref;
903 FOR_EACH_INSN_DEF (ref, insn)
904 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
905 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
906 && DF_REF_REGNO (ref) != FLAGS_REG)
907 return true;
909 FOR_EACH_INSN_USE (ref, insn)
910 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
911 return true;
913 return false;
916 /* Check if comparison INSN may be transformed
917 into vector comparison. Currently we transform
918 zero checks only which look like:
920 (set (reg:CCZ 17 flags)
921 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
922 (subreg:SI (reg:DI x) 0))
923 (const_int 0 [0]))) */
925 static bool
926 convertible_comparison_p (rtx_insn *insn)
928 if (!TARGET_SSE4_1)
929 return false;
931 rtx def_set = single_set (insn);
933 gcc_assert (def_set);
935 rtx src = SET_SRC (def_set);
936 rtx dst = SET_DEST (def_set);
938 gcc_assert (GET_CODE (src) == COMPARE);
940 if (GET_CODE (dst) != REG
941 || REGNO (dst) != FLAGS_REG
942 || GET_MODE (dst) != CCZmode)
943 return false;
945 rtx op1 = XEXP (src, 0);
946 rtx op2 = XEXP (src, 1);
948 if (op2 != CONST0_RTX (GET_MODE (op2)))
949 return false;
951 if (GET_CODE (op1) != IOR)
952 return false;
954 op2 = XEXP (op1, 1);
955 op1 = XEXP (op1, 0);
957 if (!SUBREG_P (op1)
958 || !SUBREG_P (op2)
959 || GET_MODE (op1) != SImode
960 || GET_MODE (op2) != SImode
961 || ((SUBREG_BYTE (op1) != 0
962 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
963 && (SUBREG_BYTE (op2) != 0
964 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
965 return false;
967 op1 = SUBREG_REG (op1);
968 op2 = SUBREG_REG (op2);
970 if (op1 != op2
971 || !REG_P (op1)
972 || GET_MODE (op1) != DImode)
973 return false;
975 return true;
978 /* The DImode version of scalar_to_vector_candidate_p. */
980 static bool
981 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
983 rtx def_set = single_set (insn);
985 if (!def_set)
986 return false;
988 if (has_non_address_hard_reg (insn))
989 return false;
991 rtx src = SET_SRC (def_set);
992 rtx dst = SET_DEST (def_set);
994 if (GET_CODE (src) == COMPARE)
995 return convertible_comparison_p (insn);
997 /* We are interested in DImode promotion only. */
998 if ((GET_MODE (src) != DImode
999 && !CONST_INT_P (src))
1000 || GET_MODE (dst) != DImode)
1001 return false;
1003 if (!REG_P (dst) && !MEM_P (dst))
1004 return false;
1006 switch (GET_CODE (src))
1008 case ASHIFTRT:
1009 if (!TARGET_AVX512VL)
1010 return false;
1011 /* FALLTHRU */
1013 case ASHIFT:
1014 case LSHIFTRT:
1015 if (!REG_P (XEXP (src, 1))
1016 && (!SUBREG_P (XEXP (src, 1))
1017 || SUBREG_BYTE (XEXP (src, 1)) != 0
1018 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1019 && (!CONST_INT_P (XEXP (src, 1))
1020 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1021 return false;
1023 if (GET_MODE (XEXP (src, 1)) != QImode
1024 && !CONST_INT_P (XEXP (src, 1)))
1025 return false;
1026 break;
1028 case PLUS:
1029 case MINUS:
1030 case IOR:
1031 case XOR:
1032 case AND:
1033 if (!REG_P (XEXP (src, 1))
1034 && !MEM_P (XEXP (src, 1))
1035 && !CONST_INT_P (XEXP (src, 1)))
1036 return false;
1038 if (GET_MODE (XEXP (src, 1)) != DImode
1039 && !CONST_INT_P (XEXP (src, 1)))
1040 return false;
1041 break;
1043 case NEG:
1044 case NOT:
1045 break;
1047 case REG:
1048 return true;
1050 case MEM:
1051 case CONST_INT:
1052 return REG_P (dst);
1054 default:
1055 return false;
1058 if (!REG_P (XEXP (src, 0))
1059 && !MEM_P (XEXP (src, 0))
1060 && !CONST_INT_P (XEXP (src, 0))
1061 /* Check for andnot case. */
1062 && (GET_CODE (src) != AND
1063 || GET_CODE (XEXP (src, 0)) != NOT
1064 || !REG_P (XEXP (XEXP (src, 0), 0))))
1065 return false;
1067 if (GET_MODE (XEXP (src, 0)) != DImode
1068 && !CONST_INT_P (XEXP (src, 0)))
1069 return false;
1071 return true;
1074 /* The TImode version of scalar_to_vector_candidate_p. */
1076 static bool
1077 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1079 rtx def_set = single_set (insn);
1081 if (!def_set)
1082 return false;
1084 if (has_non_address_hard_reg (insn))
1085 return false;
1087 rtx src = SET_SRC (def_set);
1088 rtx dst = SET_DEST (def_set);
1090 /* Only TImode load and store are allowed. */
1091 if (GET_MODE (dst) != TImode)
1092 return false;
1094 if (MEM_P (dst))
1096 /* Check for store. Memory must be aligned or unaligned store
1097 is optimal. Only support store from register, standard SSE
1098 constant or CONST_WIDE_INT generated from piecewise store.
1100 ??? Verify performance impact before enabling CONST_INT for
1101 __int128 store. */
1102 if (misaligned_operand (dst, TImode)
1103 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1104 return false;
1106 switch (GET_CODE (src))
1108 default:
1109 return false;
1111 case REG:
1112 case CONST_WIDE_INT:
1113 return true;
1115 case CONST_INT:
1116 return standard_sse_constant_p (src, TImode);
1119 else if (MEM_P (src))
1121 /* Check for load. Memory must be aligned or unaligned load is
1122 optimal. */
1123 return (REG_P (dst)
1124 && (!misaligned_operand (src, TImode)
1125 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1128 return false;
1131 /* Return 1 if INSN may be converted into vector
1132 instruction. */
1134 static bool
1135 scalar_to_vector_candidate_p (rtx_insn *insn)
1137 if (TARGET_64BIT)
1138 return timode_scalar_to_vector_candidate_p (insn);
1139 else
1140 return dimode_scalar_to_vector_candidate_p (insn);
1143 /* The DImode version of remove_non_convertible_regs. */
1145 static void
1146 dimode_remove_non_convertible_regs (bitmap candidates)
1148 bitmap_iterator bi;
1149 unsigned id;
1150 bitmap regs = BITMAP_ALLOC (NULL);
1152 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1154 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1155 rtx reg = SET_DEST (def_set);
1157 if (!REG_P (reg)
1158 || bitmap_bit_p (regs, REGNO (reg))
1159 || HARD_REGISTER_P (reg))
1160 continue;
1162 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1163 def;
1164 def = DF_REF_NEXT_REG (def))
1166 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1168 if (dump_file)
1169 fprintf (dump_file,
1170 "r%d has non convertible definition in insn %d\n",
1171 REGNO (reg), DF_REF_INSN_UID (def));
1173 bitmap_set_bit (regs, REGNO (reg));
1174 break;
1179 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1181 for (df_ref def = DF_REG_DEF_CHAIN (id);
1182 def;
1183 def = DF_REF_NEXT_REG (def))
1184 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1186 if (dump_file)
1187 fprintf (dump_file, "Removing insn %d from candidates list\n",
1188 DF_REF_INSN_UID (def));
1190 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1194 BITMAP_FREE (regs);
1197 /* For a register REGNO, scan instructions for its defs and uses.
1198 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1200 static void
1201 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1202 unsigned int regno)
1204 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1205 def;
1206 def = DF_REF_NEXT_REG (def))
1208 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1210 if (dump_file)
1211 fprintf (dump_file,
1212 "r%d has non convertible def in insn %d\n",
1213 regno, DF_REF_INSN_UID (def));
1215 bitmap_set_bit (regs, regno);
1216 break;
1220 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1221 ref;
1222 ref = DF_REF_NEXT_REG (ref))
1224 /* Debug instructions are skipped. */
1225 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1226 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1228 if (dump_file)
1229 fprintf (dump_file,
1230 "r%d has non convertible use in insn %d\n",
1231 regno, DF_REF_INSN_UID (ref));
1233 bitmap_set_bit (regs, regno);
1234 break;
1239 /* The TImode version of remove_non_convertible_regs. */
1241 static void
1242 timode_remove_non_convertible_regs (bitmap candidates)
1244 bitmap_iterator bi;
1245 unsigned id;
1246 bitmap regs = BITMAP_ALLOC (NULL);
1248 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1250 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1251 rtx dest = SET_DEST (def_set);
1252 rtx src = SET_SRC (def_set);
1254 if ((!REG_P (dest)
1255 || bitmap_bit_p (regs, REGNO (dest))
1256 || HARD_REGISTER_P (dest))
1257 && (!REG_P (src)
1258 || bitmap_bit_p (regs, REGNO (src))
1259 || HARD_REGISTER_P (src)))
1260 continue;
1262 if (REG_P (dest))
1263 timode_check_non_convertible_regs (candidates, regs,
1264 REGNO (dest));
1266 if (REG_P (src))
1267 timode_check_non_convertible_regs (candidates, regs,
1268 REGNO (src));
1271 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1273 for (df_ref def = DF_REG_DEF_CHAIN (id);
1274 def;
1275 def = DF_REF_NEXT_REG (def))
1276 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1278 if (dump_file)
1279 fprintf (dump_file, "Removing insn %d from candidates list\n",
1280 DF_REF_INSN_UID (def));
1282 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1285 for (df_ref ref = DF_REG_USE_CHAIN (id);
1286 ref;
1287 ref = DF_REF_NEXT_REG (ref))
1288 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1290 if (dump_file)
1291 fprintf (dump_file, "Removing insn %d from candidates list\n",
1292 DF_REF_INSN_UID (ref));
1294 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1298 BITMAP_FREE (regs);
1301 /* For a given bitmap of insn UIDs scans all instruction and
1302 remove insn from CANDIDATES in case it has both convertible
1303 and not convertible definitions.
1305 All insns in a bitmap are conversion candidates according to
1306 scalar_to_vector_candidate_p. Currently it implies all insns
1307 are single_set. */
1309 static void
1310 remove_non_convertible_regs (bitmap candidates)
1312 if (TARGET_64BIT)
1313 timode_remove_non_convertible_regs (candidates);
1314 else
1315 dimode_remove_non_convertible_regs (candidates);
1318 class scalar_chain
1320 public:
1321 scalar_chain ();
1322 virtual ~scalar_chain ();
1324 static unsigned max_id;
1326 /* ID of a chain. */
1327 unsigned int chain_id;
1328 /* A queue of instructions to be included into a chain. */
1329 bitmap queue;
1330 /* Instructions included into a chain. */
1331 bitmap insns;
1332 /* All registers defined by a chain. */
1333 bitmap defs;
1334 /* Registers used in both vector and sclar modes. */
1335 bitmap defs_conv;
1337 void build (bitmap candidates, unsigned insn_uid);
1338 virtual int compute_convert_gain () = 0;
1339 int convert ();
1341 protected:
1342 void add_to_queue (unsigned insn_uid);
1343 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1345 private:
1346 void add_insn (bitmap candidates, unsigned insn_uid);
1347 void analyze_register_chain (bitmap candidates, df_ref ref);
1348 virtual void mark_dual_mode_def (df_ref def) = 0;
1349 virtual void convert_insn (rtx_insn *insn) = 0;
1350 virtual void convert_registers () = 0;
1353 class dimode_scalar_chain : public scalar_chain
1355 public:
1356 int compute_convert_gain ();
1357 private:
1358 void mark_dual_mode_def (df_ref def);
1359 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1360 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1361 void convert_insn (rtx_insn *insn);
1362 void convert_op (rtx *op, rtx_insn *insn);
1363 void convert_reg (unsigned regno);
1364 void make_vector_copies (unsigned regno);
1365 void convert_registers ();
1366 int vector_const_cost (rtx exp);
1369 class timode_scalar_chain : public scalar_chain
1371 public:
1372 /* Convert from TImode to V1TImode is always faster. */
1373 int compute_convert_gain () { return 1; }
1375 private:
1376 void mark_dual_mode_def (df_ref def);
1377 void fix_debug_reg_uses (rtx reg);
1378 void convert_insn (rtx_insn *insn);
1379 /* We don't convert registers to difference size. */
1380 void convert_registers () {}
1383 unsigned scalar_chain::max_id = 0;
1385 /* Initialize new chain. */
1387 scalar_chain::scalar_chain ()
1389 chain_id = ++max_id;
1391 if (dump_file)
1392 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1394 bitmap_obstack_initialize (NULL);
1395 insns = BITMAP_ALLOC (NULL);
1396 defs = BITMAP_ALLOC (NULL);
1397 defs_conv = BITMAP_ALLOC (NULL);
1398 queue = NULL;
1401 /* Free chain's data. */
1403 scalar_chain::~scalar_chain ()
1405 BITMAP_FREE (insns);
1406 BITMAP_FREE (defs);
1407 BITMAP_FREE (defs_conv);
1408 bitmap_obstack_release (NULL);
1411 /* Add instruction into chains' queue. */
1413 void
1414 scalar_chain::add_to_queue (unsigned insn_uid)
1416 if (bitmap_bit_p (insns, insn_uid)
1417 || bitmap_bit_p (queue, insn_uid))
1418 return;
1420 if (dump_file)
1421 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1422 insn_uid, chain_id);
1423 bitmap_set_bit (queue, insn_uid);
1426 /* For DImode conversion, mark register defined by DEF as requiring
1427 conversion. */
1429 void
1430 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1432 gcc_assert (DF_REF_REG_DEF_P (def));
1434 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1435 return;
1437 if (dump_file)
1438 fprintf (dump_file,
1439 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1440 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1442 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1445 /* For TImode conversion, it is unused. */
1447 void
1448 timode_scalar_chain::mark_dual_mode_def (df_ref)
1450 gcc_unreachable ();
1453 /* Check REF's chain to add new insns into a queue
1454 and find registers requiring conversion. */
1456 void
1457 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1459 df_link *chain;
1461 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1462 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1463 add_to_queue (DF_REF_INSN_UID (ref));
1465 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1467 unsigned uid = DF_REF_INSN_UID (chain->ref);
1469 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1470 continue;
1472 if (!DF_REF_REG_MEM_P (chain->ref))
1474 if (bitmap_bit_p (insns, uid))
1475 continue;
1477 if (bitmap_bit_p (candidates, uid))
1479 add_to_queue (uid);
1480 continue;
1484 if (DF_REF_REG_DEF_P (chain->ref))
1486 if (dump_file)
1487 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1488 DF_REF_REGNO (chain->ref), uid);
1489 mark_dual_mode_def (chain->ref);
1491 else
1493 if (dump_file)
1494 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1495 DF_REF_REGNO (chain->ref), uid);
1496 mark_dual_mode_def (ref);
1501 /* Add instruction into a chain. */
1503 void
1504 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1506 if (bitmap_bit_p (insns, insn_uid))
1507 return;
1509 if (dump_file)
1510 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1512 bitmap_set_bit (insns, insn_uid);
1514 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1515 rtx def_set = single_set (insn);
1516 if (def_set && REG_P (SET_DEST (def_set))
1517 && !HARD_REGISTER_P (SET_DEST (def_set)))
1518 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1520 df_ref ref;
1521 df_ref def;
1522 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1523 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1524 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1525 def;
1526 def = DF_REF_NEXT_REG (def))
1527 analyze_register_chain (candidates, def);
1528 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1529 if (!DF_REF_REG_MEM_P (ref))
1530 analyze_register_chain (candidates, ref);
1533 /* Build new chain starting from insn INSN_UID recursively
1534 adding all dependent uses and definitions. */
1536 void
1537 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1539 queue = BITMAP_ALLOC (NULL);
1540 bitmap_set_bit (queue, insn_uid);
1542 if (dump_file)
1543 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1545 while (!bitmap_empty_p (queue))
1547 insn_uid = bitmap_first_set_bit (queue);
1548 bitmap_clear_bit (queue, insn_uid);
1549 bitmap_clear_bit (candidates, insn_uid);
1550 add_insn (candidates, insn_uid);
1553 if (dump_file)
1555 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1556 fprintf (dump_file, " insns: ");
1557 dump_bitmap (dump_file, insns);
1558 if (!bitmap_empty_p (defs_conv))
1560 bitmap_iterator bi;
1561 unsigned id;
1562 const char *comma = "";
1563 fprintf (dump_file, " defs to convert: ");
1564 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1566 fprintf (dump_file, "%sr%d", comma, id);
1567 comma = ", ";
1569 fprintf (dump_file, "\n");
1573 BITMAP_FREE (queue);
1576 /* Return a cost of building a vector costant
1577 instead of using a scalar one. */
1580 dimode_scalar_chain::vector_const_cost (rtx exp)
1582 gcc_assert (CONST_INT_P (exp));
1584 if (standard_sse_constant_p (exp, V2DImode))
1585 return COSTS_N_INSNS (1);
1586 return ix86_cost->sse_load[1];
1589 /* Compute a gain for chain conversion. */
1592 dimode_scalar_chain::compute_convert_gain ()
1594 bitmap_iterator bi;
1595 unsigned insn_uid;
1596 int gain = 0;
1597 int cost = 0;
1599 if (dump_file)
1600 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1602 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1604 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1605 rtx def_set = single_set (insn);
1606 rtx src = SET_SRC (def_set);
1607 rtx dst = SET_DEST (def_set);
1609 if (REG_P (src) && REG_P (dst))
1610 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1611 else if (REG_P (src) && MEM_P (dst))
1612 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1613 else if (MEM_P (src) && REG_P (dst))
1614 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1615 else if (GET_CODE (src) == ASHIFT
1616 || GET_CODE (src) == ASHIFTRT
1617 || GET_CODE (src) == LSHIFTRT)
1619 if (CONST_INT_P (XEXP (src, 0)))
1620 gain -= vector_const_cost (XEXP (src, 0));
1621 if (CONST_INT_P (XEXP (src, 1)))
1623 gain += ix86_cost->shift_const;
1624 if (INTVAL (XEXP (src, 1)) >= 32)
1625 gain -= COSTS_N_INSNS (1);
1627 else
1628 /* Additional gain for omitting two CMOVs. */
1629 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1631 else if (GET_CODE (src) == PLUS
1632 || GET_CODE (src) == MINUS
1633 || GET_CODE (src) == IOR
1634 || GET_CODE (src) == XOR
1635 || GET_CODE (src) == AND)
1637 gain += ix86_cost->add;
1638 /* Additional gain for andnot for targets without BMI. */
1639 if (GET_CODE (XEXP (src, 0)) == NOT
1640 && !TARGET_BMI)
1641 gain += 2 * ix86_cost->add;
1643 if (CONST_INT_P (XEXP (src, 0)))
1644 gain -= vector_const_cost (XEXP (src, 0));
1645 if (CONST_INT_P (XEXP (src, 1)))
1646 gain -= vector_const_cost (XEXP (src, 1));
1648 else if (GET_CODE (src) == NEG
1649 || GET_CODE (src) == NOT)
1650 gain += ix86_cost->add - COSTS_N_INSNS (1);
1651 else if (GET_CODE (src) == COMPARE)
1653 /* Assume comparison cost is the same. */
1655 else if (CONST_INT_P (src))
1657 if (REG_P (dst))
1658 gain += COSTS_N_INSNS (2);
1659 else if (MEM_P (dst))
1660 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1661 gain -= vector_const_cost (src);
1663 else
1664 gcc_unreachable ();
1667 if (dump_file)
1668 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1670 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1671 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1673 if (dump_file)
1674 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1676 gain -= cost;
1678 if (dump_file)
1679 fprintf (dump_file, " Total gain: %d\n", gain);
1681 return gain;
1684 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1687 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1689 if (x == reg)
1690 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1692 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1693 int i, j;
1694 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1696 if (fmt[i] == 'e')
1697 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1698 else if (fmt[i] == 'E')
1699 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1700 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1701 reg, new_reg);
1704 return x;
1707 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1709 void
1710 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1711 rtx reg, rtx new_reg)
1713 replace_with_subreg (single_set (insn), reg, new_reg);
1716 /* Insert generated conversion instruction sequence INSNS
1717 after instruction AFTER. New BB may be required in case
1718 instruction has EH region attached. */
1720 void
1721 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1723 if (!control_flow_insn_p (after))
1725 emit_insn_after (insns, after);
1726 return;
1729 basic_block bb = BLOCK_FOR_INSN (after);
1730 edge e = find_fallthru_edge (bb->succs);
1731 gcc_assert (e);
1733 basic_block new_bb = split_edge (e);
1734 emit_insn_after (insns, BB_HEAD (new_bb));
1737 /* Make vector copies for all register REGNO definitions
1738 and replace its uses in a chain. */
1740 void
1741 dimode_scalar_chain::make_vector_copies (unsigned regno)
1743 rtx reg = regno_reg_rtx[regno];
1744 rtx vreg = gen_reg_rtx (DImode);
1745 bool count_reg = false;
1746 df_ref ref;
1748 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1749 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1751 df_ref use;
1753 /* Detect the count register of a shift instruction. */
1754 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1755 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1757 rtx_insn *insn = DF_REF_INSN (use);
1758 rtx def_set = single_set (insn);
1760 gcc_assert (def_set);
1762 rtx src = SET_SRC (def_set);
1764 if ((GET_CODE (src) == ASHIFT
1765 || GET_CODE (src) == ASHIFTRT
1766 || GET_CODE (src) == LSHIFTRT)
1767 && !CONST_INT_P (XEXP (src, 1))
1768 && reg_or_subregno (XEXP (src, 1)) == regno)
1769 count_reg = true;
1772 start_sequence ();
1773 if (count_reg)
1775 rtx qreg = gen_lowpart (QImode, reg);
1776 rtx tmp = gen_reg_rtx (SImode);
1778 if (TARGET_ZERO_EXTEND_WITH_AND
1779 && optimize_function_for_speed_p (cfun))
1781 emit_move_insn (tmp, const0_rtx);
1782 emit_insn (gen_movstrictqi
1783 (gen_lowpart (QImode, tmp), qreg));
1785 else
1786 emit_insn (gen_rtx_SET
1787 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1789 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1791 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1792 emit_move_insn (slot, tmp);
1793 tmp = copy_rtx (slot);
1796 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1798 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1800 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1801 emit_move_insn (adjust_address (tmp, SImode, 0),
1802 gen_rtx_SUBREG (SImode, reg, 0));
1803 emit_move_insn (adjust_address (tmp, SImode, 4),
1804 gen_rtx_SUBREG (SImode, reg, 4));
1805 emit_move_insn (vreg, tmp);
1807 else if (TARGET_SSE4_1)
1809 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1810 CONST0_RTX (V4SImode),
1811 gen_rtx_SUBREG (SImode, reg, 0)));
1812 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1813 gen_rtx_SUBREG (V4SImode, vreg, 0),
1814 gen_rtx_SUBREG (SImode, reg, 4),
1815 GEN_INT (2)));
1817 else
1819 rtx tmp = gen_reg_rtx (DImode);
1820 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1821 CONST0_RTX (V4SImode),
1822 gen_rtx_SUBREG (SImode, reg, 0)));
1823 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1824 CONST0_RTX (V4SImode),
1825 gen_rtx_SUBREG (SImode, reg, 4)));
1826 emit_insn (gen_vec_interleave_lowv4si
1827 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1828 gen_rtx_SUBREG (V4SImode, vreg, 0),
1829 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1831 rtx_insn *seq = get_insns ();
1832 end_sequence ();
1833 rtx_insn *insn = DF_REF_INSN (ref);
1834 emit_conversion_insns (seq, insn);
1836 if (dump_file)
1837 fprintf (dump_file,
1838 " Copied r%d to a vector register r%d for insn %d\n",
1839 regno, REGNO (vreg), INSN_UID (insn));
1842 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1843 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1845 rtx_insn *insn = DF_REF_INSN (ref);
1846 if (count_reg)
1848 rtx def_set = single_set (insn);
1849 gcc_assert (def_set);
1851 rtx src = SET_SRC (def_set);
1853 if ((GET_CODE (src) == ASHIFT
1854 || GET_CODE (src) == ASHIFTRT
1855 || GET_CODE (src) == LSHIFTRT)
1856 && !CONST_INT_P (XEXP (src, 1))
1857 && reg_or_subregno (XEXP (src, 1)) == regno)
1858 XEXP (src, 1) = vreg;
1860 else
1861 replace_with_subreg_in_insn (insn, reg, vreg);
1863 if (dump_file)
1864 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1865 regno, REGNO (vreg), INSN_UID (insn));
1869 /* Convert all definitions of register REGNO
1870 and fix its uses. Scalar copies may be created
1871 in case register is used in not convertible insn. */
1873 void
1874 dimode_scalar_chain::convert_reg (unsigned regno)
1876 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1877 rtx reg = regno_reg_rtx[regno];
1878 rtx scopy = NULL_RTX;
1879 df_ref ref;
1880 bitmap conv;
1882 conv = BITMAP_ALLOC (NULL);
1883 bitmap_copy (conv, insns);
1885 if (scalar_copy)
1886 scopy = gen_reg_rtx (DImode);
1888 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1890 rtx_insn *insn = DF_REF_INSN (ref);
1891 rtx def_set = single_set (insn);
1892 rtx src = SET_SRC (def_set);
1893 rtx reg = DF_REF_REG (ref);
1895 if (!MEM_P (src))
1897 replace_with_subreg_in_insn (insn, reg, reg);
1898 bitmap_clear_bit (conv, INSN_UID (insn));
1901 if (scalar_copy)
1903 start_sequence ();
1904 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1906 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1907 emit_move_insn (tmp, reg);
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1909 adjust_address (tmp, SImode, 0));
1910 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1911 adjust_address (tmp, SImode, 4));
1913 else if (TARGET_SSE4_1)
1915 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1916 emit_insn
1917 (gen_rtx_SET
1918 (gen_rtx_SUBREG (SImode, scopy, 0),
1919 gen_rtx_VEC_SELECT (SImode,
1920 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1922 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1923 emit_insn
1924 (gen_rtx_SET
1925 (gen_rtx_SUBREG (SImode, scopy, 4),
1926 gen_rtx_VEC_SELECT (SImode,
1927 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1929 else
1931 rtx vcopy = gen_reg_rtx (V2DImode);
1932 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1933 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1934 gen_rtx_SUBREG (SImode, vcopy, 0));
1935 emit_move_insn (vcopy,
1936 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1937 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1938 gen_rtx_SUBREG (SImode, vcopy, 0));
1940 rtx_insn *seq = get_insns ();
1941 end_sequence ();
1942 emit_conversion_insns (seq, insn);
1944 if (dump_file)
1945 fprintf (dump_file,
1946 " Copied r%d to a scalar register r%d for insn %d\n",
1947 regno, REGNO (scopy), INSN_UID (insn));
1951 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1952 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1954 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1956 rtx_insn *insn = DF_REF_INSN (ref);
1958 rtx def_set = single_set (insn);
1959 gcc_assert (def_set);
1961 rtx src = SET_SRC (def_set);
1962 rtx dst = SET_DEST (def_set);
1964 if ((GET_CODE (src) == ASHIFT
1965 || GET_CODE (src) == ASHIFTRT
1966 || GET_CODE (src) == LSHIFTRT)
1967 && !CONST_INT_P (XEXP (src, 1))
1968 && reg_or_subregno (XEXP (src, 1)) == regno)
1970 rtx tmp2 = gen_reg_rtx (V2DImode);
1972 start_sequence ();
1974 if (TARGET_SSE4_1)
1975 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1976 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1977 else
1979 rtx vec_cst
1980 = gen_rtx_CONST_VECTOR (V2DImode,
1981 gen_rtvec (2, GEN_INT (0xff),
1982 const0_rtx));
1983 vec_cst
1984 = validize_mem (force_const_mem (V2DImode, vec_cst));
1986 emit_insn (gen_rtx_SET
1987 (tmp2,
1988 gen_rtx_AND (V2DImode,
1989 gen_rtx_SUBREG (V2DImode, reg, 0),
1990 vec_cst)));
1992 rtx_insn *seq = get_insns ();
1993 end_sequence ();
1995 emit_insn_before (seq, insn);
1997 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1999 else if (!MEM_P (dst) || !REG_P (src))
2000 replace_with_subreg_in_insn (insn, reg, reg);
2002 bitmap_clear_bit (conv, INSN_UID (insn));
2005 /* Skip debug insns and uninitialized uses. */
2006 else if (DF_REF_CHAIN (ref)
2007 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2009 gcc_assert (scopy);
2010 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2011 df_insn_rescan (DF_REF_INSN (ref));
2014 BITMAP_FREE (conv);
2017 /* Convert operand OP in INSN. We should handle
2018 memory operands and uninitialized registers.
2019 All other register uses are converted during
2020 registers conversion. */
2022 void
2023 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2025 *op = copy_rtx_if_shared (*op);
2027 if (GET_CODE (*op) == NOT)
2029 convert_op (&XEXP (*op, 0), insn);
2030 PUT_MODE (*op, V2DImode);
2032 else if (MEM_P (*op))
2034 rtx tmp = gen_reg_rtx (DImode);
2036 emit_insn_before (gen_move_insn (tmp, *op), insn);
2037 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2039 if (dump_file)
2040 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2041 INSN_UID (insn), REGNO (tmp));
2043 else if (REG_P (*op))
2045 /* We may have not converted register usage in case
2046 this register has no definition. Otherwise it
2047 should be converted in convert_reg. */
2048 df_ref ref;
2049 FOR_EACH_INSN_USE (ref, insn)
2050 if (DF_REF_REGNO (ref) == REGNO (*op))
2052 gcc_assert (!DF_REF_CHAIN (ref));
2053 break;
2055 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2057 else if (CONST_INT_P (*op))
2059 rtx vec_cst;
2060 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2062 /* Prefer all ones vector in case of -1. */
2063 if (constm1_operand (*op, GET_MODE (*op)))
2064 vec_cst = CONSTM1_RTX (V2DImode);
2065 else
2066 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2067 gen_rtvec (2, *op, const0_rtx));
2069 if (!standard_sse_constant_p (vec_cst, V2DImode))
2071 start_sequence ();
2072 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2073 rtx_insn *seq = get_insns ();
2074 end_sequence ();
2075 emit_insn_before (seq, insn);
2078 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2079 *op = tmp;
2081 else
2083 gcc_assert (SUBREG_P (*op));
2084 gcc_assert (GET_MODE (*op) == V2DImode);
2088 /* Convert INSN to vector mode. */
2090 void
2091 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2093 rtx def_set = single_set (insn);
2094 rtx src = SET_SRC (def_set);
2095 rtx dst = SET_DEST (def_set);
2096 rtx subreg;
2098 if (MEM_P (dst) && !REG_P (src))
2100 /* There are no scalar integer instructions and therefore
2101 temporary register usage is required. */
2102 rtx tmp = gen_reg_rtx (DImode);
2103 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2104 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2107 switch (GET_CODE (src))
2109 case ASHIFT:
2110 case ASHIFTRT:
2111 case LSHIFTRT:
2112 convert_op (&XEXP (src, 0), insn);
2113 PUT_MODE (src, V2DImode);
2114 break;
2116 case PLUS:
2117 case MINUS:
2118 case IOR:
2119 case XOR:
2120 case AND:
2121 convert_op (&XEXP (src, 0), insn);
2122 convert_op (&XEXP (src, 1), insn);
2123 PUT_MODE (src, V2DImode);
2124 break;
2126 case NEG:
2127 src = XEXP (src, 0);
2128 convert_op (&src, insn);
2129 subreg = gen_reg_rtx (V2DImode);
2130 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2131 src = gen_rtx_MINUS (V2DImode, subreg, src);
2132 break;
2134 case NOT:
2135 src = XEXP (src, 0);
2136 convert_op (&src, insn);
2137 subreg = gen_reg_rtx (V2DImode);
2138 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2139 src = gen_rtx_XOR (V2DImode, src, subreg);
2140 break;
2142 case MEM:
2143 if (!REG_P (dst))
2144 convert_op (&src, insn);
2145 break;
2147 case REG:
2148 if (!MEM_P (dst))
2149 convert_op (&src, insn);
2150 break;
2152 case SUBREG:
2153 gcc_assert (GET_MODE (src) == V2DImode);
2154 break;
2156 case COMPARE:
2157 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2159 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2160 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2162 if (REG_P (src))
2163 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2164 else
2165 subreg = copy_rtx_if_shared (src);
2166 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2167 copy_rtx_if_shared (subreg),
2168 copy_rtx_if_shared (subreg)),
2169 insn);
2170 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2171 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2172 copy_rtx_if_shared (src)),
2173 UNSPEC_PTEST);
2174 break;
2176 case CONST_INT:
2177 convert_op (&src, insn);
2178 break;
2180 default:
2181 gcc_unreachable ();
2184 SET_SRC (def_set) = src;
2185 SET_DEST (def_set) = dst;
2187 /* Drop possible dead definitions. */
2188 PATTERN (insn) = def_set;
2190 INSN_CODE (insn) = -1;
2191 recog_memoized (insn);
2192 df_insn_rescan (insn);
2195 /* Fix uses of converted REG in debug insns. */
2197 void
2198 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2200 if (!flag_var_tracking)
2201 return;
2203 df_ref ref, next;
2204 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2206 rtx_insn *insn = DF_REF_INSN (ref);
2207 /* Make sure the next ref is for a different instruction,
2208 so that we're not affected by the rescan. */
2209 next = DF_REF_NEXT_REG (ref);
2210 while (next && DF_REF_INSN (next) == insn)
2211 next = DF_REF_NEXT_REG (next);
2213 if (DEBUG_INSN_P (insn))
2215 /* It may be a debug insn with a TImode variable in
2216 register. */
2217 bool changed = false;
2218 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2220 rtx *loc = DF_REF_LOC (ref);
2221 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2223 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2224 changed = true;
2227 if (changed)
2228 df_insn_rescan (insn);
2233 /* Convert INSN from TImode to V1T1mode. */
2235 void
2236 timode_scalar_chain::convert_insn (rtx_insn *insn)
2238 rtx def_set = single_set (insn);
2239 rtx src = SET_SRC (def_set);
2240 rtx dst = SET_DEST (def_set);
2242 switch (GET_CODE (dst))
2244 case REG:
2246 rtx tmp = find_reg_equal_equiv_note (insn);
2247 if (tmp)
2248 PUT_MODE (XEXP (tmp, 0), V1TImode);
2249 PUT_MODE (dst, V1TImode);
2250 fix_debug_reg_uses (dst);
2252 break;
2253 case MEM:
2254 PUT_MODE (dst, V1TImode);
2255 break;
2257 default:
2258 gcc_unreachable ();
2261 switch (GET_CODE (src))
2263 case REG:
2264 PUT_MODE (src, V1TImode);
2265 /* Call fix_debug_reg_uses only if SRC is never defined. */
2266 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2267 fix_debug_reg_uses (src);
2268 break;
2270 case MEM:
2271 PUT_MODE (src, V1TImode);
2272 break;
2274 case CONST_WIDE_INT:
2275 if (NONDEBUG_INSN_P (insn))
2277 /* Since there are no instructions to store 128-bit constant,
2278 temporary register usage is required. */
2279 rtx tmp = gen_reg_rtx (V1TImode);
2280 start_sequence ();
2281 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2282 src = validize_mem (force_const_mem (V1TImode, src));
2283 rtx_insn *seq = get_insns ();
2284 end_sequence ();
2285 if (seq)
2286 emit_insn_before (seq, insn);
2287 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2288 dst = tmp;
2290 break;
2292 case CONST_INT:
2293 switch (standard_sse_constant_p (src, TImode))
2295 case 1:
2296 src = CONST0_RTX (GET_MODE (dst));
2297 break;
2298 case 2:
2299 src = CONSTM1_RTX (GET_MODE (dst));
2300 break;
2301 default:
2302 gcc_unreachable ();
2304 if (NONDEBUG_INSN_P (insn))
2306 rtx tmp = gen_reg_rtx (V1TImode);
2307 /* Since there are no instructions to store standard SSE
2308 constant, temporary register usage is required. */
2309 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2310 dst = tmp;
2312 break;
2314 default:
2315 gcc_unreachable ();
2318 SET_SRC (def_set) = src;
2319 SET_DEST (def_set) = dst;
2321 /* Drop possible dead definitions. */
2322 PATTERN (insn) = def_set;
2324 INSN_CODE (insn) = -1;
2325 recog_memoized (insn);
2326 df_insn_rescan (insn);
2329 void
2330 dimode_scalar_chain::convert_registers ()
2332 bitmap_iterator bi;
2333 unsigned id;
2335 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2336 convert_reg (id);
2338 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2339 make_vector_copies (id);
2342 /* Convert whole chain creating required register
2343 conversions and copies. */
2346 scalar_chain::convert ()
2348 bitmap_iterator bi;
2349 unsigned id;
2350 int converted_insns = 0;
2352 if (!dbg_cnt (stv_conversion))
2353 return 0;
2355 if (dump_file)
2356 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2358 convert_registers ();
2360 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2362 convert_insn (DF_INSN_UID_GET (id)->insn);
2363 converted_insns++;
2366 return converted_insns;
2369 /* Main STV pass function. Find and convert scalar
2370 instructions into vector mode when profitable. */
2372 static unsigned int
2373 convert_scalars_to_vector ()
2375 basic_block bb;
2376 bitmap candidates;
2377 int converted_insns = 0;
2379 bitmap_obstack_initialize (NULL);
2380 candidates = BITMAP_ALLOC (NULL);
2382 calculate_dominance_info (CDI_DOMINATORS);
2383 df_set_flags (DF_DEFER_INSN_RESCAN);
2384 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2385 df_md_add_problem ();
2386 df_analyze ();
2388 /* Find all instructions we want to convert into vector mode. */
2389 if (dump_file)
2390 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2392 FOR_EACH_BB_FN (bb, cfun)
2394 rtx_insn *insn;
2395 FOR_BB_INSNS (bb, insn)
2396 if (scalar_to_vector_candidate_p (insn))
2398 if (dump_file)
2399 fprintf (dump_file, " insn %d is marked as a candidate\n",
2400 INSN_UID (insn));
2402 bitmap_set_bit (candidates, INSN_UID (insn));
2406 remove_non_convertible_regs (candidates);
2408 if (bitmap_empty_p (candidates))
2409 if (dump_file)
2410 fprintf (dump_file, "There are no candidates for optimization.\n");
2412 while (!bitmap_empty_p (candidates))
2414 unsigned uid = bitmap_first_set_bit (candidates);
2415 scalar_chain *chain;
2417 if (TARGET_64BIT)
2418 chain = new timode_scalar_chain;
2419 else
2420 chain = new dimode_scalar_chain;
2422 /* Find instructions chain we want to convert to vector mode.
2423 Check all uses and definitions to estimate all required
2424 conversions. */
2425 chain->build (candidates, uid);
2427 if (chain->compute_convert_gain () > 0)
2428 converted_insns += chain->convert ();
2429 else
2430 if (dump_file)
2431 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2432 chain->chain_id);
2434 delete chain;
2437 if (dump_file)
2438 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2440 BITMAP_FREE (candidates);
2441 bitmap_obstack_release (NULL);
2442 df_process_deferred_rescans ();
2444 /* Conversion means we may have 128bit register spills/fills
2445 which require aligned stack. */
2446 if (converted_insns)
2448 if (crtl->stack_alignment_needed < 128)
2449 crtl->stack_alignment_needed = 128;
2450 if (crtl->stack_alignment_estimated < 128)
2451 crtl->stack_alignment_estimated = 128;
2452 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2453 if (TARGET_64BIT)
2454 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2455 parm; parm = DECL_CHAIN (parm))
2457 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2458 continue;
2459 if (DECL_RTL_SET_P (parm)
2460 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2462 rtx r = DECL_RTL (parm);
2463 if (REG_P (r))
2464 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2466 if (DECL_INCOMING_RTL (parm)
2467 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2469 rtx r = DECL_INCOMING_RTL (parm);
2470 if (REG_P (r))
2471 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2476 return 0;
2479 namespace {
2481 const pass_data pass_data_insert_vzeroupper =
2483 RTL_PASS, /* type */
2484 "vzeroupper", /* name */
2485 OPTGROUP_NONE, /* optinfo_flags */
2486 TV_MACH_DEP, /* tv_id */
2487 0, /* properties_required */
2488 0, /* properties_provided */
2489 0, /* properties_destroyed */
2490 0, /* todo_flags_start */
2491 TODO_df_finish, /* todo_flags_finish */
2494 class pass_insert_vzeroupper : public rtl_opt_pass
2496 public:
2497 pass_insert_vzeroupper(gcc::context *ctxt)
2498 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2501 /* opt_pass methods: */
2502 virtual bool gate (function *)
2504 return TARGET_AVX
2505 && TARGET_VZEROUPPER && flag_expensive_optimizations
2506 && !optimize_size;
2509 virtual unsigned int execute (function *)
2511 return rest_of_handle_insert_vzeroupper ();
2514 }; // class pass_insert_vzeroupper
2516 const pass_data pass_data_stv =
2518 RTL_PASS, /* type */
2519 "stv", /* name */
2520 OPTGROUP_NONE, /* optinfo_flags */
2521 TV_MACH_DEP, /* tv_id */
2522 0, /* properties_required */
2523 0, /* properties_provided */
2524 0, /* properties_destroyed */
2525 0, /* todo_flags_start */
2526 TODO_df_finish, /* todo_flags_finish */
2529 class pass_stv : public rtl_opt_pass
2531 public:
2532 pass_stv (gcc::context *ctxt)
2533 : rtl_opt_pass (pass_data_stv, ctxt),
2534 timode_p (false)
2537 /* opt_pass methods: */
2538 virtual bool gate (function *)
2540 return (timode_p == !!TARGET_64BIT
2541 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2544 virtual unsigned int execute (function *)
2546 return convert_scalars_to_vector ();
2549 opt_pass *clone ()
2551 return new pass_stv (m_ctxt);
2554 void set_pass_param (unsigned int n, bool param)
2556 gcc_assert (n == 0);
2557 timode_p = param;
2560 private:
2561 bool timode_p;
2562 }; // class pass_stv
2564 } // anon namespace
2566 rtl_opt_pass *
2567 make_pass_insert_vzeroupper (gcc::context *ctxt)
2569 return new pass_insert_vzeroupper (ctxt);
2572 rtl_opt_pass *
2573 make_pass_stv (gcc::context *ctxt)
2575 return new pass_stv (ctxt);
2578 /* Inserting ENDBRANCH instructions. */
2580 static unsigned int
2581 rest_of_insert_endbranch (void)
2583 timevar_push (TV_MACH_DEP);
2585 rtx cet_eb;
2586 rtx_insn *insn;
2587 basic_block bb;
2589 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2590 absent among function attributes. Later an optimization will be
2591 introduced to make analysis if an address of a static function is
2592 taken. A static function whose address is not taken will get a
2593 nocf_check attribute. This will allow to reduce the number of EB. */
2595 if (!lookup_attribute ("nocf_check",
2596 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2597 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2599 cet_eb = gen_nop_endbr ();
2601 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2602 insn = BB_HEAD (bb);
2603 emit_insn_before (cet_eb, insn);
2606 bb = 0;
2607 FOR_EACH_BB_FN (bb, cfun)
2609 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2610 insn = NEXT_INSN (insn))
2612 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2614 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2615 continue;
2616 /* Generate ENDBRANCH after CALL, which can return more than
2617 twice, setjmp-like functions. */
2619 /* Skip notes and debug insns that must be next to the
2620 call insn. ??? This might skip a lot more than
2621 that... ??? Skipping barriers and emitting code
2622 after them surely looks like a mistake; we probably
2623 won't ever hit it, for we'll hit BB_END first. */
2624 rtx_insn *next_insn = insn;
2625 while ((next_insn != BB_END (bb))
2626 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2627 || NOTE_P (NEXT_INSN (next_insn))
2628 || BARRIER_P (NEXT_INSN (next_insn))))
2629 next_insn = NEXT_INSN (next_insn);
2631 cet_eb = gen_nop_endbr ();
2632 emit_insn_after_setloc (cet_eb, next_insn, INSN_LOCATION (insn));
2633 continue;
2636 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2638 rtx target = JUMP_LABEL (insn);
2639 if (target == NULL_RTX || ANY_RETURN_P (target))
2640 continue;
2642 /* Check the jump is a switch table. */
2643 rtx_insn *label = as_a<rtx_insn *> (target);
2644 rtx_insn *table = next_insn (label);
2645 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2646 continue;
2648 /* For the indirect jump find out all places it jumps and insert
2649 ENDBRANCH there. It should be done under a special flag to
2650 control ENDBRANCH generation for switch stmts. */
2651 edge_iterator ei;
2652 edge e;
2653 basic_block dest_blk;
2655 FOR_EACH_EDGE (e, ei, bb->succs)
2657 rtx_insn *insn;
2659 dest_blk = e->dest;
2660 insn = BB_HEAD (dest_blk);
2661 gcc_assert (LABEL_P (insn));
2662 cet_eb = gen_nop_endbr ();
2663 emit_insn_after (cet_eb, insn);
2665 continue;
2668 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2669 || (NOTE_P (insn)
2670 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2671 /* TODO. Check /s bit also. */
2673 cet_eb = gen_nop_endbr ();
2674 emit_insn_after (cet_eb, insn);
2675 continue;
2680 timevar_pop (TV_MACH_DEP);
2681 return 0;
2684 namespace {
2686 const pass_data pass_data_insert_endbranch =
2688 RTL_PASS, /* type. */
2689 "cet", /* name. */
2690 OPTGROUP_NONE, /* optinfo_flags. */
2691 TV_MACH_DEP, /* tv_id. */
2692 0, /* properties_required. */
2693 0, /* properties_provided. */
2694 0, /* properties_destroyed. */
2695 0, /* todo_flags_start. */
2696 0, /* todo_flags_finish. */
2699 class pass_insert_endbranch : public rtl_opt_pass
2701 public:
2702 pass_insert_endbranch (gcc::context *ctxt)
2703 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2706 /* opt_pass methods: */
2707 virtual bool gate (function *)
2709 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2712 virtual unsigned int execute (function *)
2714 return rest_of_insert_endbranch ();
2717 }; // class pass_insert_endbranch
2719 } // anon namespace
2721 rtl_opt_pass *
2722 make_pass_insert_endbranch (gcc::context *ctxt)
2724 return new pass_insert_endbranch (ctxt);
2727 /* Return true if a red-zone is in use. */
2729 bool
2730 ix86_using_red_zone (void)
2732 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2735 /* Return a string that documents the current -m options. The caller is
2736 responsible for freeing the string. */
2738 static char *
2739 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2740 int flags, int flags2,
2741 const char *arch, const char *tune,
2742 enum fpmath_unit fpmath, bool add_nl_p)
2744 struct ix86_target_opts
2746 const char *option; /* option string */
2747 HOST_WIDE_INT mask; /* isa mask options */
2750 /* This table is ordered so that options like -msse4.2 that imply other
2751 ISAs come first. Target string will be displayed in the same order. */
2752 static struct ix86_target_opts isa2_opts[] =
2754 { "-mcx16", OPTION_MASK_ISA_CX16 },
2755 { "-mmpx", OPTION_MASK_ISA_MPX },
2756 { "-mvaes", OPTION_MASK_ISA_VAES },
2757 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2758 { "-msgx", OPTION_MASK_ISA_SGX },
2759 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2760 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2761 { "-mibt", OPTION_MASK_ISA_IBT },
2762 { "-mhle", OPTION_MASK_ISA_HLE },
2763 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2764 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2765 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2767 static struct ix86_target_opts isa_opts[] =
2769 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2770 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2771 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2772 { "-mgfni", OPTION_MASK_ISA_GFNI },
2773 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2774 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2775 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2776 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2777 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2778 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2779 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2780 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2781 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2782 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2783 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2784 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2785 { "-mfma", OPTION_MASK_ISA_FMA },
2786 { "-mxop", OPTION_MASK_ISA_XOP },
2787 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2788 { "-mf16c", OPTION_MASK_ISA_F16C },
2789 { "-mavx", OPTION_MASK_ISA_AVX },
2790 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2791 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2792 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2793 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2794 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2795 { "-msse3", OPTION_MASK_ISA_SSE3 },
2796 { "-maes", OPTION_MASK_ISA_AES },
2797 { "-msha", OPTION_MASK_ISA_SHA },
2798 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2799 { "-msse2", OPTION_MASK_ISA_SSE2 },
2800 { "-msse", OPTION_MASK_ISA_SSE },
2801 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2802 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2803 { "-mmmx", OPTION_MASK_ISA_MMX },
2804 { "-mrtm", OPTION_MASK_ISA_RTM },
2805 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2806 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2807 { "-madx", OPTION_MASK_ISA_ADX },
2808 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2809 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2810 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2811 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2812 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2813 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2814 { "-mabm", OPTION_MASK_ISA_ABM },
2815 { "-mbmi", OPTION_MASK_ISA_BMI },
2816 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2817 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2818 { "-mtbm", OPTION_MASK_ISA_TBM },
2819 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2820 { "-msahf", OPTION_MASK_ISA_SAHF },
2821 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2822 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2823 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2824 { "-mpku", OPTION_MASK_ISA_PKU },
2825 { "-mlwp", OPTION_MASK_ISA_LWP },
2826 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2827 { "-mclwb", OPTION_MASK_ISA_CLWB },
2828 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2831 /* Flag options. */
2832 static struct ix86_target_opts flag_opts[] =
2834 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2835 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2836 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2837 { "-m80387", MASK_80387 },
2838 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2839 { "-malign-double", MASK_ALIGN_DOUBLE },
2840 { "-mcld", MASK_CLD },
2841 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2842 { "-mieee-fp", MASK_IEEE_FP },
2843 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2844 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2845 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2846 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2847 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2848 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2849 { "-mno-red-zone", MASK_NO_RED_ZONE },
2850 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2851 { "-mrecip", MASK_RECIP },
2852 { "-mrtd", MASK_RTD },
2853 { "-msseregparm", MASK_SSEREGPARM },
2854 { "-mstack-arg-probe", MASK_STACK_PROBE },
2855 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2856 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2857 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2858 { "-mvzeroupper", MASK_VZEROUPPER },
2859 { "-mstv", MASK_STV },
2860 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2861 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2862 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2865 /* Additional flag options. */
2866 static struct ix86_target_opts flag2_opts[] =
2868 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2871 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2872 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2874 char isa_other[40];
2875 char isa2_other[40];
2876 char flags_other[40];
2877 char flags2_other[40];
2878 unsigned num = 0;
2879 unsigned i, j;
2880 char *ret;
2881 char *ptr;
2882 size_t len;
2883 size_t line_len;
2884 size_t sep_len;
2885 const char *abi;
2887 memset (opts, '\0', sizeof (opts));
2889 /* Add -march= option. */
2890 if (arch)
2892 opts[num][0] = "-march=";
2893 opts[num++][1] = arch;
2896 /* Add -mtune= option. */
2897 if (tune)
2899 opts[num][0] = "-mtune=";
2900 opts[num++][1] = tune;
2903 /* Add -m32/-m64/-mx32. */
2904 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2906 if ((isa & OPTION_MASK_ABI_64) != 0)
2907 abi = "-m64";
2908 else
2909 abi = "-mx32";
2910 isa &= ~ (OPTION_MASK_ISA_64BIT
2911 | OPTION_MASK_ABI_64
2912 | OPTION_MASK_ABI_X32);
2914 else
2915 abi = "-m32";
2916 opts[num++][0] = abi;
2918 /* Pick out the options in isa2 options. */
2919 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2921 if ((isa2 & isa2_opts[i].mask) != 0)
2923 opts[num++][0] = isa2_opts[i].option;
2924 isa2 &= ~ isa2_opts[i].mask;
2928 if (isa2 && add_nl_p)
2930 opts[num++][0] = isa2_other;
2931 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2934 /* Pick out the options in isa options. */
2935 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2937 if ((isa & isa_opts[i].mask) != 0)
2939 opts[num++][0] = isa_opts[i].option;
2940 isa &= ~ isa_opts[i].mask;
2944 if (isa && add_nl_p)
2946 opts[num++][0] = isa_other;
2947 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2950 /* Add flag options. */
2951 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2953 if ((flags & flag_opts[i].mask) != 0)
2955 opts[num++][0] = flag_opts[i].option;
2956 flags &= ~ flag_opts[i].mask;
2960 if (flags && add_nl_p)
2962 opts[num++][0] = flags_other;
2963 sprintf (flags_other, "(other flags: %#x)", flags);
2966 /* Add additional flag options. */
2967 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2969 if ((flags2 & flag2_opts[i].mask) != 0)
2971 opts[num++][0] = flag2_opts[i].option;
2972 flags2 &= ~ flag2_opts[i].mask;
2976 if (flags2 && add_nl_p)
2978 opts[num++][0] = flags2_other;
2979 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2982 /* Add -fpmath= option. */
2983 if (fpmath)
2985 opts[num][0] = "-mfpmath=";
2986 switch ((int) fpmath)
2988 case FPMATH_387:
2989 opts[num++][1] = "387";
2990 break;
2992 case FPMATH_SSE:
2993 opts[num++][1] = "sse";
2994 break;
2996 case FPMATH_387 | FPMATH_SSE:
2997 opts[num++][1] = "sse+387";
2998 break;
3000 default:
3001 gcc_unreachable ();
3005 /* Any options? */
3006 if (num == 0)
3007 return NULL;
3009 gcc_assert (num < ARRAY_SIZE (opts));
3011 /* Size the string. */
3012 len = 0;
3013 sep_len = (add_nl_p) ? 3 : 1;
3014 for (i = 0; i < num; i++)
3016 len += sep_len;
3017 for (j = 0; j < 2; j++)
3018 if (opts[i][j])
3019 len += strlen (opts[i][j]);
3022 /* Build the string. */
3023 ret = ptr = (char *) xmalloc (len);
3024 line_len = 0;
3026 for (i = 0; i < num; i++)
3028 size_t len2[2];
3030 for (j = 0; j < 2; j++)
3031 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3033 if (i != 0)
3035 *ptr++ = ' ';
3036 line_len++;
3038 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3040 *ptr++ = '\\';
3041 *ptr++ = '\n';
3042 line_len = 0;
3046 for (j = 0; j < 2; j++)
3047 if (opts[i][j])
3049 memcpy (ptr, opts[i][j], len2[j]);
3050 ptr += len2[j];
3051 line_len += len2[j];
3055 *ptr = '\0';
3056 gcc_assert (ret + len >= ptr);
3058 return ret;
3061 /* Return true, if profiling code should be emitted before
3062 prologue. Otherwise it returns false.
3063 Note: For x86 with "hotfix" it is sorried. */
3064 static bool
3065 ix86_profile_before_prologue (void)
3067 return flag_fentry != 0;
3070 /* Function that is callable from the debugger to print the current
3071 options. */
3072 void ATTRIBUTE_UNUSED
3073 ix86_debug_options (void)
3075 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3076 target_flags, ix86_target_flags,
3077 ix86_arch_string,ix86_tune_string,
3078 ix86_fpmath, true);
3080 if (opts)
3082 fprintf (stderr, "%s\n\n", opts);
3083 free (opts);
3085 else
3086 fputs ("<no options>\n\n", stderr);
3088 return;
3091 /* Return true if T is one of the bytes we should avoid with
3092 -mmitigate-rop. */
3094 static bool
3095 ix86_rop_should_change_byte_p (int t)
3097 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3100 static const char *stringop_alg_names[] = {
3101 #define DEF_ENUM
3102 #define DEF_ALG(alg, name) #name,
3103 #include "stringop.def"
3104 #undef DEF_ENUM
3105 #undef DEF_ALG
3108 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3109 The string is of the following form (or comma separated list of it):
3111 strategy_alg:max_size:[align|noalign]
3113 where the full size range for the strategy is either [0, max_size] or
3114 [min_size, max_size], in which min_size is the max_size + 1 of the
3115 preceding range. The last size range must have max_size == -1.
3117 Examples:
3120 -mmemcpy-strategy=libcall:-1:noalign
3122 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3126 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3128 This is to tell the compiler to use the following strategy for memset
3129 1) when the expected size is between [1, 16], use rep_8byte strategy;
3130 2) when the size is between [17, 2048], use vector_loop;
3131 3) when the size is > 2048, use libcall. */
3133 struct stringop_size_range
3135 int max;
3136 stringop_alg alg;
3137 bool noalign;
3140 static void
3141 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3143 const struct stringop_algs *default_algs;
3144 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3145 char *curr_range_str, *next_range_str;
3146 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3147 int i = 0, n = 0;
3149 if (is_memset)
3150 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3151 else
3152 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3154 curr_range_str = strategy_str;
3158 int maxs;
3159 char alg_name[128];
3160 char align[16];
3161 next_range_str = strchr (curr_range_str, ',');
3162 if (next_range_str)
3163 *next_range_str++ = '\0';
3165 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3166 align) != 3)
3168 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3169 return;
3172 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3174 error ("size ranges of option %qs should be increasing", opt);
3175 return;
3178 for (i = 0; i < last_alg; i++)
3179 if (!strcmp (alg_name, stringop_alg_names[i]))
3180 break;
3182 if (i == last_alg)
3184 error ("wrong strategy name %qs specified for option %qs",
3185 alg_name, opt);
3187 auto_vec <const char *> candidates;
3188 for (i = 0; i < last_alg; i++)
3189 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3190 candidates.safe_push (stringop_alg_names[i]);
3192 char *s;
3193 const char *hint
3194 = candidates_list_and_hint (alg_name, s, candidates);
3195 if (hint)
3196 inform (input_location,
3197 "valid arguments to %qs are: %s; did you mean %qs?",
3198 opt, s, hint);
3199 else
3200 inform (input_location, "valid arguments to %qs are: %s",
3201 opt, s);
3202 XDELETEVEC (s);
3203 return;
3206 if ((stringop_alg) i == rep_prefix_8_byte
3207 && !TARGET_64BIT)
3209 /* rep; movq isn't available in 32-bit code. */
3210 error ("strategy name %qs specified for option %qs "
3211 "not supported for 32-bit code", alg_name, opt);
3212 return;
3215 input_ranges[n].max = maxs;
3216 input_ranges[n].alg = (stringop_alg) i;
3217 if (!strcmp (align, "align"))
3218 input_ranges[n].noalign = false;
3219 else if (!strcmp (align, "noalign"))
3220 input_ranges[n].noalign = true;
3221 else
3223 error ("unknown alignment %qs specified for option %qs", align, opt);
3224 return;
3226 n++;
3227 curr_range_str = next_range_str;
3229 while (curr_range_str);
3231 if (input_ranges[n - 1].max != -1)
3233 error ("the max value for the last size range should be -1"
3234 " for option %qs", opt);
3235 return;
3238 if (n > MAX_STRINGOP_ALGS)
3240 error ("too many size ranges specified in option %qs", opt);
3241 return;
3244 /* Now override the default algs array. */
3245 for (i = 0; i < n; i++)
3247 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3248 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3249 = input_ranges[i].alg;
3250 *const_cast<int *>(&default_algs->size[i].noalign)
3251 = input_ranges[i].noalign;
3256 /* parse -mtune-ctrl= option. When DUMP is true,
3257 print the features that are explicitly set. */
3259 static void
3260 parse_mtune_ctrl_str (bool dump)
3262 if (!ix86_tune_ctrl_string)
3263 return;
3265 char *next_feature_string = NULL;
3266 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3267 char *orig = curr_feature_string;
3268 int i;
3271 bool clear = false;
3273 next_feature_string = strchr (curr_feature_string, ',');
3274 if (next_feature_string)
3275 *next_feature_string++ = '\0';
3276 if (*curr_feature_string == '^')
3278 curr_feature_string++;
3279 clear = true;
3281 for (i = 0; i < X86_TUNE_LAST; i++)
3283 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3285 ix86_tune_features[i] = !clear;
3286 if (dump)
3287 fprintf (stderr, "Explicitly %s feature %s\n",
3288 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3289 break;
3292 if (i == X86_TUNE_LAST)
3293 error ("unknown parameter to option -mtune-ctrl: %s",
3294 clear ? curr_feature_string - 1 : curr_feature_string);
3295 curr_feature_string = next_feature_string;
3297 while (curr_feature_string);
3298 free (orig);
3301 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3302 processor type. */
3304 static void
3305 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3307 unsigned int ix86_tune_mask = 1u << ix86_tune;
3308 int i;
3310 for (i = 0; i < X86_TUNE_LAST; ++i)
3312 if (ix86_tune_no_default)
3313 ix86_tune_features[i] = 0;
3314 else
3315 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3318 if (dump)
3320 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3321 for (i = 0; i < X86_TUNE_LAST; i++)
3322 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3323 ix86_tune_features[i] ? "on" : "off");
3326 parse_mtune_ctrl_str (dump);
3330 /* Default align_* from the processor table. */
3332 static void
3333 ix86_default_align (struct gcc_options *opts)
3335 if (opts->x_align_loops == 0)
3337 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3338 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3340 if (opts->x_align_jumps == 0)
3342 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3343 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3345 if (opts->x_align_functions == 0)
3347 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3351 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3353 static void
3354 ix86_override_options_after_change (void)
3356 ix86_default_align (&global_options);
3359 /* Override various settings based on options. If MAIN_ARGS_P, the
3360 options are from the command line, otherwise they are from
3361 attributes. Return true if there's an error related to march
3362 option. */
3364 static bool
3365 ix86_option_override_internal (bool main_args_p,
3366 struct gcc_options *opts,
3367 struct gcc_options *opts_set)
3369 int i;
3370 unsigned int ix86_arch_mask;
3371 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3373 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3374 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3375 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3376 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3377 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3378 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3379 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3380 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3381 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3382 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3383 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3384 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3385 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3386 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3387 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3388 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3389 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3390 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3391 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3392 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3393 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3394 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3395 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3396 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3397 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3398 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3399 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3400 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3401 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3402 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3403 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3404 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3405 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3406 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3407 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3408 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3409 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3410 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3411 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3412 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3413 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3414 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3415 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3416 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3417 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3418 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3419 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3420 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3421 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3422 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3423 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3424 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3425 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3426 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3427 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3428 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3429 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3430 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3431 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3432 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3433 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3434 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3435 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3436 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3438 #define PTA_CORE2 \
3439 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3440 | PTA_CX16 | PTA_FXSR)
3441 #define PTA_NEHALEM \
3442 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3443 #define PTA_WESTMERE \
3444 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3445 #define PTA_SANDYBRIDGE \
3446 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3447 #define PTA_IVYBRIDGE \
3448 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3449 #define PTA_HASWELL \
3450 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3451 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3452 #define PTA_BROADWELL \
3453 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3454 #define PTA_SKYLAKE \
3455 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3456 #define PTA_SKYLAKE_AVX512 \
3457 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3458 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU | PTA_CLWB)
3459 #define PTA_CANNONLAKE \
3460 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA)
3461 #define PTA_KNL \
3462 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3463 #define PTA_BONNELL \
3464 (PTA_CORE2 | PTA_MOVBE)
3465 #define PTA_SILVERMONT \
3466 (PTA_WESTMERE | PTA_MOVBE)
3467 #define PTA_KNM \
3468 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3470 /* if this reaches 64, need to widen struct pta flags below */
3472 static struct pta
3474 const char *const name; /* processor name or nickname. */
3475 const enum processor_type processor;
3476 const enum attr_cpu schedule;
3477 const unsigned HOST_WIDE_INT flags;
3479 const processor_alias_table[] =
3481 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3482 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3483 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3484 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3485 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3486 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3487 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3488 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3489 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3490 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3491 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3492 PTA_MMX | PTA_SSE | PTA_FXSR},
3493 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3494 PTA_MMX | PTA_SSE | PTA_FXSR},
3495 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3496 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3497 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3498 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3499 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3500 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3501 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3502 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3503 PTA_MMX | PTA_SSE | PTA_FXSR},
3504 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3505 PTA_MMX | PTA_SSE | PTA_FXSR},
3506 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3507 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3508 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3509 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3510 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3511 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3512 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3513 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3514 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3515 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3516 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3517 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3518 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3519 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3520 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3521 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3522 PTA_SANDYBRIDGE},
3523 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3524 PTA_SANDYBRIDGE},
3525 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3526 PTA_IVYBRIDGE},
3527 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3528 PTA_IVYBRIDGE},
3529 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3530 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3531 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3532 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3533 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3534 PTA_SKYLAKE_AVX512},
3535 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3536 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3537 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3538 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3539 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3540 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3541 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3542 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3543 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3544 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3545 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3546 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3547 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3548 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3549 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3550 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3551 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3552 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3553 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3554 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3555 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3556 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3557 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3558 {"x86-64", PROCESSOR_K8, CPU_K8,
3559 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3560 {"eden-x2", PROCESSOR_K8, CPU_K8,
3561 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3562 {"nano", PROCESSOR_K8, CPU_K8,
3563 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3564 | PTA_SSSE3 | PTA_FXSR},
3565 {"nano-1000", PROCESSOR_K8, CPU_K8,
3566 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3567 | PTA_SSSE3 | PTA_FXSR},
3568 {"nano-2000", PROCESSOR_K8, CPU_K8,
3569 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3570 | PTA_SSSE3 | PTA_FXSR},
3571 {"nano-3000", PROCESSOR_K8, CPU_K8,
3572 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3573 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3574 {"nano-x2", PROCESSOR_K8, CPU_K8,
3575 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3576 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3577 {"eden-x4", PROCESSOR_K8, CPU_K8,
3578 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3579 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3580 {"nano-x4", PROCESSOR_K8, CPU_K8,
3581 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3582 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3583 {"k8", PROCESSOR_K8, CPU_K8,
3584 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3585 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3586 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3587 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3588 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3589 {"opteron", PROCESSOR_K8, CPU_K8,
3590 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3591 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3592 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3593 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3594 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3595 {"athlon64", PROCESSOR_K8, CPU_K8,
3596 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3597 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3598 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3599 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3600 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3601 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3602 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3603 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3604 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3605 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3606 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3607 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3608 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3609 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3610 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3611 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3612 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3613 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3614 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3615 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3616 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3617 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3618 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3619 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3620 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3621 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3622 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3623 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3624 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3625 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3626 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3627 | PTA_XSAVEOPT | PTA_FSGSBASE},
3628 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3629 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3630 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3631 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3632 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3633 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3634 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3635 | PTA_MOVBE | PTA_MWAITX},
3636 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3637 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3638 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3639 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3640 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3641 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3642 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3643 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3644 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3645 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3646 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3647 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3648 | PTA_FXSR | PTA_XSAVE},
3649 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3650 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3651 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3652 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3653 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3654 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3656 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3657 PTA_64BIT
3658 | PTA_HLE /* flags are only used for -march switch. */ },
3661 /* -mrecip options. */
3662 static struct
3664 const char *string; /* option name */
3665 unsigned int mask; /* mask bits to set */
3667 const recip_options[] =
3669 { "all", RECIP_MASK_ALL },
3670 { "none", RECIP_MASK_NONE },
3671 { "div", RECIP_MASK_DIV },
3672 { "sqrt", RECIP_MASK_SQRT },
3673 { "vec-div", RECIP_MASK_VEC_DIV },
3674 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3677 int const pta_size = ARRAY_SIZE (processor_alias_table);
3679 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3680 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3681 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3682 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3683 #ifdef TARGET_BI_ARCH
3684 else
3686 #if TARGET_BI_ARCH == 1
3687 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3688 is on and OPTION_MASK_ABI_X32 is off. We turn off
3689 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3690 -mx32. */
3691 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3692 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3693 #else
3694 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3695 on and OPTION_MASK_ABI_64 is off. We turn off
3696 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3697 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3698 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3699 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3700 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3701 #endif
3702 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3703 && TARGET_IAMCU_P (opts->x_target_flags))
3704 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3705 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3707 #endif
3709 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3711 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3712 OPTION_MASK_ABI_64 for TARGET_X32. */
3713 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3714 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3716 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3717 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3718 | OPTION_MASK_ABI_X32
3719 | OPTION_MASK_ABI_64);
3720 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3722 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3723 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3724 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3725 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3728 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3729 SUBTARGET_OVERRIDE_OPTIONS;
3730 #endif
3732 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3733 SUBSUBTARGET_OVERRIDE_OPTIONS;
3734 #endif
3736 /* -fPIC is the default for x86_64. */
3737 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3738 opts->x_flag_pic = 2;
3740 /* Need to check -mtune=generic first. */
3741 if (opts->x_ix86_tune_string)
3743 /* As special support for cross compilers we read -mtune=native
3744 as -mtune=generic. With native compilers we won't see the
3745 -mtune=native, as it was changed by the driver. */
3746 if (!strcmp (opts->x_ix86_tune_string, "native"))
3748 opts->x_ix86_tune_string = "generic";
3750 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3751 warning (OPT_Wdeprecated,
3752 main_args_p
3753 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3754 "or %<-mtune=generic%> instead as appropriate")
3755 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3756 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3757 " instead as appropriate"));
3759 else
3761 if (opts->x_ix86_arch_string)
3762 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3763 if (!opts->x_ix86_tune_string)
3765 opts->x_ix86_tune_string
3766 = processor_target_table[TARGET_CPU_DEFAULT].name;
3767 ix86_tune_defaulted = 1;
3770 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3771 or defaulted. We need to use a sensible tune option. */
3772 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3774 opts->x_ix86_tune_string = "generic";
3778 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3779 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3781 /* rep; movq isn't available in 32-bit code. */
3782 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3783 opts->x_ix86_stringop_alg = no_stringop;
3786 if (!opts->x_ix86_arch_string)
3787 opts->x_ix86_arch_string
3788 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3789 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3790 else
3791 ix86_arch_specified = 1;
3793 if (opts_set->x_ix86_pmode)
3795 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3796 && opts->x_ix86_pmode == PMODE_SI)
3797 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3798 && opts->x_ix86_pmode == PMODE_DI))
3799 error ("address mode %qs not supported in the %s bit mode",
3800 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3801 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3803 else
3804 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3805 ? PMODE_DI : PMODE_SI;
3807 if (!opts_set->x_ix86_abi)
3808 opts->x_ix86_abi = DEFAULT_ABI;
3810 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3811 error ("-mabi=ms not supported with X32 ABI");
3812 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3814 /* For targets using ms ABI enable ms-extensions, if not
3815 explicit turned off. For non-ms ABI we turn off this
3816 option. */
3817 if (!opts_set->x_flag_ms_extensions)
3818 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3820 if (opts_set->x_ix86_cmodel)
3822 switch (opts->x_ix86_cmodel)
3824 case CM_SMALL:
3825 case CM_SMALL_PIC:
3826 if (opts->x_flag_pic)
3827 opts->x_ix86_cmodel = CM_SMALL_PIC;
3828 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3829 error ("code model %qs not supported in the %s bit mode",
3830 "small", "32");
3831 break;
3833 case CM_MEDIUM:
3834 case CM_MEDIUM_PIC:
3835 if (opts->x_flag_pic)
3836 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3837 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3838 error ("code model %qs not supported in the %s bit mode",
3839 "medium", "32");
3840 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3841 error ("code model %qs not supported in x32 mode",
3842 "medium");
3843 break;
3845 case CM_LARGE:
3846 case CM_LARGE_PIC:
3847 if (opts->x_flag_pic)
3848 opts->x_ix86_cmodel = CM_LARGE_PIC;
3849 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3850 error ("code model %qs not supported in the %s bit mode",
3851 "large", "32");
3852 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3853 error ("code model %qs not supported in x32 mode",
3854 "large");
3855 break;
3857 case CM_32:
3858 if (opts->x_flag_pic)
3859 error ("code model %s does not support PIC mode", "32");
3860 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3861 error ("code model %qs not supported in the %s bit mode",
3862 "32", "64");
3863 break;
3865 case CM_KERNEL:
3866 if (opts->x_flag_pic)
3868 error ("code model %s does not support PIC mode", "kernel");
3869 opts->x_ix86_cmodel = CM_32;
3871 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3872 error ("code model %qs not supported in the %s bit mode",
3873 "kernel", "32");
3874 break;
3876 default:
3877 gcc_unreachable ();
3880 else
3882 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3883 use of rip-relative addressing. This eliminates fixups that
3884 would otherwise be needed if this object is to be placed in a
3885 DLL, and is essentially just as efficient as direct addressing. */
3886 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3887 && (TARGET_RDOS || TARGET_PECOFF))
3888 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3889 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3890 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3891 else
3892 opts->x_ix86_cmodel = CM_32;
3894 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3896 error ("-masm=intel not supported in this configuration");
3897 opts->x_ix86_asm_dialect = ASM_ATT;
3899 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3900 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3901 sorry ("%i-bit mode not compiled in",
3902 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3904 for (i = 0; i < pta_size; i++)
3905 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3907 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3909 error (main_args_p
3910 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3911 "switch")
3912 : G_("%<generic%> CPU can be used only for "
3913 "%<target(\"tune=\")%> attribute"));
3914 return false;
3916 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3918 error (main_args_p
3919 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3920 "switch")
3921 : G_("%<intel%> CPU can be used only for "
3922 "%<target(\"tune=\")%> attribute"));
3923 return false;
3926 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3927 && !(processor_alias_table[i].flags & PTA_64BIT))
3929 error ("CPU you selected does not support x86-64 "
3930 "instruction set");
3931 return false;
3934 ix86_schedule = processor_alias_table[i].schedule;
3935 ix86_arch = processor_alias_table[i].processor;
3936 /* Default cpu tuning to the architecture. */
3937 ix86_tune = ix86_arch;
3939 if (processor_alias_table[i].flags & PTA_MMX
3940 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3941 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3942 if (processor_alias_table[i].flags & PTA_3DNOW
3943 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3944 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3945 if (processor_alias_table[i].flags & PTA_3DNOW_A
3946 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3947 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3948 if (processor_alias_table[i].flags & PTA_SSE
3949 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3950 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3951 if (processor_alias_table[i].flags & PTA_SSE2
3952 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3953 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3954 if (processor_alias_table[i].flags & PTA_SSE3
3955 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3956 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3957 if (processor_alias_table[i].flags & PTA_SSSE3
3958 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3959 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3960 if (processor_alias_table[i].flags & PTA_SSE4_1
3961 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3962 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3963 if (processor_alias_table[i].flags & PTA_SSE4_2
3964 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3965 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3966 if (processor_alias_table[i].flags & PTA_AVX
3967 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3968 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3969 if (processor_alias_table[i].flags & PTA_AVX2
3970 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3971 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3972 if (processor_alias_table[i].flags & PTA_FMA
3973 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3974 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3975 if (processor_alias_table[i].flags & PTA_SSE4A
3976 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3977 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3978 if (processor_alias_table[i].flags & PTA_FMA4
3979 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3980 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3981 if (processor_alias_table[i].flags & PTA_XOP
3982 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3983 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3984 if (processor_alias_table[i].flags & PTA_LWP
3985 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3986 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3987 if (processor_alias_table[i].flags & PTA_ABM
3988 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3989 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3990 if (processor_alias_table[i].flags & PTA_BMI
3991 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3992 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3993 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3994 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3995 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3996 if (processor_alias_table[i].flags & PTA_TBM
3997 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3998 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3999 if (processor_alias_table[i].flags & PTA_BMI2
4000 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4001 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4002 if (processor_alias_table[i].flags & PTA_CX16
4003 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4004 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4005 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
4006 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4007 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4008 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4009 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4010 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4011 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4012 if (processor_alias_table[i].flags & PTA_MOVBE
4013 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4014 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4015 if (processor_alias_table[i].flags & PTA_AES
4016 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4017 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4018 if (processor_alias_table[i].flags & PTA_SHA
4019 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4020 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4021 if (processor_alias_table[i].flags & PTA_PCLMUL
4022 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4023 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4024 if (processor_alias_table[i].flags & PTA_FSGSBASE
4025 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4026 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4027 if (processor_alias_table[i].flags & PTA_RDRND
4028 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4029 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4030 if (processor_alias_table[i].flags & PTA_F16C
4031 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4032 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4033 if (processor_alias_table[i].flags & PTA_RTM
4034 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4035 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4036 if (processor_alias_table[i].flags & PTA_HLE
4037 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4038 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4039 if (processor_alias_table[i].flags & PTA_PRFCHW
4040 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4041 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4042 if (processor_alias_table[i].flags & PTA_RDSEED
4043 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4044 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4045 if (processor_alias_table[i].flags & PTA_ADX
4046 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4047 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4048 if (processor_alias_table[i].flags & PTA_FXSR
4049 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4050 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4051 if (processor_alias_table[i].flags & PTA_XSAVE
4052 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4053 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4054 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4055 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4056 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4057 if (processor_alias_table[i].flags & PTA_AVX512F
4058 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4059 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4060 if (processor_alias_table[i].flags & PTA_AVX512ER
4061 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4062 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4063 if (processor_alias_table[i].flags & PTA_AVX512PF
4064 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4065 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4066 if (processor_alias_table[i].flags & PTA_AVX512CD
4067 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4068 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4069 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4070 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4071 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4072 if (processor_alias_table[i].flags & PTA_CLWB
4073 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4074 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4075 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4076 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4077 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4078 if (processor_alias_table[i].flags & PTA_CLZERO
4079 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4080 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4081 if (processor_alias_table[i].flags & PTA_XSAVEC
4082 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4083 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4084 if (processor_alias_table[i].flags & PTA_XSAVES
4085 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4086 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4087 if (processor_alias_table[i].flags & PTA_AVX512DQ
4088 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4089 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4090 if (processor_alias_table[i].flags & PTA_AVX512BW
4091 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4092 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4093 if (processor_alias_table[i].flags & PTA_AVX512VL
4094 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4095 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4096 if (processor_alias_table[i].flags & PTA_MPX
4097 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4098 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4099 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4100 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4101 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4102 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4103 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4104 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4106 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4107 && !(opts->x_ix86_isa_flags2_explicit
4108 & OPTION_MASK_ISA_AVX5124VNNIW))
4109 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4110 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4111 && !(opts->x_ix86_isa_flags2_explicit
4112 & OPTION_MASK_ISA_AVX5124FMAPS))
4113 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4114 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4115 && !(opts->x_ix86_isa_flags_explicit
4116 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4117 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4118 if (processor_alias_table[i].flags & PTA_SGX
4119 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4120 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4122 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4123 x86_prefetch_sse = true;
4124 if (processor_alias_table[i].flags & PTA_MWAITX
4125 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4126 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4127 if (processor_alias_table[i].flags & PTA_PKU
4128 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4129 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4131 /* Don't enable x87 instructions if only
4132 general registers are allowed. */
4133 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4134 && !(opts_set->x_target_flags & MASK_80387))
4136 if (processor_alias_table[i].flags & PTA_NO_80387)
4137 opts->x_target_flags &= ~MASK_80387;
4138 else
4139 opts->x_target_flags |= MASK_80387;
4141 break;
4144 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4145 error ("Intel MPX does not support x32");
4147 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4148 error ("Intel MPX does not support x32");
4150 if (i == pta_size)
4152 error (main_args_p
4153 ? G_("bad value (%qs) for %<-march=%> switch")
4154 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4155 opts->x_ix86_arch_string);
4157 auto_vec <const char *> candidates;
4158 for (i = 0; i < pta_size; i++)
4159 if (strcmp (processor_alias_table[i].name, "generic")
4160 && strcmp (processor_alias_table[i].name, "intel")
4161 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4162 || (processor_alias_table[i].flags & PTA_64BIT)))
4163 candidates.safe_push (processor_alias_table[i].name);
4165 char *s;
4166 const char *hint
4167 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4168 if (hint)
4169 inform (input_location,
4170 main_args_p
4171 ? G_("valid arguments to %<-march=%> switch are: "
4172 "%s; did you mean %qs?")
4173 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4174 "%s; did you mean %qs?"), s, hint);
4175 else
4176 inform (input_location,
4177 main_args_p
4178 ? G_("valid arguments to %<-march=%> switch are: %s")
4179 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4180 "are: %s"), s);
4181 XDELETEVEC (s);
4184 ix86_arch_mask = 1u << ix86_arch;
4185 for (i = 0; i < X86_ARCH_LAST; ++i)
4186 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4188 for (i = 0; i < pta_size; i++)
4189 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4191 ix86_schedule = processor_alias_table[i].schedule;
4192 ix86_tune = processor_alias_table[i].processor;
4193 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4195 if (!(processor_alias_table[i].flags & PTA_64BIT))
4197 if (ix86_tune_defaulted)
4199 opts->x_ix86_tune_string = "x86-64";
4200 for (i = 0; i < pta_size; i++)
4201 if (! strcmp (opts->x_ix86_tune_string,
4202 processor_alias_table[i].name))
4203 break;
4204 ix86_schedule = processor_alias_table[i].schedule;
4205 ix86_tune = processor_alias_table[i].processor;
4207 else
4208 error ("CPU you selected does not support x86-64 "
4209 "instruction set");
4212 /* Intel CPUs have always interpreted SSE prefetch instructions as
4213 NOPs; so, we can enable SSE prefetch instructions even when
4214 -mtune (rather than -march) points us to a processor that has them.
4215 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4216 higher processors. */
4217 if (TARGET_CMOV
4218 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4219 x86_prefetch_sse = true;
4220 break;
4223 if (ix86_tune_specified && i == pta_size)
4225 error (main_args_p
4226 ? G_("bad value (%qs) for %<-mtune=%> switch")
4227 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4228 opts->x_ix86_tune_string);
4230 auto_vec <const char *> candidates;
4231 for (i = 0; i < pta_size; i++)
4232 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4233 || (processor_alias_table[i].flags & PTA_64BIT))
4234 candidates.safe_push (processor_alias_table[i].name);
4236 char *s;
4237 const char *hint
4238 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4239 if (hint)
4240 inform (input_location,
4241 main_args_p
4242 ? G_("valid arguments to %<-mtune=%> switch are: "
4243 "%s; did you mean %qs?")
4244 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4245 "%s; did you mean %qs?"), s, hint);
4246 else
4247 inform (input_location,
4248 main_args_p
4249 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4250 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4251 "are: %s"), s);
4252 XDELETEVEC (s);
4255 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4257 #ifndef USE_IX86_FRAME_POINTER
4258 #define USE_IX86_FRAME_POINTER 0
4259 #endif
4261 #ifndef USE_X86_64_FRAME_POINTER
4262 #define USE_X86_64_FRAME_POINTER 0
4263 #endif
4265 /* Set the default values for switches whose default depends on TARGET_64BIT
4266 in case they weren't overwritten by command line options. */
4267 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4269 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4270 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4271 if (opts->x_flag_asynchronous_unwind_tables
4272 && !opts_set->x_flag_unwind_tables
4273 && TARGET_64BIT_MS_ABI)
4274 opts->x_flag_unwind_tables = 1;
4275 if (opts->x_flag_asynchronous_unwind_tables == 2)
4276 opts->x_flag_unwind_tables
4277 = opts->x_flag_asynchronous_unwind_tables = 1;
4278 if (opts->x_flag_pcc_struct_return == 2)
4279 opts->x_flag_pcc_struct_return = 0;
4281 else
4283 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4284 opts->x_flag_omit_frame_pointer
4285 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4286 if (opts->x_flag_asynchronous_unwind_tables == 2)
4287 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4288 if (opts->x_flag_pcc_struct_return == 2)
4290 /* Intel MCU psABI specifies that -freg-struct-return should
4291 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4292 we check -miamcu so that -freg-struct-return is always
4293 turned on if -miamcu is used. */
4294 if (TARGET_IAMCU_P (opts->x_target_flags))
4295 opts->x_flag_pcc_struct_return = 0;
4296 else
4297 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4301 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4302 /* TODO: ix86_cost should be chosen at instruction or function granuality
4303 so for cold code we use size_cost even in !optimize_size compilation. */
4304 if (opts->x_optimize_size)
4305 ix86_cost = &ix86_size_cost;
4306 else
4307 ix86_cost = ix86_tune_cost;
4309 /* Arrange to set up i386_stack_locals for all functions. */
4310 init_machine_status = ix86_init_machine_status;
4312 /* Validate -mregparm= value. */
4313 if (opts_set->x_ix86_regparm)
4315 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4316 warning (0, "-mregparm is ignored in 64-bit mode");
4317 else if (TARGET_IAMCU_P (opts->x_target_flags))
4318 warning (0, "-mregparm is ignored for Intel MCU psABI");
4319 if (opts->x_ix86_regparm > REGPARM_MAX)
4321 error ("-mregparm=%d is not between 0 and %d",
4322 opts->x_ix86_regparm, REGPARM_MAX);
4323 opts->x_ix86_regparm = 0;
4326 if (TARGET_IAMCU_P (opts->x_target_flags)
4327 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4328 opts->x_ix86_regparm = REGPARM_MAX;
4330 /* Default align_* from the processor table. */
4331 ix86_default_align (opts);
4333 /* Provide default for -mbranch-cost= value. */
4334 if (!opts_set->x_ix86_branch_cost)
4335 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4337 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4339 opts->x_target_flags
4340 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4342 /* Enable by default the SSE and MMX builtins. Do allow the user to
4343 explicitly disable any of these. In particular, disabling SSE and
4344 MMX for kernel code is extremely useful. */
4345 if (!ix86_arch_specified)
4346 opts->x_ix86_isa_flags
4347 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4348 | TARGET_SUBTARGET64_ISA_DEFAULT)
4349 & ~opts->x_ix86_isa_flags_explicit);
4351 if (TARGET_RTD_P (opts->x_target_flags))
4352 warning (0,
4353 main_args_p
4354 ? G_("%<-mrtd%> is ignored in 64bit mode")
4355 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4357 else
4359 opts->x_target_flags
4360 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4362 if (!ix86_arch_specified)
4363 opts->x_ix86_isa_flags
4364 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4366 /* i386 ABI does not specify red zone. It still makes sense to use it
4367 when programmer takes care to stack from being destroyed. */
4368 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4369 opts->x_target_flags |= MASK_NO_RED_ZONE;
4372 /* Keep nonleaf frame pointers. */
4373 if (opts->x_flag_omit_frame_pointer)
4374 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4375 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4376 opts->x_flag_omit_frame_pointer = 1;
4378 /* If we're doing fast math, we don't care about comparison order
4379 wrt NaNs. This lets us use a shorter comparison sequence. */
4380 if (opts->x_flag_finite_math_only)
4381 opts->x_target_flags &= ~MASK_IEEE_FP;
4383 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4384 since the insns won't need emulation. */
4385 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4386 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4388 /* Likewise, if the target doesn't have a 387, or we've specified
4389 software floating point, don't use 387 inline intrinsics. */
4390 if (!TARGET_80387_P (opts->x_target_flags))
4391 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4393 /* Turn on MMX builtins for -msse. */
4394 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4395 opts->x_ix86_isa_flags
4396 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4398 /* Enable SSE prefetch. */
4399 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4400 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4401 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4402 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4403 x86_prefetch_sse = true;
4405 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4406 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4407 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4408 opts->x_ix86_isa_flags
4409 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4411 /* Enable lzcnt instruction for -mabm. */
4412 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4413 opts->x_ix86_isa_flags
4414 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4416 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4417 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4418 opts->x_ix86_isa_flags
4419 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4420 & ~opts->x_ix86_isa_flags_explicit);
4422 /* Validate -mpreferred-stack-boundary= value or default it to
4423 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4424 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4425 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4427 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4428 int max = TARGET_SEH ? 4 : 12;
4430 if (opts->x_ix86_preferred_stack_boundary_arg < min
4431 || opts->x_ix86_preferred_stack_boundary_arg > max)
4433 if (min == max)
4434 error ("-mpreferred-stack-boundary is not supported "
4435 "for this target");
4436 else
4437 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4438 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4440 else
4441 ix86_preferred_stack_boundary
4442 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4445 /* Set the default value for -mstackrealign. */
4446 if (!opts_set->x_ix86_force_align_arg_pointer)
4447 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4449 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4451 /* Validate -mincoming-stack-boundary= value or default it to
4452 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4453 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4454 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4456 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4458 if (opts->x_ix86_incoming_stack_boundary_arg < min
4459 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4460 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4461 opts->x_ix86_incoming_stack_boundary_arg, min);
4462 else
4464 ix86_user_incoming_stack_boundary
4465 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4466 ix86_incoming_stack_boundary
4467 = ix86_user_incoming_stack_boundary;
4471 #ifndef NO_PROFILE_COUNTERS
4472 if (flag_nop_mcount)
4473 error ("-mnop-mcount is not compatible with this target");
4474 #endif
4475 if (flag_nop_mcount && flag_pic)
4476 error ("-mnop-mcount is not implemented for -fPIC");
4478 /* Accept -msseregparm only if at least SSE support is enabled. */
4479 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4480 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4481 error (main_args_p
4482 ? G_("%<-msseregparm%> used without SSE enabled")
4483 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4485 if (opts_set->x_ix86_fpmath)
4487 if (opts->x_ix86_fpmath & FPMATH_SSE)
4489 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4491 if (TARGET_80387_P (opts->x_target_flags))
4493 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4494 opts->x_ix86_fpmath = FPMATH_387;
4497 else if ((opts->x_ix86_fpmath & FPMATH_387)
4498 && !TARGET_80387_P (opts->x_target_flags))
4500 warning (0, "387 instruction set disabled, using SSE arithmetics");
4501 opts->x_ix86_fpmath = FPMATH_SSE;
4505 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4506 fpmath=387. The second is however default at many targets since the
4507 extra 80bit precision of temporaries is considered to be part of ABI.
4508 Overwrite the default at least for -ffast-math.
4509 TODO: -mfpmath=both seems to produce same performing code with bit
4510 smaller binaries. It is however not clear if register allocation is
4511 ready for this setting.
4512 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4513 codegen. We may switch to 387 with -ffast-math for size optimized
4514 functions. */
4515 else if (fast_math_flags_set_p (&global_options)
4516 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4517 opts->x_ix86_fpmath = FPMATH_SSE;
4518 else
4519 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4521 /* Use external vectorized library in vectorizing intrinsics. */
4522 if (opts_set->x_ix86_veclibabi_type)
4523 switch (opts->x_ix86_veclibabi_type)
4525 case ix86_veclibabi_type_svml:
4526 ix86_veclib_handler = ix86_veclibabi_svml;
4527 break;
4529 case ix86_veclibabi_type_acml:
4530 ix86_veclib_handler = ix86_veclibabi_acml;
4531 break;
4533 default:
4534 gcc_unreachable ();
4537 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4538 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4539 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4541 /* If stack probes are required, the space used for large function
4542 arguments on the stack must also be probed, so enable
4543 -maccumulate-outgoing-args so this happens in the prologue. */
4544 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4545 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4547 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4548 warning (0,
4549 main_args_p
4550 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4551 "for correctness")
4552 : G_("stack probing requires "
4553 "%<target(\"accumulate-outgoing-args\")%> for "
4554 "correctness"));
4555 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4558 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4559 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4560 if (fixed_regs[BP_REG]
4561 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4563 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4564 warning (0,
4565 main_args_p
4566 ? G_("fixed ebp register requires "
4567 "%<-maccumulate-outgoing-args%>")
4568 : G_("fixed ebp register requires "
4569 "%<target(\"accumulate-outgoing-args\")%>"));
4570 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4573 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4575 char *p;
4576 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4577 p = strchr (internal_label_prefix, 'X');
4578 internal_label_prefix_len = p - internal_label_prefix;
4579 *p = '\0';
4582 /* When scheduling description is not available, disable scheduler pass
4583 so it won't slow down the compilation and make x87 code slower. */
4584 if (!TARGET_SCHEDULE)
4585 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4587 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4588 ix86_tune_cost->simultaneous_prefetches,
4589 opts->x_param_values,
4590 opts_set->x_param_values);
4591 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4592 ix86_tune_cost->prefetch_block,
4593 opts->x_param_values,
4594 opts_set->x_param_values);
4595 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4596 ix86_tune_cost->l1_cache_size,
4597 opts->x_param_values,
4598 opts_set->x_param_values);
4599 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4600 ix86_tune_cost->l2_cache_size,
4601 opts->x_param_values,
4602 opts_set->x_param_values);
4604 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4605 if (opts->x_flag_prefetch_loop_arrays < 0
4606 && HAVE_prefetch
4607 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4608 && !opts->x_optimize_size
4609 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4610 opts->x_flag_prefetch_loop_arrays = 1;
4612 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4613 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4614 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4615 targetm.expand_builtin_va_start = NULL;
4617 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4619 ix86_gen_leave = gen_leave_rex64;
4620 if (Pmode == DImode)
4622 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4623 ix86_gen_tls_local_dynamic_base_64
4624 = gen_tls_local_dynamic_base_64_di;
4626 else
4628 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4629 ix86_gen_tls_local_dynamic_base_64
4630 = gen_tls_local_dynamic_base_64_si;
4633 else
4634 ix86_gen_leave = gen_leave;
4636 if (Pmode == DImode)
4638 ix86_gen_add3 = gen_adddi3;
4639 ix86_gen_sub3 = gen_subdi3;
4640 ix86_gen_sub3_carry = gen_subdi3_carry;
4641 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4642 ix86_gen_andsp = gen_anddi3;
4643 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4644 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4645 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4646 ix86_gen_monitor = gen_sse3_monitor_di;
4647 ix86_gen_monitorx = gen_monitorx_di;
4648 ix86_gen_clzero = gen_clzero_di;
4650 else
4652 ix86_gen_add3 = gen_addsi3;
4653 ix86_gen_sub3 = gen_subsi3;
4654 ix86_gen_sub3_carry = gen_subsi3_carry;
4655 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4656 ix86_gen_andsp = gen_andsi3;
4657 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4658 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4659 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4660 ix86_gen_monitor = gen_sse3_monitor_si;
4661 ix86_gen_monitorx = gen_monitorx_si;
4662 ix86_gen_clzero = gen_clzero_si;
4665 #ifdef USE_IX86_CLD
4666 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4667 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4668 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4669 #endif
4671 /* Set the default value for -mfentry. */
4672 if (!opts_set->x_flag_fentry)
4673 opts->x_flag_fentry = TARGET_SEH;
4674 else
4676 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4677 && opts->x_flag_fentry)
4678 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4679 "with -fpic");
4680 else if (TARGET_SEH && !opts->x_flag_fentry)
4681 sorry ("-mno-fentry isn%'t compatible with SEH");
4684 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4685 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4687 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4688 && TARGET_EMIT_VZEROUPPER)
4689 opts->x_target_flags |= MASK_VZEROUPPER;
4690 if (!(opts_set->x_target_flags & MASK_STV))
4691 opts->x_target_flags |= MASK_STV;
4692 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4693 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4694 stack realignment will be extra cost the pass doesn't take into
4695 account and the pass can't realign the stack. */
4696 if (ix86_preferred_stack_boundary < 128
4697 || ix86_incoming_stack_boundary < 128
4698 || opts->x_ix86_force_align_arg_pointer)
4699 opts->x_target_flags &= ~MASK_STV;
4700 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4701 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4702 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4703 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4704 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4705 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4707 /* Enable 128-bit AVX instruction generation
4708 for the auto-vectorizer. */
4709 if (TARGET_AVX128_OPTIMAL
4710 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4711 opts->x_prefer_vector_width_type = PVW_AVX128;
4713 /* Use 256-bit AVX instruction generation
4714 in the auto-vectorizer. */
4715 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4716 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4717 opts->x_prefer_vector_width_type = PVW_AVX256;
4719 if (opts->x_ix86_recip_name)
4721 char *p = ASTRDUP (opts->x_ix86_recip_name);
4722 char *q;
4723 unsigned int mask, i;
4724 bool invert;
4726 while ((q = strtok (p, ",")) != NULL)
4728 p = NULL;
4729 if (*q == '!')
4731 invert = true;
4732 q++;
4734 else
4735 invert = false;
4737 if (!strcmp (q, "default"))
4738 mask = RECIP_MASK_ALL;
4739 else
4741 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4742 if (!strcmp (q, recip_options[i].string))
4744 mask = recip_options[i].mask;
4745 break;
4748 if (i == ARRAY_SIZE (recip_options))
4750 error ("unknown option for -mrecip=%s", q);
4751 invert = false;
4752 mask = RECIP_MASK_NONE;
4756 opts->x_recip_mask_explicit |= mask;
4757 if (invert)
4758 opts->x_recip_mask &= ~mask;
4759 else
4760 opts->x_recip_mask |= mask;
4764 if (TARGET_RECIP_P (opts->x_target_flags))
4765 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4766 else if (opts_set->x_target_flags & MASK_RECIP)
4767 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4769 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4770 for 64-bit Bionic. Also default long double to 64-bit for Intel
4771 MCU psABI. */
4772 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4773 && !(opts_set->x_target_flags
4774 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4775 opts->x_target_flags |= (TARGET_64BIT
4776 ? MASK_LONG_DOUBLE_128
4777 : MASK_LONG_DOUBLE_64);
4779 /* Only one of them can be active. */
4780 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4781 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4783 /* Handle stack protector */
4784 if (!opts_set->x_ix86_stack_protector_guard)
4785 opts->x_ix86_stack_protector_guard
4786 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4788 #ifdef TARGET_THREAD_SSP_OFFSET
4789 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4790 #endif
4792 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4794 char *endp;
4795 const char *str = ix86_stack_protector_guard_offset_str;
4797 errno = 0;
4798 int64_t offset;
4800 #if defined(INT64_T_IS_LONG)
4801 offset = strtol (str, &endp, 0);
4802 #else
4803 offset = strtoll (str, &endp, 0);
4804 #endif
4806 if (!*str || *endp || errno)
4807 error ("%qs is not a valid number "
4808 "in -mstack-protector-guard-offset=", str);
4810 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4811 HOST_WIDE_INT_C (0x7fffffff)))
4812 error ("%qs is not a valid offset "
4813 "in -mstack-protector-guard-offset=", str);
4815 ix86_stack_protector_guard_offset = offset;
4818 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4820 /* The kernel uses a different segment register for performance
4821 reasons; a system call would not have to trash the userspace
4822 segment register, which would be expensive. */
4823 if (ix86_cmodel == CM_KERNEL)
4824 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4826 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4828 const char *str = ix86_stack_protector_guard_reg_str;
4829 addr_space_t seg = ADDR_SPACE_GENERIC;
4831 /* Discard optional register prefix. */
4832 if (str[0] == '%')
4833 str++;
4835 if (strlen (str) == 2 && str[1] == 's')
4837 if (str[0] == 'f')
4838 seg = ADDR_SPACE_SEG_FS;
4839 else if (str[0] == 'g')
4840 seg = ADDR_SPACE_SEG_GS;
4843 if (seg == ADDR_SPACE_GENERIC)
4844 error ("%qs is not a valid base register "
4845 "in -mstack-protector-guard-reg=",
4846 ix86_stack_protector_guard_reg_str);
4848 ix86_stack_protector_guard_reg = seg;
4851 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4852 if (opts->x_ix86_tune_memcpy_strategy)
4854 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4855 ix86_parse_stringop_strategy_string (str, false);
4856 free (str);
4859 if (opts->x_ix86_tune_memset_strategy)
4861 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4862 ix86_parse_stringop_strategy_string (str, true);
4863 free (str);
4866 /* Save the initial options in case the user does function specific
4867 options. */
4868 if (main_args_p)
4869 target_option_default_node = target_option_current_node
4870 = build_target_option_node (opts);
4872 /* Do not support control flow instrumentation if CET is not enabled. */
4873 if (opts->x_flag_cf_protection != CF_NONE)
4875 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4876 || TARGET_SHSTK_P (opts->x_ix86_isa_flags)))
4878 if (flag_cf_protection == CF_FULL)
4880 error ("%<-fcf-protection=full%> requires CET support "
4881 "on this target. Use -mcet or one of -mibt, "
4882 "-mshstk options to enable CET");
4884 else if (flag_cf_protection == CF_BRANCH)
4886 error ("%<-fcf-protection=branch%> requires CET support "
4887 "on this target. Use -mcet or one of -mibt, "
4888 "-mshstk options to enable CET");
4890 else if (flag_cf_protection == CF_RETURN)
4892 error ("%<-fcf-protection=return%> requires CET support "
4893 "on this target. Use -mcet or one of -mibt, "
4894 "-mshstk options to enable CET");
4896 flag_cf_protection = CF_NONE;
4897 return false;
4899 opts->x_flag_cf_protection =
4900 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4903 return true;
4906 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4908 static void
4909 ix86_option_override (void)
4911 ix86_option_override_internal (true, &global_options, &global_options_set);
4914 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4915 static char *
4916 ix86_offload_options (void)
4918 if (TARGET_LP64)
4919 return xstrdup ("-foffload-abi=lp64");
4920 return xstrdup ("-foffload-abi=ilp32");
4923 /* Update register usage after having seen the compiler flags. */
4925 static void
4926 ix86_conditional_register_usage (void)
4928 int i, c_mask;
4930 /* If there are no caller-saved registers, preserve all registers.
4931 except fixed_regs and registers used for function return value
4932 since aggregate_value_p checks call_used_regs[regno] on return
4933 value. */
4934 if (cfun && cfun->machine->no_caller_saved_registers)
4935 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4936 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4937 call_used_regs[i] = 0;
4939 /* For 32-bit targets, squash the REX registers. */
4940 if (! TARGET_64BIT)
4942 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4943 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4944 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4945 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4946 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4947 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4950 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4951 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4953 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4955 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4957 /* Set/reset conditionally defined registers from
4958 CALL_USED_REGISTERS initializer. */
4959 if (call_used_regs[i] > 1)
4960 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4962 /* Calculate registers of CLOBBERED_REGS register set
4963 as call used registers from GENERAL_REGS register set. */
4964 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4965 && call_used_regs[i])
4966 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4969 /* If MMX is disabled, squash the registers. */
4970 if (! TARGET_MMX)
4971 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4972 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4973 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4975 /* If SSE is disabled, squash the registers. */
4976 if (! TARGET_SSE)
4977 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4978 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4979 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4981 /* If the FPU is disabled, squash the registers. */
4982 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4983 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4984 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4985 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4987 /* If AVX512F is disabled, squash the registers. */
4988 if (! TARGET_AVX512F)
4990 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4991 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4993 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4994 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4997 /* If MPX is disabled, squash the registers. */
4998 if (! TARGET_MPX)
4999 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5000 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5003 /* Canonicalize a comparison from one we don't have to one we do have. */
5005 static void
5006 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5007 bool op0_preserve_value)
5009 /* The order of operands in x87 ficom compare is forced by combine in
5010 simplify_comparison () function. Float operator is treated as RTX_OBJ
5011 with a precedence over other operators and is always put in the first
5012 place. Swap condition and operands to match ficom instruction. */
5013 if (!op0_preserve_value
5014 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5016 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5018 /* We are called only for compares that are split to SAHF instruction.
5019 Ensure that we have setcc/jcc insn for the swapped condition. */
5020 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5022 std::swap (*op0, *op1);
5023 *code = (int) scode;
5028 /* Save the current options */
5030 static void
5031 ix86_function_specific_save (struct cl_target_option *ptr,
5032 struct gcc_options *opts)
5034 ptr->arch = ix86_arch;
5035 ptr->schedule = ix86_schedule;
5036 ptr->prefetch_sse = x86_prefetch_sse;
5037 ptr->tune = ix86_tune;
5038 ptr->branch_cost = ix86_branch_cost;
5039 ptr->tune_defaulted = ix86_tune_defaulted;
5040 ptr->arch_specified = ix86_arch_specified;
5041 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5042 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5043 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5044 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5045 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5046 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5047 ptr->x_ix86_abi = opts->x_ix86_abi;
5048 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5049 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5050 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5051 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5052 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5053 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5054 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5055 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5056 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5057 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5058 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5059 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5060 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5061 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5062 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5063 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5064 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5065 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5066 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5067 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5069 /* The fields are char but the variables are not; make sure the
5070 values fit in the fields. */
5071 gcc_assert (ptr->arch == ix86_arch);
5072 gcc_assert (ptr->schedule == ix86_schedule);
5073 gcc_assert (ptr->tune == ix86_tune);
5074 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5077 /* Restore the current options */
5079 static void
5080 ix86_function_specific_restore (struct gcc_options *opts,
5081 struct cl_target_option *ptr)
5083 enum processor_type old_tune = ix86_tune;
5084 enum processor_type old_arch = ix86_arch;
5085 unsigned int ix86_arch_mask;
5086 int i;
5088 /* We don't change -fPIC. */
5089 opts->x_flag_pic = flag_pic;
5091 ix86_arch = (enum processor_type) ptr->arch;
5092 ix86_schedule = (enum attr_cpu) ptr->schedule;
5093 ix86_tune = (enum processor_type) ptr->tune;
5094 x86_prefetch_sse = ptr->prefetch_sse;
5095 opts->x_ix86_branch_cost = ptr->branch_cost;
5096 ix86_tune_defaulted = ptr->tune_defaulted;
5097 ix86_arch_specified = ptr->arch_specified;
5098 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5099 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5100 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5101 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5102 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5103 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5104 opts->x_ix86_abi = ptr->x_ix86_abi;
5105 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5106 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5107 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5108 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5109 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5110 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5111 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5112 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5113 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5114 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5115 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5116 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5117 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5118 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5119 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5120 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5121 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5122 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5123 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5124 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5125 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5126 /* TODO: ix86_cost should be chosen at instruction or function granuality
5127 so for cold code we use size_cost even in !optimize_size compilation. */
5128 if (opts->x_optimize_size)
5129 ix86_cost = &ix86_size_cost;
5130 else
5131 ix86_cost = ix86_tune_cost;
5133 /* Recreate the arch feature tests if the arch changed */
5134 if (old_arch != ix86_arch)
5136 ix86_arch_mask = 1u << ix86_arch;
5137 for (i = 0; i < X86_ARCH_LAST; ++i)
5138 ix86_arch_features[i]
5139 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5142 /* Recreate the tune optimization tests */
5143 if (old_tune != ix86_tune)
5144 set_ix86_tune_features (ix86_tune, false);
5147 /* Adjust target options after streaming them in. This is mainly about
5148 reconciling them with global options. */
5150 static void
5151 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5153 /* flag_pic is a global option, but ix86_cmodel is target saved option
5154 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5155 for PIC, or error out. */
5156 if (flag_pic)
5157 switch (ptr->x_ix86_cmodel)
5159 case CM_SMALL:
5160 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5161 break;
5163 case CM_MEDIUM:
5164 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5165 break;
5167 case CM_LARGE:
5168 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5169 break;
5171 case CM_KERNEL:
5172 error ("code model %s does not support PIC mode", "kernel");
5173 break;
5175 default:
5176 break;
5178 else
5179 switch (ptr->x_ix86_cmodel)
5181 case CM_SMALL_PIC:
5182 ptr->x_ix86_cmodel = CM_SMALL;
5183 break;
5185 case CM_MEDIUM_PIC:
5186 ptr->x_ix86_cmodel = CM_MEDIUM;
5187 break;
5189 case CM_LARGE_PIC:
5190 ptr->x_ix86_cmodel = CM_LARGE;
5191 break;
5193 default:
5194 break;
5198 /* Print the current options */
5200 static void
5201 ix86_function_specific_print (FILE *file, int indent,
5202 struct cl_target_option *ptr)
5204 char *target_string
5205 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5206 ptr->x_target_flags, ptr->x_ix86_target_flags,
5207 NULL, NULL, ptr->x_ix86_fpmath, false);
5209 gcc_assert (ptr->arch < PROCESSOR_max);
5210 fprintf (file, "%*sarch = %d (%s)\n",
5211 indent, "",
5212 ptr->arch, processor_target_table[ptr->arch].name);
5214 gcc_assert (ptr->tune < PROCESSOR_max);
5215 fprintf (file, "%*stune = %d (%s)\n",
5216 indent, "",
5217 ptr->tune, processor_target_table[ptr->tune].name);
5219 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5221 if (target_string)
5223 fprintf (file, "%*s%s\n", indent, "", target_string);
5224 free (target_string);
5229 /* Inner function to process the attribute((target(...))), take an argument and
5230 set the current options from the argument. If we have a list, recursively go
5231 over the list. */
5233 static bool
5234 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5235 struct gcc_options *opts,
5236 struct gcc_options *opts_set,
5237 struct gcc_options *enum_opts_set)
5239 char *next_optstr;
5240 bool ret = true;
5242 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5243 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5244 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5245 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5246 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5248 enum ix86_opt_type
5250 ix86_opt_unknown,
5251 ix86_opt_yes,
5252 ix86_opt_no,
5253 ix86_opt_str,
5254 ix86_opt_enum,
5255 ix86_opt_isa
5258 static const struct
5260 const char *string;
5261 size_t len;
5262 enum ix86_opt_type type;
5263 int opt;
5264 int mask;
5265 } attrs[] = {
5266 /* isa options */
5267 IX86_ATTR_ISA ("sgx", OPT_msgx),
5268 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5269 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5270 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5271 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5272 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5273 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5275 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5276 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5277 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5278 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5279 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5280 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5281 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5282 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5283 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5284 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5285 IX86_ATTR_ISA ("fma", OPT_mfma),
5286 IX86_ATTR_ISA ("xop", OPT_mxop),
5287 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5288 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5289 IX86_ATTR_ISA ("avx", OPT_mavx),
5290 IX86_ATTR_ISA ("sse4", OPT_msse4),
5291 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5292 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5293 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5294 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5295 IX86_ATTR_ISA ("sse3", OPT_msse3),
5296 IX86_ATTR_ISA ("aes", OPT_maes),
5297 IX86_ATTR_ISA ("sha", OPT_msha),
5298 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5299 IX86_ATTR_ISA ("sse2", OPT_msse2),
5300 IX86_ATTR_ISA ("sse", OPT_msse),
5301 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5302 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5303 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5304 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5305 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5306 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5307 IX86_ATTR_ISA ("adx", OPT_madx),
5308 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5309 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5310 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5311 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5312 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5313 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5314 IX86_ATTR_ISA ("abm", OPT_mabm),
5315 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5316 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5317 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5318 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5319 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5320 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5321 IX86_ATTR_ISA ("sahf", OPT_msahf),
5322 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5323 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5324 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5325 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5326 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5327 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5328 IX86_ATTR_ISA ("pku", OPT_mpku),
5329 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5330 IX86_ATTR_ISA ("hle", OPT_mhle),
5331 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5332 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5333 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5334 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5335 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5336 IX86_ATTR_ISA ("ibt", OPT_mibt),
5337 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5338 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5339 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5341 /* enum options */
5342 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5344 /* string options */
5345 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5346 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5348 /* flag options */
5349 IX86_ATTR_YES ("cld",
5350 OPT_mcld,
5351 MASK_CLD),
5353 IX86_ATTR_NO ("fancy-math-387",
5354 OPT_mfancy_math_387,
5355 MASK_NO_FANCY_MATH_387),
5357 IX86_ATTR_YES ("ieee-fp",
5358 OPT_mieee_fp,
5359 MASK_IEEE_FP),
5361 IX86_ATTR_YES ("inline-all-stringops",
5362 OPT_minline_all_stringops,
5363 MASK_INLINE_ALL_STRINGOPS),
5365 IX86_ATTR_YES ("inline-stringops-dynamically",
5366 OPT_minline_stringops_dynamically,
5367 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5369 IX86_ATTR_NO ("align-stringops",
5370 OPT_mno_align_stringops,
5371 MASK_NO_ALIGN_STRINGOPS),
5373 IX86_ATTR_YES ("recip",
5374 OPT_mrecip,
5375 MASK_RECIP),
5379 /* If this is a list, recurse to get the options. */
5380 if (TREE_CODE (args) == TREE_LIST)
5382 bool ret = true;
5384 for (; args; args = TREE_CHAIN (args))
5385 if (TREE_VALUE (args)
5386 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5387 p_strings, opts, opts_set,
5388 enum_opts_set))
5389 ret = false;
5391 return ret;
5394 else if (TREE_CODE (args) != STRING_CST)
5396 error ("attribute %<target%> argument not a string");
5397 return false;
5400 /* Handle multiple arguments separated by commas. */
5401 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5403 while (next_optstr && *next_optstr != '\0')
5405 char *p = next_optstr;
5406 char *orig_p = p;
5407 char *comma = strchr (next_optstr, ',');
5408 const char *opt_string;
5409 size_t len, opt_len;
5410 int opt;
5411 bool opt_set_p;
5412 char ch;
5413 unsigned i;
5414 enum ix86_opt_type type = ix86_opt_unknown;
5415 int mask = 0;
5417 if (comma)
5419 *comma = '\0';
5420 len = comma - next_optstr;
5421 next_optstr = comma + 1;
5423 else
5425 len = strlen (p);
5426 next_optstr = NULL;
5429 /* Recognize no-xxx. */
5430 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5432 opt_set_p = false;
5433 p += 3;
5434 len -= 3;
5436 else
5437 opt_set_p = true;
5439 /* Find the option. */
5440 ch = *p;
5441 opt = N_OPTS;
5442 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5444 type = attrs[i].type;
5445 opt_len = attrs[i].len;
5446 if (ch == attrs[i].string[0]
5447 && ((type != ix86_opt_str && type != ix86_opt_enum)
5448 ? len == opt_len
5449 : len > opt_len)
5450 && memcmp (p, attrs[i].string, opt_len) == 0)
5452 opt = attrs[i].opt;
5453 mask = attrs[i].mask;
5454 opt_string = attrs[i].string;
5455 break;
5459 /* Process the option. */
5460 if (opt == N_OPTS)
5462 error ("attribute(target(\"%s\")) is unknown", orig_p);
5463 ret = false;
5466 else if (type == ix86_opt_isa)
5468 struct cl_decoded_option decoded;
5470 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5471 ix86_handle_option (opts, opts_set,
5472 &decoded, input_location);
5475 else if (type == ix86_opt_yes || type == ix86_opt_no)
5477 if (type == ix86_opt_no)
5478 opt_set_p = !opt_set_p;
5480 if (opt_set_p)
5481 opts->x_target_flags |= mask;
5482 else
5483 opts->x_target_flags &= ~mask;
5486 else if (type == ix86_opt_str)
5488 if (p_strings[opt])
5490 error ("option(\"%s\") was already specified", opt_string);
5491 ret = false;
5493 else
5494 p_strings[opt] = xstrdup (p + opt_len);
5497 else if (type == ix86_opt_enum)
5499 bool arg_ok;
5500 int value;
5502 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5503 if (arg_ok)
5504 set_option (opts, enum_opts_set, opt, value,
5505 p + opt_len, DK_UNSPECIFIED, input_location,
5506 global_dc);
5507 else
5509 error ("attribute(target(\"%s\")) is unknown", orig_p);
5510 ret = false;
5514 else
5515 gcc_unreachable ();
5518 return ret;
5521 /* Release allocated strings. */
5522 static void
5523 release_options_strings (char **option_strings)
5525 /* Free up memory allocated to hold the strings */
5526 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5527 free (option_strings[i]);
5530 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5532 tree
5533 ix86_valid_target_attribute_tree (tree args,
5534 struct gcc_options *opts,
5535 struct gcc_options *opts_set)
5537 const char *orig_arch_string = opts->x_ix86_arch_string;
5538 const char *orig_tune_string = opts->x_ix86_tune_string;
5539 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5540 int orig_tune_defaulted = ix86_tune_defaulted;
5541 int orig_arch_specified = ix86_arch_specified;
5542 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5543 tree t = NULL_TREE;
5544 struct cl_target_option *def
5545 = TREE_TARGET_OPTION (target_option_default_node);
5546 struct gcc_options enum_opts_set;
5548 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5550 /* Process each of the options on the chain. */
5551 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5552 opts_set, &enum_opts_set))
5553 return error_mark_node;
5555 /* If the changed options are different from the default, rerun
5556 ix86_option_override_internal, and then save the options away.
5557 The string options are attribute options, and will be undone
5558 when we copy the save structure. */
5559 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5560 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5561 || opts->x_target_flags != def->x_target_flags
5562 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5563 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5564 || enum_opts_set.x_ix86_fpmath)
5566 /* If we are using the default tune= or arch=, undo the string assigned,
5567 and use the default. */
5568 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5570 opts->x_ix86_arch_string
5571 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5573 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5574 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5575 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5576 | OPTION_MASK_ABI_64
5577 | OPTION_MASK_ABI_X32
5578 | OPTION_MASK_CODE16);
5579 opts->x_ix86_isa_flags2 = 0;
5581 else if (!orig_arch_specified)
5582 opts->x_ix86_arch_string = NULL;
5584 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5585 opts->x_ix86_tune_string
5586 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5587 else if (orig_tune_defaulted)
5588 opts->x_ix86_tune_string = NULL;
5590 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5591 if (enum_opts_set.x_ix86_fpmath)
5592 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5594 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5595 bool r = ix86_option_override_internal (false, opts, opts_set);
5596 if (!r)
5598 release_options_strings (option_strings);
5599 return error_mark_node;
5602 /* Add any builtin functions with the new isa if any. */
5603 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5605 /* Save the current options unless we are validating options for
5606 #pragma. */
5607 t = build_target_option_node (opts);
5609 opts->x_ix86_arch_string = orig_arch_string;
5610 opts->x_ix86_tune_string = orig_tune_string;
5611 opts_set->x_ix86_fpmath = orig_fpmath_set;
5613 release_options_strings (option_strings);
5616 return t;
5619 /* Hook to validate attribute((target("string"))). */
5621 static bool
5622 ix86_valid_target_attribute_p (tree fndecl,
5623 tree ARG_UNUSED (name),
5624 tree args,
5625 int ARG_UNUSED (flags))
5627 struct gcc_options func_options;
5628 tree new_target, new_optimize;
5629 bool ret = true;
5631 /* attribute((target("default"))) does nothing, beyond
5632 affecting multi-versioning. */
5633 if (TREE_VALUE (args)
5634 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5635 && TREE_CHAIN (args) == NULL_TREE
5636 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5637 return true;
5639 tree old_optimize = build_optimization_node (&global_options);
5641 /* Get the optimization options of the current function. */
5642 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5644 if (!func_optimize)
5645 func_optimize = old_optimize;
5647 /* Init func_options. */
5648 memset (&func_options, 0, sizeof (func_options));
5649 init_options_struct (&func_options, NULL);
5650 lang_hooks.init_options_struct (&func_options);
5652 cl_optimization_restore (&func_options,
5653 TREE_OPTIMIZATION (func_optimize));
5655 /* Initialize func_options to the default before its target options can
5656 be set. */
5657 cl_target_option_restore (&func_options,
5658 TREE_TARGET_OPTION (target_option_default_node));
5660 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5661 &global_options_set);
5663 new_optimize = build_optimization_node (&func_options);
5665 if (new_target == error_mark_node)
5666 ret = false;
5668 else if (fndecl && new_target)
5670 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5672 if (old_optimize != new_optimize)
5673 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5676 finalize_options_struct (&func_options);
5678 return ret;
5682 /* Hook to determine if one function can safely inline another. */
5684 static bool
5685 ix86_can_inline_p (tree caller, tree callee)
5687 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5688 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5689 if (!callee_tree)
5690 callee_tree = target_option_default_node;
5691 if (!caller_tree)
5692 caller_tree = target_option_default_node;
5693 if (callee_tree == caller_tree)
5694 return true;
5696 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5697 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5698 bool ret = false;
5700 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5701 function can inline a SSE2 function but a SSE2 function can't inline
5702 a SSE4 function. */
5703 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5704 != callee_opts->x_ix86_isa_flags)
5705 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5706 != callee_opts->x_ix86_isa_flags2))
5707 ret = false;
5709 /* See if we have the same non-isa options. */
5710 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5711 ret = false;
5713 /* See if arch, tune, etc. are the same. */
5714 else if (caller_opts->arch != callee_opts->arch)
5715 ret = false;
5717 else if (caller_opts->tune != callee_opts->tune)
5718 ret = false;
5720 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5721 /* If the calle doesn't use FP expressions differences in
5722 ix86_fpmath can be ignored. We are called from FEs
5723 for multi-versioning call optimization, so beware of
5724 ipa_fn_summaries not available. */
5725 && (! ipa_fn_summaries
5726 || ipa_fn_summaries->get
5727 (cgraph_node::get (callee))->fp_expressions))
5728 ret = false;
5730 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5731 ret = false;
5733 else
5734 ret = true;
5736 return ret;
5740 /* Remember the last target of ix86_set_current_function. */
5741 static GTY(()) tree ix86_previous_fndecl;
5743 /* Set targets globals to the default (or current #pragma GCC target
5744 if active). Invalidate ix86_previous_fndecl cache. */
5746 void
5747 ix86_reset_previous_fndecl (void)
5749 tree new_tree = target_option_current_node;
5750 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5751 if (TREE_TARGET_GLOBALS (new_tree))
5752 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5753 else if (new_tree == target_option_default_node)
5754 restore_target_globals (&default_target_globals);
5755 else
5756 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5757 ix86_previous_fndecl = NULL_TREE;
5760 /* Set the func_type field from the function FNDECL. */
5762 static void
5763 ix86_set_func_type (tree fndecl)
5765 if (cfun->machine->func_type == TYPE_UNKNOWN)
5767 if (lookup_attribute ("interrupt",
5768 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5770 if (ix86_function_naked (fndecl))
5771 error_at (DECL_SOURCE_LOCATION (fndecl),
5772 "interrupt and naked attributes are not compatible");
5774 int nargs = 0;
5775 for (tree arg = DECL_ARGUMENTS (fndecl);
5776 arg;
5777 arg = TREE_CHAIN (arg))
5778 nargs++;
5779 cfun->machine->no_caller_saved_registers = true;
5780 cfun->machine->func_type
5781 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5783 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5785 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5786 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5787 sorry ("Only DWARF debug format is supported for interrupt "
5788 "service routine.");
5790 else
5792 cfun->machine->func_type = TYPE_NORMAL;
5793 if (lookup_attribute ("no_caller_saved_registers",
5794 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5795 cfun->machine->no_caller_saved_registers = true;
5800 /* Establish appropriate back-end context for processing the function
5801 FNDECL. The argument might be NULL to indicate processing at top
5802 level, outside of any function scope. */
5803 static void
5804 ix86_set_current_function (tree fndecl)
5806 /* Only change the context if the function changes. This hook is called
5807 several times in the course of compiling a function, and we don't want to
5808 slow things down too much or call target_reinit when it isn't safe. */
5809 if (fndecl == ix86_previous_fndecl)
5811 /* There may be 2 function bodies for the same function FNDECL,
5812 one is extern inline and one isn't. Call ix86_set_func_type
5813 to set the func_type field. */
5814 if (fndecl != NULL_TREE)
5815 ix86_set_func_type (fndecl);
5816 return;
5819 tree old_tree;
5820 if (ix86_previous_fndecl == NULL_TREE)
5821 old_tree = target_option_current_node;
5822 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5823 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5824 else
5825 old_tree = target_option_default_node;
5827 if (fndecl == NULL_TREE)
5829 if (old_tree != target_option_current_node)
5830 ix86_reset_previous_fndecl ();
5831 return;
5834 ix86_set_func_type (fndecl);
5836 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5837 if (new_tree == NULL_TREE)
5838 new_tree = target_option_default_node;
5840 if (old_tree != new_tree)
5842 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5843 if (TREE_TARGET_GLOBALS (new_tree))
5844 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5845 else if (new_tree == target_option_default_node)
5846 restore_target_globals (&default_target_globals);
5847 else
5848 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5850 ix86_previous_fndecl = fndecl;
5852 static bool prev_no_caller_saved_registers;
5854 /* 64-bit MS and SYSV ABI have different set of call used registers.
5855 Avoid expensive re-initialization of init_regs each time we switch
5856 function context. */
5857 if (TARGET_64BIT
5858 && (call_used_regs[SI_REG]
5859 == (cfun->machine->call_abi == MS_ABI)))
5860 reinit_regs ();
5861 /* Need to re-initialize init_regs if caller-saved registers are
5862 changed. */
5863 else if (prev_no_caller_saved_registers
5864 != cfun->machine->no_caller_saved_registers)
5865 reinit_regs ();
5867 if (cfun->machine->func_type != TYPE_NORMAL
5868 || cfun->machine->no_caller_saved_registers)
5870 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5871 may change processor state. */
5872 const char *isa;
5873 if (TARGET_MPX)
5874 isa = "MPX";
5875 else if (TARGET_SSE)
5876 isa = "SSE";
5877 else if (TARGET_MMX)
5878 isa = "MMX/3Dnow";
5879 else if (TARGET_80387)
5880 isa = "80387";
5881 else
5882 isa = NULL;
5883 if (isa != NULL)
5885 if (cfun->machine->func_type != TYPE_NORMAL)
5886 sorry ("%s instructions aren't allowed in %s service routine",
5887 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5888 ? "exception" : "interrupt"));
5889 else
5890 sorry ("%s instructions aren't allowed in function with "
5891 "no_caller_saved_registers attribute", isa);
5892 /* Don't issue the same error twice. */
5893 cfun->machine->func_type = TYPE_NORMAL;
5894 cfun->machine->no_caller_saved_registers = false;
5898 prev_no_caller_saved_registers
5899 = cfun->machine->no_caller_saved_registers;
5903 /* Return true if this goes in large data/bss. */
5905 static bool
5906 ix86_in_large_data_p (tree exp)
5908 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5909 return false;
5911 if (exp == NULL_TREE)
5912 return false;
5914 /* Functions are never large data. */
5915 if (TREE_CODE (exp) == FUNCTION_DECL)
5916 return false;
5918 /* Automatic variables are never large data. */
5919 if (VAR_P (exp) && !is_global_var (exp))
5920 return false;
5922 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5924 const char *section = DECL_SECTION_NAME (exp);
5925 if (strcmp (section, ".ldata") == 0
5926 || strcmp (section, ".lbss") == 0)
5927 return true;
5928 return false;
5930 else
5932 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5934 /* If this is an incomplete type with size 0, then we can't put it
5935 in data because it might be too big when completed. Also,
5936 int_size_in_bytes returns -1 if size can vary or is larger than
5937 an integer in which case also it is safer to assume that it goes in
5938 large data. */
5939 if (size <= 0 || size > ix86_section_threshold)
5940 return true;
5943 return false;
5946 /* i386-specific section flag to mark large sections. */
5947 #define SECTION_LARGE SECTION_MACH_DEP
5949 /* Switch to the appropriate section for output of DECL.
5950 DECL is either a `VAR_DECL' node or a constant of some sort.
5951 RELOC indicates whether forming the initial value of DECL requires
5952 link-time relocations. */
5954 ATTRIBUTE_UNUSED static section *
5955 x86_64_elf_select_section (tree decl, int reloc,
5956 unsigned HOST_WIDE_INT align)
5958 if (ix86_in_large_data_p (decl))
5960 const char *sname = NULL;
5961 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5962 switch (categorize_decl_for_section (decl, reloc))
5964 case SECCAT_DATA:
5965 sname = ".ldata";
5966 break;
5967 case SECCAT_DATA_REL:
5968 sname = ".ldata.rel";
5969 break;
5970 case SECCAT_DATA_REL_LOCAL:
5971 sname = ".ldata.rel.local";
5972 break;
5973 case SECCAT_DATA_REL_RO:
5974 sname = ".ldata.rel.ro";
5975 break;
5976 case SECCAT_DATA_REL_RO_LOCAL:
5977 sname = ".ldata.rel.ro.local";
5978 break;
5979 case SECCAT_BSS:
5980 sname = ".lbss";
5981 flags |= SECTION_BSS;
5982 break;
5983 case SECCAT_RODATA:
5984 case SECCAT_RODATA_MERGE_STR:
5985 case SECCAT_RODATA_MERGE_STR_INIT:
5986 case SECCAT_RODATA_MERGE_CONST:
5987 sname = ".lrodata";
5988 flags &= ~SECTION_WRITE;
5989 break;
5990 case SECCAT_SRODATA:
5991 case SECCAT_SDATA:
5992 case SECCAT_SBSS:
5993 gcc_unreachable ();
5994 case SECCAT_TEXT:
5995 case SECCAT_TDATA:
5996 case SECCAT_TBSS:
5997 /* We don't split these for medium model. Place them into
5998 default sections and hope for best. */
5999 break;
6001 if (sname)
6003 /* We might get called with string constants, but get_named_section
6004 doesn't like them as they are not DECLs. Also, we need to set
6005 flags in that case. */
6006 if (!DECL_P (decl))
6007 return get_section (sname, flags, NULL);
6008 return get_named_section (decl, sname, reloc);
6011 return default_elf_select_section (decl, reloc, align);
6014 /* Select a set of attributes for section NAME based on the properties
6015 of DECL and whether or not RELOC indicates that DECL's initializer
6016 might contain runtime relocations. */
6018 static unsigned int ATTRIBUTE_UNUSED
6019 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6021 unsigned int flags = default_section_type_flags (decl, name, reloc);
6023 if (ix86_in_large_data_p (decl))
6024 flags |= SECTION_LARGE;
6026 if (decl == NULL_TREE
6027 && (strcmp (name, ".ldata.rel.ro") == 0
6028 || strcmp (name, ".ldata.rel.ro.local") == 0))
6029 flags |= SECTION_RELRO;
6031 if (strcmp (name, ".lbss") == 0
6032 || strncmp (name, ".lbss.", 5) == 0
6033 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6034 flags |= SECTION_BSS;
6036 return flags;
6039 /* Build up a unique section name, expressed as a
6040 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6041 RELOC indicates whether the initial value of EXP requires
6042 link-time relocations. */
6044 static void ATTRIBUTE_UNUSED
6045 x86_64_elf_unique_section (tree decl, int reloc)
6047 if (ix86_in_large_data_p (decl))
6049 const char *prefix = NULL;
6050 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6051 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6053 switch (categorize_decl_for_section (decl, reloc))
6055 case SECCAT_DATA:
6056 case SECCAT_DATA_REL:
6057 case SECCAT_DATA_REL_LOCAL:
6058 case SECCAT_DATA_REL_RO:
6059 case SECCAT_DATA_REL_RO_LOCAL:
6060 prefix = one_only ? ".ld" : ".ldata";
6061 break;
6062 case SECCAT_BSS:
6063 prefix = one_only ? ".lb" : ".lbss";
6064 break;
6065 case SECCAT_RODATA:
6066 case SECCAT_RODATA_MERGE_STR:
6067 case SECCAT_RODATA_MERGE_STR_INIT:
6068 case SECCAT_RODATA_MERGE_CONST:
6069 prefix = one_only ? ".lr" : ".lrodata";
6070 break;
6071 case SECCAT_SRODATA:
6072 case SECCAT_SDATA:
6073 case SECCAT_SBSS:
6074 gcc_unreachable ();
6075 case SECCAT_TEXT:
6076 case SECCAT_TDATA:
6077 case SECCAT_TBSS:
6078 /* We don't split these for medium model. Place them into
6079 default sections and hope for best. */
6080 break;
6082 if (prefix)
6084 const char *name, *linkonce;
6085 char *string;
6087 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6088 name = targetm.strip_name_encoding (name);
6090 /* If we're using one_only, then there needs to be a .gnu.linkonce
6091 prefix to the section name. */
6092 linkonce = one_only ? ".gnu.linkonce" : "";
6094 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6096 set_decl_section_name (decl, string);
6097 return;
6100 default_unique_section (decl, reloc);
6103 #ifdef COMMON_ASM_OP
6105 #ifndef LARGECOMM_SECTION_ASM_OP
6106 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6107 #endif
6109 /* This says how to output assembler code to declare an
6110 uninitialized external linkage data object.
6112 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6113 large objects. */
6114 void
6115 x86_elf_aligned_decl_common (FILE *file, tree decl,
6116 const char *name, unsigned HOST_WIDE_INT size,
6117 int align)
6119 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6120 && size > (unsigned int)ix86_section_threshold)
6122 switch_to_section (get_named_section (decl, ".lbss", 0));
6123 fputs (LARGECOMM_SECTION_ASM_OP, file);
6125 else
6126 fputs (COMMON_ASM_OP, file);
6127 assemble_name (file, name);
6128 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6129 size, align / BITS_PER_UNIT);
6131 #endif
6133 /* Utility function for targets to use in implementing
6134 ASM_OUTPUT_ALIGNED_BSS. */
6136 void
6137 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6138 unsigned HOST_WIDE_INT size, int align)
6140 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6141 && size > (unsigned int)ix86_section_threshold)
6142 switch_to_section (get_named_section (decl, ".lbss", 0));
6143 else
6144 switch_to_section (bss_section);
6145 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6146 #ifdef ASM_DECLARE_OBJECT_NAME
6147 last_assemble_variable_decl = decl;
6148 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6149 #else
6150 /* Standard thing is just output label for the object. */
6151 ASM_OUTPUT_LABEL (file, name);
6152 #endif /* ASM_DECLARE_OBJECT_NAME */
6153 ASM_OUTPUT_SKIP (file, size ? size : 1);
6156 /* Decide whether we must probe the stack before any space allocation
6157 on this target. It's essentially TARGET_STACK_PROBE except when
6158 -fstack-check causes the stack to be already probed differently. */
6160 bool
6161 ix86_target_stack_probe (void)
6163 /* Do not probe the stack twice if static stack checking is enabled. */
6164 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6165 return false;
6167 return TARGET_STACK_PROBE;
6170 /* Decide whether we can make a sibling call to a function. DECL is the
6171 declaration of the function being targeted by the call and EXP is the
6172 CALL_EXPR representing the call. */
6174 static bool
6175 ix86_function_ok_for_sibcall (tree decl, tree exp)
6177 tree type, decl_or_type;
6178 rtx a, b;
6179 bool bind_global = decl && !targetm.binds_local_p (decl);
6181 if (ix86_function_naked (current_function_decl))
6182 return false;
6184 /* Sibling call isn't OK if there are no caller-saved registers
6185 since all registers must be preserved before return. */
6186 if (cfun->machine->no_caller_saved_registers)
6187 return false;
6189 /* If we are generating position-independent code, we cannot sibcall
6190 optimize direct calls to global functions, as the PLT requires
6191 %ebx be live. (Darwin does not have a PLT.) */
6192 if (!TARGET_MACHO
6193 && !TARGET_64BIT
6194 && flag_pic
6195 && flag_plt
6196 && bind_global)
6197 return false;
6199 /* If we need to align the outgoing stack, then sibcalling would
6200 unalign the stack, which may break the called function. */
6201 if (ix86_minimum_incoming_stack_boundary (true)
6202 < PREFERRED_STACK_BOUNDARY)
6203 return false;
6205 if (decl)
6207 decl_or_type = decl;
6208 type = TREE_TYPE (decl);
6210 else
6212 /* We're looking at the CALL_EXPR, we need the type of the function. */
6213 type = CALL_EXPR_FN (exp); /* pointer expression */
6214 type = TREE_TYPE (type); /* pointer type */
6215 type = TREE_TYPE (type); /* function type */
6216 decl_or_type = type;
6219 /* Check that the return value locations are the same. Like
6220 if we are returning floats on the 80387 register stack, we cannot
6221 make a sibcall from a function that doesn't return a float to a
6222 function that does or, conversely, from a function that does return
6223 a float to a function that doesn't; the necessary stack adjustment
6224 would not be executed. This is also the place we notice
6225 differences in the return value ABI. Note that it is ok for one
6226 of the functions to have void return type as long as the return
6227 value of the other is passed in a register. */
6228 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6229 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6230 cfun->decl, false);
6231 if (STACK_REG_P (a) || STACK_REG_P (b))
6233 if (!rtx_equal_p (a, b))
6234 return false;
6236 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6238 else if (!rtx_equal_p (a, b))
6239 return false;
6241 if (TARGET_64BIT)
6243 /* The SYSV ABI has more call-clobbered registers;
6244 disallow sibcalls from MS to SYSV. */
6245 if (cfun->machine->call_abi == MS_ABI
6246 && ix86_function_type_abi (type) == SYSV_ABI)
6247 return false;
6249 else
6251 /* If this call is indirect, we'll need to be able to use a
6252 call-clobbered register for the address of the target function.
6253 Make sure that all such registers are not used for passing
6254 parameters. Note that DLLIMPORT functions and call to global
6255 function via GOT slot are indirect. */
6256 if (!decl
6257 || (bind_global && flag_pic && !flag_plt)
6258 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6260 /* Check if regparm >= 3 since arg_reg_available is set to
6261 false if regparm == 0. If regparm is 1 or 2, there is
6262 always a call-clobbered register available.
6264 ??? The symbol indirect call doesn't need a call-clobbered
6265 register. But we don't know if this is a symbol indirect
6266 call or not here. */
6267 if (ix86_function_regparm (type, NULL) >= 3
6268 && !cfun->machine->arg_reg_available)
6269 return false;
6273 /* Otherwise okay. That also includes certain types of indirect calls. */
6274 return true;
6277 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6278 and "sseregparm" calling convention attributes;
6279 arguments as in struct attribute_spec.handler. */
6281 static tree
6282 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6283 bool *no_add_attrs)
6285 if (TREE_CODE (*node) != FUNCTION_TYPE
6286 && TREE_CODE (*node) != METHOD_TYPE
6287 && TREE_CODE (*node) != FIELD_DECL
6288 && TREE_CODE (*node) != TYPE_DECL)
6290 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6291 name);
6292 *no_add_attrs = true;
6293 return NULL_TREE;
6296 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6297 if (is_attribute_p ("regparm", name))
6299 tree cst;
6301 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6303 error ("fastcall and regparm attributes are not compatible");
6306 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6308 error ("regparam and thiscall attributes are not compatible");
6311 cst = TREE_VALUE (args);
6312 if (TREE_CODE (cst) != INTEGER_CST)
6314 warning (OPT_Wattributes,
6315 "%qE attribute requires an integer constant argument",
6316 name);
6317 *no_add_attrs = true;
6319 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6321 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6322 name, REGPARM_MAX);
6323 *no_add_attrs = true;
6326 return NULL_TREE;
6329 if (TARGET_64BIT)
6331 /* Do not warn when emulating the MS ABI. */
6332 if ((TREE_CODE (*node) != FUNCTION_TYPE
6333 && TREE_CODE (*node) != METHOD_TYPE)
6334 || ix86_function_type_abi (*node) != MS_ABI)
6335 warning (OPT_Wattributes, "%qE attribute ignored",
6336 name);
6337 *no_add_attrs = true;
6338 return NULL_TREE;
6341 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6342 if (is_attribute_p ("fastcall", name))
6344 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6346 error ("fastcall and cdecl attributes are not compatible");
6348 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6350 error ("fastcall and stdcall attributes are not compatible");
6352 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6354 error ("fastcall and regparm attributes are not compatible");
6356 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6358 error ("fastcall and thiscall attributes are not compatible");
6362 /* Can combine stdcall with fastcall (redundant), regparm and
6363 sseregparm. */
6364 else if (is_attribute_p ("stdcall", name))
6366 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6368 error ("stdcall and cdecl attributes are not compatible");
6370 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6372 error ("stdcall and fastcall attributes are not compatible");
6374 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6376 error ("stdcall and thiscall attributes are not compatible");
6380 /* Can combine cdecl with regparm and sseregparm. */
6381 else if (is_attribute_p ("cdecl", name))
6383 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6385 error ("stdcall and cdecl attributes are not compatible");
6387 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6389 error ("fastcall and cdecl attributes are not compatible");
6391 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6393 error ("cdecl and thiscall attributes are not compatible");
6396 else if (is_attribute_p ("thiscall", name))
6398 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6399 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6400 name);
6401 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6403 error ("stdcall and thiscall attributes are not compatible");
6405 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6407 error ("fastcall and thiscall attributes are not compatible");
6409 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6411 error ("cdecl and thiscall attributes are not compatible");
6415 /* Can combine sseregparm with all attributes. */
6417 return NULL_TREE;
6420 /* The transactional memory builtins are implicitly regparm or fastcall
6421 depending on the ABI. Override the generic do-nothing attribute that
6422 these builtins were declared with, and replace it with one of the two
6423 attributes that we expect elsewhere. */
6425 static tree
6426 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6427 int flags, bool *no_add_attrs)
6429 tree alt;
6431 /* In no case do we want to add the placeholder attribute. */
6432 *no_add_attrs = true;
6434 /* The 64-bit ABI is unchanged for transactional memory. */
6435 if (TARGET_64BIT)
6436 return NULL_TREE;
6438 /* ??? Is there a better way to validate 32-bit windows? We have
6439 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6440 if (CHECK_STACK_LIMIT > 0)
6441 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6442 else
6444 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6445 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6447 decl_attributes (node, alt, flags);
6449 return NULL_TREE;
6452 /* This function determines from TYPE the calling-convention. */
6454 unsigned int
6455 ix86_get_callcvt (const_tree type)
6457 unsigned int ret = 0;
6458 bool is_stdarg;
6459 tree attrs;
6461 if (TARGET_64BIT)
6462 return IX86_CALLCVT_CDECL;
6464 attrs = TYPE_ATTRIBUTES (type);
6465 if (attrs != NULL_TREE)
6467 if (lookup_attribute ("cdecl", attrs))
6468 ret |= IX86_CALLCVT_CDECL;
6469 else if (lookup_attribute ("stdcall", attrs))
6470 ret |= IX86_CALLCVT_STDCALL;
6471 else if (lookup_attribute ("fastcall", attrs))
6472 ret |= IX86_CALLCVT_FASTCALL;
6473 else if (lookup_attribute ("thiscall", attrs))
6474 ret |= IX86_CALLCVT_THISCALL;
6476 /* Regparam isn't allowed for thiscall and fastcall. */
6477 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6479 if (lookup_attribute ("regparm", attrs))
6480 ret |= IX86_CALLCVT_REGPARM;
6481 if (lookup_attribute ("sseregparm", attrs))
6482 ret |= IX86_CALLCVT_SSEREGPARM;
6485 if (IX86_BASE_CALLCVT(ret) != 0)
6486 return ret;
6489 is_stdarg = stdarg_p (type);
6490 if (TARGET_RTD && !is_stdarg)
6491 return IX86_CALLCVT_STDCALL | ret;
6493 if (ret != 0
6494 || is_stdarg
6495 || TREE_CODE (type) != METHOD_TYPE
6496 || ix86_function_type_abi (type) != MS_ABI)
6497 return IX86_CALLCVT_CDECL | ret;
6499 return IX86_CALLCVT_THISCALL;
6502 /* Return 0 if the attributes for two types are incompatible, 1 if they
6503 are compatible, and 2 if they are nearly compatible (which causes a
6504 warning to be generated). */
6506 static int
6507 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6509 unsigned int ccvt1, ccvt2;
6511 if (TREE_CODE (type1) != FUNCTION_TYPE
6512 && TREE_CODE (type1) != METHOD_TYPE)
6513 return 1;
6515 ccvt1 = ix86_get_callcvt (type1);
6516 ccvt2 = ix86_get_callcvt (type2);
6517 if (ccvt1 != ccvt2)
6518 return 0;
6519 if (ix86_function_regparm (type1, NULL)
6520 != ix86_function_regparm (type2, NULL))
6521 return 0;
6523 return 1;
6526 /* Return the regparm value for a function with the indicated TYPE and DECL.
6527 DECL may be NULL when calling function indirectly
6528 or considering a libcall. */
6530 static int
6531 ix86_function_regparm (const_tree type, const_tree decl)
6533 tree attr;
6534 int regparm;
6535 unsigned int ccvt;
6537 if (TARGET_64BIT)
6538 return (ix86_function_type_abi (type) == SYSV_ABI
6539 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6540 ccvt = ix86_get_callcvt (type);
6541 regparm = ix86_regparm;
6543 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6545 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6546 if (attr)
6548 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6549 return regparm;
6552 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6553 return 2;
6554 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6555 return 1;
6557 /* Use register calling convention for local functions when possible. */
6558 if (decl
6559 && TREE_CODE (decl) == FUNCTION_DECL)
6561 cgraph_node *target = cgraph_node::get (decl);
6562 if (target)
6563 target = target->function_symbol ();
6565 /* Caller and callee must agree on the calling convention, so
6566 checking here just optimize means that with
6567 __attribute__((optimize (...))) caller could use regparm convention
6568 and callee not, or vice versa. Instead look at whether the callee
6569 is optimized or not. */
6570 if (target && opt_for_fn (target->decl, optimize)
6571 && !(profile_flag && !flag_fentry))
6573 cgraph_local_info *i = &target->local;
6574 if (i && i->local && i->can_change_signature)
6576 int local_regparm, globals = 0, regno;
6578 /* Make sure no regparm register is taken by a
6579 fixed register variable. */
6580 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6581 local_regparm++)
6582 if (fixed_regs[local_regparm])
6583 break;
6585 /* We don't want to use regparm(3) for nested functions as
6586 these use a static chain pointer in the third argument. */
6587 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6588 local_regparm = 2;
6590 /* Save a register for the split stack. */
6591 if (flag_split_stack)
6593 if (local_regparm == 3)
6594 local_regparm = 2;
6595 else if (local_regparm == 2
6596 && DECL_STATIC_CHAIN (target->decl))
6597 local_regparm = 1;
6600 /* Each fixed register usage increases register pressure,
6601 so less registers should be used for argument passing.
6602 This functionality can be overriden by an explicit
6603 regparm value. */
6604 for (regno = AX_REG; regno <= DI_REG; regno++)
6605 if (fixed_regs[regno])
6606 globals++;
6608 local_regparm
6609 = globals < local_regparm ? local_regparm - globals : 0;
6611 if (local_regparm > regparm)
6612 regparm = local_regparm;
6617 return regparm;
6620 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6621 DFmode (2) arguments in SSE registers for a function with the
6622 indicated TYPE and DECL. DECL may be NULL when calling function
6623 indirectly or considering a libcall. Return -1 if any FP parameter
6624 should be rejected by error. This is used in siutation we imply SSE
6625 calling convetion but the function is called from another function with
6626 SSE disabled. Otherwise return 0. */
6628 static int
6629 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6631 gcc_assert (!TARGET_64BIT);
6633 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6634 by the sseregparm attribute. */
6635 if (TARGET_SSEREGPARM
6636 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6638 if (!TARGET_SSE)
6640 if (warn)
6642 if (decl)
6643 error ("calling %qD with attribute sseregparm without "
6644 "SSE/SSE2 enabled", decl);
6645 else
6646 error ("calling %qT with attribute sseregparm without "
6647 "SSE/SSE2 enabled", type);
6649 return 0;
6652 return 2;
6655 if (!decl)
6656 return 0;
6658 cgraph_node *target = cgraph_node::get (decl);
6659 if (target)
6660 target = target->function_symbol ();
6662 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6663 (and DFmode for SSE2) arguments in SSE registers. */
6664 if (target
6665 /* TARGET_SSE_MATH */
6666 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6667 && opt_for_fn (target->decl, optimize)
6668 && !(profile_flag && !flag_fentry))
6670 cgraph_local_info *i = &target->local;
6671 if (i && i->local && i->can_change_signature)
6673 /* Refuse to produce wrong code when local function with SSE enabled
6674 is called from SSE disabled function.
6675 FIXME: We need a way to detect these cases cross-ltrans partition
6676 and avoid using SSE calling conventions on local functions called
6677 from function with SSE disabled. For now at least delay the
6678 warning until we know we are going to produce wrong code.
6679 See PR66047 */
6680 if (!TARGET_SSE && warn)
6681 return -1;
6682 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6683 ->x_ix86_isa_flags) ? 2 : 1;
6687 return 0;
6690 /* Return true if EAX is live at the start of the function. Used by
6691 ix86_expand_prologue to determine if we need special help before
6692 calling allocate_stack_worker. */
6694 static bool
6695 ix86_eax_live_at_start_p (void)
6697 /* Cheat. Don't bother working forward from ix86_function_regparm
6698 to the function type to whether an actual argument is located in
6699 eax. Instead just look at cfg info, which is still close enough
6700 to correct at this point. This gives false positives for broken
6701 functions that might use uninitialized data that happens to be
6702 allocated in eax, but who cares? */
6703 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6706 static bool
6707 ix86_keep_aggregate_return_pointer (tree fntype)
6709 tree attr;
6711 if (!TARGET_64BIT)
6713 attr = lookup_attribute ("callee_pop_aggregate_return",
6714 TYPE_ATTRIBUTES (fntype));
6715 if (attr)
6716 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6718 /* For 32-bit MS-ABI the default is to keep aggregate
6719 return pointer. */
6720 if (ix86_function_type_abi (fntype) == MS_ABI)
6721 return true;
6723 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6726 /* Value is the number of bytes of arguments automatically
6727 popped when returning from a subroutine call.
6728 FUNDECL is the declaration node of the function (as a tree),
6729 FUNTYPE is the data type of the function (as a tree),
6730 or for a library call it is an identifier node for the subroutine name.
6731 SIZE is the number of bytes of arguments passed on the stack.
6733 On the 80386, the RTD insn may be used to pop them if the number
6734 of args is fixed, but if the number is variable then the caller
6735 must pop them all. RTD can't be used for library calls now
6736 because the library is compiled with the Unix compiler.
6737 Use of RTD is a selectable option, since it is incompatible with
6738 standard Unix calling sequences. If the option is not selected,
6739 the caller must always pop the args.
6741 The attribute stdcall is equivalent to RTD on a per module basis. */
6743 static poly_int64
6744 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6746 unsigned int ccvt;
6748 /* None of the 64-bit ABIs pop arguments. */
6749 if (TARGET_64BIT)
6750 return 0;
6752 ccvt = ix86_get_callcvt (funtype);
6754 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6755 | IX86_CALLCVT_THISCALL)) != 0
6756 && ! stdarg_p (funtype))
6757 return size;
6759 /* Lose any fake structure return argument if it is passed on the stack. */
6760 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6761 && !ix86_keep_aggregate_return_pointer (funtype))
6763 int nregs = ix86_function_regparm (funtype, fundecl);
6764 if (nregs == 0)
6765 return GET_MODE_SIZE (Pmode);
6768 return 0;
6771 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6773 static bool
6774 ix86_legitimate_combined_insn (rtx_insn *insn)
6776 int i;
6778 /* Check operand constraints in case hard registers were propagated
6779 into insn pattern. This check prevents combine pass from
6780 generating insn patterns with invalid hard register operands.
6781 These invalid insns can eventually confuse reload to error out
6782 with a spill failure. See also PRs 46829 and 46843. */
6784 gcc_assert (INSN_CODE (insn) >= 0);
6786 extract_insn (insn);
6787 preprocess_constraints (insn);
6789 int n_operands = recog_data.n_operands;
6790 int n_alternatives = recog_data.n_alternatives;
6791 for (i = 0; i < n_operands; i++)
6793 rtx op = recog_data.operand[i];
6794 machine_mode mode = GET_MODE (op);
6795 const operand_alternative *op_alt;
6796 int offset = 0;
6797 bool win;
6798 int j;
6800 /* A unary operator may be accepted by the predicate, but it
6801 is irrelevant for matching constraints. */
6802 if (UNARY_P (op))
6803 op = XEXP (op, 0);
6805 if (SUBREG_P (op))
6807 if (REG_P (SUBREG_REG (op))
6808 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6809 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6810 GET_MODE (SUBREG_REG (op)),
6811 SUBREG_BYTE (op),
6812 GET_MODE (op));
6813 op = SUBREG_REG (op);
6816 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6817 continue;
6819 op_alt = recog_op_alt;
6821 /* Operand has no constraints, anything is OK. */
6822 win = !n_alternatives;
6824 alternative_mask preferred = get_preferred_alternatives (insn);
6825 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6827 if (!TEST_BIT (preferred, j))
6828 continue;
6829 if (op_alt[i].anything_ok
6830 || (op_alt[i].matches != -1
6831 && operands_match_p
6832 (recog_data.operand[i],
6833 recog_data.operand[op_alt[i].matches]))
6834 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6836 win = true;
6837 break;
6841 if (!win)
6842 return false;
6845 return true;
6848 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6850 static unsigned HOST_WIDE_INT
6851 ix86_asan_shadow_offset (void)
6853 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6854 : HOST_WIDE_INT_C (0x7fff8000))
6855 : (HOST_WIDE_INT_1 << 29);
6858 /* Argument support functions. */
6860 /* Return true when register may be used to pass function parameters. */
6861 bool
6862 ix86_function_arg_regno_p (int regno)
6864 int i;
6865 enum calling_abi call_abi;
6866 const int *parm_regs;
6868 if (TARGET_MPX && BND_REGNO_P (regno))
6869 return true;
6871 if (!TARGET_64BIT)
6873 if (TARGET_MACHO)
6874 return (regno < REGPARM_MAX
6875 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6876 else
6877 return (regno < REGPARM_MAX
6878 || (TARGET_MMX && MMX_REGNO_P (regno)
6879 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6880 || (TARGET_SSE && SSE_REGNO_P (regno)
6881 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6884 if (TARGET_SSE && SSE_REGNO_P (regno)
6885 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6886 return true;
6888 /* TODO: The function should depend on current function ABI but
6889 builtins.c would need updating then. Therefore we use the
6890 default ABI. */
6891 call_abi = ix86_cfun_abi ();
6893 /* RAX is used as hidden argument to va_arg functions. */
6894 if (call_abi == SYSV_ABI && regno == AX_REG)
6895 return true;
6897 if (call_abi == MS_ABI)
6898 parm_regs = x86_64_ms_abi_int_parameter_registers;
6899 else
6900 parm_regs = x86_64_int_parameter_registers;
6902 for (i = 0; i < (call_abi == MS_ABI
6903 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6904 if (regno == parm_regs[i])
6905 return true;
6906 return false;
6909 /* Return if we do not know how to pass TYPE solely in registers. */
6911 static bool
6912 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6914 if (must_pass_in_stack_var_size_or_pad (mode, type))
6915 return true;
6917 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6918 The layout_type routine is crafty and tries to trick us into passing
6919 currently unsupported vector types on the stack by using TImode. */
6920 return (!TARGET_64BIT && mode == TImode
6921 && type && TREE_CODE (type) != VECTOR_TYPE);
6924 /* It returns the size, in bytes, of the area reserved for arguments passed
6925 in registers for the function represented by fndecl dependent to the used
6926 abi format. */
6928 ix86_reg_parm_stack_space (const_tree fndecl)
6930 enum calling_abi call_abi = SYSV_ABI;
6931 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6932 call_abi = ix86_function_abi (fndecl);
6933 else
6934 call_abi = ix86_function_type_abi (fndecl);
6935 if (TARGET_64BIT && call_abi == MS_ABI)
6936 return 32;
6937 return 0;
6940 /* We add this as a workaround in order to use libc_has_function
6941 hook in i386.md. */
6942 bool
6943 ix86_libc_has_function (enum function_class fn_class)
6945 return targetm.libc_has_function (fn_class);
6948 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6949 specifying the call abi used. */
6950 enum calling_abi
6951 ix86_function_type_abi (const_tree fntype)
6953 enum calling_abi abi = ix86_abi;
6955 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6956 return abi;
6958 if (abi == SYSV_ABI
6959 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6961 static int warned;
6962 if (TARGET_X32 && !warned)
6964 error ("X32 does not support ms_abi attribute");
6965 warned = 1;
6968 abi = MS_ABI;
6970 else if (abi == MS_ABI
6971 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6972 abi = SYSV_ABI;
6974 return abi;
6977 static enum calling_abi
6978 ix86_function_abi (const_tree fndecl)
6980 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6983 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6984 specifying the call abi used. */
6985 enum calling_abi
6986 ix86_cfun_abi (void)
6988 return cfun ? cfun->machine->call_abi : ix86_abi;
6991 static bool
6992 ix86_function_ms_hook_prologue (const_tree fn)
6994 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6996 if (decl_function_context (fn) != NULL_TREE)
6997 error_at (DECL_SOURCE_LOCATION (fn),
6998 "ms_hook_prologue is not compatible with nested function");
6999 else
7000 return true;
7002 return false;
7005 static bool
7006 ix86_function_naked (const_tree fn)
7008 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7009 return true;
7011 return false;
7014 /* Write the extra assembler code needed to declare a function properly. */
7016 void
7017 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7018 tree decl)
7020 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7022 if (is_ms_hook)
7024 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7025 unsigned int filler_cc = 0xcccccccc;
7027 for (i = 0; i < filler_count; i += 4)
7028 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7031 #ifdef SUBTARGET_ASM_UNWIND_INIT
7032 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7033 #endif
7035 ASM_OUTPUT_LABEL (asm_out_file, fname);
7037 /* Output magic byte marker, if hot-patch attribute is set. */
7038 if (is_ms_hook)
7040 if (TARGET_64BIT)
7042 /* leaq [%rsp + 0], %rsp */
7043 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7044 asm_out_file);
7046 else
7048 /* movl.s %edi, %edi
7049 push %ebp
7050 movl.s %esp, %ebp */
7051 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7056 /* Implementation of call abi switching target hook. Specific to FNDECL
7057 the specific call register sets are set. See also
7058 ix86_conditional_register_usage for more details. */
7059 void
7060 ix86_call_abi_override (const_tree fndecl)
7062 cfun->machine->call_abi = ix86_function_abi (fndecl);
7065 /* Return 1 if pseudo register should be created and used to hold
7066 GOT address for PIC code. */
7067 bool
7068 ix86_use_pseudo_pic_reg (void)
7070 if ((TARGET_64BIT
7071 && (ix86_cmodel == CM_SMALL_PIC
7072 || TARGET_PECOFF))
7073 || !flag_pic)
7074 return false;
7075 return true;
7078 /* Initialize large model PIC register. */
7080 static void
7081 ix86_init_large_pic_reg (unsigned int tmp_regno)
7083 rtx_code_label *label;
7084 rtx tmp_reg;
7086 gcc_assert (Pmode == DImode);
7087 label = gen_label_rtx ();
7088 emit_label (label);
7089 LABEL_PRESERVE_P (label) = 1;
7090 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7091 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7092 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7093 label));
7094 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7095 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7096 pic_offset_table_rtx, tmp_reg));
7097 const char *name = LABEL_NAME (label);
7098 PUT_CODE (label, NOTE);
7099 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7100 NOTE_DELETED_LABEL_NAME (label) = name;
7103 /* Create and initialize PIC register if required. */
7104 static void
7105 ix86_init_pic_reg (void)
7107 edge entry_edge;
7108 rtx_insn *seq;
7110 if (!ix86_use_pseudo_pic_reg ())
7111 return;
7113 start_sequence ();
7115 if (TARGET_64BIT)
7117 if (ix86_cmodel == CM_LARGE_PIC)
7118 ix86_init_large_pic_reg (R11_REG);
7119 else
7120 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7122 else
7124 /* If there is future mcount call in the function it is more profitable
7125 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7126 rtx reg = crtl->profile
7127 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7128 : pic_offset_table_rtx;
7129 rtx_insn *insn = emit_insn (gen_set_got (reg));
7130 RTX_FRAME_RELATED_P (insn) = 1;
7131 if (crtl->profile)
7132 emit_move_insn (pic_offset_table_rtx, reg);
7133 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7136 seq = get_insns ();
7137 end_sequence ();
7139 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7140 insert_insn_on_edge (seq, entry_edge);
7141 commit_one_edge_insertion (entry_edge);
7144 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7145 for a call to a function whose data type is FNTYPE.
7146 For a library call, FNTYPE is 0. */
7148 void
7149 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7150 tree fntype, /* tree ptr for function decl */
7151 rtx libname, /* SYMBOL_REF of library name or 0 */
7152 tree fndecl,
7153 int caller)
7155 struct cgraph_local_info *i = NULL;
7156 struct cgraph_node *target = NULL;
7158 memset (cum, 0, sizeof (*cum));
7160 if (fndecl)
7162 target = cgraph_node::get (fndecl);
7163 if (target)
7165 target = target->function_symbol ();
7166 i = cgraph_node::local_info (target->decl);
7167 cum->call_abi = ix86_function_abi (target->decl);
7169 else
7170 cum->call_abi = ix86_function_abi (fndecl);
7172 else
7173 cum->call_abi = ix86_function_type_abi (fntype);
7175 cum->caller = caller;
7177 /* Set up the number of registers to use for passing arguments. */
7178 cum->nregs = ix86_regparm;
7179 if (TARGET_64BIT)
7181 cum->nregs = (cum->call_abi == SYSV_ABI
7182 ? X86_64_REGPARM_MAX
7183 : X86_64_MS_REGPARM_MAX);
7185 if (TARGET_SSE)
7187 cum->sse_nregs = SSE_REGPARM_MAX;
7188 if (TARGET_64BIT)
7190 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7191 ? X86_64_SSE_REGPARM_MAX
7192 : X86_64_MS_SSE_REGPARM_MAX);
7195 if (TARGET_MMX)
7196 cum->mmx_nregs = MMX_REGPARM_MAX;
7197 cum->warn_avx512f = true;
7198 cum->warn_avx = true;
7199 cum->warn_sse = true;
7200 cum->warn_mmx = true;
7202 /* Because type might mismatch in between caller and callee, we need to
7203 use actual type of function for local calls.
7204 FIXME: cgraph_analyze can be told to actually record if function uses
7205 va_start so for local functions maybe_vaarg can be made aggressive
7206 helping K&R code.
7207 FIXME: once typesytem is fixed, we won't need this code anymore. */
7208 if (i && i->local && i->can_change_signature)
7209 fntype = TREE_TYPE (target->decl);
7210 cum->stdarg = stdarg_p (fntype);
7211 cum->maybe_vaarg = (fntype
7212 ? (!prototype_p (fntype) || stdarg_p (fntype))
7213 : !libname);
7215 cum->bnd_regno = FIRST_BND_REG;
7216 cum->bnds_in_bt = 0;
7217 cum->force_bnd_pass = 0;
7218 cum->decl = fndecl;
7220 cum->warn_empty = !warn_abi || cum->stdarg;
7221 if (!cum->warn_empty && fntype)
7223 function_args_iterator iter;
7224 tree argtype;
7225 bool seen_empty_type = false;
7226 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7228 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7229 break;
7230 if (TYPE_EMPTY_P (argtype))
7231 seen_empty_type = true;
7232 else if (seen_empty_type)
7234 cum->warn_empty = true;
7235 break;
7240 if (!TARGET_64BIT)
7242 /* If there are variable arguments, then we won't pass anything
7243 in registers in 32-bit mode. */
7244 if (stdarg_p (fntype))
7246 cum->nregs = 0;
7247 /* Since in 32-bit, variable arguments are always passed on
7248 stack, there is scratch register available for indirect
7249 sibcall. */
7250 cfun->machine->arg_reg_available = true;
7251 cum->sse_nregs = 0;
7252 cum->mmx_nregs = 0;
7253 cum->warn_avx512f = false;
7254 cum->warn_avx = false;
7255 cum->warn_sse = false;
7256 cum->warn_mmx = false;
7257 return;
7260 /* Use ecx and edx registers if function has fastcall attribute,
7261 else look for regparm information. */
7262 if (fntype)
7264 unsigned int ccvt = ix86_get_callcvt (fntype);
7265 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7267 cum->nregs = 1;
7268 cum->fastcall = 1; /* Same first register as in fastcall. */
7270 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7272 cum->nregs = 2;
7273 cum->fastcall = 1;
7275 else
7276 cum->nregs = ix86_function_regparm (fntype, fndecl);
7279 /* Set up the number of SSE registers used for passing SFmode
7280 and DFmode arguments. Warn for mismatching ABI. */
7281 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7284 cfun->machine->arg_reg_available = (cum->nregs > 0);
7287 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7288 But in the case of vector types, it is some vector mode.
7290 When we have only some of our vector isa extensions enabled, then there
7291 are some modes for which vector_mode_supported_p is false. For these
7292 modes, the generic vector support in gcc will choose some non-vector mode
7293 in order to implement the type. By computing the natural mode, we'll
7294 select the proper ABI location for the operand and not depend on whatever
7295 the middle-end decides to do with these vector types.
7297 The midde-end can't deal with the vector types > 16 bytes. In this
7298 case, we return the original mode and warn ABI change if CUM isn't
7299 NULL.
7301 If INT_RETURN is true, warn ABI change if the vector mode isn't
7302 available for function return value. */
7304 static machine_mode
7305 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7306 bool in_return)
7308 machine_mode mode = TYPE_MODE (type);
7310 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7312 HOST_WIDE_INT size = int_size_in_bytes (type);
7313 if ((size == 8 || size == 16 || size == 32 || size == 64)
7314 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7315 && TYPE_VECTOR_SUBPARTS (type) > 1)
7317 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7319 /* There are no XFmode vector modes. */
7320 if (innermode == XFmode)
7321 return mode;
7323 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7324 mode = MIN_MODE_VECTOR_FLOAT;
7325 else
7326 mode = MIN_MODE_VECTOR_INT;
7328 /* Get the mode which has this inner mode and number of units. */
7329 FOR_EACH_MODE_FROM (mode, mode)
7330 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7331 && GET_MODE_INNER (mode) == innermode)
7333 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7335 static bool warnedavx512f;
7336 static bool warnedavx512f_ret;
7338 if (cum && cum->warn_avx512f && !warnedavx512f)
7340 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7341 "without AVX512F enabled changes the ABI"))
7342 warnedavx512f = true;
7344 else if (in_return && !warnedavx512f_ret)
7346 if (warning (OPT_Wpsabi, "AVX512F vector return "
7347 "without AVX512F enabled changes the ABI"))
7348 warnedavx512f_ret = true;
7351 return TYPE_MODE (type);
7353 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7355 static bool warnedavx;
7356 static bool warnedavx_ret;
7358 if (cum && cum->warn_avx && !warnedavx)
7360 if (warning (OPT_Wpsabi, "AVX vector argument "
7361 "without AVX enabled changes the ABI"))
7362 warnedavx = true;
7364 else if (in_return && !warnedavx_ret)
7366 if (warning (OPT_Wpsabi, "AVX vector return "
7367 "without AVX enabled changes the ABI"))
7368 warnedavx_ret = true;
7371 return TYPE_MODE (type);
7373 else if (((size == 8 && TARGET_64BIT) || size == 16)
7374 && !TARGET_SSE
7375 && !TARGET_IAMCU)
7377 static bool warnedsse;
7378 static bool warnedsse_ret;
7380 if (cum && cum->warn_sse && !warnedsse)
7382 if (warning (OPT_Wpsabi, "SSE vector argument "
7383 "without SSE enabled changes the ABI"))
7384 warnedsse = true;
7386 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7388 if (warning (OPT_Wpsabi, "SSE vector return "
7389 "without SSE enabled changes the ABI"))
7390 warnedsse_ret = true;
7393 else if ((size == 8 && !TARGET_64BIT)
7394 && (!cfun
7395 || cfun->machine->func_type == TYPE_NORMAL)
7396 && !TARGET_MMX
7397 && !TARGET_IAMCU)
7399 static bool warnedmmx;
7400 static bool warnedmmx_ret;
7402 if (cum && cum->warn_mmx && !warnedmmx)
7404 if (warning (OPT_Wpsabi, "MMX vector argument "
7405 "without MMX enabled changes the ABI"))
7406 warnedmmx = true;
7408 else if (in_return && !warnedmmx_ret)
7410 if (warning (OPT_Wpsabi, "MMX vector return "
7411 "without MMX enabled changes the ABI"))
7412 warnedmmx_ret = true;
7415 return mode;
7418 gcc_unreachable ();
7422 return mode;
7425 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7426 this may not agree with the mode that the type system has chosen for the
7427 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7428 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7430 static rtx
7431 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7432 unsigned int regno)
7434 rtx tmp;
7436 if (orig_mode != BLKmode)
7437 tmp = gen_rtx_REG (orig_mode, regno);
7438 else
7440 tmp = gen_rtx_REG (mode, regno);
7441 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7442 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7445 return tmp;
7448 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7449 of this code is to classify each 8bytes of incoming argument by the register
7450 class and assign registers accordingly. */
7452 /* Return the union class of CLASS1 and CLASS2.
7453 See the x86-64 PS ABI for details. */
7455 static enum x86_64_reg_class
7456 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7458 /* Rule #1: If both classes are equal, this is the resulting class. */
7459 if (class1 == class2)
7460 return class1;
7462 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7463 the other class. */
7464 if (class1 == X86_64_NO_CLASS)
7465 return class2;
7466 if (class2 == X86_64_NO_CLASS)
7467 return class1;
7469 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7470 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7471 return X86_64_MEMORY_CLASS;
7473 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7474 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7475 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7476 return X86_64_INTEGERSI_CLASS;
7477 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7478 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7479 return X86_64_INTEGER_CLASS;
7481 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7482 MEMORY is used. */
7483 if (class1 == X86_64_X87_CLASS
7484 || class1 == X86_64_X87UP_CLASS
7485 || class1 == X86_64_COMPLEX_X87_CLASS
7486 || class2 == X86_64_X87_CLASS
7487 || class2 == X86_64_X87UP_CLASS
7488 || class2 == X86_64_COMPLEX_X87_CLASS)
7489 return X86_64_MEMORY_CLASS;
7491 /* Rule #6: Otherwise class SSE is used. */
7492 return X86_64_SSE_CLASS;
7495 /* Classify the argument of type TYPE and mode MODE.
7496 CLASSES will be filled by the register class used to pass each word
7497 of the operand. The number of words is returned. In case the parameter
7498 should be passed in memory, 0 is returned. As a special case for zero
7499 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7501 BIT_OFFSET is used internally for handling records and specifies offset
7502 of the offset in bits modulo 512 to avoid overflow cases.
7504 See the x86-64 PS ABI for details.
7507 static int
7508 classify_argument (machine_mode mode, const_tree type,
7509 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7511 HOST_WIDE_INT bytes =
7512 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7513 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7515 /* Variable sized entities are always passed/returned in memory. */
7516 if (bytes < 0)
7517 return 0;
7519 if (mode != VOIDmode
7520 && targetm.calls.must_pass_in_stack (mode, type))
7521 return 0;
7523 if (type && AGGREGATE_TYPE_P (type))
7525 int i;
7526 tree field;
7527 enum x86_64_reg_class subclasses[MAX_CLASSES];
7529 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7530 if (bytes > 64)
7531 return 0;
7533 for (i = 0; i < words; i++)
7534 classes[i] = X86_64_NO_CLASS;
7536 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7537 signalize memory class, so handle it as special case. */
7538 if (!words)
7540 classes[0] = X86_64_NO_CLASS;
7541 return 1;
7544 /* Classify each field of record and merge classes. */
7545 switch (TREE_CODE (type))
7547 case RECORD_TYPE:
7548 /* And now merge the fields of structure. */
7549 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7551 if (TREE_CODE (field) == FIELD_DECL)
7553 int num;
7555 if (TREE_TYPE (field) == error_mark_node)
7556 continue;
7558 /* Bitfields are always classified as integer. Handle them
7559 early, since later code would consider them to be
7560 misaligned integers. */
7561 if (DECL_BIT_FIELD (field))
7563 for (i = (int_bit_position (field)
7564 + (bit_offset % 64)) / 8 / 8;
7565 i < ((int_bit_position (field) + (bit_offset % 64))
7566 + tree_to_shwi (DECL_SIZE (field))
7567 + 63) / 8 / 8; i++)
7568 classes[i] =
7569 merge_classes (X86_64_INTEGER_CLASS,
7570 classes[i]);
7572 else
7574 int pos;
7576 type = TREE_TYPE (field);
7578 /* Flexible array member is ignored. */
7579 if (TYPE_MODE (type) == BLKmode
7580 && TREE_CODE (type) == ARRAY_TYPE
7581 && TYPE_SIZE (type) == NULL_TREE
7582 && TYPE_DOMAIN (type) != NULL_TREE
7583 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7584 == NULL_TREE))
7586 static bool warned;
7588 if (!warned && warn_psabi)
7590 warned = true;
7591 inform (input_location,
7592 "the ABI of passing struct with"
7593 " a flexible array member has"
7594 " changed in GCC 4.4");
7596 continue;
7598 num = classify_argument (TYPE_MODE (type), type,
7599 subclasses,
7600 (int_bit_position (field)
7601 + bit_offset) % 512);
7602 if (!num)
7603 return 0;
7604 pos = (int_bit_position (field)
7605 + (bit_offset % 64)) / 8 / 8;
7606 for (i = 0; i < num && (i + pos) < words; i++)
7607 classes[i + pos] =
7608 merge_classes (subclasses[i], classes[i + pos]);
7612 break;
7614 case ARRAY_TYPE:
7615 /* Arrays are handled as small records. */
7617 int num;
7618 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7619 TREE_TYPE (type), subclasses, bit_offset);
7620 if (!num)
7621 return 0;
7623 /* The partial classes are now full classes. */
7624 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7625 subclasses[0] = X86_64_SSE_CLASS;
7626 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7627 && !((bit_offset % 64) == 0 && bytes == 4))
7628 subclasses[0] = X86_64_INTEGER_CLASS;
7630 for (i = 0; i < words; i++)
7631 classes[i] = subclasses[i % num];
7633 break;
7635 case UNION_TYPE:
7636 case QUAL_UNION_TYPE:
7637 /* Unions are similar to RECORD_TYPE but offset is always 0.
7639 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7641 if (TREE_CODE (field) == FIELD_DECL)
7643 int num;
7645 if (TREE_TYPE (field) == error_mark_node)
7646 continue;
7648 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7649 TREE_TYPE (field), subclasses,
7650 bit_offset);
7651 if (!num)
7652 return 0;
7653 for (i = 0; i < num && i < words; i++)
7654 classes[i] = merge_classes (subclasses[i], classes[i]);
7657 break;
7659 default:
7660 gcc_unreachable ();
7663 if (words > 2)
7665 /* When size > 16 bytes, if the first one isn't
7666 X86_64_SSE_CLASS or any other ones aren't
7667 X86_64_SSEUP_CLASS, everything should be passed in
7668 memory. */
7669 if (classes[0] != X86_64_SSE_CLASS)
7670 return 0;
7672 for (i = 1; i < words; i++)
7673 if (classes[i] != X86_64_SSEUP_CLASS)
7674 return 0;
7677 /* Final merger cleanup. */
7678 for (i = 0; i < words; i++)
7680 /* If one class is MEMORY, everything should be passed in
7681 memory. */
7682 if (classes[i] == X86_64_MEMORY_CLASS)
7683 return 0;
7685 /* The X86_64_SSEUP_CLASS should be always preceded by
7686 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7687 if (classes[i] == X86_64_SSEUP_CLASS
7688 && classes[i - 1] != X86_64_SSE_CLASS
7689 && classes[i - 1] != X86_64_SSEUP_CLASS)
7691 /* The first one should never be X86_64_SSEUP_CLASS. */
7692 gcc_assert (i != 0);
7693 classes[i] = X86_64_SSE_CLASS;
7696 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7697 everything should be passed in memory. */
7698 if (classes[i] == X86_64_X87UP_CLASS
7699 && (classes[i - 1] != X86_64_X87_CLASS))
7701 static bool warned;
7703 /* The first one should never be X86_64_X87UP_CLASS. */
7704 gcc_assert (i != 0);
7705 if (!warned && warn_psabi)
7707 warned = true;
7708 inform (input_location,
7709 "the ABI of passing union with long double"
7710 " has changed in GCC 4.4");
7712 return 0;
7715 return words;
7718 /* Compute alignment needed. We align all types to natural boundaries with
7719 exception of XFmode that is aligned to 64bits. */
7720 if (mode != VOIDmode && mode != BLKmode)
7722 int mode_alignment = GET_MODE_BITSIZE (mode);
7724 if (mode == XFmode)
7725 mode_alignment = 128;
7726 else if (mode == XCmode)
7727 mode_alignment = 256;
7728 if (COMPLEX_MODE_P (mode))
7729 mode_alignment /= 2;
7730 /* Misaligned fields are always returned in memory. */
7731 if (bit_offset % mode_alignment)
7732 return 0;
7735 /* for V1xx modes, just use the base mode */
7736 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7737 && GET_MODE_UNIT_SIZE (mode) == bytes)
7738 mode = GET_MODE_INNER (mode);
7740 /* Classification of atomic types. */
7741 switch (mode)
7743 case E_SDmode:
7744 case E_DDmode:
7745 classes[0] = X86_64_SSE_CLASS;
7746 return 1;
7747 case E_TDmode:
7748 classes[0] = X86_64_SSE_CLASS;
7749 classes[1] = X86_64_SSEUP_CLASS;
7750 return 2;
7751 case E_DImode:
7752 case E_SImode:
7753 case E_HImode:
7754 case E_QImode:
7755 case E_CSImode:
7756 case E_CHImode:
7757 case E_CQImode:
7759 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7761 /* Analyze last 128 bits only. */
7762 size = (size - 1) & 0x7f;
7764 if (size < 32)
7766 classes[0] = X86_64_INTEGERSI_CLASS;
7767 return 1;
7769 else if (size < 64)
7771 classes[0] = X86_64_INTEGER_CLASS;
7772 return 1;
7774 else if (size < 64+32)
7776 classes[0] = X86_64_INTEGER_CLASS;
7777 classes[1] = X86_64_INTEGERSI_CLASS;
7778 return 2;
7780 else if (size < 64+64)
7782 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7783 return 2;
7785 else
7786 gcc_unreachable ();
7788 case E_CDImode:
7789 case E_TImode:
7790 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7791 return 2;
7792 case E_COImode:
7793 case E_OImode:
7794 /* OImode shouldn't be used directly. */
7795 gcc_unreachable ();
7796 case E_CTImode:
7797 return 0;
7798 case E_SFmode:
7799 if (!(bit_offset % 64))
7800 classes[0] = X86_64_SSESF_CLASS;
7801 else
7802 classes[0] = X86_64_SSE_CLASS;
7803 return 1;
7804 case E_DFmode:
7805 classes[0] = X86_64_SSEDF_CLASS;
7806 return 1;
7807 case E_XFmode:
7808 classes[0] = X86_64_X87_CLASS;
7809 classes[1] = X86_64_X87UP_CLASS;
7810 return 2;
7811 case E_TFmode:
7812 classes[0] = X86_64_SSE_CLASS;
7813 classes[1] = X86_64_SSEUP_CLASS;
7814 return 2;
7815 case E_SCmode:
7816 classes[0] = X86_64_SSE_CLASS;
7817 if (!(bit_offset % 64))
7818 return 1;
7819 else
7821 static bool warned;
7823 if (!warned && warn_psabi)
7825 warned = true;
7826 inform (input_location,
7827 "the ABI of passing structure with complex float"
7828 " member has changed in GCC 4.4");
7830 classes[1] = X86_64_SSESF_CLASS;
7831 return 2;
7833 case E_DCmode:
7834 classes[0] = X86_64_SSEDF_CLASS;
7835 classes[1] = X86_64_SSEDF_CLASS;
7836 return 2;
7837 case E_XCmode:
7838 classes[0] = X86_64_COMPLEX_X87_CLASS;
7839 return 1;
7840 case E_TCmode:
7841 /* This modes is larger than 16 bytes. */
7842 return 0;
7843 case E_V8SFmode:
7844 case E_V8SImode:
7845 case E_V32QImode:
7846 case E_V16HImode:
7847 case E_V4DFmode:
7848 case E_V4DImode:
7849 classes[0] = X86_64_SSE_CLASS;
7850 classes[1] = X86_64_SSEUP_CLASS;
7851 classes[2] = X86_64_SSEUP_CLASS;
7852 classes[3] = X86_64_SSEUP_CLASS;
7853 return 4;
7854 case E_V8DFmode:
7855 case E_V16SFmode:
7856 case E_V8DImode:
7857 case E_V16SImode:
7858 case E_V32HImode:
7859 case E_V64QImode:
7860 classes[0] = X86_64_SSE_CLASS;
7861 classes[1] = X86_64_SSEUP_CLASS;
7862 classes[2] = X86_64_SSEUP_CLASS;
7863 classes[3] = X86_64_SSEUP_CLASS;
7864 classes[4] = X86_64_SSEUP_CLASS;
7865 classes[5] = X86_64_SSEUP_CLASS;
7866 classes[6] = X86_64_SSEUP_CLASS;
7867 classes[7] = X86_64_SSEUP_CLASS;
7868 return 8;
7869 case E_V4SFmode:
7870 case E_V4SImode:
7871 case E_V16QImode:
7872 case E_V8HImode:
7873 case E_V2DFmode:
7874 case E_V2DImode:
7875 classes[0] = X86_64_SSE_CLASS;
7876 classes[1] = X86_64_SSEUP_CLASS;
7877 return 2;
7878 case E_V1TImode:
7879 case E_V1DImode:
7880 case E_V2SFmode:
7881 case E_V2SImode:
7882 case E_V4HImode:
7883 case E_V8QImode:
7884 classes[0] = X86_64_SSE_CLASS;
7885 return 1;
7886 case E_BLKmode:
7887 case E_VOIDmode:
7888 return 0;
7889 default:
7890 gcc_assert (VECTOR_MODE_P (mode));
7892 if (bytes > 16)
7893 return 0;
7895 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7897 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7898 classes[0] = X86_64_INTEGERSI_CLASS;
7899 else
7900 classes[0] = X86_64_INTEGER_CLASS;
7901 classes[1] = X86_64_INTEGER_CLASS;
7902 return 1 + (bytes > 8);
7906 /* Examine the argument and return set number of register required in each
7907 class. Return true iff parameter should be passed in memory. */
7909 static bool
7910 examine_argument (machine_mode mode, const_tree type, int in_return,
7911 int *int_nregs, int *sse_nregs)
7913 enum x86_64_reg_class regclass[MAX_CLASSES];
7914 int n = classify_argument (mode, type, regclass, 0);
7916 *int_nregs = 0;
7917 *sse_nregs = 0;
7919 if (!n)
7920 return true;
7921 for (n--; n >= 0; n--)
7922 switch (regclass[n])
7924 case X86_64_INTEGER_CLASS:
7925 case X86_64_INTEGERSI_CLASS:
7926 (*int_nregs)++;
7927 break;
7928 case X86_64_SSE_CLASS:
7929 case X86_64_SSESF_CLASS:
7930 case X86_64_SSEDF_CLASS:
7931 (*sse_nregs)++;
7932 break;
7933 case X86_64_NO_CLASS:
7934 case X86_64_SSEUP_CLASS:
7935 break;
7936 case X86_64_X87_CLASS:
7937 case X86_64_X87UP_CLASS:
7938 case X86_64_COMPLEX_X87_CLASS:
7939 if (!in_return)
7940 return true;
7941 break;
7942 case X86_64_MEMORY_CLASS:
7943 gcc_unreachable ();
7946 return false;
7949 /* Construct container for the argument used by GCC interface. See
7950 FUNCTION_ARG for the detailed description. */
7952 static rtx
7953 construct_container (machine_mode mode, machine_mode orig_mode,
7954 const_tree type, int in_return, int nintregs, int nsseregs,
7955 const int *intreg, int sse_regno)
7957 /* The following variables hold the static issued_error state. */
7958 static bool issued_sse_arg_error;
7959 static bool issued_sse_ret_error;
7960 static bool issued_x87_ret_error;
7962 machine_mode tmpmode;
7963 int bytes =
7964 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7965 enum x86_64_reg_class regclass[MAX_CLASSES];
7966 int n;
7967 int i;
7968 int nexps = 0;
7969 int needed_sseregs, needed_intregs;
7970 rtx exp[MAX_CLASSES];
7971 rtx ret;
7973 n = classify_argument (mode, type, regclass, 0);
7974 if (!n)
7975 return NULL;
7976 if (examine_argument (mode, type, in_return, &needed_intregs,
7977 &needed_sseregs))
7978 return NULL;
7979 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7980 return NULL;
7982 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7983 some less clueful developer tries to use floating-point anyway. */
7984 if (needed_sseregs && !TARGET_SSE)
7986 if (in_return)
7988 if (!issued_sse_ret_error)
7990 error ("SSE register return with SSE disabled");
7991 issued_sse_ret_error = true;
7994 else if (!issued_sse_arg_error)
7996 error ("SSE register argument with SSE disabled");
7997 issued_sse_arg_error = true;
7999 return NULL;
8002 /* Likewise, error if the ABI requires us to return values in the
8003 x87 registers and the user specified -mno-80387. */
8004 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8005 for (i = 0; i < n; i++)
8006 if (regclass[i] == X86_64_X87_CLASS
8007 || regclass[i] == X86_64_X87UP_CLASS
8008 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8010 if (!issued_x87_ret_error)
8012 error ("x87 register return with x87 disabled");
8013 issued_x87_ret_error = true;
8015 return NULL;
8018 /* First construct simple cases. Avoid SCmode, since we want to use
8019 single register to pass this type. */
8020 if (n == 1 && mode != SCmode)
8021 switch (regclass[0])
8023 case X86_64_INTEGER_CLASS:
8024 case X86_64_INTEGERSI_CLASS:
8025 return gen_rtx_REG (mode, intreg[0]);
8026 case X86_64_SSE_CLASS:
8027 case X86_64_SSESF_CLASS:
8028 case X86_64_SSEDF_CLASS:
8029 if (mode != BLKmode)
8030 return gen_reg_or_parallel (mode, orig_mode,
8031 SSE_REGNO (sse_regno));
8032 break;
8033 case X86_64_X87_CLASS:
8034 case X86_64_COMPLEX_X87_CLASS:
8035 return gen_rtx_REG (mode, FIRST_STACK_REG);
8036 case X86_64_NO_CLASS:
8037 /* Zero sized array, struct or class. */
8038 return NULL;
8039 default:
8040 gcc_unreachable ();
8042 if (n == 2
8043 && regclass[0] == X86_64_SSE_CLASS
8044 && regclass[1] == X86_64_SSEUP_CLASS
8045 && mode != BLKmode)
8046 return gen_reg_or_parallel (mode, orig_mode,
8047 SSE_REGNO (sse_regno));
8048 if (n == 4
8049 && regclass[0] == X86_64_SSE_CLASS
8050 && regclass[1] == X86_64_SSEUP_CLASS
8051 && regclass[2] == X86_64_SSEUP_CLASS
8052 && regclass[3] == X86_64_SSEUP_CLASS
8053 && mode != BLKmode)
8054 return gen_reg_or_parallel (mode, orig_mode,
8055 SSE_REGNO (sse_regno));
8056 if (n == 8
8057 && regclass[0] == X86_64_SSE_CLASS
8058 && regclass[1] == X86_64_SSEUP_CLASS
8059 && regclass[2] == X86_64_SSEUP_CLASS
8060 && regclass[3] == X86_64_SSEUP_CLASS
8061 && regclass[4] == X86_64_SSEUP_CLASS
8062 && regclass[5] == X86_64_SSEUP_CLASS
8063 && regclass[6] == X86_64_SSEUP_CLASS
8064 && regclass[7] == X86_64_SSEUP_CLASS
8065 && mode != BLKmode)
8066 return gen_reg_or_parallel (mode, orig_mode,
8067 SSE_REGNO (sse_regno));
8068 if (n == 2
8069 && regclass[0] == X86_64_X87_CLASS
8070 && regclass[1] == X86_64_X87UP_CLASS)
8071 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8073 if (n == 2
8074 && regclass[0] == X86_64_INTEGER_CLASS
8075 && regclass[1] == X86_64_INTEGER_CLASS
8076 && (mode == CDImode || mode == TImode)
8077 && intreg[0] + 1 == intreg[1])
8078 return gen_rtx_REG (mode, intreg[0]);
8080 /* Otherwise figure out the entries of the PARALLEL. */
8081 for (i = 0; i < n; i++)
8083 int pos;
8085 switch (regclass[i])
8087 case X86_64_NO_CLASS:
8088 break;
8089 case X86_64_INTEGER_CLASS:
8090 case X86_64_INTEGERSI_CLASS:
8091 /* Merge TImodes on aligned occasions here too. */
8092 if (i * 8 + 8 > bytes)
8094 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8095 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8096 /* We've requested 24 bytes we
8097 don't have mode for. Use DImode. */
8098 tmpmode = DImode;
8100 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8101 tmpmode = SImode;
8102 else
8103 tmpmode = DImode;
8104 exp [nexps++]
8105 = gen_rtx_EXPR_LIST (VOIDmode,
8106 gen_rtx_REG (tmpmode, *intreg),
8107 GEN_INT (i*8));
8108 intreg++;
8109 break;
8110 case X86_64_SSESF_CLASS:
8111 exp [nexps++]
8112 = gen_rtx_EXPR_LIST (VOIDmode,
8113 gen_rtx_REG (SFmode,
8114 SSE_REGNO (sse_regno)),
8115 GEN_INT (i*8));
8116 sse_regno++;
8117 break;
8118 case X86_64_SSEDF_CLASS:
8119 exp [nexps++]
8120 = gen_rtx_EXPR_LIST (VOIDmode,
8121 gen_rtx_REG (DFmode,
8122 SSE_REGNO (sse_regno)),
8123 GEN_INT (i*8));
8124 sse_regno++;
8125 break;
8126 case X86_64_SSE_CLASS:
8127 pos = i;
8128 switch (n)
8130 case 1:
8131 tmpmode = DImode;
8132 break;
8133 case 2:
8134 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8136 tmpmode = TImode;
8137 i++;
8139 else
8140 tmpmode = DImode;
8141 break;
8142 case 4:
8143 gcc_assert (i == 0
8144 && regclass[1] == X86_64_SSEUP_CLASS
8145 && regclass[2] == X86_64_SSEUP_CLASS
8146 && regclass[3] == X86_64_SSEUP_CLASS);
8147 tmpmode = OImode;
8148 i += 3;
8149 break;
8150 case 8:
8151 gcc_assert (i == 0
8152 && regclass[1] == X86_64_SSEUP_CLASS
8153 && regclass[2] == X86_64_SSEUP_CLASS
8154 && regclass[3] == X86_64_SSEUP_CLASS
8155 && regclass[4] == X86_64_SSEUP_CLASS
8156 && regclass[5] == X86_64_SSEUP_CLASS
8157 && regclass[6] == X86_64_SSEUP_CLASS
8158 && regclass[7] == X86_64_SSEUP_CLASS);
8159 tmpmode = XImode;
8160 i += 7;
8161 break;
8162 default:
8163 gcc_unreachable ();
8165 exp [nexps++]
8166 = gen_rtx_EXPR_LIST (VOIDmode,
8167 gen_rtx_REG (tmpmode,
8168 SSE_REGNO (sse_regno)),
8169 GEN_INT (pos*8));
8170 sse_regno++;
8171 break;
8172 default:
8173 gcc_unreachable ();
8177 /* Empty aligned struct, union or class. */
8178 if (nexps == 0)
8179 return NULL;
8181 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8182 for (i = 0; i < nexps; i++)
8183 XVECEXP (ret, 0, i) = exp [i];
8184 return ret;
8187 /* Update the data in CUM to advance over an argument of mode MODE
8188 and data type TYPE. (TYPE is null for libcalls where that information
8189 may not be available.)
8191 Return a number of integer regsiters advanced over. */
8193 static int
8194 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8195 const_tree type, HOST_WIDE_INT bytes,
8196 HOST_WIDE_INT words)
8198 int res = 0;
8199 bool error_p = false;
8201 if (TARGET_IAMCU)
8203 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8204 bytes in registers. */
8205 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8206 goto pass_in_reg;
8207 return res;
8210 switch (mode)
8212 default:
8213 break;
8215 case E_BLKmode:
8216 if (bytes < 0)
8217 break;
8218 /* FALLTHRU */
8220 case E_DImode:
8221 case E_SImode:
8222 case E_HImode:
8223 case E_QImode:
8224 pass_in_reg:
8225 cum->words += words;
8226 cum->nregs -= words;
8227 cum->regno += words;
8228 if (cum->nregs >= 0)
8229 res = words;
8230 if (cum->nregs <= 0)
8232 cum->nregs = 0;
8233 cfun->machine->arg_reg_available = false;
8234 cum->regno = 0;
8236 break;
8238 case E_OImode:
8239 /* OImode shouldn't be used directly. */
8240 gcc_unreachable ();
8242 case E_DFmode:
8243 if (cum->float_in_sse == -1)
8244 error_p = true;
8245 if (cum->float_in_sse < 2)
8246 break;
8247 /* FALLTHRU */
8248 case E_SFmode:
8249 if (cum->float_in_sse == -1)
8250 error_p = true;
8251 if (cum->float_in_sse < 1)
8252 break;
8253 /* FALLTHRU */
8255 case E_V8SFmode:
8256 case E_V8SImode:
8257 case E_V64QImode:
8258 case E_V32HImode:
8259 case E_V16SImode:
8260 case E_V8DImode:
8261 case E_V16SFmode:
8262 case E_V8DFmode:
8263 case E_V32QImode:
8264 case E_V16HImode:
8265 case E_V4DFmode:
8266 case E_V4DImode:
8267 case E_TImode:
8268 case E_V16QImode:
8269 case E_V8HImode:
8270 case E_V4SImode:
8271 case E_V2DImode:
8272 case E_V4SFmode:
8273 case E_V2DFmode:
8274 if (!type || !AGGREGATE_TYPE_P (type))
8276 cum->sse_words += words;
8277 cum->sse_nregs -= 1;
8278 cum->sse_regno += 1;
8279 if (cum->sse_nregs <= 0)
8281 cum->sse_nregs = 0;
8282 cum->sse_regno = 0;
8285 break;
8287 case E_V8QImode:
8288 case E_V4HImode:
8289 case E_V2SImode:
8290 case E_V2SFmode:
8291 case E_V1TImode:
8292 case E_V1DImode:
8293 if (!type || !AGGREGATE_TYPE_P (type))
8295 cum->mmx_words += words;
8296 cum->mmx_nregs -= 1;
8297 cum->mmx_regno += 1;
8298 if (cum->mmx_nregs <= 0)
8300 cum->mmx_nregs = 0;
8301 cum->mmx_regno = 0;
8304 break;
8306 if (error_p)
8308 cum->float_in_sse = 0;
8309 error ("calling %qD with SSE calling convention without "
8310 "SSE/SSE2 enabled", cum->decl);
8311 sorry ("this is a GCC bug that can be worked around by adding "
8312 "attribute used to function called");
8315 return res;
8318 static int
8319 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8320 const_tree type, HOST_WIDE_INT words, bool named)
8322 int int_nregs, sse_nregs;
8324 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8325 if (!named && (VALID_AVX512F_REG_MODE (mode)
8326 || VALID_AVX256_REG_MODE (mode)))
8327 return 0;
8329 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8330 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8332 cum->nregs -= int_nregs;
8333 cum->sse_nregs -= sse_nregs;
8334 cum->regno += int_nregs;
8335 cum->sse_regno += sse_nregs;
8336 return int_nregs;
8338 else
8340 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8341 cum->words = ROUND_UP (cum->words, align);
8342 cum->words += words;
8343 return 0;
8347 static int
8348 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8349 HOST_WIDE_INT words)
8351 /* Otherwise, this should be passed indirect. */
8352 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8354 cum->words += words;
8355 if (cum->nregs > 0)
8357 cum->nregs -= 1;
8358 cum->regno += 1;
8359 return 1;
8361 return 0;
8364 /* Update the data in CUM to advance over an argument of mode MODE and
8365 data type TYPE. (TYPE is null for libcalls where that information
8366 may not be available.) */
8368 static void
8369 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8370 const_tree type, bool named)
8372 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8373 HOST_WIDE_INT bytes, words;
8374 int nregs;
8376 /* The argument of interrupt handler is a special case and is
8377 handled in ix86_function_arg. */
8378 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8379 return;
8381 if (mode == BLKmode)
8382 bytes = int_size_in_bytes (type);
8383 else
8384 bytes = GET_MODE_SIZE (mode);
8385 words = CEIL (bytes, UNITS_PER_WORD);
8387 if (type)
8388 mode = type_natural_mode (type, NULL, false);
8390 if ((type && POINTER_BOUNDS_TYPE_P (type))
8391 || POINTER_BOUNDS_MODE_P (mode))
8393 /* If we pass bounds in BT then just update remained bounds count. */
8394 if (cum->bnds_in_bt)
8396 cum->bnds_in_bt--;
8397 return;
8400 /* Update remained number of bounds to force. */
8401 if (cum->force_bnd_pass)
8402 cum->force_bnd_pass--;
8404 cum->bnd_regno++;
8406 return;
8409 /* The first arg not going to Bounds Tables resets this counter. */
8410 cum->bnds_in_bt = 0;
8411 /* For unnamed args we always pass bounds to avoid bounds mess when
8412 passed and received types do not match. If bounds do not follow
8413 unnamed arg, still pretend required number of bounds were passed. */
8414 if (cum->force_bnd_pass)
8416 cum->bnd_regno += cum->force_bnd_pass;
8417 cum->force_bnd_pass = 0;
8420 if (TARGET_64BIT)
8422 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8424 if (call_abi == MS_ABI)
8425 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8426 else
8427 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8429 else
8430 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8432 /* For stdarg we expect bounds to be passed for each value passed
8433 in register. */
8434 if (cum->stdarg)
8435 cum->force_bnd_pass = nregs;
8436 /* For pointers passed in memory we expect bounds passed in Bounds
8437 Table. */
8438 if (!nregs)
8440 /* Track if there are outgoing arguments on stack. */
8441 if (cum->caller)
8442 cfun->machine->outgoing_args_on_stack = true;
8444 cum->bnds_in_bt = chkp_type_bounds_count (type);
8448 /* Define where to put the arguments to a function.
8449 Value is zero to push the argument on the stack,
8450 or a hard register in which to store the argument.
8452 MODE is the argument's machine mode.
8453 TYPE is the data type of the argument (as a tree).
8454 This is null for libcalls where that information may
8455 not be available.
8456 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8457 the preceding args and about the function being called.
8458 NAMED is nonzero if this argument is a named parameter
8459 (otherwise it is an extra parameter matching an ellipsis). */
8461 static rtx
8462 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8463 machine_mode orig_mode, const_tree type,
8464 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8466 bool error_p = false;
8468 /* Avoid the AL settings for the Unix64 ABI. */
8469 if (mode == VOIDmode)
8470 return constm1_rtx;
8472 if (TARGET_IAMCU)
8474 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8475 bytes in registers. */
8476 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8477 goto pass_in_reg;
8478 return NULL_RTX;
8481 switch (mode)
8483 default:
8484 break;
8486 case E_BLKmode:
8487 if (bytes < 0)
8488 break;
8489 /* FALLTHRU */
8490 case E_DImode:
8491 case E_SImode:
8492 case E_HImode:
8493 case E_QImode:
8494 pass_in_reg:
8495 if (words <= cum->nregs)
8497 int regno = cum->regno;
8499 /* Fastcall allocates the first two DWORD (SImode) or
8500 smaller arguments to ECX and EDX if it isn't an
8501 aggregate type . */
8502 if (cum->fastcall)
8504 if (mode == BLKmode
8505 || mode == DImode
8506 || (type && AGGREGATE_TYPE_P (type)))
8507 break;
8509 /* ECX not EAX is the first allocated register. */
8510 if (regno == AX_REG)
8511 regno = CX_REG;
8513 return gen_rtx_REG (mode, regno);
8515 break;
8517 case E_DFmode:
8518 if (cum->float_in_sse == -1)
8519 error_p = true;
8520 if (cum->float_in_sse < 2)
8521 break;
8522 /* FALLTHRU */
8523 case E_SFmode:
8524 if (cum->float_in_sse == -1)
8525 error_p = true;
8526 if (cum->float_in_sse < 1)
8527 break;
8528 /* FALLTHRU */
8529 case E_TImode:
8530 /* In 32bit, we pass TImode in xmm registers. */
8531 case E_V16QImode:
8532 case E_V8HImode:
8533 case E_V4SImode:
8534 case E_V2DImode:
8535 case E_V4SFmode:
8536 case E_V2DFmode:
8537 if (!type || !AGGREGATE_TYPE_P (type))
8539 if (cum->sse_nregs)
8540 return gen_reg_or_parallel (mode, orig_mode,
8541 cum->sse_regno + FIRST_SSE_REG);
8543 break;
8545 case E_OImode:
8546 case E_XImode:
8547 /* OImode and XImode shouldn't be used directly. */
8548 gcc_unreachable ();
8550 case E_V64QImode:
8551 case E_V32HImode:
8552 case E_V16SImode:
8553 case E_V8DImode:
8554 case E_V16SFmode:
8555 case E_V8DFmode:
8556 case E_V8SFmode:
8557 case E_V8SImode:
8558 case E_V32QImode:
8559 case E_V16HImode:
8560 case E_V4DFmode:
8561 case E_V4DImode:
8562 if (!type || !AGGREGATE_TYPE_P (type))
8564 if (cum->sse_nregs)
8565 return gen_reg_or_parallel (mode, orig_mode,
8566 cum->sse_regno + FIRST_SSE_REG);
8568 break;
8570 case E_V8QImode:
8571 case E_V4HImode:
8572 case E_V2SImode:
8573 case E_V2SFmode:
8574 case E_V1TImode:
8575 case E_V1DImode:
8576 if (!type || !AGGREGATE_TYPE_P (type))
8578 if (cum->mmx_nregs)
8579 return gen_reg_or_parallel (mode, orig_mode,
8580 cum->mmx_regno + FIRST_MMX_REG);
8582 break;
8584 if (error_p)
8586 cum->float_in_sse = 0;
8587 error ("calling %qD with SSE calling convention without "
8588 "SSE/SSE2 enabled", cum->decl);
8589 sorry ("this is a GCC bug that can be worked around by adding "
8590 "attribute used to function called");
8593 return NULL_RTX;
8596 static rtx
8597 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8598 machine_mode orig_mode, const_tree type, bool named)
8600 /* Handle a hidden AL argument containing number of registers
8601 for varargs x86-64 functions. */
8602 if (mode == VOIDmode)
8603 return GEN_INT (cum->maybe_vaarg
8604 ? (cum->sse_nregs < 0
8605 ? X86_64_SSE_REGPARM_MAX
8606 : cum->sse_regno)
8607 : -1);
8609 switch (mode)
8611 default:
8612 break;
8614 case E_V8SFmode:
8615 case E_V8SImode:
8616 case E_V32QImode:
8617 case E_V16HImode:
8618 case E_V4DFmode:
8619 case E_V4DImode:
8620 case E_V16SFmode:
8621 case E_V16SImode:
8622 case E_V64QImode:
8623 case E_V32HImode:
8624 case E_V8DFmode:
8625 case E_V8DImode:
8626 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8627 if (!named)
8628 return NULL;
8629 break;
8632 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8633 cum->sse_nregs,
8634 &x86_64_int_parameter_registers [cum->regno],
8635 cum->sse_regno);
8638 static rtx
8639 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8640 machine_mode orig_mode, bool named,
8641 HOST_WIDE_INT bytes)
8643 unsigned int regno;
8645 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8646 We use value of -2 to specify that current function call is MSABI. */
8647 if (mode == VOIDmode)
8648 return GEN_INT (-2);
8650 /* If we've run out of registers, it goes on the stack. */
8651 if (cum->nregs == 0)
8652 return NULL_RTX;
8654 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8656 /* Only floating point modes are passed in anything but integer regs. */
8657 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8659 if (named)
8660 regno = cum->regno + FIRST_SSE_REG;
8661 else
8663 rtx t1, t2;
8665 /* Unnamed floating parameters are passed in both the
8666 SSE and integer registers. */
8667 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8668 t2 = gen_rtx_REG (mode, regno);
8669 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8670 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8671 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8674 /* Handle aggregated types passed in register. */
8675 if (orig_mode == BLKmode)
8677 if (bytes > 0 && bytes <= 8)
8678 mode = (bytes > 4 ? DImode : SImode);
8679 if (mode == BLKmode)
8680 mode = DImode;
8683 return gen_reg_or_parallel (mode, orig_mode, regno);
8686 /* Return where to put the arguments to a function.
8687 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8689 MODE is the argument's machine mode. TYPE is the data type of the
8690 argument. It is null for libcalls where that information may not be
8691 available. CUM gives information about the preceding args and about
8692 the function being called. NAMED is nonzero if this argument is a
8693 named parameter (otherwise it is an extra parameter matching an
8694 ellipsis). */
8696 static rtx
8697 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8698 const_tree type, bool named)
8700 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8701 machine_mode mode = omode;
8702 HOST_WIDE_INT bytes, words;
8703 rtx arg;
8705 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8707 gcc_assert (type != NULL_TREE);
8708 if (POINTER_TYPE_P (type))
8710 /* This is the pointer argument. */
8711 gcc_assert (TYPE_MODE (type) == Pmode);
8712 /* It is at -WORD(AP) in the current frame in interrupt and
8713 exception handlers. */
8714 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8716 else
8718 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8719 && TREE_CODE (type) == INTEGER_TYPE
8720 && TYPE_MODE (type) == word_mode);
8721 /* The error code is the word-mode integer argument at
8722 -2 * WORD(AP) in the current frame of the exception
8723 handler. */
8724 arg = gen_rtx_MEM (word_mode,
8725 plus_constant (Pmode,
8726 arg_pointer_rtx,
8727 -2 * UNITS_PER_WORD));
8729 return arg;
8732 /* All pointer bounds arguments are handled separately here. */
8733 if ((type && POINTER_BOUNDS_TYPE_P (type))
8734 || POINTER_BOUNDS_MODE_P (mode))
8736 /* Return NULL if bounds are forced to go in Bounds Table. */
8737 if (cum->bnds_in_bt)
8738 arg = NULL;
8739 /* Return the next available bound reg if any. */
8740 else if (cum->bnd_regno <= LAST_BND_REG)
8741 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8742 /* Return the next special slot number otherwise. */
8743 else
8744 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8746 return arg;
8749 if (mode == BLKmode)
8750 bytes = int_size_in_bytes (type);
8751 else
8752 bytes = GET_MODE_SIZE (mode);
8753 words = CEIL (bytes, UNITS_PER_WORD);
8755 /* To simplify the code below, represent vector types with a vector mode
8756 even if MMX/SSE are not active. */
8757 if (type && TREE_CODE (type) == VECTOR_TYPE)
8758 mode = type_natural_mode (type, cum, false);
8760 if (TARGET_64BIT)
8762 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8764 if (call_abi == MS_ABI)
8765 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8766 else
8767 arg = function_arg_64 (cum, mode, omode, type, named);
8769 else
8770 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8772 /* Track if there are outgoing arguments on stack. */
8773 if (arg == NULL_RTX && cum->caller)
8774 cfun->machine->outgoing_args_on_stack = true;
8776 return arg;
8779 /* A C expression that indicates when an argument must be passed by
8780 reference. If nonzero for an argument, a copy of that argument is
8781 made in memory and a pointer to the argument is passed instead of
8782 the argument itself. The pointer is passed in whatever way is
8783 appropriate for passing a pointer to that type. */
8785 static bool
8786 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8787 const_tree type, bool)
8789 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8791 /* Bounds are never passed by reference. */
8792 if ((type && POINTER_BOUNDS_TYPE_P (type))
8793 || POINTER_BOUNDS_MODE_P (mode))
8794 return false;
8796 if (TARGET_64BIT)
8798 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8800 /* See Windows x64 Software Convention. */
8801 if (call_abi == MS_ABI)
8803 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8805 if (type)
8807 /* Arrays are passed by reference. */
8808 if (TREE_CODE (type) == ARRAY_TYPE)
8809 return true;
8811 if (RECORD_OR_UNION_TYPE_P (type))
8813 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8814 are passed by reference. */
8815 msize = int_size_in_bytes (type);
8819 /* __m128 is passed by reference. */
8820 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8822 else if (type && int_size_in_bytes (type) == -1)
8823 return true;
8826 return false;
8829 /* Return true when TYPE should be 128bit aligned for 32bit argument
8830 passing ABI. XXX: This function is obsolete and is only used for
8831 checking psABI compatibility with previous versions of GCC. */
8833 static bool
8834 ix86_compat_aligned_value_p (const_tree type)
8836 machine_mode mode = TYPE_MODE (type);
8837 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8838 || mode == TDmode
8839 || mode == TFmode
8840 || mode == TCmode)
8841 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8842 return true;
8843 if (TYPE_ALIGN (type) < 128)
8844 return false;
8846 if (AGGREGATE_TYPE_P (type))
8848 /* Walk the aggregates recursively. */
8849 switch (TREE_CODE (type))
8851 case RECORD_TYPE:
8852 case UNION_TYPE:
8853 case QUAL_UNION_TYPE:
8855 tree field;
8857 /* Walk all the structure fields. */
8858 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8860 if (TREE_CODE (field) == FIELD_DECL
8861 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8862 return true;
8864 break;
8867 case ARRAY_TYPE:
8868 /* Just for use if some languages passes arrays by value. */
8869 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8870 return true;
8871 break;
8873 default:
8874 gcc_unreachable ();
8877 return false;
8880 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8881 XXX: This function is obsolete and is only used for checking psABI
8882 compatibility with previous versions of GCC. */
8884 static unsigned int
8885 ix86_compat_function_arg_boundary (machine_mode mode,
8886 const_tree type, unsigned int align)
8888 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8889 natural boundaries. */
8890 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8892 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8893 make an exception for SSE modes since these require 128bit
8894 alignment.
8896 The handling here differs from field_alignment. ICC aligns MMX
8897 arguments to 4 byte boundaries, while structure fields are aligned
8898 to 8 byte boundaries. */
8899 if (!type)
8901 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8902 align = PARM_BOUNDARY;
8904 else
8906 if (!ix86_compat_aligned_value_p (type))
8907 align = PARM_BOUNDARY;
8910 if (align > BIGGEST_ALIGNMENT)
8911 align = BIGGEST_ALIGNMENT;
8912 return align;
8915 /* Return true when TYPE should be 128bit aligned for 32bit argument
8916 passing ABI. */
8918 static bool
8919 ix86_contains_aligned_value_p (const_tree type)
8921 machine_mode mode = TYPE_MODE (type);
8923 if (mode == XFmode || mode == XCmode)
8924 return false;
8926 if (TYPE_ALIGN (type) < 128)
8927 return false;
8929 if (AGGREGATE_TYPE_P (type))
8931 /* Walk the aggregates recursively. */
8932 switch (TREE_CODE (type))
8934 case RECORD_TYPE:
8935 case UNION_TYPE:
8936 case QUAL_UNION_TYPE:
8938 tree field;
8940 /* Walk all the structure fields. */
8941 for (field = TYPE_FIELDS (type);
8942 field;
8943 field = DECL_CHAIN (field))
8945 if (TREE_CODE (field) == FIELD_DECL
8946 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8947 return true;
8949 break;
8952 case ARRAY_TYPE:
8953 /* Just for use if some languages passes arrays by value. */
8954 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8955 return true;
8956 break;
8958 default:
8959 gcc_unreachable ();
8962 else
8963 return TYPE_ALIGN (type) >= 128;
8965 return false;
8968 /* Gives the alignment boundary, in bits, of an argument with the
8969 specified mode and type. */
8971 static unsigned int
8972 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8974 unsigned int align;
8975 if (type)
8977 /* Since the main variant type is used for call, we convert it to
8978 the main variant type. */
8979 type = TYPE_MAIN_VARIANT (type);
8980 align = TYPE_ALIGN (type);
8981 if (TYPE_EMPTY_P (type))
8982 return PARM_BOUNDARY;
8984 else
8985 align = GET_MODE_ALIGNMENT (mode);
8986 if (align < PARM_BOUNDARY)
8987 align = PARM_BOUNDARY;
8988 else
8990 static bool warned;
8991 unsigned int saved_align = align;
8993 if (!TARGET_64BIT)
8995 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8996 if (!type)
8998 if (mode == XFmode || mode == XCmode)
8999 align = PARM_BOUNDARY;
9001 else if (!ix86_contains_aligned_value_p (type))
9002 align = PARM_BOUNDARY;
9004 if (align < 128)
9005 align = PARM_BOUNDARY;
9008 if (warn_psabi
9009 && !warned
9010 && align != ix86_compat_function_arg_boundary (mode, type,
9011 saved_align))
9013 warned = true;
9014 inform (input_location,
9015 "The ABI for passing parameters with %d-byte"
9016 " alignment has changed in GCC 4.6",
9017 align / BITS_PER_UNIT);
9021 return align;
9024 /* Return true if N is a possible register number of function value. */
9026 static bool
9027 ix86_function_value_regno_p (const unsigned int regno)
9029 switch (regno)
9031 case AX_REG:
9032 return true;
9033 case DX_REG:
9034 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9035 case DI_REG:
9036 case SI_REG:
9037 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9039 case BND0_REG:
9040 case BND1_REG:
9041 return chkp_function_instrumented_p (current_function_decl);
9043 /* Complex values are returned in %st(0)/%st(1) pair. */
9044 case ST0_REG:
9045 case ST1_REG:
9046 /* TODO: The function should depend on current function ABI but
9047 builtins.c would need updating then. Therefore we use the
9048 default ABI. */
9049 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9050 return false;
9051 return TARGET_FLOAT_RETURNS_IN_80387;
9053 /* Complex values are returned in %xmm0/%xmm1 pair. */
9054 case XMM0_REG:
9055 case XMM1_REG:
9056 return TARGET_SSE;
9058 case MM0_REG:
9059 if (TARGET_MACHO || TARGET_64BIT)
9060 return false;
9061 return TARGET_MMX;
9064 return false;
9067 /* Define how to find the value returned by a function.
9068 VALTYPE is the data type of the value (as a tree).
9069 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9070 otherwise, FUNC is 0. */
9072 static rtx
9073 function_value_32 (machine_mode orig_mode, machine_mode mode,
9074 const_tree fntype, const_tree fn)
9076 unsigned int regno;
9078 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9079 we normally prevent this case when mmx is not available. However
9080 some ABIs may require the result to be returned like DImode. */
9081 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9082 regno = FIRST_MMX_REG;
9084 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9085 we prevent this case when sse is not available. However some ABIs
9086 may require the result to be returned like integer TImode. */
9087 else if (mode == TImode
9088 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9089 regno = FIRST_SSE_REG;
9091 /* 32-byte vector modes in %ymm0. */
9092 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9093 regno = FIRST_SSE_REG;
9095 /* 64-byte vector modes in %zmm0. */
9096 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9097 regno = FIRST_SSE_REG;
9099 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9100 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9101 regno = FIRST_FLOAT_REG;
9102 else
9103 /* Most things go in %eax. */
9104 regno = AX_REG;
9106 /* Override FP return register with %xmm0 for local functions when
9107 SSE math is enabled or for functions with sseregparm attribute. */
9108 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9110 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9111 if (sse_level == -1)
9113 error ("calling %qD with SSE calling convention without "
9114 "SSE/SSE2 enabled", fn);
9115 sorry ("this is a GCC bug that can be worked around by adding "
9116 "attribute used to function called");
9118 else if ((sse_level >= 1 && mode == SFmode)
9119 || (sse_level == 2 && mode == DFmode))
9120 regno = FIRST_SSE_REG;
9123 /* OImode shouldn't be used directly. */
9124 gcc_assert (mode != OImode);
9126 return gen_rtx_REG (orig_mode, regno);
9129 static rtx
9130 function_value_64 (machine_mode orig_mode, machine_mode mode,
9131 const_tree valtype)
9133 rtx ret;
9135 /* Handle libcalls, which don't provide a type node. */
9136 if (valtype == NULL)
9138 unsigned int regno;
9140 switch (mode)
9142 case E_SFmode:
9143 case E_SCmode:
9144 case E_DFmode:
9145 case E_DCmode:
9146 case E_TFmode:
9147 case E_SDmode:
9148 case E_DDmode:
9149 case E_TDmode:
9150 regno = FIRST_SSE_REG;
9151 break;
9152 case E_XFmode:
9153 case E_XCmode:
9154 regno = FIRST_FLOAT_REG;
9155 break;
9156 case E_TCmode:
9157 return NULL;
9158 default:
9159 regno = AX_REG;
9162 return gen_rtx_REG (mode, regno);
9164 else if (POINTER_TYPE_P (valtype))
9166 /* Pointers are always returned in word_mode. */
9167 mode = word_mode;
9170 ret = construct_container (mode, orig_mode, valtype, 1,
9171 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9172 x86_64_int_return_registers, 0);
9174 /* For zero sized structures, construct_container returns NULL, but we
9175 need to keep rest of compiler happy by returning meaningful value. */
9176 if (!ret)
9177 ret = gen_rtx_REG (orig_mode, AX_REG);
9179 return ret;
9182 static rtx
9183 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9184 const_tree valtype)
9186 unsigned int regno = AX_REG;
9188 if (TARGET_SSE)
9190 switch (GET_MODE_SIZE (mode))
9192 case 16:
9193 if (valtype != NULL_TREE
9194 && !VECTOR_INTEGER_TYPE_P (valtype)
9195 && !VECTOR_INTEGER_TYPE_P (valtype)
9196 && !INTEGRAL_TYPE_P (valtype)
9197 && !VECTOR_FLOAT_TYPE_P (valtype))
9198 break;
9199 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9200 && !COMPLEX_MODE_P (mode))
9201 regno = FIRST_SSE_REG;
9202 break;
9203 case 8:
9204 case 4:
9205 if (mode == SFmode || mode == DFmode)
9206 regno = FIRST_SSE_REG;
9207 break;
9208 default:
9209 break;
9212 return gen_rtx_REG (orig_mode, regno);
9215 static rtx
9216 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9217 machine_mode orig_mode, machine_mode mode)
9219 const_tree fn, fntype;
9221 fn = NULL_TREE;
9222 if (fntype_or_decl && DECL_P (fntype_or_decl))
9223 fn = fntype_or_decl;
9224 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9226 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9227 || POINTER_BOUNDS_MODE_P (mode))
9228 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9229 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9230 return function_value_ms_64 (orig_mode, mode, valtype);
9231 else if (TARGET_64BIT)
9232 return function_value_64 (orig_mode, mode, valtype);
9233 else
9234 return function_value_32 (orig_mode, mode, fntype, fn);
9237 static rtx
9238 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9240 machine_mode mode, orig_mode;
9242 orig_mode = TYPE_MODE (valtype);
9243 mode = type_natural_mode (valtype, NULL, true);
9244 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9247 /* Return an RTX representing a place where a function returns
9248 or recieves pointer bounds or NULL if no bounds are returned.
9250 VALTYPE is a data type of a value returned by the function.
9252 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9253 or FUNCTION_TYPE of the function.
9255 If OUTGOING is false, return a place in which the caller will
9256 see the return value. Otherwise, return a place where a
9257 function returns a value. */
9259 static rtx
9260 ix86_function_value_bounds (const_tree valtype,
9261 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9262 bool outgoing ATTRIBUTE_UNUSED)
9264 rtx res = NULL_RTX;
9266 if (BOUNDED_TYPE_P (valtype))
9267 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9268 else if (chkp_type_has_pointer (valtype))
9270 bitmap slots;
9271 rtx bounds[2];
9272 bitmap_iterator bi;
9273 unsigned i, bnd_no = 0;
9275 bitmap_obstack_initialize (NULL);
9276 slots = BITMAP_ALLOC (NULL);
9277 chkp_find_bound_slots (valtype, slots);
9279 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9281 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9282 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9283 gcc_assert (bnd_no < 2);
9284 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9287 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9289 BITMAP_FREE (slots);
9290 bitmap_obstack_release (NULL);
9292 else
9293 res = NULL_RTX;
9295 return res;
9298 /* Pointer function arguments and return values are promoted to
9299 word_mode for normal functions. */
9301 static machine_mode
9302 ix86_promote_function_mode (const_tree type, machine_mode mode,
9303 int *punsignedp, const_tree fntype,
9304 int for_return)
9306 if (cfun->machine->func_type == TYPE_NORMAL
9307 && type != NULL_TREE
9308 && POINTER_TYPE_P (type))
9310 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9311 return word_mode;
9313 return default_promote_function_mode (type, mode, punsignedp, fntype,
9314 for_return);
9317 /* Return true if a structure, union or array with MODE containing FIELD
9318 should be accessed using BLKmode. */
9320 static bool
9321 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9323 /* Union with XFmode must be in BLKmode. */
9324 return (mode == XFmode
9325 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9326 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9330 ix86_libcall_value (machine_mode mode)
9332 return ix86_function_value_1 (NULL, NULL, mode, mode);
9335 /* Return true iff type is returned in memory. */
9337 static bool
9338 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9340 #ifdef SUBTARGET_RETURN_IN_MEMORY
9341 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9342 #else
9343 const machine_mode mode = type_natural_mode (type, NULL, true);
9344 HOST_WIDE_INT size;
9346 if (POINTER_BOUNDS_TYPE_P (type))
9347 return false;
9349 if (TARGET_64BIT)
9351 if (ix86_function_type_abi (fntype) == MS_ABI)
9353 size = int_size_in_bytes (type);
9355 /* __m128 is returned in xmm0. */
9356 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9357 || INTEGRAL_TYPE_P (type)
9358 || VECTOR_FLOAT_TYPE_P (type))
9359 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9360 && !COMPLEX_MODE_P (mode)
9361 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9362 return false;
9364 /* Otherwise, the size must be exactly in [1248]. */
9365 return size != 1 && size != 2 && size != 4 && size != 8;
9367 else
9369 int needed_intregs, needed_sseregs;
9371 return examine_argument (mode, type, 1,
9372 &needed_intregs, &needed_sseregs);
9375 else
9377 size = int_size_in_bytes (type);
9379 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9380 bytes in registers. */
9381 if (TARGET_IAMCU)
9382 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9384 if (mode == BLKmode)
9385 return true;
9387 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9388 return false;
9390 if (VECTOR_MODE_P (mode) || mode == TImode)
9392 /* User-created vectors small enough to fit in EAX. */
9393 if (size < 8)
9394 return false;
9396 /* Unless ABI prescibes otherwise,
9397 MMX/3dNow values are returned in MM0 if available. */
9399 if (size == 8)
9400 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9402 /* SSE values are returned in XMM0 if available. */
9403 if (size == 16)
9404 return !TARGET_SSE;
9406 /* AVX values are returned in YMM0 if available. */
9407 if (size == 32)
9408 return !TARGET_AVX;
9410 /* AVX512F values are returned in ZMM0 if available. */
9411 if (size == 64)
9412 return !TARGET_AVX512F;
9415 if (mode == XFmode)
9416 return false;
9418 if (size > 12)
9419 return true;
9421 /* OImode shouldn't be used directly. */
9422 gcc_assert (mode != OImode);
9424 return false;
9426 #endif
9430 /* Create the va_list data type. */
9432 static tree
9433 ix86_build_builtin_va_list_64 (void)
9435 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9437 record = lang_hooks.types.make_type (RECORD_TYPE);
9438 type_decl = build_decl (BUILTINS_LOCATION,
9439 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9441 f_gpr = build_decl (BUILTINS_LOCATION,
9442 FIELD_DECL, get_identifier ("gp_offset"),
9443 unsigned_type_node);
9444 f_fpr = build_decl (BUILTINS_LOCATION,
9445 FIELD_DECL, get_identifier ("fp_offset"),
9446 unsigned_type_node);
9447 f_ovf = build_decl (BUILTINS_LOCATION,
9448 FIELD_DECL, get_identifier ("overflow_arg_area"),
9449 ptr_type_node);
9450 f_sav = build_decl (BUILTINS_LOCATION,
9451 FIELD_DECL, get_identifier ("reg_save_area"),
9452 ptr_type_node);
9454 va_list_gpr_counter_field = f_gpr;
9455 va_list_fpr_counter_field = f_fpr;
9457 DECL_FIELD_CONTEXT (f_gpr) = record;
9458 DECL_FIELD_CONTEXT (f_fpr) = record;
9459 DECL_FIELD_CONTEXT (f_ovf) = record;
9460 DECL_FIELD_CONTEXT (f_sav) = record;
9462 TYPE_STUB_DECL (record) = type_decl;
9463 TYPE_NAME (record) = type_decl;
9464 TYPE_FIELDS (record) = f_gpr;
9465 DECL_CHAIN (f_gpr) = f_fpr;
9466 DECL_CHAIN (f_fpr) = f_ovf;
9467 DECL_CHAIN (f_ovf) = f_sav;
9469 layout_type (record);
9471 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9472 NULL_TREE, TYPE_ATTRIBUTES (record));
9474 /* The correct type is an array type of one element. */
9475 return build_array_type (record, build_index_type (size_zero_node));
9478 /* Setup the builtin va_list data type and for 64-bit the additional
9479 calling convention specific va_list data types. */
9481 static tree
9482 ix86_build_builtin_va_list (void)
9484 if (TARGET_64BIT)
9486 /* Initialize ABI specific va_list builtin types.
9488 In lto1, we can encounter two va_list types:
9489 - one as a result of the type-merge across TUs, and
9490 - the one constructed here.
9491 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9492 a type identity check in canonical_va_list_type based on
9493 TYPE_MAIN_VARIANT (which we used to have) will not work.
9494 Instead, we tag each va_list_type_node with its unique attribute, and
9495 look for the attribute in the type identity check in
9496 canonical_va_list_type.
9498 Tagging sysv_va_list_type_node directly with the attribute is
9499 problematic since it's a array of one record, which will degrade into a
9500 pointer to record when used as parameter (see build_va_arg comments for
9501 an example), dropping the attribute in the process. So we tag the
9502 record instead. */
9504 /* For SYSV_ABI we use an array of one record. */
9505 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9507 /* For MS_ABI we use plain pointer to argument area. */
9508 tree char_ptr_type = build_pointer_type (char_type_node);
9509 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9510 TYPE_ATTRIBUTES (char_ptr_type));
9511 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9513 return ((ix86_abi == MS_ABI)
9514 ? ms_va_list_type_node
9515 : sysv_va_list_type_node);
9517 else
9519 /* For i386 we use plain pointer to argument area. */
9520 return build_pointer_type (char_type_node);
9524 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9526 static void
9527 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9529 rtx save_area, mem;
9530 alias_set_type set;
9531 int i, max;
9533 /* GPR size of varargs save area. */
9534 if (cfun->va_list_gpr_size)
9535 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9536 else
9537 ix86_varargs_gpr_size = 0;
9539 /* FPR size of varargs save area. We don't need it if we don't pass
9540 anything in SSE registers. */
9541 if (TARGET_SSE && cfun->va_list_fpr_size)
9542 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9543 else
9544 ix86_varargs_fpr_size = 0;
9546 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9547 return;
9549 save_area = frame_pointer_rtx;
9550 set = get_varargs_alias_set ();
9552 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9553 if (max > X86_64_REGPARM_MAX)
9554 max = X86_64_REGPARM_MAX;
9556 for (i = cum->regno; i < max; i++)
9558 mem = gen_rtx_MEM (word_mode,
9559 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9560 MEM_NOTRAP_P (mem) = 1;
9561 set_mem_alias_set (mem, set);
9562 emit_move_insn (mem,
9563 gen_rtx_REG (word_mode,
9564 x86_64_int_parameter_registers[i]));
9567 if (ix86_varargs_fpr_size)
9569 machine_mode smode;
9570 rtx_code_label *label;
9571 rtx test;
9573 /* Now emit code to save SSE registers. The AX parameter contains number
9574 of SSE parameter registers used to call this function, though all we
9575 actually check here is the zero/non-zero status. */
9577 label = gen_label_rtx ();
9578 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9579 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9580 label));
9582 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9583 we used movdqa (i.e. TImode) instead? Perhaps even better would
9584 be if we could determine the real mode of the data, via a hook
9585 into pass_stdarg. Ignore all that for now. */
9586 smode = V4SFmode;
9587 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9588 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9590 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9591 if (max > X86_64_SSE_REGPARM_MAX)
9592 max = X86_64_SSE_REGPARM_MAX;
9594 for (i = cum->sse_regno; i < max; ++i)
9596 mem = plus_constant (Pmode, save_area,
9597 i * 16 + ix86_varargs_gpr_size);
9598 mem = gen_rtx_MEM (smode, mem);
9599 MEM_NOTRAP_P (mem) = 1;
9600 set_mem_alias_set (mem, set);
9601 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9603 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9606 emit_label (label);
9610 static void
9611 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9613 alias_set_type set = get_varargs_alias_set ();
9614 int i;
9616 /* Reset to zero, as there might be a sysv vaarg used
9617 before. */
9618 ix86_varargs_gpr_size = 0;
9619 ix86_varargs_fpr_size = 0;
9621 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9623 rtx reg, mem;
9625 mem = gen_rtx_MEM (Pmode,
9626 plus_constant (Pmode, virtual_incoming_args_rtx,
9627 i * UNITS_PER_WORD));
9628 MEM_NOTRAP_P (mem) = 1;
9629 set_mem_alias_set (mem, set);
9631 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9632 emit_move_insn (mem, reg);
9636 static void
9637 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9638 tree type, int *, int no_rtl)
9640 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9641 CUMULATIVE_ARGS next_cum;
9642 tree fntype;
9644 /* This argument doesn't appear to be used anymore. Which is good,
9645 because the old code here didn't suppress rtl generation. */
9646 gcc_assert (!no_rtl);
9648 if (!TARGET_64BIT)
9649 return;
9651 fntype = TREE_TYPE (current_function_decl);
9653 /* For varargs, we do not want to skip the dummy va_dcl argument.
9654 For stdargs, we do want to skip the last named argument. */
9655 next_cum = *cum;
9656 if (stdarg_p (fntype))
9657 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9658 true);
9660 if (cum->call_abi == MS_ABI)
9661 setup_incoming_varargs_ms_64 (&next_cum);
9662 else
9663 setup_incoming_varargs_64 (&next_cum);
9666 static void
9667 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9668 machine_mode mode,
9669 tree type,
9670 int *pretend_size ATTRIBUTE_UNUSED,
9671 int no_rtl)
9673 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9674 CUMULATIVE_ARGS next_cum;
9675 tree fntype;
9676 rtx save_area;
9677 int bnd_reg, i, max;
9679 gcc_assert (!no_rtl);
9681 /* Do nothing if we use plain pointer to argument area. */
9682 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9683 return;
9685 fntype = TREE_TYPE (current_function_decl);
9687 /* For varargs, we do not want to skip the dummy va_dcl argument.
9688 For stdargs, we do want to skip the last named argument. */
9689 next_cum = *cum;
9690 if (stdarg_p (fntype))
9691 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9692 true);
9693 save_area = frame_pointer_rtx;
9695 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9696 if (max > X86_64_REGPARM_MAX)
9697 max = X86_64_REGPARM_MAX;
9699 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9700 if (chkp_function_instrumented_p (current_function_decl))
9701 for (i = cum->regno; i < max; i++)
9703 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9704 rtx ptr = gen_rtx_REG (Pmode,
9705 x86_64_int_parameter_registers[i]);
9706 rtx bounds;
9708 if (bnd_reg <= LAST_BND_REG)
9709 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9710 else
9712 rtx ldx_addr =
9713 plus_constant (Pmode, arg_pointer_rtx,
9714 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9715 bounds = gen_reg_rtx (BNDmode);
9716 emit_insn (BNDmode == BND64mode
9717 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9718 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9721 emit_insn (BNDmode == BND64mode
9722 ? gen_bnd64_stx (addr, ptr, bounds)
9723 : gen_bnd32_stx (addr, ptr, bounds));
9725 bnd_reg++;
9730 /* Checks if TYPE is of kind va_list char *. */
9732 static bool
9733 is_va_list_char_pointer (tree type)
9735 tree canonic;
9737 /* For 32-bit it is always true. */
9738 if (!TARGET_64BIT)
9739 return true;
9740 canonic = ix86_canonical_va_list_type (type);
9741 return (canonic == ms_va_list_type_node
9742 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9745 /* Implement va_start. */
9747 static void
9748 ix86_va_start (tree valist, rtx nextarg)
9750 HOST_WIDE_INT words, n_gpr, n_fpr;
9751 tree f_gpr, f_fpr, f_ovf, f_sav;
9752 tree gpr, fpr, ovf, sav, t;
9753 tree type;
9754 rtx ovf_rtx;
9756 if (flag_split_stack
9757 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9759 unsigned int scratch_regno;
9761 /* When we are splitting the stack, we can't refer to the stack
9762 arguments using internal_arg_pointer, because they may be on
9763 the old stack. The split stack prologue will arrange to
9764 leave a pointer to the old stack arguments in a scratch
9765 register, which we here copy to a pseudo-register. The split
9766 stack prologue can't set the pseudo-register directly because
9767 it (the prologue) runs before any registers have been saved. */
9769 scratch_regno = split_stack_prologue_scratch_regno ();
9770 if (scratch_regno != INVALID_REGNUM)
9772 rtx reg;
9773 rtx_insn *seq;
9775 reg = gen_reg_rtx (Pmode);
9776 cfun->machine->split_stack_varargs_pointer = reg;
9778 start_sequence ();
9779 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9780 seq = get_insns ();
9781 end_sequence ();
9783 push_topmost_sequence ();
9784 emit_insn_after (seq, entry_of_function ());
9785 pop_topmost_sequence ();
9789 /* Only 64bit target needs something special. */
9790 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9792 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9793 std_expand_builtin_va_start (valist, nextarg);
9794 else
9796 rtx va_r, next;
9798 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9799 next = expand_binop (ptr_mode, add_optab,
9800 cfun->machine->split_stack_varargs_pointer,
9801 crtl->args.arg_offset_rtx,
9802 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9803 convert_move (va_r, next, 0);
9805 /* Store zero bounds for va_list. */
9806 if (chkp_function_instrumented_p (current_function_decl))
9807 chkp_expand_bounds_reset_for_mem (valist,
9808 make_tree (TREE_TYPE (valist),
9809 next));
9812 return;
9815 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9816 f_fpr = DECL_CHAIN (f_gpr);
9817 f_ovf = DECL_CHAIN (f_fpr);
9818 f_sav = DECL_CHAIN (f_ovf);
9820 valist = build_simple_mem_ref (valist);
9821 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9822 /* The following should be folded into the MEM_REF offset. */
9823 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9824 f_gpr, NULL_TREE);
9825 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9826 f_fpr, NULL_TREE);
9827 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9828 f_ovf, NULL_TREE);
9829 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9830 f_sav, NULL_TREE);
9832 /* Count number of gp and fp argument registers used. */
9833 words = crtl->args.info.words;
9834 n_gpr = crtl->args.info.regno;
9835 n_fpr = crtl->args.info.sse_regno;
9837 if (cfun->va_list_gpr_size)
9839 type = TREE_TYPE (gpr);
9840 t = build2 (MODIFY_EXPR, type,
9841 gpr, build_int_cst (type, n_gpr * 8));
9842 TREE_SIDE_EFFECTS (t) = 1;
9843 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9846 if (TARGET_SSE && cfun->va_list_fpr_size)
9848 type = TREE_TYPE (fpr);
9849 t = build2 (MODIFY_EXPR, type, fpr,
9850 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9851 TREE_SIDE_EFFECTS (t) = 1;
9852 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9855 /* Find the overflow area. */
9856 type = TREE_TYPE (ovf);
9857 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9858 ovf_rtx = crtl->args.internal_arg_pointer;
9859 else
9860 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9861 t = make_tree (type, ovf_rtx);
9862 if (words != 0)
9863 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9865 /* Store zero bounds for overflow area pointer. */
9866 if (chkp_function_instrumented_p (current_function_decl))
9867 chkp_expand_bounds_reset_for_mem (ovf, t);
9869 t = build2 (MODIFY_EXPR, type, ovf, t);
9870 TREE_SIDE_EFFECTS (t) = 1;
9871 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9873 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9875 /* Find the register save area.
9876 Prologue of the function save it right above stack frame. */
9877 type = TREE_TYPE (sav);
9878 t = make_tree (type, frame_pointer_rtx);
9879 if (!ix86_varargs_gpr_size)
9880 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9882 /* Store zero bounds for save area pointer. */
9883 if (chkp_function_instrumented_p (current_function_decl))
9884 chkp_expand_bounds_reset_for_mem (sav, t);
9886 t = build2 (MODIFY_EXPR, type, sav, t);
9887 TREE_SIDE_EFFECTS (t) = 1;
9888 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9892 /* Implement va_arg. */
9894 static tree
9895 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9896 gimple_seq *post_p)
9898 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9899 tree f_gpr, f_fpr, f_ovf, f_sav;
9900 tree gpr, fpr, ovf, sav, t;
9901 int size, rsize;
9902 tree lab_false, lab_over = NULL_TREE;
9903 tree addr, t2;
9904 rtx container;
9905 int indirect_p = 0;
9906 tree ptrtype;
9907 machine_mode nat_mode;
9908 unsigned int arg_boundary;
9910 /* Only 64bit target needs something special. */
9911 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9912 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9914 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9915 f_fpr = DECL_CHAIN (f_gpr);
9916 f_ovf = DECL_CHAIN (f_fpr);
9917 f_sav = DECL_CHAIN (f_ovf);
9919 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9920 valist, f_gpr, NULL_TREE);
9922 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9923 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9924 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9926 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9927 if (indirect_p)
9928 type = build_pointer_type (type);
9929 size = arg_int_size_in_bytes (type);
9930 rsize = CEIL (size, UNITS_PER_WORD);
9932 nat_mode = type_natural_mode (type, NULL, false);
9933 switch (nat_mode)
9935 case E_V8SFmode:
9936 case E_V8SImode:
9937 case E_V32QImode:
9938 case E_V16HImode:
9939 case E_V4DFmode:
9940 case E_V4DImode:
9941 case E_V16SFmode:
9942 case E_V16SImode:
9943 case E_V64QImode:
9944 case E_V32HImode:
9945 case E_V8DFmode:
9946 case E_V8DImode:
9947 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9948 if (!TARGET_64BIT_MS_ABI)
9950 container = NULL;
9951 break;
9953 /* FALLTHRU */
9955 default:
9956 container = construct_container (nat_mode, TYPE_MODE (type),
9957 type, 0, X86_64_REGPARM_MAX,
9958 X86_64_SSE_REGPARM_MAX, intreg,
9960 break;
9963 /* Pull the value out of the saved registers. */
9965 addr = create_tmp_var (ptr_type_node, "addr");
9967 if (container)
9969 int needed_intregs, needed_sseregs;
9970 bool need_temp;
9971 tree int_addr, sse_addr;
9973 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9974 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9976 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9978 need_temp = (!REG_P (container)
9979 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9980 || TYPE_ALIGN (type) > 128));
9982 /* In case we are passing structure, verify that it is consecutive block
9983 on the register save area. If not we need to do moves. */
9984 if (!need_temp && !REG_P (container))
9986 /* Verify that all registers are strictly consecutive */
9987 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9989 int i;
9991 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9993 rtx slot = XVECEXP (container, 0, i);
9994 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9995 || INTVAL (XEXP (slot, 1)) != i * 16)
9996 need_temp = true;
9999 else
10001 int i;
10003 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10005 rtx slot = XVECEXP (container, 0, i);
10006 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10007 || INTVAL (XEXP (slot, 1)) != i * 8)
10008 need_temp = true;
10012 if (!need_temp)
10014 int_addr = addr;
10015 sse_addr = addr;
10017 else
10019 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10020 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10023 /* First ensure that we fit completely in registers. */
10024 if (needed_intregs)
10026 t = build_int_cst (TREE_TYPE (gpr),
10027 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10028 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10029 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10030 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10031 gimplify_and_add (t, pre_p);
10033 if (needed_sseregs)
10035 t = build_int_cst (TREE_TYPE (fpr),
10036 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10037 + X86_64_REGPARM_MAX * 8);
10038 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10039 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10040 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10041 gimplify_and_add (t, pre_p);
10044 /* Compute index to start of area used for integer regs. */
10045 if (needed_intregs)
10047 /* int_addr = gpr + sav; */
10048 t = fold_build_pointer_plus (sav, gpr);
10049 gimplify_assign (int_addr, t, pre_p);
10051 if (needed_sseregs)
10053 /* sse_addr = fpr + sav; */
10054 t = fold_build_pointer_plus (sav, fpr);
10055 gimplify_assign (sse_addr, t, pre_p);
10057 if (need_temp)
10059 int i, prev_size = 0;
10060 tree temp = create_tmp_var (type, "va_arg_tmp");
10062 /* addr = &temp; */
10063 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10064 gimplify_assign (addr, t, pre_p);
10066 for (i = 0; i < XVECLEN (container, 0); i++)
10068 rtx slot = XVECEXP (container, 0, i);
10069 rtx reg = XEXP (slot, 0);
10070 machine_mode mode = GET_MODE (reg);
10071 tree piece_type;
10072 tree addr_type;
10073 tree daddr_type;
10074 tree src_addr, src;
10075 int src_offset;
10076 tree dest_addr, dest;
10077 int cur_size = GET_MODE_SIZE (mode);
10079 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10080 prev_size = INTVAL (XEXP (slot, 1));
10081 if (prev_size + cur_size > size)
10083 cur_size = size - prev_size;
10084 unsigned int nbits = cur_size * BITS_PER_UNIT;
10085 if (!int_mode_for_size (nbits, 1).exists (&mode))
10086 mode = QImode;
10088 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10089 if (mode == GET_MODE (reg))
10090 addr_type = build_pointer_type (piece_type);
10091 else
10092 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10093 true);
10094 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10095 true);
10097 if (SSE_REGNO_P (REGNO (reg)))
10099 src_addr = sse_addr;
10100 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10102 else
10104 src_addr = int_addr;
10105 src_offset = REGNO (reg) * 8;
10107 src_addr = fold_convert (addr_type, src_addr);
10108 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10110 dest_addr = fold_convert (daddr_type, addr);
10111 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10112 if (cur_size == GET_MODE_SIZE (mode))
10114 src = build_va_arg_indirect_ref (src_addr);
10115 dest = build_va_arg_indirect_ref (dest_addr);
10117 gimplify_assign (dest, src, pre_p);
10119 else
10121 tree copy
10122 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10123 3, dest_addr, src_addr,
10124 size_int (cur_size));
10125 gimplify_and_add (copy, pre_p);
10127 prev_size += cur_size;
10131 if (needed_intregs)
10133 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10134 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10135 gimplify_assign (gpr, t, pre_p);
10138 if (needed_sseregs)
10140 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10141 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10142 gimplify_assign (unshare_expr (fpr), t, pre_p);
10145 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10147 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10150 /* ... otherwise out of the overflow area. */
10152 /* When we align parameter on stack for caller, if the parameter
10153 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10154 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10155 here with caller. */
10156 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10157 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10158 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10160 /* Care for on-stack alignment if needed. */
10161 if (arg_boundary <= 64 || size == 0)
10162 t = ovf;
10163 else
10165 HOST_WIDE_INT align = arg_boundary / 8;
10166 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10167 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10168 build_int_cst (TREE_TYPE (t), -align));
10171 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10172 gimplify_assign (addr, t, pre_p);
10174 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10175 gimplify_assign (unshare_expr (ovf), t, pre_p);
10177 if (container)
10178 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10180 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10181 addr = fold_convert (ptrtype, addr);
10183 if (indirect_p)
10184 addr = build_va_arg_indirect_ref (addr);
10185 return build_va_arg_indirect_ref (addr);
10188 /* Return true if OPNUM's MEM should be matched
10189 in movabs* patterns. */
10191 bool
10192 ix86_check_movabs (rtx insn, int opnum)
10194 rtx set, mem;
10196 set = PATTERN (insn);
10197 if (GET_CODE (set) == PARALLEL)
10198 set = XVECEXP (set, 0, 0);
10199 gcc_assert (GET_CODE (set) == SET);
10200 mem = XEXP (set, opnum);
10201 while (SUBREG_P (mem))
10202 mem = SUBREG_REG (mem);
10203 gcc_assert (MEM_P (mem));
10204 return volatile_ok || !MEM_VOLATILE_P (mem);
10207 /* Return false if INSN contains a MEM with a non-default address space. */
10208 bool
10209 ix86_check_no_addr_space (rtx insn)
10211 subrtx_var_iterator::array_type array;
10212 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10214 rtx x = *iter;
10215 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10216 return false;
10218 return true;
10221 /* Initialize the table of extra 80387 mathematical constants. */
10223 static void
10224 init_ext_80387_constants (void)
10226 static const char * cst[5] =
10228 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10229 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10230 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10231 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10232 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10234 int i;
10236 for (i = 0; i < 5; i++)
10238 real_from_string (&ext_80387_constants_table[i], cst[i]);
10239 /* Ensure each constant is rounded to XFmode precision. */
10240 real_convert (&ext_80387_constants_table[i],
10241 XFmode, &ext_80387_constants_table[i]);
10244 ext_80387_constants_init = 1;
10247 /* Return non-zero if the constant is something that
10248 can be loaded with a special instruction. */
10251 standard_80387_constant_p (rtx x)
10253 machine_mode mode = GET_MODE (x);
10255 const REAL_VALUE_TYPE *r;
10257 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10258 return -1;
10260 if (x == CONST0_RTX (mode))
10261 return 1;
10262 if (x == CONST1_RTX (mode))
10263 return 2;
10265 r = CONST_DOUBLE_REAL_VALUE (x);
10267 /* For XFmode constants, try to find a special 80387 instruction when
10268 optimizing for size or on those CPUs that benefit from them. */
10269 if (mode == XFmode
10270 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10272 int i;
10274 if (! ext_80387_constants_init)
10275 init_ext_80387_constants ();
10277 for (i = 0; i < 5; i++)
10278 if (real_identical (r, &ext_80387_constants_table[i]))
10279 return i + 3;
10282 /* Load of the constant -0.0 or -1.0 will be split as
10283 fldz;fchs or fld1;fchs sequence. */
10284 if (real_isnegzero (r))
10285 return 8;
10286 if (real_identical (r, &dconstm1))
10287 return 9;
10289 return 0;
10292 /* Return the opcode of the special instruction to be used to load
10293 the constant X. */
10295 const char *
10296 standard_80387_constant_opcode (rtx x)
10298 switch (standard_80387_constant_p (x))
10300 case 1:
10301 return "fldz";
10302 case 2:
10303 return "fld1";
10304 case 3:
10305 return "fldlg2";
10306 case 4:
10307 return "fldln2";
10308 case 5:
10309 return "fldl2e";
10310 case 6:
10311 return "fldl2t";
10312 case 7:
10313 return "fldpi";
10314 case 8:
10315 case 9:
10316 return "#";
10317 default:
10318 gcc_unreachable ();
10322 /* Return the CONST_DOUBLE representing the 80387 constant that is
10323 loaded by the specified special instruction. The argument IDX
10324 matches the return value from standard_80387_constant_p. */
10327 standard_80387_constant_rtx (int idx)
10329 int i;
10331 if (! ext_80387_constants_init)
10332 init_ext_80387_constants ();
10334 switch (idx)
10336 case 3:
10337 case 4:
10338 case 5:
10339 case 6:
10340 case 7:
10341 i = idx - 3;
10342 break;
10344 default:
10345 gcc_unreachable ();
10348 return const_double_from_real_value (ext_80387_constants_table[i],
10349 XFmode);
10352 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10353 in supported SSE/AVX vector mode. */
10356 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10358 machine_mode mode;
10360 if (!TARGET_SSE)
10361 return 0;
10363 mode = GET_MODE (x);
10365 if (x == const0_rtx || const0_operand (x, mode))
10366 return 1;
10368 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10370 /* VOIDmode integer constant, get mode from the predicate. */
10371 if (mode == VOIDmode)
10372 mode = pred_mode;
10374 switch (GET_MODE_SIZE (mode))
10376 case 64:
10377 if (TARGET_AVX512F)
10378 return 2;
10379 break;
10380 case 32:
10381 if (TARGET_AVX2)
10382 return 2;
10383 break;
10384 case 16:
10385 if (TARGET_SSE2)
10386 return 2;
10387 break;
10388 case 0:
10389 /* VOIDmode */
10390 gcc_unreachable ();
10391 default:
10392 break;
10396 return 0;
10399 /* Return the opcode of the special instruction to be used to load
10400 the constant operands[1] into operands[0]. */
10402 const char *
10403 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10405 machine_mode mode;
10406 rtx x = operands[1];
10408 gcc_assert (TARGET_SSE);
10410 mode = GET_MODE (x);
10412 if (x == const0_rtx || const0_operand (x, mode))
10414 switch (get_attr_mode (insn))
10416 case MODE_TI:
10417 if (!EXT_REX_SSE_REG_P (operands[0]))
10418 return "%vpxor\t%0, %d0";
10419 /* FALLTHRU */
10420 case MODE_XI:
10421 case MODE_OI:
10422 if (EXT_REX_SSE_REG_P (operands[0]))
10423 return (TARGET_AVX512VL
10424 ? "vpxord\t%x0, %x0, %x0"
10425 : "vpxord\t%g0, %g0, %g0");
10426 return "vpxor\t%x0, %x0, %x0";
10428 case MODE_V2DF:
10429 if (!EXT_REX_SSE_REG_P (operands[0]))
10430 return "%vxorpd\t%0, %d0";
10431 /* FALLTHRU */
10432 case MODE_V8DF:
10433 case MODE_V4DF:
10434 if (!EXT_REX_SSE_REG_P (operands[0]))
10435 return "vxorpd\t%x0, %x0, %x0";
10436 else if (TARGET_AVX512DQ)
10437 return (TARGET_AVX512VL
10438 ? "vxorpd\t%x0, %x0, %x0"
10439 : "vxorpd\t%g0, %g0, %g0");
10440 else
10441 return (TARGET_AVX512VL
10442 ? "vpxorq\t%x0, %x0, %x0"
10443 : "vpxorq\t%g0, %g0, %g0");
10445 case MODE_V4SF:
10446 if (!EXT_REX_SSE_REG_P (operands[0]))
10447 return "%vxorps\t%0, %d0";
10448 /* FALLTHRU */
10449 case MODE_V16SF:
10450 case MODE_V8SF:
10451 if (!EXT_REX_SSE_REG_P (operands[0]))
10452 return "vxorps\t%x0, %x0, %x0";
10453 else if (TARGET_AVX512DQ)
10454 return (TARGET_AVX512VL
10455 ? "vxorps\t%x0, %x0, %x0"
10456 : "vxorps\t%g0, %g0, %g0");
10457 else
10458 return (TARGET_AVX512VL
10459 ? "vpxord\t%x0, %x0, %x0"
10460 : "vpxord\t%g0, %g0, %g0");
10462 default:
10463 gcc_unreachable ();
10466 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10468 enum attr_mode insn_mode = get_attr_mode (insn);
10470 switch (insn_mode)
10472 case MODE_XI:
10473 case MODE_V8DF:
10474 case MODE_V16SF:
10475 gcc_assert (TARGET_AVX512F);
10476 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10478 case MODE_OI:
10479 case MODE_V4DF:
10480 case MODE_V8SF:
10481 gcc_assert (TARGET_AVX2);
10482 /* FALLTHRU */
10483 case MODE_TI:
10484 case MODE_V2DF:
10485 case MODE_V4SF:
10486 gcc_assert (TARGET_SSE2);
10487 if (!EXT_REX_SSE_REG_P (operands[0]))
10488 return (TARGET_AVX
10489 ? "vpcmpeqd\t%0, %0, %0"
10490 : "pcmpeqd\t%0, %0");
10491 else if (TARGET_AVX512VL)
10492 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10493 else
10494 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10496 default:
10497 gcc_unreachable ();
10501 gcc_unreachable ();
10504 /* Returns true if INSN can be transformed from a memory load
10505 to a supported FP constant load. */
10507 bool
10508 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10510 rtx src = find_constant_src (insn);
10512 gcc_assert (REG_P (dst));
10514 if (src == NULL
10515 || (SSE_REGNO_P (REGNO (dst))
10516 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10517 || (STACK_REGNO_P (REGNO (dst))
10518 && standard_80387_constant_p (src) < 1))
10519 return false;
10521 return true;
10524 /* Returns true if OP contains a symbol reference */
10526 bool
10527 symbolic_reference_mentioned_p (rtx op)
10529 const char *fmt;
10530 int i;
10532 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10533 return true;
10535 fmt = GET_RTX_FORMAT (GET_CODE (op));
10536 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10538 if (fmt[i] == 'E')
10540 int j;
10542 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10543 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10544 return true;
10547 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10548 return true;
10551 return false;
10554 /* Return true if it is appropriate to emit `ret' instructions in the
10555 body of a function. Do this only if the epilogue is simple, needing a
10556 couple of insns. Prior to reloading, we can't tell how many registers
10557 must be saved, so return false then. Return false if there is no frame
10558 marker to de-allocate. */
10560 bool
10561 ix86_can_use_return_insn_p (void)
10563 if (ix86_function_naked (current_function_decl))
10564 return false;
10566 /* Don't use `ret' instruction in interrupt handler. */
10567 if (! reload_completed
10568 || frame_pointer_needed
10569 || cfun->machine->func_type != TYPE_NORMAL)
10570 return 0;
10572 /* Don't allow more than 32k pop, since that's all we can do
10573 with one instruction. */
10574 if (crtl->args.pops_args && crtl->args.size >= 32768)
10575 return 0;
10577 struct ix86_frame &frame = cfun->machine->frame;
10578 return (frame.stack_pointer_offset == UNITS_PER_WORD
10579 && (frame.nregs + frame.nsseregs) == 0);
10582 /* Value should be nonzero if functions must have frame pointers.
10583 Zero means the frame pointer need not be set up (and parms may
10584 be accessed via the stack pointer) in functions that seem suitable. */
10586 static bool
10587 ix86_frame_pointer_required (void)
10589 /* If we accessed previous frames, then the generated code expects
10590 to be able to access the saved ebp value in our frame. */
10591 if (cfun->machine->accesses_prev_frame)
10592 return true;
10594 /* Several x86 os'es need a frame pointer for other reasons,
10595 usually pertaining to setjmp. */
10596 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10597 return true;
10599 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10600 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10601 return true;
10603 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10604 allocation is 4GB. */
10605 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10606 return true;
10608 /* SSE saves require frame-pointer when stack is misaligned. */
10609 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10610 return true;
10612 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10613 turns off the frame pointer by default. Turn it back on now if
10614 we've not got a leaf function. */
10615 if (TARGET_OMIT_LEAF_FRAME_POINTER
10616 && (!crtl->is_leaf
10617 || ix86_current_function_calls_tls_descriptor))
10618 return true;
10620 if (crtl->profile && !flag_fentry)
10621 return true;
10623 return false;
10626 /* Record that the current function accesses previous call frames. */
10628 void
10629 ix86_setup_frame_addresses (void)
10631 cfun->machine->accesses_prev_frame = 1;
10634 #ifndef USE_HIDDEN_LINKONCE
10635 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10636 # define USE_HIDDEN_LINKONCE 1
10637 # else
10638 # define USE_HIDDEN_LINKONCE 0
10639 # endif
10640 #endif
10642 static int pic_labels_used;
10644 /* Fills in the label name that should be used for a pc thunk for
10645 the given register. */
10647 static void
10648 get_pc_thunk_name (char name[32], unsigned int regno)
10650 gcc_assert (!TARGET_64BIT);
10652 if (USE_HIDDEN_LINKONCE)
10653 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10654 else
10655 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10659 /* This function generates code for -fpic that loads %ebx with
10660 the return address of the caller and then returns. */
10662 static void
10663 ix86_code_end (void)
10665 rtx xops[2];
10666 int regno;
10668 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10670 char name[32];
10671 tree decl;
10673 if (!(pic_labels_used & (1 << regno)))
10674 continue;
10676 get_pc_thunk_name (name, regno);
10678 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10679 get_identifier (name),
10680 build_function_type_list (void_type_node, NULL_TREE));
10681 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10682 NULL_TREE, void_type_node);
10683 TREE_PUBLIC (decl) = 1;
10684 TREE_STATIC (decl) = 1;
10685 DECL_IGNORED_P (decl) = 1;
10687 #if TARGET_MACHO
10688 if (TARGET_MACHO)
10690 switch_to_section (darwin_sections[picbase_thunk_section]);
10691 fputs ("\t.weak_definition\t", asm_out_file);
10692 assemble_name (asm_out_file, name);
10693 fputs ("\n\t.private_extern\t", asm_out_file);
10694 assemble_name (asm_out_file, name);
10695 putc ('\n', asm_out_file);
10696 ASM_OUTPUT_LABEL (asm_out_file, name);
10697 DECL_WEAK (decl) = 1;
10699 else
10700 #endif
10701 if (USE_HIDDEN_LINKONCE)
10703 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10705 targetm.asm_out.unique_section (decl, 0);
10706 switch_to_section (get_named_section (decl, NULL, 0));
10708 targetm.asm_out.globalize_label (asm_out_file, name);
10709 fputs ("\t.hidden\t", asm_out_file);
10710 assemble_name (asm_out_file, name);
10711 putc ('\n', asm_out_file);
10712 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10714 else
10716 switch_to_section (text_section);
10717 ASM_OUTPUT_LABEL (asm_out_file, name);
10720 DECL_INITIAL (decl) = make_node (BLOCK);
10721 current_function_decl = decl;
10722 allocate_struct_function (decl, false);
10723 init_function_start (decl);
10724 /* We're about to hide the function body from callees of final_* by
10725 emitting it directly; tell them we're a thunk, if they care. */
10726 cfun->is_thunk = true;
10727 first_function_block_is_cold = false;
10728 /* Make sure unwind info is emitted for the thunk if needed. */
10729 final_start_function (emit_barrier (), asm_out_file, 1);
10731 /* Pad stack IP move with 4 instructions (two NOPs count
10732 as one instruction). */
10733 if (TARGET_PAD_SHORT_FUNCTION)
10735 int i = 8;
10737 while (i--)
10738 fputs ("\tnop\n", asm_out_file);
10741 xops[0] = gen_rtx_REG (Pmode, regno);
10742 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10743 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10744 output_asm_insn ("%!ret", NULL);
10745 final_end_function ();
10746 init_insn_lengths ();
10747 free_after_compilation (cfun);
10748 set_cfun (NULL);
10749 current_function_decl = NULL;
10752 if (flag_split_stack)
10753 file_end_indicate_split_stack ();
10756 /* Emit code for the SET_GOT patterns. */
10758 const char *
10759 output_set_got (rtx dest, rtx label)
10761 rtx xops[3];
10763 xops[0] = dest;
10765 if (TARGET_VXWORKS_RTP && flag_pic)
10767 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10768 xops[2] = gen_rtx_MEM (Pmode,
10769 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10770 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10772 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10773 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10774 an unadorned address. */
10775 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10776 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10777 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10778 return "";
10781 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10783 if (flag_pic)
10785 char name[32];
10786 get_pc_thunk_name (name, REGNO (dest));
10787 pic_labels_used |= 1 << REGNO (dest);
10789 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10790 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10791 output_asm_insn ("%!call\t%X2", xops);
10793 #if TARGET_MACHO
10794 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10795 This is what will be referenced by the Mach-O PIC subsystem. */
10796 if (machopic_should_output_picbase_label () || !label)
10797 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10799 /* When we are restoring the pic base at the site of a nonlocal label,
10800 and we decided to emit the pic base above, we will still output a
10801 local label used for calculating the correction offset (even though
10802 the offset will be 0 in that case). */
10803 if (label)
10804 targetm.asm_out.internal_label (asm_out_file, "L",
10805 CODE_LABEL_NUMBER (label));
10806 #endif
10808 else
10810 if (TARGET_MACHO)
10811 /* We don't need a pic base, we're not producing pic. */
10812 gcc_unreachable ();
10814 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10815 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10816 targetm.asm_out.internal_label (asm_out_file, "L",
10817 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10820 if (!TARGET_MACHO)
10821 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10823 return "";
10826 /* Generate an "push" pattern for input ARG. */
10828 static rtx
10829 gen_push (rtx arg)
10831 struct machine_function *m = cfun->machine;
10833 if (m->fs.cfa_reg == stack_pointer_rtx)
10834 m->fs.cfa_offset += UNITS_PER_WORD;
10835 m->fs.sp_offset += UNITS_PER_WORD;
10837 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10838 arg = gen_rtx_REG (word_mode, REGNO (arg));
10840 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10841 gen_rtx_PRE_DEC (Pmode,
10842 stack_pointer_rtx)),
10843 arg);
10846 /* Generate an "pop" pattern for input ARG. */
10848 static rtx
10849 gen_pop (rtx arg)
10851 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10852 arg = gen_rtx_REG (word_mode, REGNO (arg));
10854 return gen_rtx_SET (arg,
10855 gen_rtx_MEM (word_mode,
10856 gen_rtx_POST_INC (Pmode,
10857 stack_pointer_rtx)));
10860 /* Return >= 0 if there is an unused call-clobbered register available
10861 for the entire function. */
10863 static unsigned int
10864 ix86_select_alt_pic_regnum (void)
10866 if (ix86_use_pseudo_pic_reg ())
10867 return INVALID_REGNUM;
10869 if (crtl->is_leaf
10870 && !crtl->profile
10871 && !ix86_current_function_calls_tls_descriptor)
10873 int i, drap;
10874 /* Can't use the same register for both PIC and DRAP. */
10875 if (crtl->drap_reg)
10876 drap = REGNO (crtl->drap_reg);
10877 else
10878 drap = -1;
10879 for (i = 2; i >= 0; --i)
10880 if (i != drap && !df_regs_ever_live_p (i))
10881 return i;
10884 return INVALID_REGNUM;
10887 /* Return true if REGNO is used by the epilogue. */
10889 bool
10890 ix86_epilogue_uses (int regno)
10892 /* If there are no caller-saved registers, we preserve all registers,
10893 except for MMX and x87 registers which aren't supported when saving
10894 and restoring registers. Don't explicitly save SP register since
10895 it is always preserved. */
10896 return (epilogue_completed
10897 && cfun->machine->no_caller_saved_registers
10898 && !fixed_regs[regno]
10899 && !STACK_REGNO_P (regno)
10900 && !MMX_REGNO_P (regno));
10903 /* Return nonzero if register REGNO can be used as a scratch register
10904 in peephole2. */
10906 static bool
10907 ix86_hard_regno_scratch_ok (unsigned int regno)
10909 /* If there are no caller-saved registers, we can't use any register
10910 as a scratch register after epilogue and use REGNO as scratch
10911 register only if it has been used before to avoid saving and
10912 restoring it. */
10913 return (!cfun->machine->no_caller_saved_registers
10914 || (!epilogue_completed
10915 && df_regs_ever_live_p (regno)));
10918 /* Return true if register class CL should be an additional allocno
10919 class. */
10921 static bool
10922 ix86_additional_allocno_class_p (reg_class_t cl)
10924 return cl == MOD4_SSE_REGS;
10927 /* Return TRUE if we need to save REGNO. */
10929 static bool
10930 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10932 /* If there are no caller-saved registers, we preserve all registers,
10933 except for MMX and x87 registers which aren't supported when saving
10934 and restoring registers. Don't explicitly save SP register since
10935 it is always preserved. */
10936 if (cfun->machine->no_caller_saved_registers)
10938 /* Don't preserve registers used for function return value. */
10939 rtx reg = crtl->return_rtx;
10940 if (reg)
10942 unsigned int i = REGNO (reg);
10943 unsigned int nregs = REG_NREGS (reg);
10944 while (nregs-- > 0)
10945 if ((i + nregs) == regno)
10946 return false;
10948 reg = crtl->return_bnd;
10949 if (reg)
10951 i = REGNO (reg);
10952 nregs = REG_NREGS (reg);
10953 while (nregs-- > 0)
10954 if ((i + nregs) == regno)
10955 return false;
10959 return (df_regs_ever_live_p (regno)
10960 && !fixed_regs[regno]
10961 && !STACK_REGNO_P (regno)
10962 && !MMX_REGNO_P (regno)
10963 && (regno != HARD_FRAME_POINTER_REGNUM
10964 || !frame_pointer_needed));
10967 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10968 && pic_offset_table_rtx)
10970 if (ix86_use_pseudo_pic_reg ())
10972 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10973 _mcount in prologue. */
10974 if (!TARGET_64BIT && flag_pic && crtl->profile)
10975 return true;
10977 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10978 || crtl->profile
10979 || crtl->calls_eh_return
10980 || crtl->uses_const_pool
10981 || cfun->has_nonlocal_label)
10982 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10985 if (crtl->calls_eh_return && maybe_eh_return)
10987 unsigned i;
10988 for (i = 0; ; i++)
10990 unsigned test = EH_RETURN_DATA_REGNO (i);
10991 if (test == INVALID_REGNUM)
10992 break;
10993 if (test == regno)
10994 return true;
10998 if (ignore_outlined && cfun->machine->call_ms2sysv)
11000 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11001 + xlogue_layout::MIN_REGS;
11002 if (xlogue_layout::is_stub_managed_reg (regno, count))
11003 return false;
11006 if (crtl->drap_reg
11007 && regno == REGNO (crtl->drap_reg)
11008 && !cfun->machine->no_drap_save_restore)
11009 return true;
11011 return (df_regs_ever_live_p (regno)
11012 && !call_used_regs[regno]
11013 && !fixed_regs[regno]
11014 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11017 /* Return number of saved general prupose registers. */
11019 static int
11020 ix86_nsaved_regs (void)
11022 int nregs = 0;
11023 int regno;
11025 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11026 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11027 nregs ++;
11028 return nregs;
11031 /* Return number of saved SSE registers. */
11033 static int
11034 ix86_nsaved_sseregs (void)
11036 int nregs = 0;
11037 int regno;
11039 if (!TARGET_64BIT_MS_ABI)
11040 return 0;
11041 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11042 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11043 nregs ++;
11044 return nregs;
11047 /* Given FROM and TO register numbers, say whether this elimination is
11048 allowed. If stack alignment is needed, we can only replace argument
11049 pointer with hard frame pointer, or replace frame pointer with stack
11050 pointer. Otherwise, frame pointer elimination is automatically
11051 handled and all other eliminations are valid. */
11053 static bool
11054 ix86_can_eliminate (const int from, const int to)
11056 if (stack_realign_fp)
11057 return ((from == ARG_POINTER_REGNUM
11058 && to == HARD_FRAME_POINTER_REGNUM)
11059 || (from == FRAME_POINTER_REGNUM
11060 && to == STACK_POINTER_REGNUM));
11061 else
11062 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11065 /* Return the offset between two registers, one to be eliminated, and the other
11066 its replacement, at the start of a routine. */
11068 HOST_WIDE_INT
11069 ix86_initial_elimination_offset (int from, int to)
11071 struct ix86_frame &frame = cfun->machine->frame;
11073 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11074 return frame.hard_frame_pointer_offset;
11075 else if (from == FRAME_POINTER_REGNUM
11076 && to == HARD_FRAME_POINTER_REGNUM)
11077 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11078 else
11080 gcc_assert (to == STACK_POINTER_REGNUM);
11082 if (from == ARG_POINTER_REGNUM)
11083 return frame.stack_pointer_offset;
11085 gcc_assert (from == FRAME_POINTER_REGNUM);
11086 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11090 /* In a dynamically-aligned function, we can't know the offset from
11091 stack pointer to frame pointer, so we must ensure that setjmp
11092 eliminates fp against the hard fp (%ebp) rather than trying to
11093 index from %esp up to the top of the frame across a gap that is
11094 of unknown (at compile-time) size. */
11095 static rtx
11096 ix86_builtin_setjmp_frame_value (void)
11098 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11101 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11102 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11104 static bool warned_once = false;
11105 if (!warned_once)
11107 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11108 feature);
11109 warned_once = true;
11113 /* When using -fsplit-stack, the allocation routines set a field in
11114 the TCB to the bottom of the stack plus this much space, measured
11115 in bytes. */
11117 #define SPLIT_STACK_AVAILABLE 256
11119 /* Fill structure ix86_frame about frame of currently computed function. */
11121 static void
11122 ix86_compute_frame_layout (void)
11124 struct ix86_frame *frame = &cfun->machine->frame;
11125 struct machine_function *m = cfun->machine;
11126 unsigned HOST_WIDE_INT stack_alignment_needed;
11127 HOST_WIDE_INT offset;
11128 unsigned HOST_WIDE_INT preferred_alignment;
11129 HOST_WIDE_INT size = get_frame_size ();
11130 HOST_WIDE_INT to_allocate;
11132 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11133 * ms_abi functions that call a sysv function. We now need to prune away
11134 * cases where it should be disabled. */
11135 if (TARGET_64BIT && m->call_ms2sysv)
11137 gcc_assert (TARGET_64BIT_MS_ABI);
11138 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11139 gcc_assert (!TARGET_SEH);
11140 gcc_assert (TARGET_SSE);
11141 gcc_assert (!ix86_using_red_zone ());
11143 if (crtl->calls_eh_return)
11145 gcc_assert (!reload_completed);
11146 m->call_ms2sysv = false;
11147 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11150 else if (ix86_static_chain_on_stack)
11152 gcc_assert (!reload_completed);
11153 m->call_ms2sysv = false;
11154 warn_once_call_ms2sysv_xlogues ("static call chains");
11157 /* Finally, compute which registers the stub will manage. */
11158 else
11160 unsigned count = xlogue_layout::count_stub_managed_regs ();
11161 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11162 m->call_ms2sysv_pad_in = 0;
11166 frame->nregs = ix86_nsaved_regs ();
11167 frame->nsseregs = ix86_nsaved_sseregs ();
11169 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11170 except for function prologues, leaf functions and when the defult
11171 incoming stack boundary is overriden at command line or via
11172 force_align_arg_pointer attribute. */
11173 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11174 && (!crtl->is_leaf || cfun->calls_alloca != 0
11175 || ix86_current_function_calls_tls_descriptor
11176 || ix86_incoming_stack_boundary < 128))
11178 crtl->preferred_stack_boundary = 128;
11179 crtl->stack_alignment_needed = 128;
11182 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11183 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11185 gcc_assert (!size || stack_alignment_needed);
11186 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11187 gcc_assert (preferred_alignment <= stack_alignment_needed);
11189 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11190 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11191 if (TARGET_64BIT && m->call_ms2sysv)
11193 gcc_assert (stack_alignment_needed >= 16);
11194 gcc_assert (!frame->nsseregs);
11197 /* For SEH we have to limit the amount of code movement into the prologue.
11198 At present we do this via a BLOCKAGE, at which point there's very little
11199 scheduling that can be done, which means that there's very little point
11200 in doing anything except PUSHs. */
11201 if (TARGET_SEH)
11202 m->use_fast_prologue_epilogue = false;
11203 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11205 int count = frame->nregs;
11206 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11208 /* The fast prologue uses move instead of push to save registers. This
11209 is significantly longer, but also executes faster as modern hardware
11210 can execute the moves in parallel, but can't do that for push/pop.
11212 Be careful about choosing what prologue to emit: When function takes
11213 many instructions to execute we may use slow version as well as in
11214 case function is known to be outside hot spot (this is known with
11215 feedback only). Weight the size of function by number of registers
11216 to save as it is cheap to use one or two push instructions but very
11217 slow to use many of them. */
11218 if (count)
11219 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11220 if (node->frequency < NODE_FREQUENCY_NORMAL
11221 || (flag_branch_probabilities
11222 && node->frequency < NODE_FREQUENCY_HOT))
11223 m->use_fast_prologue_epilogue = false;
11224 else
11225 m->use_fast_prologue_epilogue
11226 = !expensive_function_p (count);
11229 frame->save_regs_using_mov
11230 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11231 /* If static stack checking is enabled and done with probes,
11232 the registers need to be saved before allocating the frame. */
11233 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11235 /* Skip return address and error code in exception handler. */
11236 offset = INCOMING_FRAME_SP_OFFSET;
11238 /* Skip pushed static chain. */
11239 if (ix86_static_chain_on_stack)
11240 offset += UNITS_PER_WORD;
11242 /* Skip saved base pointer. */
11243 if (frame_pointer_needed)
11244 offset += UNITS_PER_WORD;
11245 frame->hfp_save_offset = offset;
11247 /* The traditional frame pointer location is at the top of the frame. */
11248 frame->hard_frame_pointer_offset = offset;
11250 /* Register save area */
11251 offset += frame->nregs * UNITS_PER_WORD;
11252 frame->reg_save_offset = offset;
11254 /* On SEH target, registers are pushed just before the frame pointer
11255 location. */
11256 if (TARGET_SEH)
11257 frame->hard_frame_pointer_offset = offset;
11259 /* Calculate the size of the va-arg area (not including padding, if any). */
11260 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11262 /* Also adjust stack_realign_offset for the largest alignment of
11263 stack slot actually used. */
11264 if (stack_realign_fp
11265 || (cfun->machine->max_used_stack_alignment != 0
11266 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11268 /* We may need a 16-byte aligned stack for the remainder of the
11269 register save area, but the stack frame for the local function
11270 may require a greater alignment if using AVX/2/512. In order
11271 to avoid wasting space, we first calculate the space needed for
11272 the rest of the register saves, add that to the stack pointer,
11273 and then realign the stack to the boundary of the start of the
11274 frame for the local function. */
11275 HOST_WIDE_INT space_needed = 0;
11276 HOST_WIDE_INT sse_reg_space_needed = 0;
11278 if (TARGET_64BIT)
11280 if (m->call_ms2sysv)
11282 m->call_ms2sysv_pad_in = 0;
11283 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11286 else if (frame->nsseregs)
11287 /* The only ABI that has saved SSE registers (Win64) also has a
11288 16-byte aligned default stack. However, many programs violate
11289 the ABI, and Wine64 forces stack realignment to compensate. */
11290 space_needed = frame->nsseregs * 16;
11292 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11294 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11295 rounding to be pedantic. */
11296 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11298 else
11299 space_needed = frame->va_arg_size;
11301 /* Record the allocation size required prior to the realignment AND. */
11302 frame->stack_realign_allocate = space_needed;
11304 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11305 before this point are not directly comparable with values below
11306 this point. Use sp_valid_at to determine if the stack pointer is
11307 valid for a given offset, fp_valid_at for the frame pointer, or
11308 choose_baseaddr to have a base register chosen for you.
11310 Note that the result of (frame->stack_realign_offset
11311 & (stack_alignment_needed - 1)) may not equal zero. */
11312 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11313 frame->stack_realign_offset = offset - space_needed;
11314 frame->sse_reg_save_offset = frame->stack_realign_offset
11315 + sse_reg_space_needed;
11317 else
11319 frame->stack_realign_offset = offset;
11321 if (TARGET_64BIT && m->call_ms2sysv)
11323 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11324 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11327 /* Align and set SSE register save area. */
11328 else if (frame->nsseregs)
11330 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11331 required and the DRAP re-alignment boundary is at least 16 bytes,
11332 then we want the SSE register save area properly aligned. */
11333 if (ix86_incoming_stack_boundary >= 128
11334 || (stack_realign_drap && stack_alignment_needed >= 16))
11335 offset = ROUND_UP (offset, 16);
11336 offset += frame->nsseregs * 16;
11338 frame->sse_reg_save_offset = offset;
11339 offset += frame->va_arg_size;
11342 /* Align start of frame for local function. When a function call
11343 is removed, it may become a leaf function. But if argument may
11344 be passed on stack, we need to align the stack when there is no
11345 tail call. */
11346 if (m->call_ms2sysv
11347 || frame->va_arg_size != 0
11348 || size != 0
11349 || !crtl->is_leaf
11350 || (!crtl->tail_call_emit
11351 && cfun->machine->outgoing_args_on_stack)
11352 || cfun->calls_alloca
11353 || ix86_current_function_calls_tls_descriptor)
11354 offset = ROUND_UP (offset, stack_alignment_needed);
11356 /* Frame pointer points here. */
11357 frame->frame_pointer_offset = offset;
11359 offset += size;
11361 /* Add outgoing arguments area. Can be skipped if we eliminated
11362 all the function calls as dead code.
11363 Skipping is however impossible when function calls alloca. Alloca
11364 expander assumes that last crtl->outgoing_args_size
11365 of stack frame are unused. */
11366 if (ACCUMULATE_OUTGOING_ARGS
11367 && (!crtl->is_leaf || cfun->calls_alloca
11368 || ix86_current_function_calls_tls_descriptor))
11370 offset += crtl->outgoing_args_size;
11371 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11373 else
11374 frame->outgoing_arguments_size = 0;
11376 /* Align stack boundary. Only needed if we're calling another function
11377 or using alloca. */
11378 if (!crtl->is_leaf || cfun->calls_alloca
11379 || ix86_current_function_calls_tls_descriptor)
11380 offset = ROUND_UP (offset, preferred_alignment);
11382 /* We've reached end of stack frame. */
11383 frame->stack_pointer_offset = offset;
11385 /* Size prologue needs to allocate. */
11386 to_allocate = offset - frame->sse_reg_save_offset;
11388 if ((!to_allocate && frame->nregs <= 1)
11389 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11390 frame->save_regs_using_mov = false;
11392 if (ix86_using_red_zone ()
11393 && crtl->sp_is_unchanging
11394 && crtl->is_leaf
11395 && !ix86_pc_thunk_call_expanded
11396 && !ix86_current_function_calls_tls_descriptor)
11398 frame->red_zone_size = to_allocate;
11399 if (frame->save_regs_using_mov)
11400 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11401 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11402 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11404 else
11405 frame->red_zone_size = 0;
11406 frame->stack_pointer_offset -= frame->red_zone_size;
11408 /* The SEH frame pointer location is near the bottom of the frame.
11409 This is enforced by the fact that the difference between the
11410 stack pointer and the frame pointer is limited to 240 bytes in
11411 the unwind data structure. */
11412 if (TARGET_SEH)
11414 HOST_WIDE_INT diff;
11416 /* If we can leave the frame pointer where it is, do so. Also, returns
11417 the establisher frame for __builtin_frame_address (0). */
11418 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11419 if (diff <= SEH_MAX_FRAME_SIZE
11420 && (diff > 240 || (diff & 15) != 0)
11421 && !crtl->accesses_prior_frames)
11423 /* Ideally we'd determine what portion of the local stack frame
11424 (within the constraint of the lowest 240) is most heavily used.
11425 But without that complication, simply bias the frame pointer
11426 by 128 bytes so as to maximize the amount of the local stack
11427 frame that is addressable with 8-bit offsets. */
11428 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11433 /* This is semi-inlined memory_address_length, but simplified
11434 since we know that we're always dealing with reg+offset, and
11435 to avoid having to create and discard all that rtl. */
11437 static inline int
11438 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11440 int len = 4;
11442 if (offset == 0)
11444 /* EBP and R13 cannot be encoded without an offset. */
11445 len = (regno == BP_REG || regno == R13_REG);
11447 else if (IN_RANGE (offset, -128, 127))
11448 len = 1;
11450 /* ESP and R12 must be encoded with a SIB byte. */
11451 if (regno == SP_REG || regno == R12_REG)
11452 len++;
11454 return len;
11457 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11458 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11460 static bool
11461 sp_valid_at (HOST_WIDE_INT cfa_offset)
11463 const struct machine_frame_state &fs = cfun->machine->fs;
11464 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11466 /* Validate that the cfa_offset isn't in a "no-man's land". */
11467 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11468 return false;
11470 return fs.sp_valid;
11473 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11474 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11476 static inline bool
11477 fp_valid_at (HOST_WIDE_INT cfa_offset)
11479 const struct machine_frame_state &fs = cfun->machine->fs;
11480 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11482 /* Validate that the cfa_offset isn't in a "no-man's land". */
11483 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11484 return false;
11486 return fs.fp_valid;
11489 /* Choose a base register based upon alignment requested, speed and/or
11490 size. */
11492 static void
11493 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11494 HOST_WIDE_INT &base_offset,
11495 unsigned int align_reqested, unsigned int *align)
11497 const struct machine_function *m = cfun->machine;
11498 unsigned int hfp_align;
11499 unsigned int drap_align;
11500 unsigned int sp_align;
11501 bool hfp_ok = fp_valid_at (cfa_offset);
11502 bool drap_ok = m->fs.drap_valid;
11503 bool sp_ok = sp_valid_at (cfa_offset);
11505 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11507 /* Filter out any registers that don't meet the requested alignment
11508 criteria. */
11509 if (align_reqested)
11511 if (m->fs.realigned)
11512 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11513 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11514 notes (which we would need to use a realigned stack pointer),
11515 so disable on SEH targets. */
11516 else if (m->fs.sp_realigned)
11517 sp_align = crtl->stack_alignment_needed;
11519 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11520 drap_ok = drap_ok && drap_align >= align_reqested;
11521 sp_ok = sp_ok && sp_align >= align_reqested;
11524 if (m->use_fast_prologue_epilogue)
11526 /* Choose the base register most likely to allow the most scheduling
11527 opportunities. Generally FP is valid throughout the function,
11528 while DRAP must be reloaded within the epilogue. But choose either
11529 over the SP due to increased encoding size. */
11531 if (hfp_ok)
11533 base_reg = hard_frame_pointer_rtx;
11534 base_offset = m->fs.fp_offset - cfa_offset;
11536 else if (drap_ok)
11538 base_reg = crtl->drap_reg;
11539 base_offset = 0 - cfa_offset;
11541 else if (sp_ok)
11543 base_reg = stack_pointer_rtx;
11544 base_offset = m->fs.sp_offset - cfa_offset;
11547 else
11549 HOST_WIDE_INT toffset;
11550 int len = 16, tlen;
11552 /* Choose the base register with the smallest address encoding.
11553 With a tie, choose FP > DRAP > SP. */
11554 if (sp_ok)
11556 base_reg = stack_pointer_rtx;
11557 base_offset = m->fs.sp_offset - cfa_offset;
11558 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11560 if (drap_ok)
11562 toffset = 0 - cfa_offset;
11563 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11564 if (tlen <= len)
11566 base_reg = crtl->drap_reg;
11567 base_offset = toffset;
11568 len = tlen;
11571 if (hfp_ok)
11573 toffset = m->fs.fp_offset - cfa_offset;
11574 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11575 if (tlen <= len)
11577 base_reg = hard_frame_pointer_rtx;
11578 base_offset = toffset;
11579 len = tlen;
11584 /* Set the align return value. */
11585 if (align)
11587 if (base_reg == stack_pointer_rtx)
11588 *align = sp_align;
11589 else if (base_reg == crtl->drap_reg)
11590 *align = drap_align;
11591 else if (base_reg == hard_frame_pointer_rtx)
11592 *align = hfp_align;
11596 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11597 the alignment of address. If ALIGN is non-null, it should point to
11598 an alignment value (in bits) that is preferred or zero and will
11599 recieve the alignment of the base register that was selected,
11600 irrespective of rather or not CFA_OFFSET is a multiple of that
11601 alignment value. If it is possible for the base register offset to be
11602 non-immediate then SCRATCH_REGNO should specify a scratch register to
11603 use.
11605 The valid base registers are taken from CFUN->MACHINE->FS. */
11607 static rtx
11608 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11609 unsigned int scratch_regno = INVALID_REGNUM)
11611 rtx base_reg = NULL;
11612 HOST_WIDE_INT base_offset = 0;
11614 /* If a specific alignment is requested, try to get a base register
11615 with that alignment first. */
11616 if (align && *align)
11617 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11619 if (!base_reg)
11620 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11622 gcc_assert (base_reg != NULL);
11624 rtx base_offset_rtx = GEN_INT (base_offset);
11626 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11628 gcc_assert (scratch_regno != INVALID_REGNUM);
11630 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11631 emit_move_insn (scratch_reg, base_offset_rtx);
11633 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11636 return plus_constant (Pmode, base_reg, base_offset);
11639 /* Emit code to save registers in the prologue. */
11641 static void
11642 ix86_emit_save_regs (void)
11644 unsigned int regno;
11645 rtx_insn *insn;
11647 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11648 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11650 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11651 RTX_FRAME_RELATED_P (insn) = 1;
11655 /* Emit a single register save at CFA - CFA_OFFSET. */
11657 static void
11658 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11659 HOST_WIDE_INT cfa_offset)
11661 struct machine_function *m = cfun->machine;
11662 rtx reg = gen_rtx_REG (mode, regno);
11663 rtx mem, addr, base, insn;
11664 unsigned int align = GET_MODE_ALIGNMENT (mode);
11666 addr = choose_baseaddr (cfa_offset, &align);
11667 mem = gen_frame_mem (mode, addr);
11669 /* The location aligment depends upon the base register. */
11670 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11671 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11672 set_mem_align (mem, align);
11674 insn = emit_insn (gen_rtx_SET (mem, reg));
11675 RTX_FRAME_RELATED_P (insn) = 1;
11677 base = addr;
11678 if (GET_CODE (base) == PLUS)
11679 base = XEXP (base, 0);
11680 gcc_checking_assert (REG_P (base));
11682 /* When saving registers into a re-aligned local stack frame, avoid
11683 any tricky guessing by dwarf2out. */
11684 if (m->fs.realigned)
11686 gcc_checking_assert (stack_realign_drap);
11688 if (regno == REGNO (crtl->drap_reg))
11690 /* A bit of a hack. We force the DRAP register to be saved in
11691 the re-aligned stack frame, which provides us with a copy
11692 of the CFA that will last past the prologue. Install it. */
11693 gcc_checking_assert (cfun->machine->fs.fp_valid);
11694 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11695 cfun->machine->fs.fp_offset - cfa_offset);
11696 mem = gen_rtx_MEM (mode, addr);
11697 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11699 else
11701 /* The frame pointer is a stable reference within the
11702 aligned frame. Use it. */
11703 gcc_checking_assert (cfun->machine->fs.fp_valid);
11704 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11705 cfun->machine->fs.fp_offset - cfa_offset);
11706 mem = gen_rtx_MEM (mode, addr);
11707 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11711 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11712 && cfa_offset >= m->fs.sp_realigned_offset)
11714 gcc_checking_assert (stack_realign_fp);
11715 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11718 /* The memory may not be relative to the current CFA register,
11719 which means that we may need to generate a new pattern for
11720 use by the unwind info. */
11721 else if (base != m->fs.cfa_reg)
11723 addr = plus_constant (Pmode, m->fs.cfa_reg,
11724 m->fs.cfa_offset - cfa_offset);
11725 mem = gen_rtx_MEM (mode, addr);
11726 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11730 /* Emit code to save registers using MOV insns.
11731 First register is stored at CFA - CFA_OFFSET. */
11732 static void
11733 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11735 unsigned int regno;
11737 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11738 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11740 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11741 cfa_offset -= UNITS_PER_WORD;
11745 /* Emit code to save SSE registers using MOV insns.
11746 First register is stored at CFA - CFA_OFFSET. */
11747 static void
11748 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11750 unsigned int regno;
11752 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11753 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11755 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11756 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11760 static GTY(()) rtx queued_cfa_restores;
11762 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11763 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11764 Don't add the note if the previously saved value will be left untouched
11765 within stack red-zone till return, as unwinders can find the same value
11766 in the register and on the stack. */
11768 static void
11769 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11771 if (!crtl->shrink_wrapped
11772 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11773 return;
11775 if (insn)
11777 add_reg_note (insn, REG_CFA_RESTORE, reg);
11778 RTX_FRAME_RELATED_P (insn) = 1;
11780 else
11781 queued_cfa_restores
11782 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11785 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11787 static void
11788 ix86_add_queued_cfa_restore_notes (rtx insn)
11790 rtx last;
11791 if (!queued_cfa_restores)
11792 return;
11793 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11795 XEXP (last, 1) = REG_NOTES (insn);
11796 REG_NOTES (insn) = queued_cfa_restores;
11797 queued_cfa_restores = NULL_RTX;
11798 RTX_FRAME_RELATED_P (insn) = 1;
11801 /* Expand prologue or epilogue stack adjustment.
11802 The pattern exist to put a dependency on all ebp-based memory accesses.
11803 STYLE should be negative if instructions should be marked as frame related,
11804 zero if %r11 register is live and cannot be freely used and positive
11805 otherwise. */
11807 static rtx
11808 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11809 int style, bool set_cfa)
11811 struct machine_function *m = cfun->machine;
11812 rtx insn;
11813 bool add_frame_related_expr = false;
11815 if (Pmode == SImode)
11816 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11817 else if (x86_64_immediate_operand (offset, DImode))
11818 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11819 else
11821 rtx tmp;
11822 /* r11 is used by indirect sibcall return as well, set before the
11823 epilogue and used after the epilogue. */
11824 if (style)
11825 tmp = gen_rtx_REG (DImode, R11_REG);
11826 else
11828 gcc_assert (src != hard_frame_pointer_rtx
11829 && dest != hard_frame_pointer_rtx);
11830 tmp = hard_frame_pointer_rtx;
11832 insn = emit_insn (gen_rtx_SET (tmp, offset));
11833 if (style < 0)
11834 add_frame_related_expr = true;
11836 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11839 insn = emit_insn (insn);
11840 if (style >= 0)
11841 ix86_add_queued_cfa_restore_notes (insn);
11843 if (set_cfa)
11845 rtx r;
11847 gcc_assert (m->fs.cfa_reg == src);
11848 m->fs.cfa_offset += INTVAL (offset);
11849 m->fs.cfa_reg = dest;
11851 r = gen_rtx_PLUS (Pmode, src, offset);
11852 r = gen_rtx_SET (dest, r);
11853 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11854 RTX_FRAME_RELATED_P (insn) = 1;
11856 else if (style < 0)
11858 RTX_FRAME_RELATED_P (insn) = 1;
11859 if (add_frame_related_expr)
11861 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11862 r = gen_rtx_SET (dest, r);
11863 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11867 if (dest == stack_pointer_rtx)
11869 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11870 bool valid = m->fs.sp_valid;
11871 bool realigned = m->fs.sp_realigned;
11873 if (src == hard_frame_pointer_rtx)
11875 valid = m->fs.fp_valid;
11876 realigned = false;
11877 ooffset = m->fs.fp_offset;
11879 else if (src == crtl->drap_reg)
11881 valid = m->fs.drap_valid;
11882 realigned = false;
11883 ooffset = 0;
11885 else
11887 /* Else there are two possibilities: SP itself, which we set
11888 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11889 taken care of this by hand along the eh_return path. */
11890 gcc_checking_assert (src == stack_pointer_rtx
11891 || offset == const0_rtx);
11894 m->fs.sp_offset = ooffset - INTVAL (offset);
11895 m->fs.sp_valid = valid;
11896 m->fs.sp_realigned = realigned;
11898 return insn;
11901 /* Find an available register to be used as dynamic realign argument
11902 pointer regsiter. Such a register will be written in prologue and
11903 used in begin of body, so it must not be
11904 1. parameter passing register.
11905 2. GOT pointer.
11906 We reuse static-chain register if it is available. Otherwise, we
11907 use DI for i386 and R13 for x86-64. We chose R13 since it has
11908 shorter encoding.
11910 Return: the regno of chosen register. */
11912 static unsigned int
11913 find_drap_reg (void)
11915 tree decl = cfun->decl;
11917 /* Always use callee-saved register if there are no caller-saved
11918 registers. */
11919 if (TARGET_64BIT)
11921 /* Use R13 for nested function or function need static chain.
11922 Since function with tail call may use any caller-saved
11923 registers in epilogue, DRAP must not use caller-saved
11924 register in such case. */
11925 if (DECL_STATIC_CHAIN (decl)
11926 || cfun->machine->no_caller_saved_registers
11927 || crtl->tail_call_emit)
11928 return R13_REG;
11930 return R10_REG;
11932 else
11934 /* Use DI for nested function or function need static chain.
11935 Since function with tail call may use any caller-saved
11936 registers in epilogue, DRAP must not use caller-saved
11937 register in such case. */
11938 if (DECL_STATIC_CHAIN (decl)
11939 || cfun->machine->no_caller_saved_registers
11940 || crtl->tail_call_emit)
11941 return DI_REG;
11943 /* Reuse static chain register if it isn't used for parameter
11944 passing. */
11945 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11947 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11948 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11949 return CX_REG;
11951 return DI_REG;
11955 /* Handle a "force_align_arg_pointer" attribute. */
11957 static tree
11958 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11959 tree, int, bool *no_add_attrs)
11961 if (TREE_CODE (*node) != FUNCTION_TYPE
11962 && TREE_CODE (*node) != METHOD_TYPE
11963 && TREE_CODE (*node) != FIELD_DECL
11964 && TREE_CODE (*node) != TYPE_DECL)
11966 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11967 name);
11968 *no_add_attrs = true;
11971 return NULL_TREE;
11974 /* Return minimum incoming stack alignment. */
11976 static unsigned int
11977 ix86_minimum_incoming_stack_boundary (bool sibcall)
11979 unsigned int incoming_stack_boundary;
11981 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11982 if (cfun->machine->func_type != TYPE_NORMAL)
11983 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11984 /* Prefer the one specified at command line. */
11985 else if (ix86_user_incoming_stack_boundary)
11986 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11987 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11988 if -mstackrealign is used, it isn't used for sibcall check and
11989 estimated stack alignment is 128bit. */
11990 else if (!sibcall
11991 && ix86_force_align_arg_pointer
11992 && crtl->stack_alignment_estimated == 128)
11993 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11994 else
11995 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11997 /* Incoming stack alignment can be changed on individual functions
11998 via force_align_arg_pointer attribute. We use the smallest
11999 incoming stack boundary. */
12000 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12001 && lookup_attribute (ix86_force_align_arg_pointer_string,
12002 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12003 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12005 /* The incoming stack frame has to be aligned at least at
12006 parm_stack_boundary. */
12007 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12008 incoming_stack_boundary = crtl->parm_stack_boundary;
12010 /* Stack at entrance of main is aligned by runtime. We use the
12011 smallest incoming stack boundary. */
12012 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12013 && DECL_NAME (current_function_decl)
12014 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12015 && DECL_FILE_SCOPE_P (current_function_decl))
12016 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12018 return incoming_stack_boundary;
12021 /* Update incoming stack boundary and estimated stack alignment. */
12023 static void
12024 ix86_update_stack_boundary (void)
12026 ix86_incoming_stack_boundary
12027 = ix86_minimum_incoming_stack_boundary (false);
12029 /* x86_64 vararg needs 16byte stack alignment for register save
12030 area. */
12031 if (TARGET_64BIT
12032 && cfun->stdarg
12033 && crtl->stack_alignment_estimated < 128)
12034 crtl->stack_alignment_estimated = 128;
12036 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12037 if (ix86_tls_descriptor_calls_expanded_in_cfun
12038 && crtl->preferred_stack_boundary < 128)
12039 crtl->preferred_stack_boundary = 128;
12042 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12043 needed or an rtx for DRAP otherwise. */
12045 static rtx
12046 ix86_get_drap_rtx (void)
12048 /* We must use DRAP if there are outgoing arguments on stack and
12049 ACCUMULATE_OUTGOING_ARGS is false. */
12050 if (ix86_force_drap
12051 || (cfun->machine->outgoing_args_on_stack
12052 && !ACCUMULATE_OUTGOING_ARGS))
12053 crtl->need_drap = true;
12055 if (stack_realign_drap)
12057 /* Assign DRAP to vDRAP and returns vDRAP */
12058 unsigned int regno = find_drap_reg ();
12059 rtx drap_vreg;
12060 rtx arg_ptr;
12061 rtx_insn *seq, *insn;
12063 arg_ptr = gen_rtx_REG (Pmode, regno);
12064 crtl->drap_reg = arg_ptr;
12066 start_sequence ();
12067 drap_vreg = copy_to_reg (arg_ptr);
12068 seq = get_insns ();
12069 end_sequence ();
12071 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12072 if (!optimize)
12074 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12075 RTX_FRAME_RELATED_P (insn) = 1;
12077 return drap_vreg;
12079 else
12080 return NULL;
12083 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12085 static rtx
12086 ix86_internal_arg_pointer (void)
12088 return virtual_incoming_args_rtx;
12091 struct scratch_reg {
12092 rtx reg;
12093 bool saved;
12096 /* Return a short-lived scratch register for use on function entry.
12097 In 32-bit mode, it is valid only after the registers are saved
12098 in the prologue. This register must be released by means of
12099 release_scratch_register_on_entry once it is dead. */
12101 static void
12102 get_scratch_register_on_entry (struct scratch_reg *sr)
12104 int regno;
12106 sr->saved = false;
12108 if (TARGET_64BIT)
12110 /* We always use R11 in 64-bit mode. */
12111 regno = R11_REG;
12113 else
12115 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12116 bool fastcall_p
12117 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12118 bool thiscall_p
12119 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12120 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12121 int regparm = ix86_function_regparm (fntype, decl);
12122 int drap_regno
12123 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12125 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12126 for the static chain register. */
12127 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12128 && drap_regno != AX_REG)
12129 regno = AX_REG;
12130 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12131 for the static chain register. */
12132 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12133 regno = AX_REG;
12134 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12135 regno = DX_REG;
12136 /* ecx is the static chain register. */
12137 else if (regparm < 3 && !fastcall_p && !thiscall_p
12138 && !static_chain_p
12139 && drap_regno != CX_REG)
12140 regno = CX_REG;
12141 else if (ix86_save_reg (BX_REG, true, false))
12142 regno = BX_REG;
12143 /* esi is the static chain register. */
12144 else if (!(regparm == 3 && static_chain_p)
12145 && ix86_save_reg (SI_REG, true, false))
12146 regno = SI_REG;
12147 else if (ix86_save_reg (DI_REG, true, false))
12148 regno = DI_REG;
12149 else
12151 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12152 sr->saved = true;
12156 sr->reg = gen_rtx_REG (Pmode, regno);
12157 if (sr->saved)
12159 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12160 RTX_FRAME_RELATED_P (insn) = 1;
12164 /* Release a scratch register obtained from the preceding function. */
12166 static void
12167 release_scratch_register_on_entry (struct scratch_reg *sr)
12169 if (sr->saved)
12171 struct machine_function *m = cfun->machine;
12172 rtx x, insn = emit_insn (gen_pop (sr->reg));
12174 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12175 RTX_FRAME_RELATED_P (insn) = 1;
12176 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12177 x = gen_rtx_SET (stack_pointer_rtx, x);
12178 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12179 m->fs.sp_offset -= UNITS_PER_WORD;
12183 /* Return the probing interval for -fstack-clash-protection. */
12185 static HOST_WIDE_INT
12186 get_probe_interval (void)
12188 if (flag_stack_clash_protection)
12189 return (HOST_WIDE_INT_1U
12190 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
12191 else
12192 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
12195 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12197 This differs from the next routine in that it tries hard to prevent
12198 attacks that jump the stack guard. Thus it is never allowed to allocate
12199 more than PROBE_INTERVAL bytes of stack space without a suitable
12200 probe. */
12202 static void
12203 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12205 struct machine_function *m = cfun->machine;
12207 /* If this function does not statically allocate stack space, then
12208 no probes are needed. */
12209 if (!size)
12211 /* However, the allocation of space via pushes for register
12212 saves could be viewed as allocating space, but without the
12213 need to probe. */
12214 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12215 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12216 else
12217 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12218 return;
12221 /* If we are a noreturn function, then we have to consider the
12222 possibility that we're called via a jump rather than a call.
12224 Thus we don't have the implicit probe generated by saving the
12225 return address into the stack at the call. Thus, the stack
12226 pointer could be anywhere in the guard page. The safe thing
12227 to do is emit a probe now.
12229 The probe can be avoided if we have already emitted any callee
12230 register saves into the stack or have a frame pointer (which will
12231 have been saved as well). Those saves will function as implicit
12232 probes.
12234 ?!? This should be revamped to work like aarch64 and s390 where
12235 we track the offset from the most recent probe. Normally that
12236 offset would be zero. For a noreturn function we would reset
12237 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12238 we just probe when we cross PROBE_INTERVAL. */
12239 if (TREE_THIS_VOLATILE (cfun->decl)
12240 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12242 /* We can safely use any register here since we're just going to push
12243 its value and immediately pop it back. But we do try and avoid
12244 argument passing registers so as not to introduce dependencies in
12245 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12246 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12247 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12248 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12249 m->fs.sp_offset -= UNITS_PER_WORD;
12250 if (m->fs.cfa_reg == stack_pointer_rtx)
12252 m->fs.cfa_offset -= UNITS_PER_WORD;
12253 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12254 x = gen_rtx_SET (stack_pointer_rtx, x);
12255 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12256 RTX_FRAME_RELATED_P (insn_push) = 1;
12257 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12258 x = gen_rtx_SET (stack_pointer_rtx, x);
12259 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12260 RTX_FRAME_RELATED_P (insn_pop) = 1;
12262 emit_insn (gen_blockage ());
12265 /* If we allocate less than the size of the guard statically,
12266 then no probing is necessary, but we do need to allocate
12267 the stack. */
12268 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12270 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12271 GEN_INT (-size), -1,
12272 m->fs.cfa_reg == stack_pointer_rtx);
12273 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12274 return;
12277 /* We're allocating a large enough stack frame that we need to
12278 emit probes. Either emit them inline or in a loop depending
12279 on the size. */
12280 HOST_WIDE_INT probe_interval = get_probe_interval ();
12281 if (size <= 4 * probe_interval)
12283 HOST_WIDE_INT i;
12284 for (i = probe_interval; i <= size; i += probe_interval)
12286 /* Allocate PROBE_INTERVAL bytes. */
12287 rtx insn
12288 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12289 GEN_INT (-probe_interval), -1,
12290 m->fs.cfa_reg == stack_pointer_rtx);
12291 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12293 /* And probe at *sp. */
12294 emit_stack_probe (stack_pointer_rtx);
12295 emit_insn (gen_blockage ());
12298 /* We need to allocate space for the residual, but we do not need
12299 to probe the residual. */
12300 HOST_WIDE_INT residual = (i - probe_interval - size);
12301 if (residual)
12302 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12303 GEN_INT (residual), -1,
12304 m->fs.cfa_reg == stack_pointer_rtx);
12305 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12307 else
12309 struct scratch_reg sr;
12310 get_scratch_register_on_entry (&sr);
12312 /* Step 1: round SIZE down to a multiple of the interval. */
12313 HOST_WIDE_INT rounded_size = size & -probe_interval;
12315 /* Step 2: compute final value of the loop counter. Use lea if
12316 possible. */
12317 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12318 rtx insn;
12319 if (address_no_seg_operand (addr, Pmode))
12320 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12321 else
12323 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12324 insn = emit_insn (gen_rtx_SET (sr.reg,
12325 gen_rtx_PLUS (Pmode, sr.reg,
12326 stack_pointer_rtx)));
12328 if (m->fs.cfa_reg == stack_pointer_rtx)
12330 add_reg_note (insn, REG_CFA_DEF_CFA,
12331 plus_constant (Pmode, sr.reg,
12332 m->fs.cfa_offset + rounded_size));
12333 RTX_FRAME_RELATED_P (insn) = 1;
12336 /* Step 3: the loop. */
12337 rtx size_rtx = GEN_INT (rounded_size);
12338 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12339 size_rtx));
12340 if (m->fs.cfa_reg == stack_pointer_rtx)
12342 m->fs.cfa_offset += rounded_size;
12343 add_reg_note (insn, REG_CFA_DEF_CFA,
12344 plus_constant (Pmode, stack_pointer_rtx,
12345 m->fs.cfa_offset));
12346 RTX_FRAME_RELATED_P (insn) = 1;
12348 m->fs.sp_offset += rounded_size;
12349 emit_insn (gen_blockage ());
12351 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12352 is equal to ROUNDED_SIZE. */
12354 if (size != rounded_size)
12355 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12356 GEN_INT (rounded_size - size), -1,
12357 m->fs.cfa_reg == stack_pointer_rtx);
12358 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12360 release_scratch_register_on_entry (&sr);
12363 /* Make sure nothing is scheduled before we are done. */
12364 emit_insn (gen_blockage ());
12367 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12369 static void
12370 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12372 /* We skip the probe for the first interval + a small dope of 4 words and
12373 probe that many bytes past the specified size to maintain a protection
12374 area at the botton of the stack. */
12375 const int dope = 4 * UNITS_PER_WORD;
12376 rtx size_rtx = GEN_INT (size), last;
12378 /* See if we have a constant small number of probes to generate. If so,
12379 that's the easy case. The run-time loop is made up of 9 insns in the
12380 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12381 for n # of intervals. */
12382 if (size <= 4 * get_probe_interval ())
12384 HOST_WIDE_INT i, adjust;
12385 bool first_probe = true;
12387 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12388 values of N from 1 until it exceeds SIZE. If only one probe is
12389 needed, this will not generate any code. Then adjust and probe
12390 to PROBE_INTERVAL + SIZE. */
12391 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12393 if (first_probe)
12395 adjust = 2 * get_probe_interval () + dope;
12396 first_probe = false;
12398 else
12399 adjust = get_probe_interval ();
12401 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12402 plus_constant (Pmode, stack_pointer_rtx,
12403 -adjust)));
12404 emit_stack_probe (stack_pointer_rtx);
12407 if (first_probe)
12408 adjust = size + get_probe_interval () + dope;
12409 else
12410 adjust = size + get_probe_interval () - i;
12412 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12413 plus_constant (Pmode, stack_pointer_rtx,
12414 -adjust)));
12415 emit_stack_probe (stack_pointer_rtx);
12417 /* Adjust back to account for the additional first interval. */
12418 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12419 plus_constant (Pmode, stack_pointer_rtx,
12420 (get_probe_interval ()
12421 + dope))));
12424 /* Otherwise, do the same as above, but in a loop. Note that we must be
12425 extra careful with variables wrapping around because we might be at
12426 the very top (or the very bottom) of the address space and we have
12427 to be able to handle this case properly; in particular, we use an
12428 equality test for the loop condition. */
12429 else
12431 HOST_WIDE_INT rounded_size;
12432 struct scratch_reg sr;
12434 get_scratch_register_on_entry (&sr);
12437 /* Step 1: round SIZE to the previous multiple of the interval. */
12439 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12442 /* Step 2: compute initial and final value of the loop counter. */
12444 /* SP = SP_0 + PROBE_INTERVAL. */
12445 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12446 plus_constant (Pmode, stack_pointer_rtx,
12447 - (get_probe_interval () + dope))));
12449 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12450 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12451 emit_insn (gen_rtx_SET (sr.reg,
12452 plus_constant (Pmode, stack_pointer_rtx,
12453 -rounded_size)));
12454 else
12456 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12457 emit_insn (gen_rtx_SET (sr.reg,
12458 gen_rtx_PLUS (Pmode, sr.reg,
12459 stack_pointer_rtx)));
12463 /* Step 3: the loop
12467 SP = SP + PROBE_INTERVAL
12468 probe at SP
12470 while (SP != LAST_ADDR)
12472 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12473 values of N from 1 until it is equal to ROUNDED_SIZE. */
12475 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12478 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12479 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12481 if (size != rounded_size)
12483 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12484 plus_constant (Pmode, stack_pointer_rtx,
12485 rounded_size - size)));
12486 emit_stack_probe (stack_pointer_rtx);
12489 /* Adjust back to account for the additional first interval. */
12490 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12491 plus_constant (Pmode, stack_pointer_rtx,
12492 (get_probe_interval ()
12493 + dope))));
12495 release_scratch_register_on_entry (&sr);
12498 /* Even if the stack pointer isn't the CFA register, we need to correctly
12499 describe the adjustments made to it, in particular differentiate the
12500 frame-related ones from the frame-unrelated ones. */
12501 if (size > 0)
12503 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12504 XVECEXP (expr, 0, 0)
12505 = gen_rtx_SET (stack_pointer_rtx,
12506 plus_constant (Pmode, stack_pointer_rtx, -size));
12507 XVECEXP (expr, 0, 1)
12508 = gen_rtx_SET (stack_pointer_rtx,
12509 plus_constant (Pmode, stack_pointer_rtx,
12510 get_probe_interval () + dope + size));
12511 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12512 RTX_FRAME_RELATED_P (last) = 1;
12514 cfun->machine->fs.sp_offset += size;
12517 /* Make sure nothing is scheduled before we are done. */
12518 emit_insn (gen_blockage ());
12521 /* Adjust the stack pointer up to REG while probing it. */
12523 const char *
12524 output_adjust_stack_and_probe (rtx reg)
12526 static int labelno = 0;
12527 char loop_lab[32];
12528 rtx xops[2];
12530 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12532 /* Loop. */
12533 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12535 /* SP = SP + PROBE_INTERVAL. */
12536 xops[0] = stack_pointer_rtx;
12537 xops[1] = GEN_INT (get_probe_interval ());
12538 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12540 /* Probe at SP. */
12541 xops[1] = const0_rtx;
12542 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12544 /* Test if SP == LAST_ADDR. */
12545 xops[0] = stack_pointer_rtx;
12546 xops[1] = reg;
12547 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12549 /* Branch. */
12550 fputs ("\tjne\t", asm_out_file);
12551 assemble_name_raw (asm_out_file, loop_lab);
12552 fputc ('\n', asm_out_file);
12554 return "";
12557 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12558 inclusive. These are offsets from the current stack pointer. */
12560 static void
12561 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12563 /* See if we have a constant small number of probes to generate. If so,
12564 that's the easy case. The run-time loop is made up of 6 insns in the
12565 generic case while the compile-time loop is made up of n insns for n #
12566 of intervals. */
12567 if (size <= 6 * get_probe_interval ())
12569 HOST_WIDE_INT i;
12571 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12572 it exceeds SIZE. If only one probe is needed, this will not
12573 generate any code. Then probe at FIRST + SIZE. */
12574 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12575 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12576 -(first + i)));
12578 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12579 -(first + size)));
12582 /* Otherwise, do the same as above, but in a loop. Note that we must be
12583 extra careful with variables wrapping around because we might be at
12584 the very top (or the very bottom) of the address space and we have
12585 to be able to handle this case properly; in particular, we use an
12586 equality test for the loop condition. */
12587 else
12589 HOST_WIDE_INT rounded_size, last;
12590 struct scratch_reg sr;
12592 get_scratch_register_on_entry (&sr);
12595 /* Step 1: round SIZE to the previous multiple of the interval. */
12597 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12600 /* Step 2: compute initial and final value of the loop counter. */
12602 /* TEST_OFFSET = FIRST. */
12603 emit_move_insn (sr.reg, GEN_INT (-first));
12605 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12606 last = first + rounded_size;
12609 /* Step 3: the loop
12613 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12614 probe at TEST_ADDR
12616 while (TEST_ADDR != LAST_ADDR)
12618 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12619 until it is equal to ROUNDED_SIZE. */
12621 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12624 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12625 that SIZE is equal to ROUNDED_SIZE. */
12627 if (size != rounded_size)
12628 emit_stack_probe (plus_constant (Pmode,
12629 gen_rtx_PLUS (Pmode,
12630 stack_pointer_rtx,
12631 sr.reg),
12632 rounded_size - size));
12634 release_scratch_register_on_entry (&sr);
12637 /* Make sure nothing is scheduled before we are done. */
12638 emit_insn (gen_blockage ());
12641 /* Probe a range of stack addresses from REG to END, inclusive. These are
12642 offsets from the current stack pointer. */
12644 const char *
12645 output_probe_stack_range (rtx reg, rtx end)
12647 static int labelno = 0;
12648 char loop_lab[32];
12649 rtx xops[3];
12651 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12653 /* Loop. */
12654 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12656 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12657 xops[0] = reg;
12658 xops[1] = GEN_INT (get_probe_interval ());
12659 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12661 /* Probe at TEST_ADDR. */
12662 xops[0] = stack_pointer_rtx;
12663 xops[1] = reg;
12664 xops[2] = const0_rtx;
12665 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12667 /* Test if TEST_ADDR == LAST_ADDR. */
12668 xops[0] = reg;
12669 xops[1] = end;
12670 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12672 /* Branch. */
12673 fputs ("\tjne\t", asm_out_file);
12674 assemble_name_raw (asm_out_file, loop_lab);
12675 fputc ('\n', asm_out_file);
12677 return "";
12680 /* Return true if stack frame is required. Update STACK_ALIGNMENT
12681 to the largest alignment, in bits, of stack slot used if stack
12682 frame is required and CHECK_STACK_SLOT is true. */
12684 static bool
12685 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
12686 bool check_stack_slot)
12688 HARD_REG_SET set_up_by_prologue, prologue_used;
12689 basic_block bb;
12691 CLEAR_HARD_REG_SET (prologue_used);
12692 CLEAR_HARD_REG_SET (set_up_by_prologue);
12693 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12694 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12695 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12696 HARD_FRAME_POINTER_REGNUM);
12698 /* The preferred stack alignment is the minimum stack alignment. */
12699 if (stack_alignment > crtl->preferred_stack_boundary)
12700 stack_alignment = crtl->preferred_stack_boundary;
12702 bool require_stack_frame = false;
12704 FOR_EACH_BB_FN (bb, cfun)
12706 rtx_insn *insn;
12707 FOR_BB_INSNS (bb, insn)
12708 if (NONDEBUG_INSN_P (insn)
12709 && requires_stack_frame_p (insn, prologue_used,
12710 set_up_by_prologue))
12712 require_stack_frame = true;
12714 if (check_stack_slot)
12716 /* Find the maximum stack alignment. */
12717 subrtx_iterator::array_type array;
12718 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12719 if (MEM_P (*iter)
12720 && (reg_mentioned_p (stack_pointer_rtx,
12721 *iter)
12722 || reg_mentioned_p (frame_pointer_rtx,
12723 *iter)))
12725 unsigned int alignment = MEM_ALIGN (*iter);
12726 if (alignment > stack_alignment)
12727 stack_alignment = alignment;
12733 return require_stack_frame;
12736 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12737 will guide prologue/epilogue to be generated in correct form. */
12739 static void
12740 ix86_finalize_stack_frame_flags (void)
12742 /* Check if stack realign is really needed after reload, and
12743 stores result in cfun */
12744 unsigned int incoming_stack_boundary
12745 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12746 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12747 unsigned int stack_alignment
12748 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12749 ? crtl->max_used_stack_slot_alignment
12750 : crtl->stack_alignment_needed);
12751 unsigned int stack_realign
12752 = (incoming_stack_boundary < stack_alignment);
12753 bool recompute_frame_layout_p = false;
12755 if (crtl->stack_realign_finalized)
12757 /* After stack_realign_needed is finalized, we can't no longer
12758 change it. */
12759 gcc_assert (crtl->stack_realign_needed == stack_realign);
12760 return;
12763 /* If the only reason for frame_pointer_needed is that we conservatively
12764 assumed stack realignment might be needed or -fno-omit-frame-pointer
12765 is used, but in the end nothing that needed the stack alignment had
12766 been spilled nor stack access, clear frame_pointer_needed and say we
12767 don't need stack realignment. */
12768 if ((stack_realign || !flag_omit_frame_pointer)
12769 && frame_pointer_needed
12770 && crtl->is_leaf
12771 && crtl->sp_is_unchanging
12772 && !ix86_current_function_calls_tls_descriptor
12773 && !crtl->accesses_prior_frames
12774 && !cfun->calls_alloca
12775 && !crtl->calls_eh_return
12776 /* See ira_setup_eliminable_regset for the rationale. */
12777 && !(STACK_CHECK_MOVING_SP
12778 && flag_stack_check
12779 && flag_exceptions
12780 && cfun->can_throw_non_call_exceptions)
12781 && !ix86_frame_pointer_required ()
12782 && get_frame_size () == 0
12783 && ix86_nsaved_sseregs () == 0
12784 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12786 if (ix86_find_max_used_stack_alignment (stack_alignment,
12787 stack_realign))
12789 /* Stack frame is required. If stack alignment needed is less
12790 than incoming stack boundary, don't realign stack. */
12791 stack_realign = incoming_stack_boundary < stack_alignment;
12792 if (!stack_realign)
12794 crtl->max_used_stack_slot_alignment
12795 = incoming_stack_boundary;
12796 crtl->stack_alignment_needed
12797 = incoming_stack_boundary;
12798 /* Also update preferred_stack_boundary for leaf
12799 functions. */
12800 crtl->preferred_stack_boundary
12801 = incoming_stack_boundary;
12804 else
12806 /* If drap has been set, but it actually isn't live at the
12807 start of the function, there is no reason to set it up. */
12808 if (crtl->drap_reg)
12810 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12811 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12812 REGNO (crtl->drap_reg)))
12814 crtl->drap_reg = NULL_RTX;
12815 crtl->need_drap = false;
12818 else
12819 cfun->machine->no_drap_save_restore = true;
12821 frame_pointer_needed = false;
12822 stack_realign = false;
12823 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12824 crtl->stack_alignment_needed = incoming_stack_boundary;
12825 crtl->stack_alignment_estimated = incoming_stack_boundary;
12826 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12827 crtl->preferred_stack_boundary = incoming_stack_boundary;
12828 df_finish_pass (true);
12829 df_scan_alloc (NULL);
12830 df_scan_blocks ();
12831 df_compute_regs_ever_live (true);
12832 df_analyze ();
12834 if (flag_var_tracking)
12836 /* Since frame pointer is no longer available, replace it with
12837 stack pointer - UNITS_PER_WORD in debug insns. */
12838 df_ref ref, next;
12839 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12840 ref; ref = next)
12842 next = DF_REF_NEXT_REG (ref);
12843 if (!DF_REF_INSN_INFO (ref))
12844 continue;
12846 /* Make sure the next ref is for a different instruction,
12847 so that we're not affected by the rescan. */
12848 rtx_insn *insn = DF_REF_INSN (ref);
12849 while (next && DF_REF_INSN (next) == insn)
12850 next = DF_REF_NEXT_REG (next);
12852 if (DEBUG_INSN_P (insn))
12854 bool changed = false;
12855 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12857 rtx *loc = DF_REF_LOC (ref);
12858 if (*loc == hard_frame_pointer_rtx)
12860 *loc = plus_constant (Pmode,
12861 stack_pointer_rtx,
12862 -UNITS_PER_WORD);
12863 changed = true;
12866 if (changed)
12867 df_insn_rescan (insn);
12872 recompute_frame_layout_p = true;
12875 else if (crtl->max_used_stack_slot_alignment
12876 > crtl->preferred_stack_boundary)
12878 /* We don't need to realign stack. But we still need to keep
12879 stack frame properly aligned to satisfy the largest alignment
12880 of stack slots. */
12881 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
12882 cfun->machine->max_used_stack_alignment
12883 = stack_alignment / BITS_PER_UNIT;
12886 if (crtl->stack_realign_needed != stack_realign)
12887 recompute_frame_layout_p = true;
12888 crtl->stack_realign_needed = stack_realign;
12889 crtl->stack_realign_finalized = true;
12890 if (recompute_frame_layout_p)
12891 ix86_compute_frame_layout ();
12894 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12896 static void
12897 ix86_elim_entry_set_got (rtx reg)
12899 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12900 rtx_insn *c_insn = BB_HEAD (bb);
12901 if (!NONDEBUG_INSN_P (c_insn))
12902 c_insn = next_nonnote_nondebug_insn (c_insn);
12903 if (c_insn && NONJUMP_INSN_P (c_insn))
12905 rtx pat = PATTERN (c_insn);
12906 if (GET_CODE (pat) == PARALLEL)
12908 rtx vec = XVECEXP (pat, 0, 0);
12909 if (GET_CODE (vec) == SET
12910 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12911 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12912 delete_insn (c_insn);
12917 static rtx
12918 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12920 rtx addr, mem;
12922 if (offset)
12923 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12924 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12925 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12928 static inline rtx
12929 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12931 return gen_frame_set (reg, frame_reg, offset, false);
12934 static inline rtx
12935 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12937 return gen_frame_set (reg, frame_reg, offset, true);
12940 static void
12941 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12943 struct machine_function *m = cfun->machine;
12944 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12945 + m->call_ms2sysv_extra_regs;
12946 rtvec v = rtvec_alloc (ncregs + 1);
12947 unsigned int align, i, vi = 0;
12948 rtx_insn *insn;
12949 rtx sym, addr;
12950 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12951 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12953 /* AL should only be live with sysv_abi. */
12954 gcc_assert (!ix86_eax_live_at_start_p ());
12955 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
12957 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12958 we've actually realigned the stack or not. */
12959 align = GET_MODE_ALIGNMENT (V4SFmode);
12960 addr = choose_baseaddr (frame.stack_realign_offset
12961 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
12962 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12964 emit_insn (gen_rtx_SET (rax, addr));
12966 /* Get the stub symbol. */
12967 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12968 : XLOGUE_STUB_SAVE);
12969 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12971 for (i = 0; i < ncregs; ++i)
12973 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12974 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12975 r.regno);
12976 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12979 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12981 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12982 RTX_FRAME_RELATED_P (insn) = true;
12985 /* Expand the prologue into a bunch of separate insns. */
12987 void
12988 ix86_expand_prologue (void)
12990 struct machine_function *m = cfun->machine;
12991 rtx insn, t;
12992 struct ix86_frame frame;
12993 HOST_WIDE_INT allocate;
12994 bool int_registers_saved;
12995 bool sse_registers_saved;
12996 bool save_stub_call_needed;
12997 rtx static_chain = NULL_RTX;
12999 if (ix86_function_naked (current_function_decl))
13000 return;
13002 ix86_finalize_stack_frame_flags ();
13004 /* DRAP should not coexist with stack_realign_fp */
13005 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13007 memset (&m->fs, 0, sizeof (m->fs));
13009 /* Initialize CFA state for before the prologue. */
13010 m->fs.cfa_reg = stack_pointer_rtx;
13011 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13013 /* Track SP offset to the CFA. We continue tracking this after we've
13014 swapped the CFA register away from SP. In the case of re-alignment
13015 this is fudged; we're interested to offsets within the local frame. */
13016 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13017 m->fs.sp_valid = true;
13018 m->fs.sp_realigned = false;
13020 frame = m->frame;
13022 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13024 /* We should have already generated an error for any use of
13025 ms_hook on a nested function. */
13026 gcc_checking_assert (!ix86_static_chain_on_stack);
13028 /* Check if profiling is active and we shall use profiling before
13029 prologue variant. If so sorry. */
13030 if (crtl->profile && flag_fentry != 0)
13031 sorry ("ms_hook_prologue attribute isn%'t compatible "
13032 "with -mfentry for 32-bit");
13034 /* In ix86_asm_output_function_label we emitted:
13035 8b ff movl.s %edi,%edi
13036 55 push %ebp
13037 8b ec movl.s %esp,%ebp
13039 This matches the hookable function prologue in Win32 API
13040 functions in Microsoft Windows XP Service Pack 2 and newer.
13041 Wine uses this to enable Windows apps to hook the Win32 API
13042 functions provided by Wine.
13044 What that means is that we've already set up the frame pointer. */
13046 if (frame_pointer_needed
13047 && !(crtl->drap_reg && crtl->stack_realign_needed))
13049 rtx push, mov;
13051 /* We've decided to use the frame pointer already set up.
13052 Describe this to the unwinder by pretending that both
13053 push and mov insns happen right here.
13055 Putting the unwind info here at the end of the ms_hook
13056 is done so that we can make absolutely certain we get
13057 the required byte sequence at the start of the function,
13058 rather than relying on an assembler that can produce
13059 the exact encoding required.
13061 However it does mean (in the unpatched case) that we have
13062 a 1 insn window where the asynchronous unwind info is
13063 incorrect. However, if we placed the unwind info at
13064 its correct location we would have incorrect unwind info
13065 in the patched case. Which is probably all moot since
13066 I don't expect Wine generates dwarf2 unwind info for the
13067 system libraries that use this feature. */
13069 insn = emit_insn (gen_blockage ());
13071 push = gen_push (hard_frame_pointer_rtx);
13072 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13073 stack_pointer_rtx);
13074 RTX_FRAME_RELATED_P (push) = 1;
13075 RTX_FRAME_RELATED_P (mov) = 1;
13077 RTX_FRAME_RELATED_P (insn) = 1;
13078 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13079 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13081 /* Note that gen_push incremented m->fs.cfa_offset, even
13082 though we didn't emit the push insn here. */
13083 m->fs.cfa_reg = hard_frame_pointer_rtx;
13084 m->fs.fp_offset = m->fs.cfa_offset;
13085 m->fs.fp_valid = true;
13087 else
13089 /* The frame pointer is not needed so pop %ebp again.
13090 This leaves us with a pristine state. */
13091 emit_insn (gen_pop (hard_frame_pointer_rtx));
13095 /* The first insn of a function that accepts its static chain on the
13096 stack is to push the register that would be filled in by a direct
13097 call. This insn will be skipped by the trampoline. */
13098 else if (ix86_static_chain_on_stack)
13100 static_chain = ix86_static_chain (cfun->decl, false);
13101 insn = emit_insn (gen_push (static_chain));
13102 emit_insn (gen_blockage ());
13104 /* We don't want to interpret this push insn as a register save,
13105 only as a stack adjustment. The real copy of the register as
13106 a save will be done later, if needed. */
13107 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13108 t = gen_rtx_SET (stack_pointer_rtx, t);
13109 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13110 RTX_FRAME_RELATED_P (insn) = 1;
13113 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13114 of DRAP is needed and stack realignment is really needed after reload */
13115 if (stack_realign_drap)
13117 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13119 /* Can't use DRAP in interrupt function. */
13120 if (cfun->machine->func_type != TYPE_NORMAL)
13121 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13122 "in interrupt service routine. This may be worked "
13123 "around by avoiding functions with aggregate return.");
13125 /* Only need to push parameter pointer reg if it is caller saved. */
13126 if (!call_used_regs[REGNO (crtl->drap_reg)])
13128 /* Push arg pointer reg */
13129 insn = emit_insn (gen_push (crtl->drap_reg));
13130 RTX_FRAME_RELATED_P (insn) = 1;
13133 /* Grab the argument pointer. */
13134 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13135 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13136 RTX_FRAME_RELATED_P (insn) = 1;
13137 m->fs.cfa_reg = crtl->drap_reg;
13138 m->fs.cfa_offset = 0;
13140 /* Align the stack. */
13141 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13142 stack_pointer_rtx,
13143 GEN_INT (-align_bytes)));
13144 RTX_FRAME_RELATED_P (insn) = 1;
13146 /* Replicate the return address on the stack so that return
13147 address can be reached via (argp - 1) slot. This is needed
13148 to implement macro RETURN_ADDR_RTX and intrinsic function
13149 expand_builtin_return_addr etc. */
13150 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13151 t = gen_frame_mem (word_mode, t);
13152 insn = emit_insn (gen_push (t));
13153 RTX_FRAME_RELATED_P (insn) = 1;
13155 /* For the purposes of frame and register save area addressing,
13156 we've started over with a new frame. */
13157 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13158 m->fs.realigned = true;
13160 if (static_chain)
13162 /* Replicate static chain on the stack so that static chain
13163 can be reached via (argp - 2) slot. This is needed for
13164 nested function with stack realignment. */
13165 insn = emit_insn (gen_push (static_chain));
13166 RTX_FRAME_RELATED_P (insn) = 1;
13170 int_registers_saved = (frame.nregs == 0);
13171 sse_registers_saved = (frame.nsseregs == 0);
13172 save_stub_call_needed = (m->call_ms2sysv);
13173 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13175 if (frame_pointer_needed && !m->fs.fp_valid)
13177 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13178 slower on all targets. Also sdb didn't like it. */
13179 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13180 RTX_FRAME_RELATED_P (insn) = 1;
13182 /* Push registers now, before setting the frame pointer
13183 on SEH target. */
13184 if (!int_registers_saved
13185 && TARGET_SEH
13186 && !frame.save_regs_using_mov)
13188 ix86_emit_save_regs ();
13189 int_registers_saved = true;
13190 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13193 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13195 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13196 RTX_FRAME_RELATED_P (insn) = 1;
13198 if (m->fs.cfa_reg == stack_pointer_rtx)
13199 m->fs.cfa_reg = hard_frame_pointer_rtx;
13200 m->fs.fp_offset = m->fs.sp_offset;
13201 m->fs.fp_valid = true;
13205 if (!int_registers_saved)
13207 /* If saving registers via PUSH, do so now. */
13208 if (!frame.save_regs_using_mov)
13210 ix86_emit_save_regs ();
13211 int_registers_saved = true;
13212 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13215 /* When using red zone we may start register saving before allocating
13216 the stack frame saving one cycle of the prologue. However, avoid
13217 doing this if we have to probe the stack; at least on x86_64 the
13218 stack probe can turn into a call that clobbers a red zone location. */
13219 else if (ix86_using_red_zone ()
13220 && (! TARGET_STACK_PROBE
13221 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13223 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13224 int_registers_saved = true;
13228 if (stack_realign_fp)
13230 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13231 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13233 /* Record last valid frame pointer offset. */
13234 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13236 /* The computation of the size of the re-aligned stack frame means
13237 that we must allocate the size of the register save area before
13238 performing the actual alignment. Otherwise we cannot guarantee
13239 that there's enough storage above the realignment point. */
13240 allocate = frame.reg_save_offset - m->fs.sp_offset
13241 + frame.stack_realign_allocate;
13242 if (allocate)
13243 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13244 GEN_INT (-allocate), -1, false);
13246 /* Align the stack. */
13247 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13248 stack_pointer_rtx,
13249 GEN_INT (-align_bytes)));
13250 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13251 m->fs.sp_realigned_offset = m->fs.sp_offset
13252 - frame.stack_realign_allocate;
13253 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13254 Beyond this point, stack access should be done via choose_baseaddr or
13255 by using sp_valid_at and fp_valid_at to determine the correct base
13256 register. Henceforth, any CFA offset should be thought of as logical
13257 and not physical. */
13258 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13259 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13260 m->fs.sp_realigned = true;
13262 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13263 is needed to describe where a register is saved using a realigned
13264 stack pointer, so we need to invalidate the stack pointer for that
13265 target. */
13266 if (TARGET_SEH)
13267 m->fs.sp_valid = false;
13269 /* If SP offset is non-immediate after allocation of the stack frame,
13270 then emit SSE saves or stub call prior to allocating the rest of the
13271 stack frame. This is less efficient for the out-of-line stub because
13272 we can't combine allocations across the call barrier, but it's better
13273 than using a scratch register. */
13274 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13275 - m->fs.sp_realigned_offset),
13276 Pmode))
13278 if (!sse_registers_saved)
13280 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13281 sse_registers_saved = true;
13283 else if (save_stub_call_needed)
13285 ix86_emit_outlined_ms2sysv_save (frame);
13286 save_stub_call_needed = false;
13291 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13293 if (flag_stack_usage_info)
13295 /* We start to count from ARG_POINTER. */
13296 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13298 /* If it was realigned, take into account the fake frame. */
13299 if (stack_realign_drap)
13301 if (ix86_static_chain_on_stack)
13302 stack_size += UNITS_PER_WORD;
13304 if (!call_used_regs[REGNO (crtl->drap_reg)])
13305 stack_size += UNITS_PER_WORD;
13307 /* This over-estimates by 1 minimal-stack-alignment-unit but
13308 mitigates that by counting in the new return address slot. */
13309 current_function_dynamic_stack_size
13310 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13313 current_function_static_stack_size = stack_size;
13316 /* On SEH target with very large frame size, allocate an area to save
13317 SSE registers (as the very large allocation won't be described). */
13318 if (TARGET_SEH
13319 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13320 && !sse_registers_saved)
13322 HOST_WIDE_INT sse_size =
13323 frame.sse_reg_save_offset - frame.reg_save_offset;
13325 gcc_assert (int_registers_saved);
13327 /* No need to do stack checking as the area will be immediately
13328 written. */
13329 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13330 GEN_INT (-sse_size), -1,
13331 m->fs.cfa_reg == stack_pointer_rtx);
13332 allocate -= sse_size;
13333 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13334 sse_registers_saved = true;
13337 /* The stack has already been decremented by the instruction calling us
13338 so probe if the size is non-negative to preserve the protection area. */
13339 if (allocate >= 0
13340 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13341 || flag_stack_clash_protection))
13343 /* This assert wants to verify that integer registers were saved
13344 prior to probing. This is necessary when probing may be implemented
13345 as a function call (Windows). It is not necessary for stack clash
13346 protection probing. */
13347 if (!flag_stack_clash_protection)
13348 gcc_assert (int_registers_saved);
13350 if (flag_stack_clash_protection)
13352 ix86_adjust_stack_and_probe_stack_clash (allocate);
13353 allocate = 0;
13355 else if (STACK_CHECK_MOVING_SP)
13357 if (!(crtl->is_leaf && !cfun->calls_alloca
13358 && allocate <= get_probe_interval ()))
13360 ix86_adjust_stack_and_probe (allocate);
13361 allocate = 0;
13364 else
13366 HOST_WIDE_INT size = allocate;
13368 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13369 size = 0x80000000 - get_stack_check_protect () - 1;
13371 if (TARGET_STACK_PROBE)
13373 if (crtl->is_leaf && !cfun->calls_alloca)
13375 if (size > get_probe_interval ())
13376 ix86_emit_probe_stack_range (0, size);
13378 else
13379 ix86_emit_probe_stack_range (0,
13380 size + get_stack_check_protect ());
13382 else
13384 if (crtl->is_leaf && !cfun->calls_alloca)
13386 if (size > get_probe_interval ()
13387 && size > get_stack_check_protect ())
13388 ix86_emit_probe_stack_range (get_stack_check_protect (),
13389 size - get_stack_check_protect ());
13391 else
13392 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13397 if (allocate == 0)
13399 else if (!ix86_target_stack_probe ()
13400 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13402 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13403 GEN_INT (-allocate), -1,
13404 m->fs.cfa_reg == stack_pointer_rtx);
13406 else
13408 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13409 rtx r10 = NULL;
13410 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13411 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13412 bool eax_live = ix86_eax_live_at_start_p ();
13413 bool r10_live = false;
13415 if (TARGET_64BIT)
13416 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13418 if (eax_live)
13420 insn = emit_insn (gen_push (eax));
13421 allocate -= UNITS_PER_WORD;
13422 /* Note that SEH directives need to continue tracking the stack
13423 pointer even after the frame pointer has been set up. */
13424 if (sp_is_cfa_reg || TARGET_SEH)
13426 if (sp_is_cfa_reg)
13427 m->fs.cfa_offset += UNITS_PER_WORD;
13428 RTX_FRAME_RELATED_P (insn) = 1;
13429 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13430 gen_rtx_SET (stack_pointer_rtx,
13431 plus_constant (Pmode, stack_pointer_rtx,
13432 -UNITS_PER_WORD)));
13436 if (r10_live)
13438 r10 = gen_rtx_REG (Pmode, R10_REG);
13439 insn = emit_insn (gen_push (r10));
13440 allocate -= UNITS_PER_WORD;
13441 if (sp_is_cfa_reg || TARGET_SEH)
13443 if (sp_is_cfa_reg)
13444 m->fs.cfa_offset += UNITS_PER_WORD;
13445 RTX_FRAME_RELATED_P (insn) = 1;
13446 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13447 gen_rtx_SET (stack_pointer_rtx,
13448 plus_constant (Pmode, stack_pointer_rtx,
13449 -UNITS_PER_WORD)));
13453 emit_move_insn (eax, GEN_INT (allocate));
13454 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13456 /* Use the fact that AX still contains ALLOCATE. */
13457 adjust_stack_insn = (Pmode == DImode
13458 ? gen_pro_epilogue_adjust_stack_di_sub
13459 : gen_pro_epilogue_adjust_stack_si_sub);
13461 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13462 stack_pointer_rtx, eax));
13464 if (sp_is_cfa_reg || TARGET_SEH)
13466 if (sp_is_cfa_reg)
13467 m->fs.cfa_offset += allocate;
13468 RTX_FRAME_RELATED_P (insn) = 1;
13469 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13470 gen_rtx_SET (stack_pointer_rtx,
13471 plus_constant (Pmode, stack_pointer_rtx,
13472 -allocate)));
13474 m->fs.sp_offset += allocate;
13476 /* Use stack_pointer_rtx for relative addressing so that code
13477 works for realigned stack, too. */
13478 if (r10_live && eax_live)
13480 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13481 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13482 gen_frame_mem (word_mode, t));
13483 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13484 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13485 gen_frame_mem (word_mode, t));
13487 else if (eax_live || r10_live)
13489 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13490 emit_move_insn (gen_rtx_REG (word_mode,
13491 (eax_live ? AX_REG : R10_REG)),
13492 gen_frame_mem (word_mode, t));
13495 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13497 /* If we havn't already set up the frame pointer, do so now. */
13498 if (frame_pointer_needed && !m->fs.fp_valid)
13500 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13501 GEN_INT (frame.stack_pointer_offset
13502 - frame.hard_frame_pointer_offset));
13503 insn = emit_insn (insn);
13504 RTX_FRAME_RELATED_P (insn) = 1;
13505 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13507 if (m->fs.cfa_reg == stack_pointer_rtx)
13508 m->fs.cfa_reg = hard_frame_pointer_rtx;
13509 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13510 m->fs.fp_valid = true;
13513 if (!int_registers_saved)
13514 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13515 if (!sse_registers_saved)
13516 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13517 else if (save_stub_call_needed)
13518 ix86_emit_outlined_ms2sysv_save (frame);
13520 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13521 in PROLOGUE. */
13522 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13524 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13525 insn = emit_insn (gen_set_got (pic));
13526 RTX_FRAME_RELATED_P (insn) = 1;
13527 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13528 emit_insn (gen_prologue_use (pic));
13529 /* Deleting already emmitted SET_GOT if exist and allocated to
13530 REAL_PIC_OFFSET_TABLE_REGNUM. */
13531 ix86_elim_entry_set_got (pic);
13534 if (crtl->drap_reg && !crtl->stack_realign_needed)
13536 /* vDRAP is setup but after reload it turns out stack realign
13537 isn't necessary, here we will emit prologue to setup DRAP
13538 without stack realign adjustment */
13539 t = choose_baseaddr (0, NULL);
13540 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13543 /* Prevent instructions from being scheduled into register save push
13544 sequence when access to the redzone area is done through frame pointer.
13545 The offset between the frame pointer and the stack pointer is calculated
13546 relative to the value of the stack pointer at the end of the function
13547 prologue, and moving instructions that access redzone area via frame
13548 pointer inside push sequence violates this assumption. */
13549 if (frame_pointer_needed && frame.red_zone_size)
13550 emit_insn (gen_memory_blockage ());
13552 /* SEH requires that the prologue end within 256 bytes of the start of
13553 the function. Prevent instruction schedules that would extend that.
13554 Further, prevent alloca modifications to the stack pointer from being
13555 combined with prologue modifications. */
13556 if (TARGET_SEH)
13557 emit_insn (gen_prologue_use (stack_pointer_rtx));
13560 /* Emit code to restore REG using a POP insn. */
13562 static void
13563 ix86_emit_restore_reg_using_pop (rtx reg)
13565 struct machine_function *m = cfun->machine;
13566 rtx_insn *insn = emit_insn (gen_pop (reg));
13568 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13569 m->fs.sp_offset -= UNITS_PER_WORD;
13571 if (m->fs.cfa_reg == crtl->drap_reg
13572 && REGNO (reg) == REGNO (crtl->drap_reg))
13574 /* Previously we'd represented the CFA as an expression
13575 like *(%ebp - 8). We've just popped that value from
13576 the stack, which means we need to reset the CFA to
13577 the drap register. This will remain until we restore
13578 the stack pointer. */
13579 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13580 RTX_FRAME_RELATED_P (insn) = 1;
13582 /* This means that the DRAP register is valid for addressing too. */
13583 m->fs.drap_valid = true;
13584 return;
13587 if (m->fs.cfa_reg == stack_pointer_rtx)
13589 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13590 x = gen_rtx_SET (stack_pointer_rtx, x);
13591 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13592 RTX_FRAME_RELATED_P (insn) = 1;
13594 m->fs.cfa_offset -= UNITS_PER_WORD;
13597 /* When the frame pointer is the CFA, and we pop it, we are
13598 swapping back to the stack pointer as the CFA. This happens
13599 for stack frames that don't allocate other data, so we assume
13600 the stack pointer is now pointing at the return address, i.e.
13601 the function entry state, which makes the offset be 1 word. */
13602 if (reg == hard_frame_pointer_rtx)
13604 m->fs.fp_valid = false;
13605 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13607 m->fs.cfa_reg = stack_pointer_rtx;
13608 m->fs.cfa_offset -= UNITS_PER_WORD;
13610 add_reg_note (insn, REG_CFA_DEF_CFA,
13611 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13612 GEN_INT (m->fs.cfa_offset)));
13613 RTX_FRAME_RELATED_P (insn) = 1;
13618 /* Emit code to restore saved registers using POP insns. */
13620 static void
13621 ix86_emit_restore_regs_using_pop (void)
13623 unsigned int regno;
13625 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13626 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13627 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13630 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13631 omits the emit and only attaches the notes. */
13633 static void
13634 ix86_emit_leave (rtx_insn *insn)
13636 struct machine_function *m = cfun->machine;
13637 if (!insn)
13638 insn = emit_insn (ix86_gen_leave ());
13640 ix86_add_queued_cfa_restore_notes (insn);
13642 gcc_assert (m->fs.fp_valid);
13643 m->fs.sp_valid = true;
13644 m->fs.sp_realigned = false;
13645 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13646 m->fs.fp_valid = false;
13648 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13650 m->fs.cfa_reg = stack_pointer_rtx;
13651 m->fs.cfa_offset = m->fs.sp_offset;
13653 add_reg_note (insn, REG_CFA_DEF_CFA,
13654 plus_constant (Pmode, stack_pointer_rtx,
13655 m->fs.sp_offset));
13656 RTX_FRAME_RELATED_P (insn) = 1;
13658 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13659 m->fs.fp_offset);
13662 /* Emit code to restore saved registers using MOV insns.
13663 First register is restored from CFA - CFA_OFFSET. */
13664 static void
13665 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13666 bool maybe_eh_return)
13668 struct machine_function *m = cfun->machine;
13669 unsigned int regno;
13671 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13672 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13674 rtx reg = gen_rtx_REG (word_mode, regno);
13675 rtx mem;
13676 rtx_insn *insn;
13678 mem = choose_baseaddr (cfa_offset, NULL);
13679 mem = gen_frame_mem (word_mode, mem);
13680 insn = emit_move_insn (reg, mem);
13682 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13684 /* Previously we'd represented the CFA as an expression
13685 like *(%ebp - 8). We've just popped that value from
13686 the stack, which means we need to reset the CFA to
13687 the drap register. This will remain until we restore
13688 the stack pointer. */
13689 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13690 RTX_FRAME_RELATED_P (insn) = 1;
13692 /* This means that the DRAP register is valid for addressing. */
13693 m->fs.drap_valid = true;
13695 else
13696 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13698 cfa_offset -= UNITS_PER_WORD;
13702 /* Emit code to restore saved registers using MOV insns.
13703 First register is restored from CFA - CFA_OFFSET. */
13704 static void
13705 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13706 bool maybe_eh_return)
13708 unsigned int regno;
13710 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13711 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13713 rtx reg = gen_rtx_REG (V4SFmode, regno);
13714 rtx mem;
13715 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13717 mem = choose_baseaddr (cfa_offset, &align);
13718 mem = gen_rtx_MEM (V4SFmode, mem);
13720 /* The location aligment depends upon the base register. */
13721 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13722 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13723 set_mem_align (mem, align);
13724 emit_insn (gen_rtx_SET (reg, mem));
13726 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13728 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13732 static void
13733 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13734 bool use_call, int style)
13736 struct machine_function *m = cfun->machine;
13737 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13738 + m->call_ms2sysv_extra_regs;
13739 rtvec v;
13740 unsigned int elems_needed, align, i, vi = 0;
13741 rtx_insn *insn;
13742 rtx sym, tmp;
13743 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13744 rtx r10 = NULL_RTX;
13745 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13746 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13747 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13748 rtx rsi_frame_load = NULL_RTX;
13749 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13750 enum xlogue_stub stub;
13752 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13754 /* If using a realigned stack, we should never start with padding. */
13755 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13757 /* Setup RSI as the stub's base pointer. */
13758 align = GET_MODE_ALIGNMENT (V4SFmode);
13759 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
13760 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13762 emit_insn (gen_rtx_SET (rsi, tmp));
13764 /* Get a symbol for the stub. */
13765 if (frame_pointer_needed)
13766 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13767 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13768 else
13769 stub = use_call ? XLOGUE_STUB_RESTORE
13770 : XLOGUE_STUB_RESTORE_TAIL;
13771 sym = xlogue.get_stub_rtx (stub);
13773 elems_needed = ncregs;
13774 if (use_call)
13775 elems_needed += 1;
13776 else
13777 elems_needed += frame_pointer_needed ? 5 : 3;
13778 v = rtvec_alloc (elems_needed);
13780 /* We call the epilogue stub when we need to pop incoming args or we are
13781 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13782 epilogue stub and it is the tail-call. */
13783 if (use_call)
13784 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13785 else
13787 RTVEC_ELT (v, vi++) = ret_rtx;
13788 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13789 if (frame_pointer_needed)
13791 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13792 gcc_assert (m->fs.fp_valid);
13793 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13795 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13796 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13797 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13798 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13799 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13801 else
13803 /* If no hard frame pointer, we set R10 to the SP restore value. */
13804 gcc_assert (!m->fs.fp_valid);
13805 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13806 gcc_assert (m->fs.sp_valid);
13808 r10 = gen_rtx_REG (DImode, R10_REG);
13809 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13810 emit_insn (gen_rtx_SET (r10, tmp));
13812 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13816 /* Generate frame load insns and restore notes. */
13817 for (i = 0; i < ncregs; ++i)
13819 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13820 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13821 rtx reg, frame_load;
13823 reg = gen_rtx_REG (mode, r.regno);
13824 frame_load = gen_frame_load (reg, rsi, r.offset);
13826 /* Save RSI frame load insn & note to add last. */
13827 if (r.regno == SI_REG)
13829 gcc_assert (!rsi_frame_load);
13830 rsi_frame_load = frame_load;
13831 rsi_restore_offset = r.offset;
13833 else
13835 RTVEC_ELT (v, vi++) = frame_load;
13836 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13840 /* Add RSI frame load & restore note at the end. */
13841 gcc_assert (rsi_frame_load);
13842 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13843 RTVEC_ELT (v, vi++) = rsi_frame_load;
13844 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13845 rsi_restore_offset);
13847 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13848 if (!use_call && !frame_pointer_needed)
13850 gcc_assert (m->fs.sp_valid);
13851 gcc_assert (!m->fs.sp_realigned);
13853 /* At this point, R10 should point to frame.stack_realign_offset. */
13854 if (m->fs.cfa_reg == stack_pointer_rtx)
13855 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13856 m->fs.sp_offset = frame.stack_realign_offset;
13859 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13860 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13861 if (use_call)
13862 insn = emit_insn (tmp);
13863 else
13865 insn = emit_jump_insn (tmp);
13866 JUMP_LABEL (insn) = ret_rtx;
13868 if (frame_pointer_needed)
13869 ix86_emit_leave (insn);
13870 else
13872 /* Need CFA adjust note. */
13873 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13874 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13878 RTX_FRAME_RELATED_P (insn) = true;
13879 ix86_add_queued_cfa_restore_notes (insn);
13881 /* If we're not doing a tail-call, we need to adjust the stack. */
13882 if (use_call && m->fs.sp_valid)
13884 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13885 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13886 GEN_INT (dealloc), style,
13887 m->fs.cfa_reg == stack_pointer_rtx);
13891 /* Restore function stack, frame, and registers. */
13893 void
13894 ix86_expand_epilogue (int style)
13896 struct machine_function *m = cfun->machine;
13897 struct machine_frame_state frame_state_save = m->fs;
13898 struct ix86_frame frame;
13899 bool restore_regs_via_mov;
13900 bool using_drap;
13901 bool restore_stub_is_tail = false;
13903 if (ix86_function_naked (current_function_decl))
13905 /* The program should not reach this point. */
13906 emit_insn (gen_ud2 ());
13907 return;
13910 ix86_finalize_stack_frame_flags ();
13911 frame = m->frame;
13913 m->fs.sp_realigned = stack_realign_fp;
13914 m->fs.sp_valid = stack_realign_fp
13915 || !frame_pointer_needed
13916 || crtl->sp_is_unchanging;
13917 gcc_assert (!m->fs.sp_valid
13918 || m->fs.sp_offset == frame.stack_pointer_offset);
13920 /* The FP must be valid if the frame pointer is present. */
13921 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13922 gcc_assert (!m->fs.fp_valid
13923 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13925 /* We must have *some* valid pointer to the stack frame. */
13926 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13928 /* The DRAP is never valid at this point. */
13929 gcc_assert (!m->fs.drap_valid);
13931 /* See the comment about red zone and frame
13932 pointer usage in ix86_expand_prologue. */
13933 if (frame_pointer_needed && frame.red_zone_size)
13934 emit_insn (gen_memory_blockage ());
13936 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13937 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13939 /* Determine the CFA offset of the end of the red-zone. */
13940 m->fs.red_zone_offset = 0;
13941 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13943 /* The red-zone begins below return address and error code in
13944 exception handler. */
13945 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13947 /* When the register save area is in the aligned portion of
13948 the stack, determine the maximum runtime displacement that
13949 matches up with the aligned frame. */
13950 if (stack_realign_drap)
13951 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13952 + UNITS_PER_WORD);
13955 /* Special care must be taken for the normal return case of a function
13956 using eh_return: the eax and edx registers are marked as saved, but
13957 not restored along this path. Adjust the save location to match. */
13958 if (crtl->calls_eh_return && style != 2)
13959 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13961 /* EH_RETURN requires the use of moves to function properly. */
13962 if (crtl->calls_eh_return)
13963 restore_regs_via_mov = true;
13964 /* SEH requires the use of pops to identify the epilogue. */
13965 else if (TARGET_SEH)
13966 restore_regs_via_mov = false;
13967 /* If we're only restoring one register and sp cannot be used then
13968 using a move instruction to restore the register since it's
13969 less work than reloading sp and popping the register. */
13970 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13971 restore_regs_via_mov = true;
13972 else if (TARGET_EPILOGUE_USING_MOVE
13973 && cfun->machine->use_fast_prologue_epilogue
13974 && (frame.nregs > 1
13975 || m->fs.sp_offset != frame.reg_save_offset))
13976 restore_regs_via_mov = true;
13977 else if (frame_pointer_needed
13978 && !frame.nregs
13979 && m->fs.sp_offset != frame.reg_save_offset)
13980 restore_regs_via_mov = true;
13981 else if (frame_pointer_needed
13982 && TARGET_USE_LEAVE
13983 && cfun->machine->use_fast_prologue_epilogue
13984 && frame.nregs == 1)
13985 restore_regs_via_mov = true;
13986 else
13987 restore_regs_via_mov = false;
13989 if (restore_regs_via_mov || frame.nsseregs)
13991 /* Ensure that the entire register save area is addressable via
13992 the stack pointer, if we will restore SSE regs via sp. */
13993 if (TARGET_64BIT
13994 && m->fs.sp_offset > 0x7fffffff
13995 && sp_valid_at (frame.stack_realign_offset + 1)
13996 && (frame.nsseregs + frame.nregs) != 0)
13998 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13999 GEN_INT (m->fs.sp_offset
14000 - frame.sse_reg_save_offset),
14001 style,
14002 m->fs.cfa_reg == stack_pointer_rtx);
14006 /* If there are any SSE registers to restore, then we have to do it
14007 via moves, since there's obviously no pop for SSE regs. */
14008 if (frame.nsseregs)
14009 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14010 style == 2);
14012 if (m->call_ms2sysv)
14014 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14016 /* We cannot use a tail-call for the stub if:
14017 1. We have to pop incoming args,
14018 2. We have additional int regs to restore, or
14019 3. A sibling call will be the tail-call, or
14020 4. We are emitting an eh_return_internal epilogue.
14022 TODO: Item 4 has not yet tested!
14024 If any of the above are true, we will call the stub rather than
14025 jump to it. */
14026 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14027 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14030 /* If using out-of-line stub that is a tail-call, then...*/
14031 if (m->call_ms2sysv && restore_stub_is_tail)
14033 /* TODO: parinoid tests. (remove eventually) */
14034 gcc_assert (m->fs.sp_valid);
14035 gcc_assert (!m->fs.sp_realigned);
14036 gcc_assert (!m->fs.fp_valid);
14037 gcc_assert (!m->fs.realigned);
14038 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14039 gcc_assert (!crtl->drap_reg);
14040 gcc_assert (!frame.nregs);
14042 else if (restore_regs_via_mov)
14044 rtx t;
14046 if (frame.nregs)
14047 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
14049 /* eh_return epilogues need %ecx added to the stack pointer. */
14050 if (style == 2)
14052 rtx sa = EH_RETURN_STACKADJ_RTX;
14053 rtx_insn *insn;
14055 /* %ecx can't be used for both DRAP register and eh_return. */
14056 if (crtl->drap_reg)
14057 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14059 /* regparm nested functions don't work with eh_return. */
14060 gcc_assert (!ix86_static_chain_on_stack);
14062 if (frame_pointer_needed)
14064 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14065 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14066 emit_insn (gen_rtx_SET (sa, t));
14068 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14069 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14071 /* Note that we use SA as a temporary CFA, as the return
14072 address is at the proper place relative to it. We
14073 pretend this happens at the FP restore insn because
14074 prior to this insn the FP would be stored at the wrong
14075 offset relative to SA, and after this insn we have no
14076 other reasonable register to use for the CFA. We don't
14077 bother resetting the CFA to the SP for the duration of
14078 the return insn, unless the control flow instrumentation
14079 is done. In this case the SP is used later and we have
14080 to reset CFA to SP. */
14081 add_reg_note (insn, REG_CFA_DEF_CFA,
14082 plus_constant (Pmode, sa, UNITS_PER_WORD));
14083 ix86_add_queued_cfa_restore_notes (insn);
14084 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14085 RTX_FRAME_RELATED_P (insn) = 1;
14087 m->fs.cfa_reg = sa;
14088 m->fs.cfa_offset = UNITS_PER_WORD;
14089 m->fs.fp_valid = false;
14091 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14092 const0_rtx, style,
14093 flag_cf_protection);
14095 else
14097 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14098 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14099 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14100 ix86_add_queued_cfa_restore_notes (insn);
14102 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14103 if (m->fs.cfa_offset != UNITS_PER_WORD)
14105 m->fs.cfa_offset = UNITS_PER_WORD;
14106 add_reg_note (insn, REG_CFA_DEF_CFA,
14107 plus_constant (Pmode, stack_pointer_rtx,
14108 UNITS_PER_WORD));
14109 RTX_FRAME_RELATED_P (insn) = 1;
14112 m->fs.sp_offset = UNITS_PER_WORD;
14113 m->fs.sp_valid = true;
14114 m->fs.sp_realigned = false;
14117 else
14119 /* SEH requires that the function end with (1) a stack adjustment
14120 if necessary, (2) a sequence of pops, and (3) a return or
14121 jump instruction. Prevent insns from the function body from
14122 being scheduled into this sequence. */
14123 if (TARGET_SEH)
14125 /* Prevent a catch region from being adjacent to the standard
14126 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14127 several other flags that would be interesting to test are
14128 not yet set up. */
14129 if (flag_non_call_exceptions)
14130 emit_insn (gen_nops (const1_rtx));
14131 else
14132 emit_insn (gen_blockage ());
14135 /* First step is to deallocate the stack frame so that we can
14136 pop the registers. If the stack pointer was realigned, it needs
14137 to be restored now. Also do it on SEH target for very large
14138 frame as the emitted instructions aren't allowed by the ABI
14139 in epilogues. */
14140 if (!m->fs.sp_valid || m->fs.sp_realigned
14141 || (TARGET_SEH
14142 && (m->fs.sp_offset - frame.reg_save_offset
14143 >= SEH_MAX_FRAME_SIZE)))
14145 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14146 GEN_INT (m->fs.fp_offset
14147 - frame.reg_save_offset),
14148 style, false);
14150 else if (m->fs.sp_offset != frame.reg_save_offset)
14152 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14153 GEN_INT (m->fs.sp_offset
14154 - frame.reg_save_offset),
14155 style,
14156 m->fs.cfa_reg == stack_pointer_rtx);
14159 ix86_emit_restore_regs_using_pop ();
14162 /* If we used a stack pointer and haven't already got rid of it,
14163 then do so now. */
14164 if (m->fs.fp_valid)
14166 /* If the stack pointer is valid and pointing at the frame
14167 pointer store address, then we only need a pop. */
14168 if (sp_valid_at (frame.hfp_save_offset)
14169 && m->fs.sp_offset == frame.hfp_save_offset)
14170 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14171 /* Leave results in shorter dependency chains on CPUs that are
14172 able to grok it fast. */
14173 else if (TARGET_USE_LEAVE
14174 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14175 || !cfun->machine->use_fast_prologue_epilogue)
14176 ix86_emit_leave (NULL);
14177 else
14179 pro_epilogue_adjust_stack (stack_pointer_rtx,
14180 hard_frame_pointer_rtx,
14181 const0_rtx, style, !using_drap);
14182 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14186 if (using_drap)
14188 int param_ptr_offset = UNITS_PER_WORD;
14189 rtx_insn *insn;
14191 gcc_assert (stack_realign_drap);
14193 if (ix86_static_chain_on_stack)
14194 param_ptr_offset += UNITS_PER_WORD;
14195 if (!call_used_regs[REGNO (crtl->drap_reg)])
14196 param_ptr_offset += UNITS_PER_WORD;
14198 insn = emit_insn (gen_rtx_SET
14199 (stack_pointer_rtx,
14200 gen_rtx_PLUS (Pmode,
14201 crtl->drap_reg,
14202 GEN_INT (-param_ptr_offset))));
14203 m->fs.cfa_reg = stack_pointer_rtx;
14204 m->fs.cfa_offset = param_ptr_offset;
14205 m->fs.sp_offset = param_ptr_offset;
14206 m->fs.realigned = false;
14208 add_reg_note (insn, REG_CFA_DEF_CFA,
14209 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14210 GEN_INT (param_ptr_offset)));
14211 RTX_FRAME_RELATED_P (insn) = 1;
14213 if (!call_used_regs[REGNO (crtl->drap_reg)])
14214 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14217 /* At this point the stack pointer must be valid, and we must have
14218 restored all of the registers. We may not have deallocated the
14219 entire stack frame. We've delayed this until now because it may
14220 be possible to merge the local stack deallocation with the
14221 deallocation forced by ix86_static_chain_on_stack. */
14222 gcc_assert (m->fs.sp_valid);
14223 gcc_assert (!m->fs.sp_realigned);
14224 gcc_assert (!m->fs.fp_valid);
14225 gcc_assert (!m->fs.realigned);
14226 if (m->fs.sp_offset != UNITS_PER_WORD)
14228 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14229 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14230 style, true);
14232 else
14233 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14235 /* Sibcall epilogues don't want a return instruction. */
14236 if (style == 0)
14238 m->fs = frame_state_save;
14239 return;
14242 if (cfun->machine->func_type != TYPE_NORMAL)
14243 emit_jump_insn (gen_interrupt_return ());
14244 else if (crtl->args.pops_args && crtl->args.size)
14246 rtx popc = GEN_INT (crtl->args.pops_args);
14248 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14249 address, do explicit add, and jump indirectly to the caller. */
14251 if (crtl->args.pops_args >= 65536)
14253 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14254 rtx_insn *insn;
14256 /* There is no "pascal" calling convention in any 64bit ABI. */
14257 gcc_assert (!TARGET_64BIT);
14259 insn = emit_insn (gen_pop (ecx));
14260 m->fs.cfa_offset -= UNITS_PER_WORD;
14261 m->fs.sp_offset -= UNITS_PER_WORD;
14263 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14264 x = gen_rtx_SET (stack_pointer_rtx, x);
14265 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14266 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14267 RTX_FRAME_RELATED_P (insn) = 1;
14269 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14270 popc, -1, true);
14271 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14273 else
14274 emit_jump_insn (gen_simple_return_pop_internal (popc));
14276 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14278 /* In case of return from EH a simple return cannot be used
14279 as a return address will be compared with a shadow stack
14280 return address. Use indirect jump instead. */
14281 if (style == 2 && flag_cf_protection)
14283 /* Register used in indirect jump must be in word_mode. But
14284 Pmode may not be the same as word_mode for x32. */
14285 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14286 rtx_insn *insn;
14288 insn = emit_insn (gen_pop (ecx));
14289 m->fs.cfa_offset -= UNITS_PER_WORD;
14290 m->fs.sp_offset -= UNITS_PER_WORD;
14292 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14293 x = gen_rtx_SET (stack_pointer_rtx, x);
14294 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14295 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14296 RTX_FRAME_RELATED_P (insn) = 1;
14298 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14300 else
14301 emit_jump_insn (gen_simple_return_internal ());
14304 /* Restore the state back to the state from the prologue,
14305 so that it's correct for the next epilogue. */
14306 m->fs = frame_state_save;
14309 /* Reset from the function's potential modifications. */
14311 static void
14312 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14314 if (pic_offset_table_rtx
14315 && !ix86_use_pseudo_pic_reg ())
14316 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14318 if (TARGET_MACHO)
14320 rtx_insn *insn = get_last_insn ();
14321 rtx_insn *deleted_debug_label = NULL;
14323 /* Mach-O doesn't support labels at the end of objects, so if
14324 it looks like we might want one, take special action.
14325 First, collect any sequence of deleted debug labels. */
14326 while (insn
14327 && NOTE_P (insn)
14328 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14330 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14331 notes only, instead set their CODE_LABEL_NUMBER to -1,
14332 otherwise there would be code generation differences
14333 in between -g and -g0. */
14334 if (NOTE_P (insn) && NOTE_KIND (insn)
14335 == NOTE_INSN_DELETED_DEBUG_LABEL)
14336 deleted_debug_label = insn;
14337 insn = PREV_INSN (insn);
14340 /* If we have:
14341 label:
14342 barrier
14343 then this needs to be detected, so skip past the barrier. */
14345 if (insn && BARRIER_P (insn))
14346 insn = PREV_INSN (insn);
14348 /* Up to now we've only seen notes or barriers. */
14349 if (insn)
14351 if (LABEL_P (insn)
14352 || (NOTE_P (insn)
14353 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14354 /* Trailing label. */
14355 fputs ("\tnop\n", file);
14356 else if (cfun && ! cfun->is_thunk)
14358 /* See if we have a completely empty function body, skipping
14359 the special case of the picbase thunk emitted as asm. */
14360 while (insn && ! INSN_P (insn))
14361 insn = PREV_INSN (insn);
14362 /* If we don't find any insns, we've got an empty function body;
14363 I.e. completely empty - without a return or branch. This is
14364 taken as the case where a function body has been removed
14365 because it contains an inline __builtin_unreachable(). GCC
14366 declares that reaching __builtin_unreachable() means UB so
14367 we're not obliged to do anything special; however, we want
14368 non-zero-sized function bodies. To meet this, and help the
14369 user out, let's trap the case. */
14370 if (insn == NULL)
14371 fputs ("\tud2\n", file);
14374 else if (deleted_debug_label)
14375 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14376 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14377 CODE_LABEL_NUMBER (insn) = -1;
14381 /* Return a scratch register to use in the split stack prologue. The
14382 split stack prologue is used for -fsplit-stack. It is the first
14383 instructions in the function, even before the regular prologue.
14384 The scratch register can be any caller-saved register which is not
14385 used for parameters or for the static chain. */
14387 static unsigned int
14388 split_stack_prologue_scratch_regno (void)
14390 if (TARGET_64BIT)
14391 return R11_REG;
14392 else
14394 bool is_fastcall, is_thiscall;
14395 int regparm;
14397 is_fastcall = (lookup_attribute ("fastcall",
14398 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14399 != NULL);
14400 is_thiscall = (lookup_attribute ("thiscall",
14401 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14402 != NULL);
14403 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14405 if (is_fastcall)
14407 if (DECL_STATIC_CHAIN (cfun->decl))
14409 sorry ("-fsplit-stack does not support fastcall with "
14410 "nested function");
14411 return INVALID_REGNUM;
14413 return AX_REG;
14415 else if (is_thiscall)
14417 if (!DECL_STATIC_CHAIN (cfun->decl))
14418 return DX_REG;
14419 return AX_REG;
14421 else if (regparm < 3)
14423 if (!DECL_STATIC_CHAIN (cfun->decl))
14424 return CX_REG;
14425 else
14427 if (regparm >= 2)
14429 sorry ("-fsplit-stack does not support 2 register "
14430 "parameters for a nested function");
14431 return INVALID_REGNUM;
14433 return DX_REG;
14436 else
14438 /* FIXME: We could make this work by pushing a register
14439 around the addition and comparison. */
14440 sorry ("-fsplit-stack does not support 3 register parameters");
14441 return INVALID_REGNUM;
14446 /* A SYMBOL_REF for the function which allocates new stackspace for
14447 -fsplit-stack. */
14449 static GTY(()) rtx split_stack_fn;
14451 /* A SYMBOL_REF for the more stack function when using the large
14452 model. */
14454 static GTY(()) rtx split_stack_fn_large;
14456 /* Return location of the stack guard value in the TLS block. */
14459 ix86_split_stack_guard (void)
14461 int offset;
14462 addr_space_t as = DEFAULT_TLS_SEG_REG;
14463 rtx r;
14465 gcc_assert (flag_split_stack);
14467 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14468 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14469 #else
14470 gcc_unreachable ();
14471 #endif
14473 r = GEN_INT (offset);
14474 r = gen_const_mem (Pmode, r);
14475 set_mem_addr_space (r, as);
14477 return r;
14480 /* Handle -fsplit-stack. These are the first instructions in the
14481 function, even before the regular prologue. */
14483 void
14484 ix86_expand_split_stack_prologue (void)
14486 HOST_WIDE_INT allocate;
14487 unsigned HOST_WIDE_INT args_size;
14488 rtx_code_label *label;
14489 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14490 rtx scratch_reg = NULL_RTX;
14491 rtx_code_label *varargs_label = NULL;
14492 rtx fn;
14494 gcc_assert (flag_split_stack && reload_completed);
14496 ix86_finalize_stack_frame_flags ();
14497 struct ix86_frame &frame = cfun->machine->frame;
14498 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14500 /* This is the label we will branch to if we have enough stack
14501 space. We expect the basic block reordering pass to reverse this
14502 branch if optimizing, so that we branch in the unlikely case. */
14503 label = gen_label_rtx ();
14505 /* We need to compare the stack pointer minus the frame size with
14506 the stack boundary in the TCB. The stack boundary always gives
14507 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14508 can compare directly. Otherwise we need to do an addition. */
14510 limit = ix86_split_stack_guard ();
14512 if (allocate < SPLIT_STACK_AVAILABLE)
14513 current = stack_pointer_rtx;
14514 else
14516 unsigned int scratch_regno;
14517 rtx offset;
14519 /* We need a scratch register to hold the stack pointer minus
14520 the required frame size. Since this is the very start of the
14521 function, the scratch register can be any caller-saved
14522 register which is not used for parameters. */
14523 offset = GEN_INT (- allocate);
14524 scratch_regno = split_stack_prologue_scratch_regno ();
14525 if (scratch_regno == INVALID_REGNUM)
14526 return;
14527 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14528 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14530 /* We don't use ix86_gen_add3 in this case because it will
14531 want to split to lea, but when not optimizing the insn
14532 will not be split after this point. */
14533 emit_insn (gen_rtx_SET (scratch_reg,
14534 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14535 offset)));
14537 else
14539 emit_move_insn (scratch_reg, offset);
14540 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14541 stack_pointer_rtx));
14543 current = scratch_reg;
14546 ix86_expand_branch (GEU, current, limit, label);
14547 rtx_insn *jump_insn = get_last_insn ();
14548 JUMP_LABEL (jump_insn) = label;
14550 /* Mark the jump as very likely to be taken. */
14551 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14553 if (split_stack_fn == NULL_RTX)
14555 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14556 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14558 fn = split_stack_fn;
14560 /* Get more stack space. We pass in the desired stack space and the
14561 size of the arguments to copy to the new stack. In 32-bit mode
14562 we push the parameters; __morestack will return on a new stack
14563 anyhow. In 64-bit mode we pass the parameters in r10 and
14564 r11. */
14565 allocate_rtx = GEN_INT (allocate);
14566 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
14567 call_fusage = NULL_RTX;
14568 rtx pop = NULL_RTX;
14569 if (TARGET_64BIT)
14571 rtx reg10, reg11;
14573 reg10 = gen_rtx_REG (Pmode, R10_REG);
14574 reg11 = gen_rtx_REG (Pmode, R11_REG);
14576 /* If this function uses a static chain, it will be in %r10.
14577 Preserve it across the call to __morestack. */
14578 if (DECL_STATIC_CHAIN (cfun->decl))
14580 rtx rax;
14582 rax = gen_rtx_REG (word_mode, AX_REG);
14583 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14584 use_reg (&call_fusage, rax);
14587 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14588 && !TARGET_PECOFF)
14590 HOST_WIDE_INT argval;
14592 gcc_assert (Pmode == DImode);
14593 /* When using the large model we need to load the address
14594 into a register, and we've run out of registers. So we
14595 switch to a different calling convention, and we call a
14596 different function: __morestack_large. We pass the
14597 argument size in the upper 32 bits of r10 and pass the
14598 frame size in the lower 32 bits. */
14599 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14600 gcc_assert ((args_size & 0xffffffff) == args_size);
14602 if (split_stack_fn_large == NULL_RTX)
14604 split_stack_fn_large =
14605 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14606 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14608 if (ix86_cmodel == CM_LARGE_PIC)
14610 rtx_code_label *label;
14611 rtx x;
14613 label = gen_label_rtx ();
14614 emit_label (label);
14615 LABEL_PRESERVE_P (label) = 1;
14616 emit_insn (gen_set_rip_rex64 (reg10, label));
14617 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14618 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14619 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14620 UNSPEC_GOT);
14621 x = gen_rtx_CONST (Pmode, x);
14622 emit_move_insn (reg11, x);
14623 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14624 x = gen_const_mem (Pmode, x);
14625 emit_move_insn (reg11, x);
14627 else
14628 emit_move_insn (reg11, split_stack_fn_large);
14630 fn = reg11;
14632 argval = ((args_size << 16) << 16) + allocate;
14633 emit_move_insn (reg10, GEN_INT (argval));
14635 else
14637 emit_move_insn (reg10, allocate_rtx);
14638 emit_move_insn (reg11, GEN_INT (args_size));
14639 use_reg (&call_fusage, reg11);
14642 use_reg (&call_fusage, reg10);
14644 else
14646 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14647 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14648 insn = emit_insn (gen_push (allocate_rtx));
14649 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14650 pop = GEN_INT (2 * UNITS_PER_WORD);
14652 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14653 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14654 pop, false);
14655 add_function_usage_to (call_insn, call_fusage);
14656 if (!TARGET_64BIT)
14657 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14658 /* Indicate that this function can't jump to non-local gotos. */
14659 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14661 /* In order to make call/return prediction work right, we now need
14662 to execute a return instruction. See
14663 libgcc/config/i386/morestack.S for the details on how this works.
14665 For flow purposes gcc must not see this as a return
14666 instruction--we need control flow to continue at the subsequent
14667 label. Therefore, we use an unspec. */
14668 gcc_assert (crtl->args.pops_args < 65536);
14669 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14671 /* If we are in 64-bit mode and this function uses a static chain,
14672 we saved %r10 in %rax before calling _morestack. */
14673 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14674 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14675 gen_rtx_REG (word_mode, AX_REG));
14677 /* If this function calls va_start, we need to store a pointer to
14678 the arguments on the old stack, because they may not have been
14679 all copied to the new stack. At this point the old stack can be
14680 found at the frame pointer value used by __morestack, because
14681 __morestack has set that up before calling back to us. Here we
14682 store that pointer in a scratch register, and in
14683 ix86_expand_prologue we store the scratch register in a stack
14684 slot. */
14685 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14687 unsigned int scratch_regno;
14688 rtx frame_reg;
14689 int words;
14691 scratch_regno = split_stack_prologue_scratch_regno ();
14692 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14693 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14695 /* 64-bit:
14696 fp -> old fp value
14697 return address within this function
14698 return address of caller of this function
14699 stack arguments
14700 So we add three words to get to the stack arguments.
14702 32-bit:
14703 fp -> old fp value
14704 return address within this function
14705 first argument to __morestack
14706 second argument to __morestack
14707 return address of caller of this function
14708 stack arguments
14709 So we add five words to get to the stack arguments.
14711 words = TARGET_64BIT ? 3 : 5;
14712 emit_insn (gen_rtx_SET (scratch_reg,
14713 gen_rtx_PLUS (Pmode, frame_reg,
14714 GEN_INT (words * UNITS_PER_WORD))));
14716 varargs_label = gen_label_rtx ();
14717 emit_jump_insn (gen_jump (varargs_label));
14718 JUMP_LABEL (get_last_insn ()) = varargs_label;
14720 emit_barrier ();
14723 emit_label (label);
14724 LABEL_NUSES (label) = 1;
14726 /* If this function calls va_start, we now have to set the scratch
14727 register for the case where we do not call __morestack. In this
14728 case we need to set it based on the stack pointer. */
14729 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14731 emit_insn (gen_rtx_SET (scratch_reg,
14732 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14733 GEN_INT (UNITS_PER_WORD))));
14735 emit_label (varargs_label);
14736 LABEL_NUSES (varargs_label) = 1;
14740 /* We may have to tell the dataflow pass that the split stack prologue
14741 is initializing a scratch register. */
14743 static void
14744 ix86_live_on_entry (bitmap regs)
14746 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14748 gcc_assert (flag_split_stack);
14749 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14753 /* Extract the parts of an RTL expression that is a valid memory address
14754 for an instruction. Return 0 if the structure of the address is
14755 grossly off. Return -1 if the address contains ASHIFT, so it is not
14756 strictly valid, but still used for computing length of lea instruction. */
14759 ix86_decompose_address (rtx addr, struct ix86_address *out)
14761 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14762 rtx base_reg, index_reg;
14763 HOST_WIDE_INT scale = 1;
14764 rtx scale_rtx = NULL_RTX;
14765 rtx tmp;
14766 int retval = 1;
14767 addr_space_t seg = ADDR_SPACE_GENERIC;
14769 /* Allow zero-extended SImode addresses,
14770 they will be emitted with addr32 prefix. */
14771 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14773 if (GET_CODE (addr) == ZERO_EXTEND
14774 && GET_MODE (XEXP (addr, 0)) == SImode)
14776 addr = XEXP (addr, 0);
14777 if (CONST_INT_P (addr))
14778 return 0;
14780 else if (GET_CODE (addr) == AND
14781 && const_32bit_mask (XEXP (addr, 1), DImode))
14783 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14784 if (addr == NULL_RTX)
14785 return 0;
14787 if (CONST_INT_P (addr))
14788 return 0;
14792 /* Allow SImode subregs of DImode addresses,
14793 they will be emitted with addr32 prefix. */
14794 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14796 if (SUBREG_P (addr)
14797 && GET_MODE (SUBREG_REG (addr)) == DImode)
14799 addr = SUBREG_REG (addr);
14800 if (CONST_INT_P (addr))
14801 return 0;
14805 if (REG_P (addr))
14806 base = addr;
14807 else if (SUBREG_P (addr))
14809 if (REG_P (SUBREG_REG (addr)))
14810 base = addr;
14811 else
14812 return 0;
14814 else if (GET_CODE (addr) == PLUS)
14816 rtx addends[4], op;
14817 int n = 0, i;
14819 op = addr;
14822 if (n >= 4)
14823 return 0;
14824 addends[n++] = XEXP (op, 1);
14825 op = XEXP (op, 0);
14827 while (GET_CODE (op) == PLUS);
14828 if (n >= 4)
14829 return 0;
14830 addends[n] = op;
14832 for (i = n; i >= 0; --i)
14834 op = addends[i];
14835 switch (GET_CODE (op))
14837 case MULT:
14838 if (index)
14839 return 0;
14840 index = XEXP (op, 0);
14841 scale_rtx = XEXP (op, 1);
14842 break;
14844 case ASHIFT:
14845 if (index)
14846 return 0;
14847 index = XEXP (op, 0);
14848 tmp = XEXP (op, 1);
14849 if (!CONST_INT_P (tmp))
14850 return 0;
14851 scale = INTVAL (tmp);
14852 if ((unsigned HOST_WIDE_INT) scale > 3)
14853 return 0;
14854 scale = 1 << scale;
14855 break;
14857 case ZERO_EXTEND:
14858 op = XEXP (op, 0);
14859 if (GET_CODE (op) != UNSPEC)
14860 return 0;
14861 /* FALLTHRU */
14863 case UNSPEC:
14864 if (XINT (op, 1) == UNSPEC_TP
14865 && TARGET_TLS_DIRECT_SEG_REFS
14866 && seg == ADDR_SPACE_GENERIC)
14867 seg = DEFAULT_TLS_SEG_REG;
14868 else
14869 return 0;
14870 break;
14872 case SUBREG:
14873 if (!REG_P (SUBREG_REG (op)))
14874 return 0;
14875 /* FALLTHRU */
14877 case REG:
14878 if (!base)
14879 base = op;
14880 else if (!index)
14881 index = op;
14882 else
14883 return 0;
14884 break;
14886 case CONST:
14887 case CONST_INT:
14888 case SYMBOL_REF:
14889 case LABEL_REF:
14890 if (disp)
14891 return 0;
14892 disp = op;
14893 break;
14895 default:
14896 return 0;
14900 else if (GET_CODE (addr) == MULT)
14902 index = XEXP (addr, 0); /* index*scale */
14903 scale_rtx = XEXP (addr, 1);
14905 else if (GET_CODE (addr) == ASHIFT)
14907 /* We're called for lea too, which implements ashift on occasion. */
14908 index = XEXP (addr, 0);
14909 tmp = XEXP (addr, 1);
14910 if (!CONST_INT_P (tmp))
14911 return 0;
14912 scale = INTVAL (tmp);
14913 if ((unsigned HOST_WIDE_INT) scale > 3)
14914 return 0;
14915 scale = 1 << scale;
14916 retval = -1;
14918 else
14919 disp = addr; /* displacement */
14921 if (index)
14923 if (REG_P (index))
14925 else if (SUBREG_P (index)
14926 && REG_P (SUBREG_REG (index)))
14928 else
14929 return 0;
14932 /* Extract the integral value of scale. */
14933 if (scale_rtx)
14935 if (!CONST_INT_P (scale_rtx))
14936 return 0;
14937 scale = INTVAL (scale_rtx);
14940 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14941 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14943 /* Avoid useless 0 displacement. */
14944 if (disp == const0_rtx && (base || index))
14945 disp = NULL_RTX;
14947 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14948 if (base_reg && index_reg && scale == 1
14949 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14950 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14951 || REGNO (index_reg) == SP_REG))
14953 std::swap (base, index);
14954 std::swap (base_reg, index_reg);
14957 /* Special case: %ebp cannot be encoded as a base without a displacement.
14958 Similarly %r13. */
14959 if (!disp && base_reg
14960 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14961 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14962 || REGNO (base_reg) == BP_REG
14963 || REGNO (base_reg) == R13_REG))
14964 disp = const0_rtx;
14966 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14967 Avoid this by transforming to [%esi+0].
14968 Reload calls address legitimization without cfun defined, so we need
14969 to test cfun for being non-NULL. */
14970 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14971 && base_reg && !index_reg && !disp
14972 && REGNO (base_reg) == SI_REG)
14973 disp = const0_rtx;
14975 /* Special case: encode reg+reg instead of reg*2. */
14976 if (!base && index && scale == 2)
14977 base = index, base_reg = index_reg, scale = 1;
14979 /* Special case: scaling cannot be encoded without base or displacement. */
14980 if (!base && !disp && index && scale != 1)
14981 disp = const0_rtx;
14983 out->base = base;
14984 out->index = index;
14985 out->disp = disp;
14986 out->scale = scale;
14987 out->seg = seg;
14989 return retval;
14992 /* Return cost of the memory address x.
14993 For i386, it is better to use a complex address than let gcc copy
14994 the address into a reg and make a new pseudo. But not if the address
14995 requires to two regs - that would mean more pseudos with longer
14996 lifetimes. */
14997 static int
14998 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15000 struct ix86_address parts;
15001 int cost = 1;
15002 int ok = ix86_decompose_address (x, &parts);
15004 gcc_assert (ok);
15006 if (parts.base && SUBREG_P (parts.base))
15007 parts.base = SUBREG_REG (parts.base);
15008 if (parts.index && SUBREG_P (parts.index))
15009 parts.index = SUBREG_REG (parts.index);
15011 /* Attempt to minimize number of registers in the address by increasing
15012 address cost for each used register. We don't increase address cost
15013 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15014 is not invariant itself it most likely means that base or index is not
15015 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15016 which is not profitable for x86. */
15017 if (parts.base
15018 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15019 && (current_pass->type == GIMPLE_PASS
15020 || !pic_offset_table_rtx
15021 || !REG_P (parts.base)
15022 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15023 cost++;
15025 if (parts.index
15026 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15027 && (current_pass->type == GIMPLE_PASS
15028 || !pic_offset_table_rtx
15029 || !REG_P (parts.index)
15030 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15031 cost++;
15033 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15034 since it's predecode logic can't detect the length of instructions
15035 and it degenerates to vector decoded. Increase cost of such
15036 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15037 to split such addresses or even refuse such addresses at all.
15039 Following addressing modes are affected:
15040 [base+scale*index]
15041 [scale*index+disp]
15042 [base+index]
15044 The first and last case may be avoidable by explicitly coding the zero in
15045 memory address, but I don't have AMD-K6 machine handy to check this
15046 theory. */
15048 if (TARGET_K6
15049 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15050 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15051 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15052 cost += 10;
15054 return cost;
15057 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15058 this is used for to form addresses to local data when -fPIC is in
15059 use. */
15061 static bool
15062 darwin_local_data_pic (rtx disp)
15064 return (GET_CODE (disp) == UNSPEC
15065 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15068 /* True if operand X should be loaded from GOT. */
15070 bool
15071 ix86_force_load_from_GOT_p (rtx x)
15073 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15074 && !TARGET_PECOFF && !TARGET_MACHO
15075 && !flag_plt && !flag_pic
15076 && ix86_cmodel != CM_LARGE
15077 && GET_CODE (x) == SYMBOL_REF
15078 && SYMBOL_REF_FUNCTION_P (x)
15079 && !SYMBOL_REF_LOCAL_P (x));
15082 /* Determine if a given RTX is a valid constant. We already know this
15083 satisfies CONSTANT_P. */
15085 static bool
15086 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15088 /* Pointer bounds constants are not valid. */
15089 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15090 return false;
15092 switch (GET_CODE (x))
15094 case CONST:
15095 x = XEXP (x, 0);
15097 if (GET_CODE (x) == PLUS)
15099 if (!CONST_INT_P (XEXP (x, 1)))
15100 return false;
15101 x = XEXP (x, 0);
15104 if (TARGET_MACHO && darwin_local_data_pic (x))
15105 return true;
15107 /* Only some unspecs are valid as "constants". */
15108 if (GET_CODE (x) == UNSPEC)
15109 switch (XINT (x, 1))
15111 case UNSPEC_GOT:
15112 case UNSPEC_GOTOFF:
15113 case UNSPEC_PLTOFF:
15114 return TARGET_64BIT;
15115 case UNSPEC_TPOFF:
15116 case UNSPEC_NTPOFF:
15117 x = XVECEXP (x, 0, 0);
15118 return (GET_CODE (x) == SYMBOL_REF
15119 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15120 case UNSPEC_DTPOFF:
15121 x = XVECEXP (x, 0, 0);
15122 return (GET_CODE (x) == SYMBOL_REF
15123 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15124 default:
15125 return false;
15128 /* We must have drilled down to a symbol. */
15129 if (GET_CODE (x) == LABEL_REF)
15130 return true;
15131 if (GET_CODE (x) != SYMBOL_REF)
15132 return false;
15133 /* FALLTHRU */
15135 case SYMBOL_REF:
15136 /* TLS symbols are never valid. */
15137 if (SYMBOL_REF_TLS_MODEL (x))
15138 return false;
15140 /* DLLIMPORT symbols are never valid. */
15141 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15142 && SYMBOL_REF_DLLIMPORT_P (x))
15143 return false;
15145 #if TARGET_MACHO
15146 /* mdynamic-no-pic */
15147 if (MACHO_DYNAMIC_NO_PIC_P)
15148 return machopic_symbol_defined_p (x);
15149 #endif
15151 /* External function address should be loaded
15152 via the GOT slot to avoid PLT. */
15153 if (ix86_force_load_from_GOT_p (x))
15154 return false;
15156 break;
15158 CASE_CONST_SCALAR_INT:
15159 switch (mode)
15161 case E_TImode:
15162 if (TARGET_64BIT)
15163 return true;
15164 /* FALLTHRU */
15165 case E_OImode:
15166 case E_XImode:
15167 if (!standard_sse_constant_p (x, mode))
15168 return false;
15169 default:
15170 break;
15172 break;
15174 case CONST_VECTOR:
15175 if (!standard_sse_constant_p (x, mode))
15176 return false;
15178 default:
15179 break;
15182 /* Otherwise we handle everything else in the move patterns. */
15183 return true;
15186 /* Determine if it's legal to put X into the constant pool. This
15187 is not possible for the address of thread-local symbols, which
15188 is checked above. */
15190 static bool
15191 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15193 /* We can put any immediate constant in memory. */
15194 switch (GET_CODE (x))
15196 CASE_CONST_ANY:
15197 return false;
15199 default:
15200 break;
15203 return !ix86_legitimate_constant_p (mode, x);
15206 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15207 otherwise zero. */
15209 static bool
15210 is_imported_p (rtx x)
15212 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15213 || GET_CODE (x) != SYMBOL_REF)
15214 return false;
15216 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15220 /* Nonzero if the constant value X is a legitimate general operand
15221 when generating PIC code. It is given that flag_pic is on and
15222 that X satisfies CONSTANT_P. */
15224 bool
15225 legitimate_pic_operand_p (rtx x)
15227 rtx inner;
15229 switch (GET_CODE (x))
15231 case CONST:
15232 inner = XEXP (x, 0);
15233 if (GET_CODE (inner) == PLUS
15234 && CONST_INT_P (XEXP (inner, 1)))
15235 inner = XEXP (inner, 0);
15237 /* Only some unspecs are valid as "constants". */
15238 if (GET_CODE (inner) == UNSPEC)
15239 switch (XINT (inner, 1))
15241 case UNSPEC_GOT:
15242 case UNSPEC_GOTOFF:
15243 case UNSPEC_PLTOFF:
15244 return TARGET_64BIT;
15245 case UNSPEC_TPOFF:
15246 x = XVECEXP (inner, 0, 0);
15247 return (GET_CODE (x) == SYMBOL_REF
15248 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15249 case UNSPEC_MACHOPIC_OFFSET:
15250 return legitimate_pic_address_disp_p (x);
15251 default:
15252 return false;
15254 /* FALLTHRU */
15256 case SYMBOL_REF:
15257 case LABEL_REF:
15258 return legitimate_pic_address_disp_p (x);
15260 default:
15261 return true;
15265 /* Determine if a given CONST RTX is a valid memory displacement
15266 in PIC mode. */
15268 bool
15269 legitimate_pic_address_disp_p (rtx disp)
15271 bool saw_plus;
15273 /* In 64bit mode we can allow direct addresses of symbols and labels
15274 when they are not dynamic symbols. */
15275 if (TARGET_64BIT)
15277 rtx op0 = disp, op1;
15279 switch (GET_CODE (disp))
15281 case LABEL_REF:
15282 return true;
15284 case CONST:
15285 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15286 break;
15287 op0 = XEXP (XEXP (disp, 0), 0);
15288 op1 = XEXP (XEXP (disp, 0), 1);
15289 if (!CONST_INT_P (op1))
15290 break;
15291 if (GET_CODE (op0) == UNSPEC
15292 && (XINT (op0, 1) == UNSPEC_DTPOFF
15293 || XINT (op0, 1) == UNSPEC_NTPOFF)
15294 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15295 return true;
15296 if (INTVAL (op1) >= 16*1024*1024
15297 || INTVAL (op1) < -16*1024*1024)
15298 break;
15299 if (GET_CODE (op0) == LABEL_REF)
15300 return true;
15301 if (GET_CODE (op0) == CONST
15302 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15303 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15304 return true;
15305 if (GET_CODE (op0) == UNSPEC
15306 && XINT (op0, 1) == UNSPEC_PCREL)
15307 return true;
15308 if (GET_CODE (op0) != SYMBOL_REF)
15309 break;
15310 /* FALLTHRU */
15312 case SYMBOL_REF:
15313 /* TLS references should always be enclosed in UNSPEC.
15314 The dllimported symbol needs always to be resolved. */
15315 if (SYMBOL_REF_TLS_MODEL (op0)
15316 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15317 return false;
15319 if (TARGET_PECOFF)
15321 if (is_imported_p (op0))
15322 return true;
15324 if (SYMBOL_REF_FAR_ADDR_P (op0)
15325 || !SYMBOL_REF_LOCAL_P (op0))
15326 break;
15328 /* Function-symbols need to be resolved only for
15329 large-model.
15330 For the small-model we don't need to resolve anything
15331 here. */
15332 if ((ix86_cmodel != CM_LARGE_PIC
15333 && SYMBOL_REF_FUNCTION_P (op0))
15334 || ix86_cmodel == CM_SMALL_PIC)
15335 return true;
15336 /* Non-external symbols don't need to be resolved for
15337 large, and medium-model. */
15338 if ((ix86_cmodel == CM_LARGE_PIC
15339 || ix86_cmodel == CM_MEDIUM_PIC)
15340 && !SYMBOL_REF_EXTERNAL_P (op0))
15341 return true;
15343 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15344 && (SYMBOL_REF_LOCAL_P (op0)
15345 || (HAVE_LD_PIE_COPYRELOC
15346 && flag_pie
15347 && !SYMBOL_REF_WEAK (op0)
15348 && !SYMBOL_REF_FUNCTION_P (op0)))
15349 && ix86_cmodel != CM_LARGE_PIC)
15350 return true;
15351 break;
15353 default:
15354 break;
15357 if (GET_CODE (disp) != CONST)
15358 return false;
15359 disp = XEXP (disp, 0);
15361 if (TARGET_64BIT)
15363 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15364 of GOT tables. We should not need these anyway. */
15365 if (GET_CODE (disp) != UNSPEC
15366 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15367 && XINT (disp, 1) != UNSPEC_GOTOFF
15368 && XINT (disp, 1) != UNSPEC_PCREL
15369 && XINT (disp, 1) != UNSPEC_PLTOFF))
15370 return false;
15372 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15373 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15374 return false;
15375 return true;
15378 saw_plus = false;
15379 if (GET_CODE (disp) == PLUS)
15381 if (!CONST_INT_P (XEXP (disp, 1)))
15382 return false;
15383 disp = XEXP (disp, 0);
15384 saw_plus = true;
15387 if (TARGET_MACHO && darwin_local_data_pic (disp))
15388 return true;
15390 if (GET_CODE (disp) != UNSPEC)
15391 return false;
15393 switch (XINT (disp, 1))
15395 case UNSPEC_GOT:
15396 if (saw_plus)
15397 return false;
15398 /* We need to check for both symbols and labels because VxWorks loads
15399 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15400 details. */
15401 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15402 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15403 case UNSPEC_GOTOFF:
15404 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15405 While ABI specify also 32bit relocation but we don't produce it in
15406 small PIC model at all. */
15407 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15408 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15409 && !TARGET_64BIT)
15410 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15411 return false;
15412 case UNSPEC_GOTTPOFF:
15413 case UNSPEC_GOTNTPOFF:
15414 case UNSPEC_INDNTPOFF:
15415 if (saw_plus)
15416 return false;
15417 disp = XVECEXP (disp, 0, 0);
15418 return (GET_CODE (disp) == SYMBOL_REF
15419 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15420 case UNSPEC_NTPOFF:
15421 disp = XVECEXP (disp, 0, 0);
15422 return (GET_CODE (disp) == SYMBOL_REF
15423 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15424 case UNSPEC_DTPOFF:
15425 disp = XVECEXP (disp, 0, 0);
15426 return (GET_CODE (disp) == SYMBOL_REF
15427 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15430 return false;
15433 /* Determine if op is suitable RTX for an address register.
15434 Return naked register if a register or a register subreg is
15435 found, otherwise return NULL_RTX. */
15437 static rtx
15438 ix86_validate_address_register (rtx op)
15440 machine_mode mode = GET_MODE (op);
15442 /* Only SImode or DImode registers can form the address. */
15443 if (mode != SImode && mode != DImode)
15444 return NULL_RTX;
15446 if (REG_P (op))
15447 return op;
15448 else if (SUBREG_P (op))
15450 rtx reg = SUBREG_REG (op);
15452 if (!REG_P (reg))
15453 return NULL_RTX;
15455 mode = GET_MODE (reg);
15457 /* Don't allow SUBREGs that span more than a word. It can
15458 lead to spill failures when the register is one word out
15459 of a two word structure. */
15460 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15461 return NULL_RTX;
15463 /* Allow only SUBREGs of non-eliminable hard registers. */
15464 if (register_no_elim_operand (reg, mode))
15465 return reg;
15468 /* Op is not a register. */
15469 return NULL_RTX;
15472 /* Recognizes RTL expressions that are valid memory addresses for an
15473 instruction. The MODE argument is the machine mode for the MEM
15474 expression that wants to use this address.
15476 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15477 convert common non-canonical forms to canonical form so that they will
15478 be recognized. */
15480 static bool
15481 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15483 struct ix86_address parts;
15484 rtx base, index, disp;
15485 HOST_WIDE_INT scale;
15486 addr_space_t seg;
15488 if (ix86_decompose_address (addr, &parts) <= 0)
15489 /* Decomposition failed. */
15490 return false;
15492 base = parts.base;
15493 index = parts.index;
15494 disp = parts.disp;
15495 scale = parts.scale;
15496 seg = parts.seg;
15498 /* Validate base register. */
15499 if (base)
15501 rtx reg = ix86_validate_address_register (base);
15503 if (reg == NULL_RTX)
15504 return false;
15506 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15507 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15508 /* Base is not valid. */
15509 return false;
15512 /* Validate index register. */
15513 if (index)
15515 rtx reg = ix86_validate_address_register (index);
15517 if (reg == NULL_RTX)
15518 return false;
15520 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15521 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15522 /* Index is not valid. */
15523 return false;
15526 /* Index and base should have the same mode. */
15527 if (base && index
15528 && GET_MODE (base) != GET_MODE (index))
15529 return false;
15531 /* Address override works only on the (%reg) part of %fs:(%reg). */
15532 if (seg != ADDR_SPACE_GENERIC
15533 && ((base && GET_MODE (base) != word_mode)
15534 || (index && GET_MODE (index) != word_mode)))
15535 return false;
15537 /* Validate scale factor. */
15538 if (scale != 1)
15540 if (!index)
15541 /* Scale without index. */
15542 return false;
15544 if (scale != 2 && scale != 4 && scale != 8)
15545 /* Scale is not a valid multiplier. */
15546 return false;
15549 /* Validate displacement. */
15550 if (disp)
15552 if (GET_CODE (disp) == CONST
15553 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15554 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15555 switch (XINT (XEXP (disp, 0), 1))
15557 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15558 when used. While ABI specify also 32bit relocations, we
15559 don't produce them at all and use IP relative instead.
15560 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15561 should be loaded via GOT. */
15562 case UNSPEC_GOT:
15563 if (!TARGET_64BIT
15564 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15565 goto is_legitimate_pic;
15566 /* FALLTHRU */
15567 case UNSPEC_GOTOFF:
15568 gcc_assert (flag_pic);
15569 if (!TARGET_64BIT)
15570 goto is_legitimate_pic;
15572 /* 64bit address unspec. */
15573 return false;
15575 case UNSPEC_GOTPCREL:
15576 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15577 goto is_legitimate_pic;
15578 /* FALLTHRU */
15579 case UNSPEC_PCREL:
15580 gcc_assert (flag_pic);
15581 goto is_legitimate_pic;
15583 case UNSPEC_GOTTPOFF:
15584 case UNSPEC_GOTNTPOFF:
15585 case UNSPEC_INDNTPOFF:
15586 case UNSPEC_NTPOFF:
15587 case UNSPEC_DTPOFF:
15588 break;
15590 default:
15591 /* Invalid address unspec. */
15592 return false;
15595 else if (SYMBOLIC_CONST (disp)
15596 && (flag_pic
15597 || (TARGET_MACHO
15598 #if TARGET_MACHO
15599 && MACHOPIC_INDIRECT
15600 && !machopic_operand_p (disp)
15601 #endif
15605 is_legitimate_pic:
15606 if (TARGET_64BIT && (index || base))
15608 /* foo@dtpoff(%rX) is ok. */
15609 if (GET_CODE (disp) != CONST
15610 || GET_CODE (XEXP (disp, 0)) != PLUS
15611 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15612 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15613 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15614 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15615 /* Non-constant pic memory reference. */
15616 return false;
15618 else if ((!TARGET_MACHO || flag_pic)
15619 && ! legitimate_pic_address_disp_p (disp))
15620 /* Displacement is an invalid pic construct. */
15621 return false;
15622 #if TARGET_MACHO
15623 else if (MACHO_DYNAMIC_NO_PIC_P
15624 && !ix86_legitimate_constant_p (Pmode, disp))
15625 /* displacment must be referenced via non_lazy_pointer */
15626 return false;
15627 #endif
15629 /* This code used to verify that a symbolic pic displacement
15630 includes the pic_offset_table_rtx register.
15632 While this is good idea, unfortunately these constructs may
15633 be created by "adds using lea" optimization for incorrect
15634 code like:
15636 int a;
15637 int foo(int i)
15639 return *(&a+i);
15642 This code is nonsensical, but results in addressing
15643 GOT table with pic_offset_table_rtx base. We can't
15644 just refuse it easily, since it gets matched by
15645 "addsi3" pattern, that later gets split to lea in the
15646 case output register differs from input. While this
15647 can be handled by separate addsi pattern for this case
15648 that never results in lea, this seems to be easier and
15649 correct fix for crash to disable this test. */
15651 else if (GET_CODE (disp) != LABEL_REF
15652 && !CONST_INT_P (disp)
15653 && (GET_CODE (disp) != CONST
15654 || !ix86_legitimate_constant_p (Pmode, disp))
15655 && (GET_CODE (disp) != SYMBOL_REF
15656 || !ix86_legitimate_constant_p (Pmode, disp)))
15657 /* Displacement is not constant. */
15658 return false;
15659 else if (TARGET_64BIT
15660 && !x86_64_immediate_operand (disp, VOIDmode))
15661 /* Displacement is out of range. */
15662 return false;
15663 /* In x32 mode, constant addresses are sign extended to 64bit, so
15664 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15665 else if (TARGET_X32 && !(index || base)
15666 && CONST_INT_P (disp)
15667 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15668 return false;
15671 /* Everything looks valid. */
15672 return true;
15675 /* Determine if a given RTX is a valid constant address. */
15677 bool
15678 constant_address_p (rtx x)
15680 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15683 /* Return a unique alias set for the GOT. */
15685 static alias_set_type
15686 ix86_GOT_alias_set (void)
15688 static alias_set_type set = -1;
15689 if (set == -1)
15690 set = new_alias_set ();
15691 return set;
15694 /* Return a legitimate reference for ORIG (an address) using the
15695 register REG. If REG is 0, a new pseudo is generated.
15697 There are two types of references that must be handled:
15699 1. Global data references must load the address from the GOT, via
15700 the PIC reg. An insn is emitted to do this load, and the reg is
15701 returned.
15703 2. Static data references, constant pool addresses, and code labels
15704 compute the address as an offset from the GOT, whose base is in
15705 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15706 differentiate them from global data objects. The returned
15707 address is the PIC reg + an unspec constant.
15709 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15710 reg also appears in the address. */
15712 static rtx
15713 legitimize_pic_address (rtx orig, rtx reg)
15715 rtx addr = orig;
15716 rtx new_rtx = orig;
15718 #if TARGET_MACHO
15719 if (TARGET_MACHO && !TARGET_64BIT)
15721 if (reg == 0)
15722 reg = gen_reg_rtx (Pmode);
15723 /* Use the generic Mach-O PIC machinery. */
15724 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15726 #endif
15728 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15730 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15731 if (tmp)
15732 return tmp;
15735 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15736 new_rtx = addr;
15737 else if ((!TARGET_64BIT
15738 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15739 && !TARGET_PECOFF
15740 && gotoff_operand (addr, Pmode))
15742 /* This symbol may be referenced via a displacement
15743 from the PIC base address (@GOTOFF). */
15744 if (GET_CODE (addr) == CONST)
15745 addr = XEXP (addr, 0);
15747 if (GET_CODE (addr) == PLUS)
15749 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15750 UNSPEC_GOTOFF);
15751 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15753 else
15754 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15756 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15758 if (TARGET_64BIT)
15759 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15761 if (reg != 0)
15763 gcc_assert (REG_P (reg));
15764 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15765 new_rtx, reg, 1, OPTAB_DIRECT);
15767 else
15768 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15770 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15771 /* We can't use @GOTOFF for text labels
15772 on VxWorks, see gotoff_operand. */
15773 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15775 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15776 if (tmp)
15777 return tmp;
15779 /* For x64 PE-COFF there is no GOT table,
15780 so we use address directly. */
15781 if (TARGET_64BIT && TARGET_PECOFF)
15783 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15784 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15786 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15788 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15789 UNSPEC_GOTPCREL);
15790 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15791 new_rtx = gen_const_mem (Pmode, new_rtx);
15792 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15794 else
15796 /* This symbol must be referenced via a load
15797 from the Global Offset Table (@GOT). */
15798 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15799 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15800 if (TARGET_64BIT)
15801 new_rtx = force_reg (Pmode, new_rtx);
15802 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15803 new_rtx = gen_const_mem (Pmode, new_rtx);
15804 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15807 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15809 else
15811 if (CONST_INT_P (addr)
15812 && !x86_64_immediate_operand (addr, VOIDmode))
15813 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15814 else if (GET_CODE (addr) == CONST)
15816 addr = XEXP (addr, 0);
15818 /* We must match stuff we generate before. Assume the only
15819 unspecs that can get here are ours. Not that we could do
15820 anything with them anyway.... */
15821 if (GET_CODE (addr) == UNSPEC
15822 || (GET_CODE (addr) == PLUS
15823 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15824 return orig;
15825 gcc_assert (GET_CODE (addr) == PLUS);
15828 if (GET_CODE (addr) == PLUS)
15830 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15832 /* Check first to see if this is a constant
15833 offset from a @GOTOFF symbol reference. */
15834 if (!TARGET_PECOFF
15835 && gotoff_operand (op0, Pmode)
15836 && CONST_INT_P (op1))
15838 if (!TARGET_64BIT)
15840 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15841 UNSPEC_GOTOFF);
15842 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15843 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15845 if (reg != 0)
15847 gcc_assert (REG_P (reg));
15848 new_rtx = expand_simple_binop (Pmode, PLUS,
15849 pic_offset_table_rtx,
15850 new_rtx, reg, 1,
15851 OPTAB_DIRECT);
15853 else
15854 new_rtx
15855 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15857 else
15859 if (INTVAL (op1) < -16*1024*1024
15860 || INTVAL (op1) >= 16*1024*1024)
15862 if (!x86_64_immediate_operand (op1, Pmode))
15863 op1 = force_reg (Pmode, op1);
15865 new_rtx
15866 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15870 else
15872 rtx base = legitimize_pic_address (op0, reg);
15873 machine_mode mode = GET_MODE (base);
15874 new_rtx
15875 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15877 if (CONST_INT_P (new_rtx))
15879 if (INTVAL (new_rtx) < -16*1024*1024
15880 || INTVAL (new_rtx) >= 16*1024*1024)
15882 if (!x86_64_immediate_operand (new_rtx, mode))
15883 new_rtx = force_reg (mode, new_rtx);
15885 new_rtx
15886 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15888 else
15889 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15891 else
15893 /* For %rip addressing, we have to use
15894 just disp32, not base nor index. */
15895 if (TARGET_64BIT
15896 && (GET_CODE (base) == SYMBOL_REF
15897 || GET_CODE (base) == LABEL_REF))
15898 base = force_reg (mode, base);
15899 if (GET_CODE (new_rtx) == PLUS
15900 && CONSTANT_P (XEXP (new_rtx, 1)))
15902 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15903 new_rtx = XEXP (new_rtx, 1);
15905 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15910 return new_rtx;
15913 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15915 static rtx
15916 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15918 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15920 if (GET_MODE (tp) != tp_mode)
15922 gcc_assert (GET_MODE (tp) == SImode);
15923 gcc_assert (tp_mode == DImode);
15925 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15928 if (to_reg)
15929 tp = copy_to_mode_reg (tp_mode, tp);
15931 return tp;
15934 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15936 static GTY(()) rtx ix86_tls_symbol;
15938 static rtx
15939 ix86_tls_get_addr (void)
15941 if (!ix86_tls_symbol)
15943 const char *sym
15944 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15945 ? "___tls_get_addr" : "__tls_get_addr");
15947 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15950 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15952 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15953 UNSPEC_PLTOFF);
15954 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15955 gen_rtx_CONST (Pmode, unspec));
15958 return ix86_tls_symbol;
15961 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15963 static GTY(()) rtx ix86_tls_module_base_symbol;
15966 ix86_tls_module_base (void)
15968 if (!ix86_tls_module_base_symbol)
15970 ix86_tls_module_base_symbol
15971 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15973 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15974 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15977 return ix86_tls_module_base_symbol;
15980 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15981 false if we expect this to be used for a memory address and true if
15982 we expect to load the address into a register. */
15984 static rtx
15985 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15987 rtx dest, base, off;
15988 rtx pic = NULL_RTX, tp = NULL_RTX;
15989 machine_mode tp_mode = Pmode;
15990 int type;
15992 /* Fall back to global dynamic model if tool chain cannot support local
15993 dynamic. */
15994 if (TARGET_SUN_TLS && !TARGET_64BIT
15995 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15996 && model == TLS_MODEL_LOCAL_DYNAMIC)
15997 model = TLS_MODEL_GLOBAL_DYNAMIC;
15999 switch (model)
16001 case TLS_MODEL_GLOBAL_DYNAMIC:
16002 dest = gen_reg_rtx (Pmode);
16004 if (!TARGET_64BIT)
16006 if (flag_pic && !TARGET_PECOFF)
16007 pic = pic_offset_table_rtx;
16008 else
16010 pic = gen_reg_rtx (Pmode);
16011 emit_insn (gen_set_got (pic));
16015 if (TARGET_GNU2_TLS)
16017 if (TARGET_64BIT)
16018 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16019 else
16020 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16022 tp = get_thread_pointer (Pmode, true);
16023 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16025 if (GET_MODE (x) != Pmode)
16026 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16028 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16030 else
16032 rtx caddr = ix86_tls_get_addr ();
16034 if (TARGET_64BIT)
16036 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16037 rtx_insn *insns;
16039 start_sequence ();
16040 emit_call_insn
16041 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16042 insns = get_insns ();
16043 end_sequence ();
16045 if (GET_MODE (x) != Pmode)
16046 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16048 RTL_CONST_CALL_P (insns) = 1;
16049 emit_libcall_block (insns, dest, rax, x);
16051 else
16052 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16054 break;
16056 case TLS_MODEL_LOCAL_DYNAMIC:
16057 base = gen_reg_rtx (Pmode);
16059 if (!TARGET_64BIT)
16061 if (flag_pic)
16062 pic = pic_offset_table_rtx;
16063 else
16065 pic = gen_reg_rtx (Pmode);
16066 emit_insn (gen_set_got (pic));
16070 if (TARGET_GNU2_TLS)
16072 rtx tmp = ix86_tls_module_base ();
16074 if (TARGET_64BIT)
16075 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16076 else
16077 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16079 tp = get_thread_pointer (Pmode, true);
16080 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16081 gen_rtx_MINUS (Pmode, tmp, tp));
16083 else
16085 rtx caddr = ix86_tls_get_addr ();
16087 if (TARGET_64BIT)
16089 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16090 rtx_insn *insns;
16091 rtx eqv;
16093 start_sequence ();
16094 emit_call_insn
16095 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16096 insns = get_insns ();
16097 end_sequence ();
16099 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16100 share the LD_BASE result with other LD model accesses. */
16101 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16102 UNSPEC_TLS_LD_BASE);
16104 RTL_CONST_CALL_P (insns) = 1;
16105 emit_libcall_block (insns, base, rax, eqv);
16107 else
16108 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16111 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16112 off = gen_rtx_CONST (Pmode, off);
16114 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16116 if (TARGET_GNU2_TLS)
16118 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16120 if (GET_MODE (x) != Pmode)
16121 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16123 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16125 break;
16127 case TLS_MODEL_INITIAL_EXEC:
16128 if (TARGET_64BIT)
16130 if (TARGET_SUN_TLS && !TARGET_X32)
16132 /* The Sun linker took the AMD64 TLS spec literally
16133 and can only handle %rax as destination of the
16134 initial executable code sequence. */
16136 dest = gen_reg_rtx (DImode);
16137 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16138 return dest;
16141 /* Generate DImode references to avoid %fs:(%reg32)
16142 problems and linker IE->LE relaxation bug. */
16143 tp_mode = DImode;
16144 pic = NULL;
16145 type = UNSPEC_GOTNTPOFF;
16147 else if (flag_pic)
16149 pic = pic_offset_table_rtx;
16150 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16152 else if (!TARGET_ANY_GNU_TLS)
16154 pic = gen_reg_rtx (Pmode);
16155 emit_insn (gen_set_got (pic));
16156 type = UNSPEC_GOTTPOFF;
16158 else
16160 pic = NULL;
16161 type = UNSPEC_INDNTPOFF;
16164 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16165 off = gen_rtx_CONST (tp_mode, off);
16166 if (pic)
16167 off = gen_rtx_PLUS (tp_mode, pic, off);
16168 off = gen_const_mem (tp_mode, off);
16169 set_mem_alias_set (off, ix86_GOT_alias_set ());
16171 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16173 base = get_thread_pointer (tp_mode,
16174 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16175 off = force_reg (tp_mode, off);
16176 dest = gen_rtx_PLUS (tp_mode, base, off);
16177 if (tp_mode != Pmode)
16178 dest = convert_to_mode (Pmode, dest, 1);
16180 else
16182 base = get_thread_pointer (Pmode, true);
16183 dest = gen_reg_rtx (Pmode);
16184 emit_insn (ix86_gen_sub3 (dest, base, off));
16186 break;
16188 case TLS_MODEL_LOCAL_EXEC:
16189 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16190 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16191 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16192 off = gen_rtx_CONST (Pmode, off);
16194 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16196 base = get_thread_pointer (Pmode,
16197 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16198 return gen_rtx_PLUS (Pmode, base, off);
16200 else
16202 base = get_thread_pointer (Pmode, true);
16203 dest = gen_reg_rtx (Pmode);
16204 emit_insn (ix86_gen_sub3 (dest, base, off));
16206 break;
16208 default:
16209 gcc_unreachable ();
16212 return dest;
16215 /* Return true if OP refers to a TLS address. */
16216 bool
16217 ix86_tls_address_pattern_p (rtx op)
16219 subrtx_var_iterator::array_type array;
16220 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16222 rtx op = *iter;
16223 if (MEM_P (op))
16225 rtx *x = &XEXP (op, 0);
16226 while (GET_CODE (*x) == PLUS)
16228 int i;
16229 for (i = 0; i < 2; i++)
16231 rtx u = XEXP (*x, i);
16232 if (GET_CODE (u) == ZERO_EXTEND)
16233 u = XEXP (u, 0);
16234 if (GET_CODE (u) == UNSPEC
16235 && XINT (u, 1) == UNSPEC_TP)
16236 return true;
16238 x = &XEXP (*x, 0);
16241 iter.skip_subrtxes ();
16245 return false;
16248 /* Rewrite *LOC so that it refers to a default TLS address space. */
16249 void
16250 ix86_rewrite_tls_address_1 (rtx *loc)
16252 subrtx_ptr_iterator::array_type array;
16253 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16255 rtx *loc = *iter;
16256 if (MEM_P (*loc))
16258 rtx addr = XEXP (*loc, 0);
16259 rtx *x = &addr;
16260 while (GET_CODE (*x) == PLUS)
16262 int i;
16263 for (i = 0; i < 2; i++)
16265 rtx u = XEXP (*x, i);
16266 if (GET_CODE (u) == ZERO_EXTEND)
16267 u = XEXP (u, 0);
16268 if (GET_CODE (u) == UNSPEC
16269 && XINT (u, 1) == UNSPEC_TP)
16271 addr_space_t as = DEFAULT_TLS_SEG_REG;
16273 *x = XEXP (*x, 1 - i);
16275 *loc = replace_equiv_address_nv (*loc, addr, true);
16276 set_mem_addr_space (*loc, as);
16277 return;
16280 x = &XEXP (*x, 0);
16283 iter.skip_subrtxes ();
16288 /* Rewrite instruction pattern involvning TLS address
16289 so that it refers to a default TLS address space. */
16291 ix86_rewrite_tls_address (rtx pattern)
16293 pattern = copy_insn (pattern);
16294 ix86_rewrite_tls_address_1 (&pattern);
16295 return pattern;
16298 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16299 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16300 unique refptr-DECL symbol corresponding to symbol DECL. */
16302 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16304 static inline hashval_t hash (tree_map *m) { return m->hash; }
16305 static inline bool
16306 equal (tree_map *a, tree_map *b)
16308 return a->base.from == b->base.from;
16311 static int
16312 keep_cache_entry (tree_map *&m)
16314 return ggc_marked_p (m->base.from);
16318 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16320 static tree
16321 get_dllimport_decl (tree decl, bool beimport)
16323 struct tree_map *h, in;
16324 const char *name;
16325 const char *prefix;
16326 size_t namelen, prefixlen;
16327 char *imp_name;
16328 tree to;
16329 rtx rtl;
16331 if (!dllimport_map)
16332 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16334 in.hash = htab_hash_pointer (decl);
16335 in.base.from = decl;
16336 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16337 h = *loc;
16338 if (h)
16339 return h->to;
16341 *loc = h = ggc_alloc<tree_map> ();
16342 h->hash = in.hash;
16343 h->base.from = decl;
16344 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16345 VAR_DECL, NULL, ptr_type_node);
16346 DECL_ARTIFICIAL (to) = 1;
16347 DECL_IGNORED_P (to) = 1;
16348 DECL_EXTERNAL (to) = 1;
16349 TREE_READONLY (to) = 1;
16351 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16352 name = targetm.strip_name_encoding (name);
16353 if (beimport)
16354 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16355 ? "*__imp_" : "*__imp__";
16356 else
16357 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16358 namelen = strlen (name);
16359 prefixlen = strlen (prefix);
16360 imp_name = (char *) alloca (namelen + prefixlen + 1);
16361 memcpy (imp_name, prefix, prefixlen);
16362 memcpy (imp_name + prefixlen, name, namelen + 1);
16364 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16365 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16366 SET_SYMBOL_REF_DECL (rtl, to);
16367 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16368 if (!beimport)
16370 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16371 #ifdef SUB_TARGET_RECORD_STUB
16372 SUB_TARGET_RECORD_STUB (name);
16373 #endif
16376 rtl = gen_const_mem (Pmode, rtl);
16377 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16379 SET_DECL_RTL (to, rtl);
16380 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16382 return to;
16385 /* Expand SYMBOL into its corresponding far-address symbol.
16386 WANT_REG is true if we require the result be a register. */
16388 static rtx
16389 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16391 tree imp_decl;
16392 rtx x;
16394 gcc_assert (SYMBOL_REF_DECL (symbol));
16395 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16397 x = DECL_RTL (imp_decl);
16398 if (want_reg)
16399 x = force_reg (Pmode, x);
16400 return x;
16403 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16404 true if we require the result be a register. */
16406 static rtx
16407 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16409 tree imp_decl;
16410 rtx x;
16412 gcc_assert (SYMBOL_REF_DECL (symbol));
16413 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16415 x = DECL_RTL (imp_decl);
16416 if (want_reg)
16417 x = force_reg (Pmode, x);
16418 return x;
16421 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16422 is true if we require the result be a register. */
16424 static rtx
16425 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16427 if (!TARGET_PECOFF)
16428 return NULL_RTX;
16430 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16432 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16433 return legitimize_dllimport_symbol (addr, inreg);
16434 if (GET_CODE (addr) == CONST
16435 && GET_CODE (XEXP (addr, 0)) == PLUS
16436 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16437 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16439 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16440 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16444 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16445 return NULL_RTX;
16446 if (GET_CODE (addr) == SYMBOL_REF
16447 && !is_imported_p (addr)
16448 && SYMBOL_REF_EXTERNAL_P (addr)
16449 && SYMBOL_REF_DECL (addr))
16450 return legitimize_pe_coff_extern_decl (addr, inreg);
16452 if (GET_CODE (addr) == CONST
16453 && GET_CODE (XEXP (addr, 0)) == PLUS
16454 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16455 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16456 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16457 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16459 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16460 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16462 return NULL_RTX;
16465 /* Try machine-dependent ways of modifying an illegitimate address
16466 to be legitimate. If we find one, return the new, valid address.
16467 This macro is used in only one place: `memory_address' in explow.c.
16469 OLDX is the address as it was before break_out_memory_refs was called.
16470 In some cases it is useful to look at this to decide what needs to be done.
16472 It is always safe for this macro to do nothing. It exists to recognize
16473 opportunities to optimize the output.
16475 For the 80386, we handle X+REG by loading X into a register R and
16476 using R+REG. R will go in a general reg and indexing will be used.
16477 However, if REG is a broken-out memory address or multiplication,
16478 nothing needs to be done because REG can certainly go in a general reg.
16480 When -fpic is used, special handling is needed for symbolic references.
16481 See comments by legitimize_pic_address in i386.c for details. */
16483 static rtx
16484 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16486 bool changed = false;
16487 unsigned log;
16489 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16490 if (log)
16491 return legitimize_tls_address (x, (enum tls_model) log, false);
16492 if (GET_CODE (x) == CONST
16493 && GET_CODE (XEXP (x, 0)) == PLUS
16494 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16495 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16497 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16498 (enum tls_model) log, false);
16499 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16502 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16504 rtx tmp = legitimize_pe_coff_symbol (x, true);
16505 if (tmp)
16506 return tmp;
16509 if (flag_pic && SYMBOLIC_CONST (x))
16510 return legitimize_pic_address (x, 0);
16512 #if TARGET_MACHO
16513 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16514 return machopic_indirect_data_reference (x, 0);
16515 #endif
16517 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16518 if (GET_CODE (x) == ASHIFT
16519 && CONST_INT_P (XEXP (x, 1))
16520 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16522 changed = true;
16523 log = INTVAL (XEXP (x, 1));
16524 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16525 GEN_INT (1 << log));
16528 if (GET_CODE (x) == PLUS)
16530 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16532 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16533 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16534 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16536 changed = true;
16537 log = INTVAL (XEXP (XEXP (x, 0), 1));
16538 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16539 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16540 GEN_INT (1 << log));
16543 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16544 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16545 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16547 changed = true;
16548 log = INTVAL (XEXP (XEXP (x, 1), 1));
16549 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16550 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16551 GEN_INT (1 << log));
16554 /* Put multiply first if it isn't already. */
16555 if (GET_CODE (XEXP (x, 1)) == MULT)
16557 std::swap (XEXP (x, 0), XEXP (x, 1));
16558 changed = true;
16561 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16562 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16563 created by virtual register instantiation, register elimination, and
16564 similar optimizations. */
16565 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16567 changed = true;
16568 x = gen_rtx_PLUS (Pmode,
16569 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16570 XEXP (XEXP (x, 1), 0)),
16571 XEXP (XEXP (x, 1), 1));
16574 /* Canonicalize
16575 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16576 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16577 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16578 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16579 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16580 && CONSTANT_P (XEXP (x, 1)))
16582 rtx constant;
16583 rtx other = NULL_RTX;
16585 if (CONST_INT_P (XEXP (x, 1)))
16587 constant = XEXP (x, 1);
16588 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16590 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16592 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16593 other = XEXP (x, 1);
16595 else
16596 constant = 0;
16598 if (constant)
16600 changed = true;
16601 x = gen_rtx_PLUS (Pmode,
16602 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16603 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16604 plus_constant (Pmode, other,
16605 INTVAL (constant)));
16609 if (changed && ix86_legitimate_address_p (mode, x, false))
16610 return x;
16612 if (GET_CODE (XEXP (x, 0)) == MULT)
16614 changed = true;
16615 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16618 if (GET_CODE (XEXP (x, 1)) == MULT)
16620 changed = true;
16621 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16624 if (changed
16625 && REG_P (XEXP (x, 1))
16626 && REG_P (XEXP (x, 0)))
16627 return x;
16629 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16631 changed = true;
16632 x = legitimize_pic_address (x, 0);
16635 if (changed && ix86_legitimate_address_p (mode, x, false))
16636 return x;
16638 if (REG_P (XEXP (x, 0)))
16640 rtx temp = gen_reg_rtx (Pmode);
16641 rtx val = force_operand (XEXP (x, 1), temp);
16642 if (val != temp)
16644 val = convert_to_mode (Pmode, val, 1);
16645 emit_move_insn (temp, val);
16648 XEXP (x, 1) = temp;
16649 return x;
16652 else if (REG_P (XEXP (x, 1)))
16654 rtx temp = gen_reg_rtx (Pmode);
16655 rtx val = force_operand (XEXP (x, 0), temp);
16656 if (val != temp)
16658 val = convert_to_mode (Pmode, val, 1);
16659 emit_move_insn (temp, val);
16662 XEXP (x, 0) = temp;
16663 return x;
16667 return x;
16670 /* Print an integer constant expression in assembler syntax. Addition
16671 and subtraction are the only arithmetic that may appear in these
16672 expressions. FILE is the stdio stream to write to, X is the rtx, and
16673 CODE is the operand print code from the output string. */
16675 static void
16676 output_pic_addr_const (FILE *file, rtx x, int code)
16678 char buf[256];
16680 switch (GET_CODE (x))
16682 case PC:
16683 gcc_assert (flag_pic);
16684 putc ('.', file);
16685 break;
16687 case SYMBOL_REF:
16688 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16689 output_addr_const (file, x);
16690 else
16692 const char *name = XSTR (x, 0);
16694 /* Mark the decl as referenced so that cgraph will
16695 output the function. */
16696 if (SYMBOL_REF_DECL (x))
16697 mark_decl_referenced (SYMBOL_REF_DECL (x));
16699 #if TARGET_MACHO
16700 if (MACHOPIC_INDIRECT
16701 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16702 name = machopic_indirection_name (x, /*stub_p=*/true);
16703 #endif
16704 assemble_name (file, name);
16706 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16707 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16708 fputs ("@PLT", file);
16709 break;
16711 case LABEL_REF:
16712 x = XEXP (x, 0);
16713 /* FALLTHRU */
16714 case CODE_LABEL:
16715 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16716 assemble_name (asm_out_file, buf);
16717 break;
16719 case CONST_INT:
16720 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16721 break;
16723 case CONST:
16724 /* This used to output parentheses around the expression,
16725 but that does not work on the 386 (either ATT or BSD assembler). */
16726 output_pic_addr_const (file, XEXP (x, 0), code);
16727 break;
16729 case CONST_DOUBLE:
16730 /* We can't handle floating point constants;
16731 TARGET_PRINT_OPERAND must handle them. */
16732 output_operand_lossage ("floating constant misused");
16733 break;
16735 case PLUS:
16736 /* Some assemblers need integer constants to appear first. */
16737 if (CONST_INT_P (XEXP (x, 0)))
16739 output_pic_addr_const (file, XEXP (x, 0), code);
16740 putc ('+', file);
16741 output_pic_addr_const (file, XEXP (x, 1), code);
16743 else
16745 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16746 output_pic_addr_const (file, XEXP (x, 1), code);
16747 putc ('+', file);
16748 output_pic_addr_const (file, XEXP (x, 0), code);
16750 break;
16752 case MINUS:
16753 if (!TARGET_MACHO)
16754 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16755 output_pic_addr_const (file, XEXP (x, 0), code);
16756 putc ('-', file);
16757 output_pic_addr_const (file, XEXP (x, 1), code);
16758 if (!TARGET_MACHO)
16759 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16760 break;
16762 case UNSPEC:
16763 gcc_assert (XVECLEN (x, 0) == 1);
16764 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16765 switch (XINT (x, 1))
16767 case UNSPEC_GOT:
16768 fputs ("@GOT", file);
16769 break;
16770 case UNSPEC_GOTOFF:
16771 fputs ("@GOTOFF", file);
16772 break;
16773 case UNSPEC_PLTOFF:
16774 fputs ("@PLTOFF", file);
16775 break;
16776 case UNSPEC_PCREL:
16777 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16778 "(%rip)" : "[rip]", file);
16779 break;
16780 case UNSPEC_GOTPCREL:
16781 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16782 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16783 break;
16784 case UNSPEC_GOTTPOFF:
16785 /* FIXME: This might be @TPOFF in Sun ld too. */
16786 fputs ("@gottpoff", file);
16787 break;
16788 case UNSPEC_TPOFF:
16789 fputs ("@tpoff", file);
16790 break;
16791 case UNSPEC_NTPOFF:
16792 if (TARGET_64BIT)
16793 fputs ("@tpoff", file);
16794 else
16795 fputs ("@ntpoff", file);
16796 break;
16797 case UNSPEC_DTPOFF:
16798 fputs ("@dtpoff", file);
16799 break;
16800 case UNSPEC_GOTNTPOFF:
16801 if (TARGET_64BIT)
16802 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16803 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16804 else
16805 fputs ("@gotntpoff", file);
16806 break;
16807 case UNSPEC_INDNTPOFF:
16808 fputs ("@indntpoff", file);
16809 break;
16810 #if TARGET_MACHO
16811 case UNSPEC_MACHOPIC_OFFSET:
16812 putc ('-', file);
16813 machopic_output_function_base_name (file);
16814 break;
16815 #endif
16816 default:
16817 output_operand_lossage ("invalid UNSPEC as operand");
16818 break;
16820 break;
16822 default:
16823 output_operand_lossage ("invalid expression as operand");
16827 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16828 We need to emit DTP-relative relocations. */
16830 static void ATTRIBUTE_UNUSED
16831 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16833 fputs (ASM_LONG, file);
16834 output_addr_const (file, x);
16835 fputs ("@dtpoff", file);
16836 switch (size)
16838 case 4:
16839 break;
16840 case 8:
16841 fputs (", 0", file);
16842 break;
16843 default:
16844 gcc_unreachable ();
16848 /* Return true if X is a representation of the PIC register. This copes
16849 with calls from ix86_find_base_term, where the register might have
16850 been replaced by a cselib value. */
16852 static bool
16853 ix86_pic_register_p (rtx x)
16855 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16856 return (pic_offset_table_rtx
16857 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16858 else if (!REG_P (x))
16859 return false;
16860 else if (pic_offset_table_rtx)
16862 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16863 return true;
16864 if (HARD_REGISTER_P (x)
16865 && !HARD_REGISTER_P (pic_offset_table_rtx)
16866 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16867 return true;
16868 return false;
16870 else
16871 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16874 /* Helper function for ix86_delegitimize_address.
16875 Attempt to delegitimize TLS local-exec accesses. */
16877 static rtx
16878 ix86_delegitimize_tls_address (rtx orig_x)
16880 rtx x = orig_x, unspec;
16881 struct ix86_address addr;
16883 if (!TARGET_TLS_DIRECT_SEG_REFS)
16884 return orig_x;
16885 if (MEM_P (x))
16886 x = XEXP (x, 0);
16887 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16888 return orig_x;
16889 if (ix86_decompose_address (x, &addr) == 0
16890 || addr.seg != DEFAULT_TLS_SEG_REG
16891 || addr.disp == NULL_RTX
16892 || GET_CODE (addr.disp) != CONST)
16893 return orig_x;
16894 unspec = XEXP (addr.disp, 0);
16895 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16896 unspec = XEXP (unspec, 0);
16897 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16898 return orig_x;
16899 x = XVECEXP (unspec, 0, 0);
16900 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16901 if (unspec != XEXP (addr.disp, 0))
16902 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16903 if (addr.index)
16905 rtx idx = addr.index;
16906 if (addr.scale != 1)
16907 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16908 x = gen_rtx_PLUS (Pmode, idx, x);
16910 if (addr.base)
16911 x = gen_rtx_PLUS (Pmode, addr.base, x);
16912 if (MEM_P (orig_x))
16913 x = replace_equiv_address_nv (orig_x, x);
16914 return x;
16917 /* In the name of slightly smaller debug output, and to cater to
16918 general assembler lossage, recognize PIC+GOTOFF and turn it back
16919 into a direct symbol reference.
16921 On Darwin, this is necessary to avoid a crash, because Darwin
16922 has a different PIC label for each routine but the DWARF debugging
16923 information is not associated with any particular routine, so it's
16924 necessary to remove references to the PIC label from RTL stored by
16925 the DWARF output code.
16927 This helper is used in the normal ix86_delegitimize_address
16928 entrypoint (e.g. used in the target delegitimization hook) and
16929 in ix86_find_base_term. As compile time memory optimization, we
16930 avoid allocating rtxes that will not change anything on the outcome
16931 of the callers (find_base_value and find_base_term). */
16933 static inline rtx
16934 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16936 rtx orig_x = delegitimize_mem_from_attrs (x);
16937 /* addend is NULL or some rtx if x is something+GOTOFF where
16938 something doesn't include the PIC register. */
16939 rtx addend = NULL_RTX;
16940 /* reg_addend is NULL or a multiple of some register. */
16941 rtx reg_addend = NULL_RTX;
16942 /* const_addend is NULL or a const_int. */
16943 rtx const_addend = NULL_RTX;
16944 /* This is the result, or NULL. */
16945 rtx result = NULL_RTX;
16947 x = orig_x;
16949 if (MEM_P (x))
16950 x = XEXP (x, 0);
16952 if (TARGET_64BIT)
16954 if (GET_CODE (x) == CONST
16955 && GET_CODE (XEXP (x, 0)) == PLUS
16956 && GET_MODE (XEXP (x, 0)) == Pmode
16957 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16958 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16959 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16961 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16962 base. A CONST can't be arg_pointer_rtx based. */
16963 if (base_term_p && MEM_P (orig_x))
16964 return orig_x;
16965 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16966 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16967 if (MEM_P (orig_x))
16968 x = replace_equiv_address_nv (orig_x, x);
16969 return x;
16972 if (GET_CODE (x) == CONST
16973 && GET_CODE (XEXP (x, 0)) == UNSPEC
16974 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16975 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16976 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16978 x = XVECEXP (XEXP (x, 0), 0, 0);
16979 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16981 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16982 if (x == NULL_RTX)
16983 return orig_x;
16985 return x;
16988 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16989 return ix86_delegitimize_tls_address (orig_x);
16991 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16992 and -mcmodel=medium -fpic. */
16995 if (GET_CODE (x) != PLUS
16996 || GET_CODE (XEXP (x, 1)) != CONST)
16997 return ix86_delegitimize_tls_address (orig_x);
16999 if (ix86_pic_register_p (XEXP (x, 0)))
17000 /* %ebx + GOT/GOTOFF */
17002 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17004 /* %ebx + %reg * scale + GOT/GOTOFF */
17005 reg_addend = XEXP (x, 0);
17006 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17007 reg_addend = XEXP (reg_addend, 1);
17008 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17009 reg_addend = XEXP (reg_addend, 0);
17010 else
17012 reg_addend = NULL_RTX;
17013 addend = XEXP (x, 0);
17016 else
17017 addend = XEXP (x, 0);
17019 x = XEXP (XEXP (x, 1), 0);
17020 if (GET_CODE (x) == PLUS
17021 && CONST_INT_P (XEXP (x, 1)))
17023 const_addend = XEXP (x, 1);
17024 x = XEXP (x, 0);
17027 if (GET_CODE (x) == UNSPEC
17028 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17029 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17030 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17031 && !MEM_P (orig_x) && !addend)))
17032 result = XVECEXP (x, 0, 0);
17034 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17035 && !MEM_P (orig_x))
17036 result = XVECEXP (x, 0, 0);
17038 if (! result)
17039 return ix86_delegitimize_tls_address (orig_x);
17041 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17042 recurse on the first operand. */
17043 if (const_addend && !base_term_p)
17044 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17045 if (reg_addend)
17046 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17047 if (addend)
17049 /* If the rest of original X doesn't involve the PIC register, add
17050 addend and subtract pic_offset_table_rtx. This can happen e.g.
17051 for code like:
17052 leal (%ebx, %ecx, 4), %ecx
17054 movl foo@GOTOFF(%ecx), %edx
17055 in which case we return (%ecx - %ebx) + foo
17056 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17057 and reload has completed. Don't do the latter for debug,
17058 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17059 if (pic_offset_table_rtx
17060 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17061 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17062 pic_offset_table_rtx),
17063 result);
17064 else if (base_term_p
17065 && pic_offset_table_rtx
17066 && !TARGET_MACHO
17067 && !TARGET_VXWORKS_RTP)
17069 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17070 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17071 result = gen_rtx_PLUS (Pmode, tmp, result);
17073 else
17074 return orig_x;
17076 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17078 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17079 if (result == NULL_RTX)
17080 return orig_x;
17082 return result;
17085 /* The normal instantiation of the above template. */
17087 static rtx
17088 ix86_delegitimize_address (rtx x)
17090 return ix86_delegitimize_address_1 (x, false);
17093 /* If X is a machine specific address (i.e. a symbol or label being
17094 referenced as a displacement from the GOT implemented using an
17095 UNSPEC), then return the base term. Otherwise return X. */
17098 ix86_find_base_term (rtx x)
17100 rtx term;
17102 if (TARGET_64BIT)
17104 if (GET_CODE (x) != CONST)
17105 return x;
17106 term = XEXP (x, 0);
17107 if (GET_CODE (term) == PLUS
17108 && CONST_INT_P (XEXP (term, 1)))
17109 term = XEXP (term, 0);
17110 if (GET_CODE (term) != UNSPEC
17111 || (XINT (term, 1) != UNSPEC_GOTPCREL
17112 && XINT (term, 1) != UNSPEC_PCREL))
17113 return x;
17115 return XVECEXP (term, 0, 0);
17118 return ix86_delegitimize_address_1 (x, true);
17121 /* Return true if X shouldn't be emitted into the debug info.
17122 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17123 symbol easily into the .debug_info section, so we need not to
17124 delegitimize, but instead assemble as @gotoff.
17125 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17126 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17128 static bool
17129 ix86_const_not_ok_for_debug_p (rtx x)
17131 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17132 return true;
17134 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17135 return true;
17137 return false;
17140 static void
17141 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17142 bool fp, FILE *file)
17144 const char *suffix;
17146 if (mode == CCFPmode)
17148 code = ix86_fp_compare_code_to_integer (code);
17149 mode = CCmode;
17151 if (reverse)
17152 code = reverse_condition (code);
17154 switch (code)
17156 case EQ:
17157 gcc_assert (mode != CCGZmode);
17158 switch (mode)
17160 case E_CCAmode:
17161 suffix = "a";
17162 break;
17163 case E_CCCmode:
17164 suffix = "c";
17165 break;
17166 case E_CCOmode:
17167 suffix = "o";
17168 break;
17169 case E_CCPmode:
17170 suffix = "p";
17171 break;
17172 case E_CCSmode:
17173 suffix = "s";
17174 break;
17175 default:
17176 suffix = "e";
17177 break;
17179 break;
17180 case NE:
17181 gcc_assert (mode != CCGZmode);
17182 switch (mode)
17184 case E_CCAmode:
17185 suffix = "na";
17186 break;
17187 case E_CCCmode:
17188 suffix = "nc";
17189 break;
17190 case E_CCOmode:
17191 suffix = "no";
17192 break;
17193 case E_CCPmode:
17194 suffix = "np";
17195 break;
17196 case E_CCSmode:
17197 suffix = "ns";
17198 break;
17199 default:
17200 suffix = "ne";
17201 break;
17203 break;
17204 case GT:
17205 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17206 suffix = "g";
17207 break;
17208 case GTU:
17209 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17210 Those same assemblers have the same but opposite lossage on cmov. */
17211 if (mode == CCmode)
17212 suffix = fp ? "nbe" : "a";
17213 else
17214 gcc_unreachable ();
17215 break;
17216 case LT:
17217 switch (mode)
17219 case E_CCNOmode:
17220 case E_CCGOCmode:
17221 suffix = "s";
17222 break;
17224 case E_CCmode:
17225 case E_CCGCmode:
17226 case E_CCGZmode:
17227 suffix = "l";
17228 break;
17230 default:
17231 gcc_unreachable ();
17233 break;
17234 case LTU:
17235 if (mode == CCmode || mode == CCGZmode)
17236 suffix = "b";
17237 else if (mode == CCCmode)
17238 suffix = fp ? "b" : "c";
17239 else
17240 gcc_unreachable ();
17241 break;
17242 case GE:
17243 switch (mode)
17245 case E_CCNOmode:
17246 case E_CCGOCmode:
17247 suffix = "ns";
17248 break;
17250 case E_CCmode:
17251 case E_CCGCmode:
17252 case E_CCGZmode:
17253 suffix = "ge";
17254 break;
17256 default:
17257 gcc_unreachable ();
17259 break;
17260 case GEU:
17261 if (mode == CCmode || mode == CCGZmode)
17262 suffix = "nb";
17263 else if (mode == CCCmode)
17264 suffix = fp ? "nb" : "nc";
17265 else
17266 gcc_unreachable ();
17267 break;
17268 case LE:
17269 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17270 suffix = "le";
17271 break;
17272 case LEU:
17273 if (mode == CCmode)
17274 suffix = "be";
17275 else
17276 gcc_unreachable ();
17277 break;
17278 case UNORDERED:
17279 suffix = fp ? "u" : "p";
17280 break;
17281 case ORDERED:
17282 suffix = fp ? "nu" : "np";
17283 break;
17284 default:
17285 gcc_unreachable ();
17287 fputs (suffix, file);
17290 /* Print the name of register X to FILE based on its machine mode and number.
17291 If CODE is 'w', pretend the mode is HImode.
17292 If CODE is 'b', pretend the mode is QImode.
17293 If CODE is 'k', pretend the mode is SImode.
17294 If CODE is 'q', pretend the mode is DImode.
17295 If CODE is 'x', pretend the mode is V4SFmode.
17296 If CODE is 't', pretend the mode is V8SFmode.
17297 If CODE is 'g', pretend the mode is V16SFmode.
17298 If CODE is 'h', pretend the reg is the 'high' byte register.
17299 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17300 If CODE is 'd', duplicate the operand for AVX instruction.
17303 void
17304 print_reg (rtx x, int code, FILE *file)
17306 const char *reg;
17307 int msize;
17308 unsigned int regno;
17309 bool duplicated;
17311 if (ASSEMBLER_DIALECT == ASM_ATT)
17312 putc ('%', file);
17314 if (x == pc_rtx)
17316 gcc_assert (TARGET_64BIT);
17317 fputs ("rip", file);
17318 return;
17321 if (code == 'y' && STACK_TOP_P (x))
17323 fputs ("st(0)", file);
17324 return;
17327 if (code == 'w')
17328 msize = 2;
17329 else if (code == 'b')
17330 msize = 1;
17331 else if (code == 'k')
17332 msize = 4;
17333 else if (code == 'q')
17334 msize = 8;
17335 else if (code == 'h')
17336 msize = 0;
17337 else if (code == 'x')
17338 msize = 16;
17339 else if (code == 't')
17340 msize = 32;
17341 else if (code == 'g')
17342 msize = 64;
17343 else
17344 msize = GET_MODE_SIZE (GET_MODE (x));
17346 regno = REGNO (x);
17348 if (regno == ARG_POINTER_REGNUM
17349 || regno == FRAME_POINTER_REGNUM
17350 || regno == FPSR_REG
17351 || regno == FPCR_REG)
17353 output_operand_lossage
17354 ("invalid use of register '%s'", reg_names[regno]);
17355 return;
17357 else if (regno == FLAGS_REG)
17359 output_operand_lossage ("invalid use of asm flag output");
17360 return;
17363 duplicated = code == 'd' && TARGET_AVX;
17365 switch (msize)
17367 case 16:
17368 case 12:
17369 case 8:
17370 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17371 warning (0, "unsupported size for integer register");
17372 /* FALLTHRU */
17373 case 4:
17374 if (LEGACY_INT_REGNO_P (regno))
17375 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17376 /* FALLTHRU */
17377 case 2:
17378 normal:
17379 reg = hi_reg_name[regno];
17380 break;
17381 case 1:
17382 if (regno >= ARRAY_SIZE (qi_reg_name))
17383 goto normal;
17384 if (!ANY_QI_REGNO_P (regno))
17385 error ("unsupported size for integer register");
17386 reg = qi_reg_name[regno];
17387 break;
17388 case 0:
17389 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17390 goto normal;
17391 reg = qi_high_reg_name[regno];
17392 break;
17393 case 32:
17394 case 64:
17395 if (SSE_REGNO_P (regno))
17397 gcc_assert (!duplicated);
17398 putc (msize == 32 ? 'y' : 'z', file);
17399 reg = hi_reg_name[regno] + 1;
17400 break;
17402 goto normal;
17403 default:
17404 gcc_unreachable ();
17407 fputs (reg, file);
17409 /* Irritatingly, AMD extended registers use
17410 different naming convention: "r%d[bwd]" */
17411 if (REX_INT_REGNO_P (regno))
17413 gcc_assert (TARGET_64BIT);
17414 switch (msize)
17416 case 0:
17417 error ("extended registers have no high halves");
17418 break;
17419 case 1:
17420 putc ('b', file);
17421 break;
17422 case 2:
17423 putc ('w', file);
17424 break;
17425 case 4:
17426 putc ('d', file);
17427 break;
17428 case 8:
17429 /* no suffix */
17430 break;
17431 default:
17432 error ("unsupported operand size for extended register");
17433 break;
17435 return;
17438 if (duplicated)
17440 if (ASSEMBLER_DIALECT == ASM_ATT)
17441 fprintf (file, ", %%%s", reg);
17442 else
17443 fprintf (file, ", %s", reg);
17447 /* Meaning of CODE:
17448 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17449 C -- print opcode suffix for set/cmov insn.
17450 c -- like C, but print reversed condition
17451 F,f -- likewise, but for floating-point.
17452 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17453 otherwise nothing
17454 R -- print embeded rounding and sae.
17455 r -- print only sae.
17456 z -- print the opcode suffix for the size of the current operand.
17457 Z -- likewise, with special suffixes for x87 instructions.
17458 * -- print a star (in certain assembler syntax)
17459 A -- print an absolute memory reference.
17460 E -- print address with DImode register names if TARGET_64BIT.
17461 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17462 s -- print a shift double count, followed by the assemblers argument
17463 delimiter.
17464 b -- print the QImode name of the register for the indicated operand.
17465 %b0 would print %al if operands[0] is reg 0.
17466 w -- likewise, print the HImode name of the register.
17467 k -- likewise, print the SImode name of the register.
17468 q -- likewise, print the DImode name of the register.
17469 x -- likewise, print the V4SFmode name of the register.
17470 t -- likewise, print the V8SFmode name of the register.
17471 g -- likewise, print the V16SFmode name of the register.
17472 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17473 y -- print "st(0)" instead of "st" as a register.
17474 d -- print duplicated register operand for AVX instruction.
17475 D -- print condition for SSE cmp instruction.
17476 P -- if PIC, print an @PLT suffix.
17477 p -- print raw symbol name.
17478 X -- don't print any sort of PIC '@' suffix for a symbol.
17479 & -- print some in-use local-dynamic symbol name.
17480 H -- print a memory address offset by 8; used for sse high-parts
17481 Y -- print condition for XOP pcom* instruction.
17482 + -- print a branch hint as 'cs' or 'ds' prefix
17483 ; -- print a semicolon (after prefixes due to bug in older gas).
17484 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17485 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17486 ! -- print MPX prefix for jxx/call/ret instructions if required.
17489 void
17490 ix86_print_operand (FILE *file, rtx x, int code)
17492 if (code)
17494 switch (code)
17496 case 'A':
17497 switch (ASSEMBLER_DIALECT)
17499 case ASM_ATT:
17500 putc ('*', file);
17501 break;
17503 case ASM_INTEL:
17504 /* Intel syntax. For absolute addresses, registers should not
17505 be surrounded by braces. */
17506 if (!REG_P (x))
17508 putc ('[', file);
17509 ix86_print_operand (file, x, 0);
17510 putc (']', file);
17511 return;
17513 break;
17515 default:
17516 gcc_unreachable ();
17519 ix86_print_operand (file, x, 0);
17520 return;
17522 case 'E':
17523 /* Wrap address in an UNSPEC to declare special handling. */
17524 if (TARGET_64BIT)
17525 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17527 output_address (VOIDmode, x);
17528 return;
17530 case 'L':
17531 if (ASSEMBLER_DIALECT == ASM_ATT)
17532 putc ('l', file);
17533 return;
17535 case 'W':
17536 if (ASSEMBLER_DIALECT == ASM_ATT)
17537 putc ('w', file);
17538 return;
17540 case 'B':
17541 if (ASSEMBLER_DIALECT == ASM_ATT)
17542 putc ('b', file);
17543 return;
17545 case 'Q':
17546 if (ASSEMBLER_DIALECT == ASM_ATT)
17547 putc ('l', file);
17548 return;
17550 case 'S':
17551 if (ASSEMBLER_DIALECT == ASM_ATT)
17552 putc ('s', file);
17553 return;
17555 case 'T':
17556 if (ASSEMBLER_DIALECT == ASM_ATT)
17557 putc ('t', file);
17558 return;
17560 case 'O':
17561 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17562 if (ASSEMBLER_DIALECT != ASM_ATT)
17563 return;
17565 switch (GET_MODE_SIZE (GET_MODE (x)))
17567 case 2:
17568 putc ('w', file);
17569 break;
17571 case 4:
17572 putc ('l', file);
17573 break;
17575 case 8:
17576 putc ('q', file);
17577 break;
17579 default:
17580 output_operand_lossage ("invalid operand size for operand "
17581 "code 'O'");
17582 return;
17585 putc ('.', file);
17586 #endif
17587 return;
17589 case 'z':
17590 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17592 /* Opcodes don't get size suffixes if using Intel opcodes. */
17593 if (ASSEMBLER_DIALECT == ASM_INTEL)
17594 return;
17596 switch (GET_MODE_SIZE (GET_MODE (x)))
17598 case 1:
17599 putc ('b', file);
17600 return;
17602 case 2:
17603 putc ('w', file);
17604 return;
17606 case 4:
17607 putc ('l', file);
17608 return;
17610 case 8:
17611 putc ('q', file);
17612 return;
17614 default:
17615 output_operand_lossage ("invalid operand size for operand "
17616 "code 'z'");
17617 return;
17621 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17622 warning (0, "non-integer operand used with operand code 'z'");
17623 /* FALLTHRU */
17625 case 'Z':
17626 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17627 if (ASSEMBLER_DIALECT == ASM_INTEL)
17628 return;
17630 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17632 switch (GET_MODE_SIZE (GET_MODE (x)))
17634 case 2:
17635 #ifdef HAVE_AS_IX86_FILDS
17636 putc ('s', file);
17637 #endif
17638 return;
17640 case 4:
17641 putc ('l', file);
17642 return;
17644 case 8:
17645 #ifdef HAVE_AS_IX86_FILDQ
17646 putc ('q', file);
17647 #else
17648 fputs ("ll", file);
17649 #endif
17650 return;
17652 default:
17653 break;
17656 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17658 /* 387 opcodes don't get size suffixes
17659 if the operands are registers. */
17660 if (STACK_REG_P (x))
17661 return;
17663 switch (GET_MODE_SIZE (GET_MODE (x)))
17665 case 4:
17666 putc ('s', file);
17667 return;
17669 case 8:
17670 putc ('l', file);
17671 return;
17673 case 12:
17674 case 16:
17675 putc ('t', file);
17676 return;
17678 default:
17679 break;
17682 else
17684 output_operand_lossage ("invalid operand type used with "
17685 "operand code 'Z'");
17686 return;
17689 output_operand_lossage ("invalid operand size for operand code 'Z'");
17690 return;
17692 case 'd':
17693 case 'b':
17694 case 'w':
17695 case 'k':
17696 case 'q':
17697 case 'h':
17698 case 't':
17699 case 'g':
17700 case 'y':
17701 case 'x':
17702 case 'X':
17703 case 'P':
17704 case 'p':
17705 break;
17707 case 's':
17708 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17710 ix86_print_operand (file, x, 0);
17711 fputs (", ", file);
17713 return;
17715 case 'Y':
17716 switch (GET_CODE (x))
17718 case NE:
17719 fputs ("neq", file);
17720 break;
17721 case EQ:
17722 fputs ("eq", file);
17723 break;
17724 case GE:
17725 case GEU:
17726 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17727 break;
17728 case GT:
17729 case GTU:
17730 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17731 break;
17732 case LE:
17733 case LEU:
17734 fputs ("le", file);
17735 break;
17736 case LT:
17737 case LTU:
17738 fputs ("lt", file);
17739 break;
17740 case UNORDERED:
17741 fputs ("unord", file);
17742 break;
17743 case ORDERED:
17744 fputs ("ord", file);
17745 break;
17746 case UNEQ:
17747 fputs ("ueq", file);
17748 break;
17749 case UNGE:
17750 fputs ("nlt", file);
17751 break;
17752 case UNGT:
17753 fputs ("nle", file);
17754 break;
17755 case UNLE:
17756 fputs ("ule", file);
17757 break;
17758 case UNLT:
17759 fputs ("ult", file);
17760 break;
17761 case LTGT:
17762 fputs ("une", file);
17763 break;
17764 default:
17765 output_operand_lossage ("operand is not a condition code, "
17766 "invalid operand code 'Y'");
17767 return;
17769 return;
17771 case 'D':
17772 /* Little bit of braindamage here. The SSE compare instructions
17773 does use completely different names for the comparisons that the
17774 fp conditional moves. */
17775 switch (GET_CODE (x))
17777 case UNEQ:
17778 if (TARGET_AVX)
17780 fputs ("eq_us", file);
17781 break;
17783 /* FALLTHRU */
17784 case EQ:
17785 fputs ("eq", file);
17786 break;
17787 case UNLT:
17788 if (TARGET_AVX)
17790 fputs ("nge", file);
17791 break;
17793 /* FALLTHRU */
17794 case LT:
17795 fputs ("lt", file);
17796 break;
17797 case UNLE:
17798 if (TARGET_AVX)
17800 fputs ("ngt", file);
17801 break;
17803 /* FALLTHRU */
17804 case LE:
17805 fputs ("le", file);
17806 break;
17807 case UNORDERED:
17808 fputs ("unord", file);
17809 break;
17810 case LTGT:
17811 if (TARGET_AVX)
17813 fputs ("neq_oq", file);
17814 break;
17816 /* FALLTHRU */
17817 case NE:
17818 fputs ("neq", file);
17819 break;
17820 case GE:
17821 if (TARGET_AVX)
17823 fputs ("ge", file);
17824 break;
17826 /* FALLTHRU */
17827 case UNGE:
17828 fputs ("nlt", file);
17829 break;
17830 case GT:
17831 if (TARGET_AVX)
17833 fputs ("gt", file);
17834 break;
17836 /* FALLTHRU */
17837 case UNGT:
17838 fputs ("nle", file);
17839 break;
17840 case ORDERED:
17841 fputs ("ord", file);
17842 break;
17843 default:
17844 output_operand_lossage ("operand is not a condition code, "
17845 "invalid operand code 'D'");
17846 return;
17848 return;
17850 case 'F':
17851 case 'f':
17852 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17853 if (ASSEMBLER_DIALECT == ASM_ATT)
17854 putc ('.', file);
17855 gcc_fallthrough ();
17856 #endif
17858 case 'C':
17859 case 'c':
17860 if (!COMPARISON_P (x))
17862 output_operand_lossage ("operand is not a condition code, "
17863 "invalid operand code '%c'", code);
17864 return;
17866 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17867 code == 'c' || code == 'f',
17868 code == 'F' || code == 'f',
17869 file);
17870 return;
17872 case 'H':
17873 if (!offsettable_memref_p (x))
17875 output_operand_lossage ("operand is not an offsettable memory "
17876 "reference, invalid operand code 'H'");
17877 return;
17879 /* It doesn't actually matter what mode we use here, as we're
17880 only going to use this for printing. */
17881 x = adjust_address_nv (x, DImode, 8);
17882 /* Output 'qword ptr' for intel assembler dialect. */
17883 if (ASSEMBLER_DIALECT == ASM_INTEL)
17884 code = 'q';
17885 break;
17887 case 'K':
17888 if (!CONST_INT_P (x))
17890 output_operand_lossage ("operand is not an integer, invalid "
17891 "operand code 'K'");
17892 return;
17895 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17896 #ifdef HAVE_AS_IX86_HLE
17897 fputs ("xacquire ", file);
17898 #else
17899 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17900 #endif
17901 else if (INTVAL (x) & IX86_HLE_RELEASE)
17902 #ifdef HAVE_AS_IX86_HLE
17903 fputs ("xrelease ", file);
17904 #else
17905 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17906 #endif
17907 /* We do not want to print value of the operand. */
17908 return;
17910 case 'N':
17911 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17912 fputs ("{z}", file);
17913 return;
17915 case 'r':
17916 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17918 output_operand_lossage ("operand is not a specific integer, "
17919 "invalid operand code 'r'");
17920 return;
17923 if (ASSEMBLER_DIALECT == ASM_INTEL)
17924 fputs (", ", file);
17926 fputs ("{sae}", file);
17928 if (ASSEMBLER_DIALECT == ASM_ATT)
17929 fputs (", ", file);
17931 return;
17933 case 'R':
17934 if (!CONST_INT_P (x))
17936 output_operand_lossage ("operand is not an integer, invalid "
17937 "operand code 'R'");
17938 return;
17941 if (ASSEMBLER_DIALECT == ASM_INTEL)
17942 fputs (", ", file);
17944 switch (INTVAL (x))
17946 case ROUND_NEAREST_INT | ROUND_SAE:
17947 fputs ("{rn-sae}", file);
17948 break;
17949 case ROUND_NEG_INF | ROUND_SAE:
17950 fputs ("{rd-sae}", file);
17951 break;
17952 case ROUND_POS_INF | ROUND_SAE:
17953 fputs ("{ru-sae}", file);
17954 break;
17955 case ROUND_ZERO | ROUND_SAE:
17956 fputs ("{rz-sae}", file);
17957 break;
17958 default:
17959 output_operand_lossage ("operand is not a specific integer, "
17960 "invalid operand code 'R'");
17963 if (ASSEMBLER_DIALECT == ASM_ATT)
17964 fputs (", ", file);
17966 return;
17968 case '*':
17969 if (ASSEMBLER_DIALECT == ASM_ATT)
17970 putc ('*', file);
17971 return;
17973 case '&':
17975 const char *name = get_some_local_dynamic_name ();
17976 if (name == NULL)
17977 output_operand_lossage ("'%%&' used without any "
17978 "local dynamic TLS references");
17979 else
17980 assemble_name (file, name);
17981 return;
17984 case '+':
17986 rtx x;
17988 if (!optimize
17989 || optimize_function_for_size_p (cfun)
17990 || !TARGET_BRANCH_PREDICTION_HINTS)
17991 return;
17993 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17994 if (x)
17996 int pred_val = profile_probability::from_reg_br_prob_note
17997 (XINT (x, 0)).to_reg_br_prob_base ();
17999 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18000 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18002 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18003 bool cputaken
18004 = final_forward_branch_p (current_output_insn) == 0;
18006 /* Emit hints only in the case default branch prediction
18007 heuristics would fail. */
18008 if (taken != cputaken)
18010 /* We use 3e (DS) prefix for taken branches and
18011 2e (CS) prefix for not taken branches. */
18012 if (taken)
18013 fputs ("ds ; ", file);
18014 else
18015 fputs ("cs ; ", file);
18019 return;
18022 case ';':
18023 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18024 putc (';', file);
18025 #endif
18026 return;
18028 case '~':
18029 putc (TARGET_AVX2 ? 'i' : 'f', file);
18030 return;
18032 case '^':
18033 if (TARGET_64BIT && Pmode != word_mode)
18034 fputs ("addr32 ", file);
18035 return;
18037 case '!':
18038 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18039 fputs ("bnd ", file);
18040 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18041 fputs ("notrack ", file);
18042 return;
18044 default:
18045 output_operand_lossage ("invalid operand code '%c'", code);
18049 if (REG_P (x))
18050 print_reg (x, code, file);
18052 else if (MEM_P (x))
18054 rtx addr = XEXP (x, 0);
18056 /* No `byte ptr' prefix for call instructions ... */
18057 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18059 machine_mode mode = GET_MODE (x);
18060 const char *size;
18062 /* Check for explicit size override codes. */
18063 if (code == 'b')
18064 size = "BYTE";
18065 else if (code == 'w')
18066 size = "WORD";
18067 else if (code == 'k')
18068 size = "DWORD";
18069 else if (code == 'q')
18070 size = "QWORD";
18071 else if (code == 'x')
18072 size = "XMMWORD";
18073 else if (code == 't')
18074 size = "YMMWORD";
18075 else if (code == 'g')
18076 size = "ZMMWORD";
18077 else if (mode == BLKmode)
18078 /* ... or BLKmode operands, when not overridden. */
18079 size = NULL;
18080 else
18081 switch (GET_MODE_SIZE (mode))
18083 case 1: size = "BYTE"; break;
18084 case 2: size = "WORD"; break;
18085 case 4: size = "DWORD"; break;
18086 case 8: size = "QWORD"; break;
18087 case 12: size = "TBYTE"; break;
18088 case 16:
18089 if (mode == XFmode)
18090 size = "TBYTE";
18091 else
18092 size = "XMMWORD";
18093 break;
18094 case 32: size = "YMMWORD"; break;
18095 case 64: size = "ZMMWORD"; break;
18096 default:
18097 gcc_unreachable ();
18099 if (size)
18101 fputs (size, file);
18102 fputs (" PTR ", file);
18106 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18107 output_operand_lossage ("invalid constraints for operand");
18108 else
18109 ix86_print_operand_address_as
18110 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18113 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18115 long l;
18117 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18119 if (ASSEMBLER_DIALECT == ASM_ATT)
18120 putc ('$', file);
18121 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18122 if (code == 'q')
18123 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18124 (unsigned long long) (int) l);
18125 else
18126 fprintf (file, "0x%08x", (unsigned int) l);
18129 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18131 long l[2];
18133 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18135 if (ASSEMBLER_DIALECT == ASM_ATT)
18136 putc ('$', file);
18137 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18140 /* These float cases don't actually occur as immediate operands. */
18141 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18143 char dstr[30];
18145 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18146 fputs (dstr, file);
18149 else
18151 /* We have patterns that allow zero sets of memory, for instance.
18152 In 64-bit mode, we should probably support all 8-byte vectors,
18153 since we can in fact encode that into an immediate. */
18154 if (GET_CODE (x) == CONST_VECTOR)
18156 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18157 x = const0_rtx;
18160 if (code != 'P' && code != 'p')
18162 if (CONST_INT_P (x))
18164 if (ASSEMBLER_DIALECT == ASM_ATT)
18165 putc ('$', file);
18167 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18168 || GET_CODE (x) == LABEL_REF)
18170 if (ASSEMBLER_DIALECT == ASM_ATT)
18171 putc ('$', file);
18172 else
18173 fputs ("OFFSET FLAT:", file);
18176 if (CONST_INT_P (x))
18177 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18178 else if (flag_pic || MACHOPIC_INDIRECT)
18179 output_pic_addr_const (file, x, code);
18180 else
18181 output_addr_const (file, x);
18185 static bool
18186 ix86_print_operand_punct_valid_p (unsigned char code)
18188 return (code == '*' || code == '+' || code == '&' || code == ';'
18189 || code == '~' || code == '^' || code == '!');
18192 /* Print a memory operand whose address is ADDR. */
18194 static void
18195 ix86_print_operand_address_as (FILE *file, rtx addr,
18196 addr_space_t as, bool no_rip)
18198 struct ix86_address parts;
18199 rtx base, index, disp;
18200 int scale;
18201 int ok;
18202 bool vsib = false;
18203 int code = 0;
18205 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18207 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18208 gcc_assert (parts.index == NULL_RTX);
18209 parts.index = XVECEXP (addr, 0, 1);
18210 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18211 addr = XVECEXP (addr, 0, 0);
18212 vsib = true;
18214 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18216 gcc_assert (TARGET_64BIT);
18217 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18218 code = 'q';
18220 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18222 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18223 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18224 if (parts.base != NULL_RTX)
18226 parts.index = parts.base;
18227 parts.scale = 1;
18229 parts.base = XVECEXP (addr, 0, 0);
18230 addr = XVECEXP (addr, 0, 0);
18232 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18234 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18235 gcc_assert (parts.index == NULL_RTX);
18236 parts.index = XVECEXP (addr, 0, 1);
18237 addr = XVECEXP (addr, 0, 0);
18239 else
18240 ok = ix86_decompose_address (addr, &parts);
18242 gcc_assert (ok);
18244 base = parts.base;
18245 index = parts.index;
18246 disp = parts.disp;
18247 scale = parts.scale;
18249 if (ADDR_SPACE_GENERIC_P (as))
18250 as = parts.seg;
18251 else
18252 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18254 if (!ADDR_SPACE_GENERIC_P (as))
18256 const char *string;
18258 if (as == ADDR_SPACE_SEG_FS)
18259 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18260 else if (as == ADDR_SPACE_SEG_GS)
18261 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18262 else
18263 gcc_unreachable ();
18264 fputs (string, file);
18267 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18268 if (TARGET_64BIT && !base && !index && !no_rip)
18270 rtx symbol = disp;
18272 if (GET_CODE (disp) == CONST
18273 && GET_CODE (XEXP (disp, 0)) == PLUS
18274 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18275 symbol = XEXP (XEXP (disp, 0), 0);
18277 if (GET_CODE (symbol) == LABEL_REF
18278 || (GET_CODE (symbol) == SYMBOL_REF
18279 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18280 base = pc_rtx;
18283 if (!base && !index)
18285 /* Displacement only requires special attention. */
18286 if (CONST_INT_P (disp))
18288 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18289 fputs ("ds:", file);
18290 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18292 /* Load the external function address via the GOT slot to avoid PLT. */
18293 else if (GET_CODE (disp) == CONST
18294 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18295 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18296 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18297 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18298 output_pic_addr_const (file, disp, 0);
18299 else if (flag_pic)
18300 output_pic_addr_const (file, disp, 0);
18301 else
18302 output_addr_const (file, disp);
18304 else
18306 /* Print SImode register names to force addr32 prefix. */
18307 if (SImode_address_operand (addr, VOIDmode))
18309 if (flag_checking)
18311 gcc_assert (TARGET_64BIT);
18312 switch (GET_CODE (addr))
18314 case SUBREG:
18315 gcc_assert (GET_MODE (addr) == SImode);
18316 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18317 break;
18318 case ZERO_EXTEND:
18319 case AND:
18320 gcc_assert (GET_MODE (addr) == DImode);
18321 break;
18322 default:
18323 gcc_unreachable ();
18326 gcc_assert (!code);
18327 code = 'k';
18329 else if (code == 0
18330 && TARGET_X32
18331 && disp
18332 && CONST_INT_P (disp)
18333 && INTVAL (disp) < -16*1024*1024)
18335 /* X32 runs in 64-bit mode, where displacement, DISP, in
18336 address DISP(%r64), is encoded as 32-bit immediate sign-
18337 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18338 address is %r64 + 0xffffffffbffffd00. When %r64 <
18339 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18340 which is invalid for x32. The correct address is %r64
18341 - 0x40000300 == 0xf7ffdd64. To properly encode
18342 -0x40000300(%r64) for x32, we zero-extend negative
18343 displacement by forcing addr32 prefix which truncates
18344 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18345 zero-extend all negative displacements, including -1(%rsp).
18346 However, for small negative displacements, sign-extension
18347 won't cause overflow. We only zero-extend negative
18348 displacements if they < -16*1024*1024, which is also used
18349 to check legitimate address displacements for PIC. */
18350 code = 'k';
18353 /* Since the upper 32 bits of RSP are always zero for x32,
18354 we can encode %esp as %rsp to avoid 0x67 prefix if
18355 there is no index register. */
18356 if (TARGET_X32 && Pmode == SImode
18357 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18358 code = 'q';
18360 if (ASSEMBLER_DIALECT == ASM_ATT)
18362 if (disp)
18364 if (flag_pic)
18365 output_pic_addr_const (file, disp, 0);
18366 else if (GET_CODE (disp) == LABEL_REF)
18367 output_asm_label (disp);
18368 else
18369 output_addr_const (file, disp);
18372 putc ('(', file);
18373 if (base)
18374 print_reg (base, code, file);
18375 if (index)
18377 putc (',', file);
18378 print_reg (index, vsib ? 0 : code, file);
18379 if (scale != 1 || vsib)
18380 fprintf (file, ",%d", scale);
18382 putc (')', file);
18384 else
18386 rtx offset = NULL_RTX;
18388 if (disp)
18390 /* Pull out the offset of a symbol; print any symbol itself. */
18391 if (GET_CODE (disp) == CONST
18392 && GET_CODE (XEXP (disp, 0)) == PLUS
18393 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18395 offset = XEXP (XEXP (disp, 0), 1);
18396 disp = gen_rtx_CONST (VOIDmode,
18397 XEXP (XEXP (disp, 0), 0));
18400 if (flag_pic)
18401 output_pic_addr_const (file, disp, 0);
18402 else if (GET_CODE (disp) == LABEL_REF)
18403 output_asm_label (disp);
18404 else if (CONST_INT_P (disp))
18405 offset = disp;
18406 else
18407 output_addr_const (file, disp);
18410 putc ('[', file);
18411 if (base)
18413 print_reg (base, code, file);
18414 if (offset)
18416 if (INTVAL (offset) >= 0)
18417 putc ('+', file);
18418 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18421 else if (offset)
18422 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18423 else
18424 putc ('0', file);
18426 if (index)
18428 putc ('+', file);
18429 print_reg (index, vsib ? 0 : code, file);
18430 if (scale != 1 || vsib)
18431 fprintf (file, "*%d", scale);
18433 putc (']', file);
18438 static void
18439 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18441 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18444 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18446 static bool
18447 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18449 rtx op;
18451 if (GET_CODE (x) != UNSPEC)
18452 return false;
18454 op = XVECEXP (x, 0, 0);
18455 switch (XINT (x, 1))
18457 case UNSPEC_GOTOFF:
18458 output_addr_const (file, op);
18459 fputs ("@gotoff", file);
18460 break;
18461 case UNSPEC_GOTTPOFF:
18462 output_addr_const (file, op);
18463 /* FIXME: This might be @TPOFF in Sun ld. */
18464 fputs ("@gottpoff", file);
18465 break;
18466 case UNSPEC_TPOFF:
18467 output_addr_const (file, op);
18468 fputs ("@tpoff", file);
18469 break;
18470 case UNSPEC_NTPOFF:
18471 output_addr_const (file, op);
18472 if (TARGET_64BIT)
18473 fputs ("@tpoff", file);
18474 else
18475 fputs ("@ntpoff", file);
18476 break;
18477 case UNSPEC_DTPOFF:
18478 output_addr_const (file, op);
18479 fputs ("@dtpoff", file);
18480 break;
18481 case UNSPEC_GOTNTPOFF:
18482 output_addr_const (file, op);
18483 if (TARGET_64BIT)
18484 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18485 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18486 else
18487 fputs ("@gotntpoff", file);
18488 break;
18489 case UNSPEC_INDNTPOFF:
18490 output_addr_const (file, op);
18491 fputs ("@indntpoff", file);
18492 break;
18493 #if TARGET_MACHO
18494 case UNSPEC_MACHOPIC_OFFSET:
18495 output_addr_const (file, op);
18496 putc ('-', file);
18497 machopic_output_function_base_name (file);
18498 break;
18499 #endif
18501 default:
18502 return false;
18505 return true;
18508 /* Split one or more double-mode RTL references into pairs of half-mode
18509 references. The RTL can be REG, offsettable MEM, integer constant, or
18510 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18511 split and "num" is its length. lo_half and hi_half are output arrays
18512 that parallel "operands". */
18514 void
18515 split_double_mode (machine_mode mode, rtx operands[],
18516 int num, rtx lo_half[], rtx hi_half[])
18518 machine_mode half_mode;
18519 unsigned int byte;
18521 switch (mode)
18523 case E_TImode:
18524 half_mode = DImode;
18525 break;
18526 case E_DImode:
18527 half_mode = SImode;
18528 break;
18529 default:
18530 gcc_unreachable ();
18533 byte = GET_MODE_SIZE (half_mode);
18535 while (num--)
18537 rtx op = operands[num];
18539 /* simplify_subreg refuse to split volatile memory addresses,
18540 but we still have to handle it. */
18541 if (MEM_P (op))
18543 lo_half[num] = adjust_address (op, half_mode, 0);
18544 hi_half[num] = adjust_address (op, half_mode, byte);
18546 else
18548 lo_half[num] = simplify_gen_subreg (half_mode, op,
18549 GET_MODE (op) == VOIDmode
18550 ? mode : GET_MODE (op), 0);
18551 hi_half[num] = simplify_gen_subreg (half_mode, op,
18552 GET_MODE (op) == VOIDmode
18553 ? mode : GET_MODE (op), byte);
18558 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18559 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18560 is the expression of the binary operation. The output may either be
18561 emitted here, or returned to the caller, like all output_* functions.
18563 There is no guarantee that the operands are the same mode, as they
18564 might be within FLOAT or FLOAT_EXTEND expressions. */
18566 #ifndef SYSV386_COMPAT
18567 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18568 wants to fix the assemblers because that causes incompatibility
18569 with gcc. No-one wants to fix gcc because that causes
18570 incompatibility with assemblers... You can use the option of
18571 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18572 #define SYSV386_COMPAT 1
18573 #endif
18575 const char *
18576 output_387_binary_op (rtx_insn *insn, rtx *operands)
18578 static char buf[40];
18579 const char *p;
18580 bool is_sse
18581 = (SSE_REG_P (operands[0])
18582 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18584 if (is_sse)
18585 p = "%v";
18586 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18587 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18588 p = "fi";
18589 else
18590 p = "f";
18592 strcpy (buf, p);
18594 switch (GET_CODE (operands[3]))
18596 case PLUS:
18597 p = "add"; break;
18598 case MINUS:
18599 p = "sub"; break;
18600 case MULT:
18601 p = "mul"; break;
18602 case DIV:
18603 p = "div"; break;
18604 default:
18605 gcc_unreachable ();
18608 strcat (buf, p);
18610 if (is_sse)
18612 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18613 strcat (buf, p);
18615 if (TARGET_AVX)
18616 p = "\t{%2, %1, %0|%0, %1, %2}";
18617 else
18618 p = "\t{%2, %0|%0, %2}";
18620 strcat (buf, p);
18621 return buf;
18624 /* Even if we do not want to check the inputs, this documents input
18625 constraints. Which helps in understanding the following code. */
18626 if (flag_checking)
18628 if (STACK_REG_P (operands[0])
18629 && ((REG_P (operands[1])
18630 && REGNO (operands[0]) == REGNO (operands[1])
18631 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18632 || (REG_P (operands[2])
18633 && REGNO (operands[0]) == REGNO (operands[2])
18634 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18635 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18636 ; /* ok */
18637 else
18638 gcc_unreachable ();
18641 switch (GET_CODE (operands[3]))
18643 case MULT:
18644 case PLUS:
18645 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18646 std::swap (operands[1], operands[2]);
18648 /* know operands[0] == operands[1]. */
18650 if (MEM_P (operands[2]))
18652 p = "%Z2\t%2";
18653 break;
18656 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18658 if (STACK_TOP_P (operands[0]))
18659 /* How is it that we are storing to a dead operand[2]?
18660 Well, presumably operands[1] is dead too. We can't
18661 store the result to st(0) as st(0) gets popped on this
18662 instruction. Instead store to operands[2] (which I
18663 think has to be st(1)). st(1) will be popped later.
18664 gcc <= 2.8.1 didn't have this check and generated
18665 assembly code that the Unixware assembler rejected. */
18666 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18667 else
18668 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18669 break;
18672 if (STACK_TOP_P (operands[0]))
18673 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18674 else
18675 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18676 break;
18678 case MINUS:
18679 case DIV:
18680 if (MEM_P (operands[1]))
18682 p = "r%Z1\t%1";
18683 break;
18686 if (MEM_P (operands[2]))
18688 p = "%Z2\t%2";
18689 break;
18692 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18694 #if SYSV386_COMPAT
18695 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18696 derived assemblers, confusingly reverse the direction of
18697 the operation for fsub{r} and fdiv{r} when the
18698 destination register is not st(0). The Intel assembler
18699 doesn't have this brain damage. Read !SYSV386_COMPAT to
18700 figure out what the hardware really does. */
18701 if (STACK_TOP_P (operands[0]))
18702 p = "{p\t%0, %2|rp\t%2, %0}";
18703 else
18704 p = "{rp\t%2, %0|p\t%0, %2}";
18705 #else
18706 if (STACK_TOP_P (operands[0]))
18707 /* As above for fmul/fadd, we can't store to st(0). */
18708 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18709 else
18710 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18711 #endif
18712 break;
18715 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18717 #if SYSV386_COMPAT
18718 if (STACK_TOP_P (operands[0]))
18719 p = "{rp\t%0, %1|p\t%1, %0}";
18720 else
18721 p = "{p\t%1, %0|rp\t%0, %1}";
18722 #else
18723 if (STACK_TOP_P (operands[0]))
18724 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18725 else
18726 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18727 #endif
18728 break;
18731 if (STACK_TOP_P (operands[0]))
18733 if (STACK_TOP_P (operands[1]))
18734 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18735 else
18736 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18737 break;
18739 else if (STACK_TOP_P (operands[1]))
18741 #if SYSV386_COMPAT
18742 p = "{\t%1, %0|r\t%0, %1}";
18743 #else
18744 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18745 #endif
18747 else
18749 #if SYSV386_COMPAT
18750 p = "{r\t%2, %0|\t%0, %2}";
18751 #else
18752 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18753 #endif
18755 break;
18757 default:
18758 gcc_unreachable ();
18761 strcat (buf, p);
18762 return buf;
18765 /* Return needed mode for entity in optimize_mode_switching pass. */
18767 static int
18768 ix86_dirflag_mode_needed (rtx_insn *insn)
18770 if (CALL_P (insn))
18772 if (cfun->machine->func_type == TYPE_NORMAL)
18773 return X86_DIRFLAG_ANY;
18774 else
18775 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18776 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18779 if (recog_memoized (insn) < 0)
18780 return X86_DIRFLAG_ANY;
18782 if (get_attr_type (insn) == TYPE_STR)
18784 /* Emit cld instruction if stringops are used in the function. */
18785 if (cfun->machine->func_type == TYPE_NORMAL)
18786 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18787 else
18788 return X86_DIRFLAG_RESET;
18791 return X86_DIRFLAG_ANY;
18794 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
18796 static bool
18797 ix86_check_avx_upper_register (const_rtx exp)
18799 if (SUBREG_P (exp))
18800 exp = SUBREG_REG (exp);
18802 return (REG_P (exp)
18803 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
18804 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
18807 /* Return needed mode for entity in optimize_mode_switching pass. */
18809 static int
18810 ix86_avx_u128_mode_needed (rtx_insn *insn)
18812 if (CALL_P (insn))
18814 rtx link;
18816 /* Needed mode is set to AVX_U128_CLEAN if there are
18817 no 256bit or 512bit modes used in function arguments. */
18818 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18819 link;
18820 link = XEXP (link, 1))
18822 if (GET_CODE (XEXP (link, 0)) == USE)
18824 rtx arg = XEXP (XEXP (link, 0), 0);
18826 if (ix86_check_avx_upper_register (arg))
18827 return AVX_U128_DIRTY;
18831 return AVX_U128_CLEAN;
18834 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
18835 Hardware changes state only when a 256bit register is written to,
18836 but we need to prevent the compiler from moving optimal insertion
18837 point above eventual read from 256bit or 512 bit register. */
18838 subrtx_iterator::array_type array;
18839 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18840 if (ix86_check_avx_upper_register (*iter))
18841 return AVX_U128_DIRTY;
18843 return AVX_U128_ANY;
18846 /* Return mode that i387 must be switched into
18847 prior to the execution of insn. */
18849 static int
18850 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18852 enum attr_i387_cw mode;
18854 /* The mode UNINITIALIZED is used to store control word after a
18855 function call or ASM pattern. The mode ANY specify that function
18856 has no requirements on the control word and make no changes in the
18857 bits we are interested in. */
18859 if (CALL_P (insn)
18860 || (NONJUMP_INSN_P (insn)
18861 && (asm_noperands (PATTERN (insn)) >= 0
18862 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18863 return I387_CW_UNINITIALIZED;
18865 if (recog_memoized (insn) < 0)
18866 return I387_CW_ANY;
18868 mode = get_attr_i387_cw (insn);
18870 switch (entity)
18872 case I387_TRUNC:
18873 if (mode == I387_CW_TRUNC)
18874 return mode;
18875 break;
18877 case I387_FLOOR:
18878 if (mode == I387_CW_FLOOR)
18879 return mode;
18880 break;
18882 case I387_CEIL:
18883 if (mode == I387_CW_CEIL)
18884 return mode;
18885 break;
18887 case I387_MASK_PM:
18888 if (mode == I387_CW_MASK_PM)
18889 return mode;
18890 break;
18892 default:
18893 gcc_unreachable ();
18896 return I387_CW_ANY;
18899 /* Return mode that entity must be switched into
18900 prior to the execution of insn. */
18902 static int
18903 ix86_mode_needed (int entity, rtx_insn *insn)
18905 switch (entity)
18907 case X86_DIRFLAG:
18908 return ix86_dirflag_mode_needed (insn);
18909 case AVX_U128:
18910 return ix86_avx_u128_mode_needed (insn);
18911 case I387_TRUNC:
18912 case I387_FLOOR:
18913 case I387_CEIL:
18914 case I387_MASK_PM:
18915 return ix86_i387_mode_needed (entity, insn);
18916 default:
18917 gcc_unreachable ();
18919 return 0;
18922 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
18924 static void
18925 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
18927 if (ix86_check_avx_upper_register (dest))
18929 bool *used = (bool *) data;
18930 *used = true;
18934 /* Calculate mode of upper 128bit AVX registers after the insn. */
18936 static int
18937 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18939 rtx pat = PATTERN (insn);
18941 if (vzeroupper_operation (pat, VOIDmode)
18942 || vzeroall_operation (pat, VOIDmode))
18943 return AVX_U128_CLEAN;
18945 /* We know that state is clean after CALL insn if there are no
18946 256bit or 512bit registers used in the function return register. */
18947 if (CALL_P (insn))
18949 bool avx_upper_reg_found = false;
18950 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
18952 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18955 /* Otherwise, return current mode. Remember that if insn
18956 references AVX 256bit or 512bit registers, the mode was already
18957 changed to DIRTY from MODE_NEEDED. */
18958 return mode;
18961 /* Return the mode that an insn results in. */
18963 static int
18964 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18966 switch (entity)
18968 case X86_DIRFLAG:
18969 return mode;
18970 case AVX_U128:
18971 return ix86_avx_u128_mode_after (mode, insn);
18972 case I387_TRUNC:
18973 case I387_FLOOR:
18974 case I387_CEIL:
18975 case I387_MASK_PM:
18976 return mode;
18977 default:
18978 gcc_unreachable ();
18982 static int
18983 ix86_dirflag_mode_entry (void)
18985 /* For TARGET_CLD or in the interrupt handler we can't assume
18986 direction flag state at function entry. */
18987 if (TARGET_CLD
18988 || cfun->machine->func_type != TYPE_NORMAL)
18989 return X86_DIRFLAG_ANY;
18991 return X86_DIRFLAG_RESET;
18994 static int
18995 ix86_avx_u128_mode_entry (void)
18997 tree arg;
18999 /* Entry mode is set to AVX_U128_DIRTY if there are
19000 256bit or 512bit modes used in function arguments. */
19001 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19002 arg = TREE_CHAIN (arg))
19004 rtx incoming = DECL_INCOMING_RTL (arg);
19006 if (incoming && ix86_check_avx_upper_register (incoming))
19007 return AVX_U128_DIRTY;
19010 return AVX_U128_CLEAN;
19013 /* Return a mode that ENTITY is assumed to be
19014 switched to at function entry. */
19016 static int
19017 ix86_mode_entry (int entity)
19019 switch (entity)
19021 case X86_DIRFLAG:
19022 return ix86_dirflag_mode_entry ();
19023 case AVX_U128:
19024 return ix86_avx_u128_mode_entry ();
19025 case I387_TRUNC:
19026 case I387_FLOOR:
19027 case I387_CEIL:
19028 case I387_MASK_PM:
19029 return I387_CW_ANY;
19030 default:
19031 gcc_unreachable ();
19035 static int
19036 ix86_avx_u128_mode_exit (void)
19038 rtx reg = crtl->return_rtx;
19040 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19041 or 512 bit modes used in the function return register. */
19042 if (reg && ix86_check_avx_upper_register (reg))
19043 return AVX_U128_DIRTY;
19045 return AVX_U128_CLEAN;
19048 /* Return a mode that ENTITY is assumed to be
19049 switched to at function exit. */
19051 static int
19052 ix86_mode_exit (int entity)
19054 switch (entity)
19056 case X86_DIRFLAG:
19057 return X86_DIRFLAG_ANY;
19058 case AVX_U128:
19059 return ix86_avx_u128_mode_exit ();
19060 case I387_TRUNC:
19061 case I387_FLOOR:
19062 case I387_CEIL:
19063 case I387_MASK_PM:
19064 return I387_CW_ANY;
19065 default:
19066 gcc_unreachable ();
19070 static int
19071 ix86_mode_priority (int, int n)
19073 return n;
19076 /* Output code to initialize control word copies used by trunc?f?i and
19077 rounding patterns. CURRENT_MODE is set to current control word,
19078 while NEW_MODE is set to new control word. */
19080 static void
19081 emit_i387_cw_initialization (int mode)
19083 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19084 rtx new_mode;
19086 enum ix86_stack_slot slot;
19088 rtx reg = gen_reg_rtx (HImode);
19090 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19091 emit_move_insn (reg, copy_rtx (stored_mode));
19093 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19094 || optimize_insn_for_size_p ())
19096 switch (mode)
19098 case I387_CW_TRUNC:
19099 /* round toward zero (truncate) */
19100 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19101 slot = SLOT_CW_TRUNC;
19102 break;
19104 case I387_CW_FLOOR:
19105 /* round down toward -oo */
19106 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19107 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19108 slot = SLOT_CW_FLOOR;
19109 break;
19111 case I387_CW_CEIL:
19112 /* round up toward +oo */
19113 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19114 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19115 slot = SLOT_CW_CEIL;
19116 break;
19118 case I387_CW_MASK_PM:
19119 /* mask precision exception for nearbyint() */
19120 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19121 slot = SLOT_CW_MASK_PM;
19122 break;
19124 default:
19125 gcc_unreachable ();
19128 else
19130 switch (mode)
19132 case I387_CW_TRUNC:
19133 /* round toward zero (truncate) */
19134 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19135 slot = SLOT_CW_TRUNC;
19136 break;
19138 case I387_CW_FLOOR:
19139 /* round down toward -oo */
19140 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19141 slot = SLOT_CW_FLOOR;
19142 break;
19144 case I387_CW_CEIL:
19145 /* round up toward +oo */
19146 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19147 slot = SLOT_CW_CEIL;
19148 break;
19150 case I387_CW_MASK_PM:
19151 /* mask precision exception for nearbyint() */
19152 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19153 slot = SLOT_CW_MASK_PM;
19154 break;
19156 default:
19157 gcc_unreachable ();
19161 gcc_assert (slot < MAX_386_STACK_LOCALS);
19163 new_mode = assign_386_stack_local (HImode, slot);
19164 emit_move_insn (new_mode, reg);
19167 /* Emit vzeroupper. */
19169 void
19170 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19172 int i;
19174 /* Cancel automatic vzeroupper insertion if there are
19175 live call-saved SSE registers at the insertion point. */
19177 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19178 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19179 return;
19181 if (TARGET_64BIT)
19182 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19183 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19184 return;
19186 emit_insn (gen_avx_vzeroupper ());
19189 /* Generate one or more insns to set ENTITY to MODE. */
19191 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19192 is the set of hard registers live at the point where the insn(s)
19193 are to be inserted. */
19195 static void
19196 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19197 HARD_REG_SET regs_live)
19199 switch (entity)
19201 case X86_DIRFLAG:
19202 if (mode == X86_DIRFLAG_RESET)
19203 emit_insn (gen_cld ());
19204 break;
19205 case AVX_U128:
19206 if (mode == AVX_U128_CLEAN)
19207 ix86_avx_emit_vzeroupper (regs_live);
19208 break;
19209 case I387_TRUNC:
19210 case I387_FLOOR:
19211 case I387_CEIL:
19212 case I387_MASK_PM:
19213 if (mode != I387_CW_ANY
19214 && mode != I387_CW_UNINITIALIZED)
19215 emit_i387_cw_initialization (mode);
19216 break;
19217 default:
19218 gcc_unreachable ();
19222 /* Output code for INSN to convert a float to a signed int. OPERANDS
19223 are the insn operands. The output may be [HSD]Imode and the input
19224 operand may be [SDX]Fmode. */
19226 const char *
19227 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19229 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19230 bool dimode_p = GET_MODE (operands[0]) == DImode;
19231 int round_mode = get_attr_i387_cw (insn);
19233 static char buf[40];
19234 const char *p;
19236 /* Jump through a hoop or two for DImode, since the hardware has no
19237 non-popping instruction. We used to do this a different way, but
19238 that was somewhat fragile and broke with post-reload splitters. */
19239 if ((dimode_p || fisttp) && !stack_top_dies)
19240 output_asm_insn ("fld\t%y1", operands);
19242 gcc_assert (STACK_TOP_P (operands[1]));
19243 gcc_assert (MEM_P (operands[0]));
19244 gcc_assert (GET_MODE (operands[1]) != TFmode);
19246 if (fisttp)
19247 return "fisttp%Z0\t%0";
19249 strcpy (buf, "fist");
19251 if (round_mode != I387_CW_ANY)
19252 output_asm_insn ("fldcw\t%3", operands);
19254 p = "p%Z0\t%0";
19255 strcat (buf, p + !(stack_top_dies || dimode_p));
19257 output_asm_insn (buf, operands);
19259 if (round_mode != I387_CW_ANY)
19260 output_asm_insn ("fldcw\t%2", operands);
19262 return "";
19265 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19266 have the values zero or one, indicates the ffreep insn's operand
19267 from the OPERANDS array. */
19269 static const char *
19270 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19272 if (TARGET_USE_FFREEP)
19273 #ifdef HAVE_AS_IX86_FFREEP
19274 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19275 #else
19277 static char retval[32];
19278 int regno = REGNO (operands[opno]);
19280 gcc_assert (STACK_REGNO_P (regno));
19282 regno -= FIRST_STACK_REG;
19284 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19285 return retval;
19287 #endif
19289 return opno ? "fstp\t%y1" : "fstp\t%y0";
19293 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19294 should be used. UNORDERED_P is true when fucom should be used. */
19296 const char *
19297 output_fp_compare (rtx_insn *insn, rtx *operands,
19298 bool eflags_p, bool unordered_p)
19300 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19301 bool stack_top_dies;
19303 static char buf[40];
19304 const char *p;
19306 gcc_assert (STACK_TOP_P (xops[0]));
19308 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19310 if (eflags_p)
19312 p = unordered_p ? "fucomi" : "fcomi";
19313 strcpy (buf, p);
19315 p = "p\t{%y1, %0|%0, %y1}";
19316 strcat (buf, p + !stack_top_dies);
19318 return buf;
19321 if (STACK_REG_P (xops[1])
19322 && stack_top_dies
19323 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19325 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19327 /* If both the top of the 387 stack die, and the other operand
19328 is also a stack register that dies, then this must be a
19329 `fcompp' float compare. */
19330 p = unordered_p ? "fucompp" : "fcompp";
19331 strcpy (buf, p);
19333 else if (const0_operand (xops[1], VOIDmode))
19335 gcc_assert (!unordered_p);
19336 strcpy (buf, "ftst");
19338 else
19340 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19342 gcc_assert (!unordered_p);
19343 p = "ficom";
19345 else
19346 p = unordered_p ? "fucom" : "fcom";
19348 strcpy (buf, p);
19350 p = "p%Z2\t%y2";
19351 strcat (buf, p + !stack_top_dies);
19354 output_asm_insn (buf, operands);
19355 return "fnstsw\t%0";
19358 void
19359 ix86_output_addr_vec_elt (FILE *file, int value)
19361 const char *directive = ASM_LONG;
19363 #ifdef ASM_QUAD
19364 if (TARGET_LP64)
19365 directive = ASM_QUAD;
19366 #else
19367 gcc_assert (!TARGET_64BIT);
19368 #endif
19370 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19373 void
19374 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19376 const char *directive = ASM_LONG;
19378 #ifdef ASM_QUAD
19379 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19380 directive = ASM_QUAD;
19381 #else
19382 gcc_assert (!TARGET_64BIT);
19383 #endif
19384 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19385 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19386 fprintf (file, "%s%s%d-%s%d\n",
19387 directive, LPREFIX, value, LPREFIX, rel);
19388 else if (HAVE_AS_GOTOFF_IN_DATA)
19389 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19390 #if TARGET_MACHO
19391 else if (TARGET_MACHO)
19393 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19394 machopic_output_function_base_name (file);
19395 putc ('\n', file);
19397 #endif
19398 else
19399 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19400 GOT_SYMBOL_NAME, LPREFIX, value);
19403 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19404 for the target. */
19406 void
19407 ix86_expand_clear (rtx dest)
19409 rtx tmp;
19411 /* We play register width games, which are only valid after reload. */
19412 gcc_assert (reload_completed);
19414 /* Avoid HImode and its attendant prefix byte. */
19415 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19416 dest = gen_rtx_REG (SImode, REGNO (dest));
19417 tmp = gen_rtx_SET (dest, const0_rtx);
19419 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19421 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19422 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19425 emit_insn (tmp);
19428 void
19429 ix86_expand_move (machine_mode mode, rtx operands[])
19431 rtx op0, op1;
19432 rtx tmp, addend = NULL_RTX;
19433 enum tls_model model;
19435 op0 = operands[0];
19436 op1 = operands[1];
19438 switch (GET_CODE (op1))
19440 case CONST:
19441 tmp = XEXP (op1, 0);
19443 if (GET_CODE (tmp) != PLUS
19444 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19445 break;
19447 op1 = XEXP (tmp, 0);
19448 addend = XEXP (tmp, 1);
19449 /* FALLTHRU */
19451 case SYMBOL_REF:
19452 model = SYMBOL_REF_TLS_MODEL (op1);
19454 if (model)
19455 op1 = legitimize_tls_address (op1, model, true);
19456 else if (ix86_force_load_from_GOT_p (op1))
19458 /* Load the external function address via GOT slot to avoid PLT. */
19459 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19460 (TARGET_64BIT
19461 ? UNSPEC_GOTPCREL
19462 : UNSPEC_GOT));
19463 op1 = gen_rtx_CONST (Pmode, op1);
19464 op1 = gen_const_mem (Pmode, op1);
19465 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19467 else
19469 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19470 if (tmp)
19472 op1 = tmp;
19473 if (!addend)
19474 break;
19476 else
19478 op1 = operands[1];
19479 break;
19483 if (addend)
19485 op1 = force_operand (op1, NULL_RTX);
19486 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19487 op0, 1, OPTAB_DIRECT);
19489 else
19490 op1 = force_operand (op1, op0);
19492 if (op1 == op0)
19493 return;
19495 op1 = convert_to_mode (mode, op1, 1);
19497 default:
19498 break;
19501 if ((flag_pic || MACHOPIC_INDIRECT)
19502 && symbolic_operand (op1, mode))
19504 if (TARGET_MACHO && !TARGET_64BIT)
19506 #if TARGET_MACHO
19507 /* dynamic-no-pic */
19508 if (MACHOPIC_INDIRECT)
19510 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19511 ? op0 : gen_reg_rtx (Pmode);
19512 op1 = machopic_indirect_data_reference (op1, temp);
19513 if (MACHOPIC_PURE)
19514 op1 = machopic_legitimize_pic_address (op1, mode,
19515 temp == op1 ? 0 : temp);
19517 if (op0 != op1 && GET_CODE (op0) != MEM)
19519 rtx insn = gen_rtx_SET (op0, op1);
19520 emit_insn (insn);
19521 return;
19523 if (GET_CODE (op0) == MEM)
19524 op1 = force_reg (Pmode, op1);
19525 else
19527 rtx temp = op0;
19528 if (GET_CODE (temp) != REG)
19529 temp = gen_reg_rtx (Pmode);
19530 temp = legitimize_pic_address (op1, temp);
19531 if (temp == op0)
19532 return;
19533 op1 = temp;
19535 /* dynamic-no-pic */
19536 #endif
19538 else
19540 if (MEM_P (op0))
19541 op1 = force_reg (mode, op1);
19542 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19544 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19545 op1 = legitimize_pic_address (op1, reg);
19546 if (op0 == op1)
19547 return;
19548 op1 = convert_to_mode (mode, op1, 1);
19552 else
19554 if (MEM_P (op0)
19555 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19556 || !push_operand (op0, mode))
19557 && MEM_P (op1))
19558 op1 = force_reg (mode, op1);
19560 if (push_operand (op0, mode)
19561 && ! general_no_elim_operand (op1, mode))
19562 op1 = copy_to_mode_reg (mode, op1);
19564 /* Force large constants in 64bit compilation into register
19565 to get them CSEed. */
19566 if (can_create_pseudo_p ()
19567 && (mode == DImode) && TARGET_64BIT
19568 && immediate_operand (op1, mode)
19569 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19570 && !register_operand (op0, mode)
19571 && optimize)
19572 op1 = copy_to_mode_reg (mode, op1);
19574 if (can_create_pseudo_p ()
19575 && CONST_DOUBLE_P (op1))
19577 /* If we are loading a floating point constant to a register,
19578 force the value to memory now, since we'll get better code
19579 out the back end. */
19581 op1 = validize_mem (force_const_mem (mode, op1));
19582 if (!register_operand (op0, mode))
19584 rtx temp = gen_reg_rtx (mode);
19585 emit_insn (gen_rtx_SET (temp, op1));
19586 emit_move_insn (op0, temp);
19587 return;
19592 emit_insn (gen_rtx_SET (op0, op1));
19595 void
19596 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19598 rtx op0 = operands[0], op1 = operands[1];
19599 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19600 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19601 unsigned int align = (TARGET_IAMCU
19602 ? GET_MODE_BITSIZE (mode)
19603 : GET_MODE_ALIGNMENT (mode));
19605 if (push_operand (op0, VOIDmode))
19606 op0 = emit_move_resolve_push (mode, op0);
19608 /* Force constants other than zero into memory. We do not know how
19609 the instructions used to build constants modify the upper 64 bits
19610 of the register, once we have that information we may be able
19611 to handle some of them more efficiently. */
19612 if (can_create_pseudo_p ()
19613 && (CONSTANT_P (op1)
19614 || (SUBREG_P (op1)
19615 && CONSTANT_P (SUBREG_REG (op1))))
19616 && ((register_operand (op0, mode)
19617 && !standard_sse_constant_p (op1, mode))
19618 /* ix86_expand_vector_move_misalign() does not like constants. */
19619 || (SSE_REG_MODE_P (mode)
19620 && MEM_P (op0)
19621 && MEM_ALIGN (op0) < align)))
19623 if (SUBREG_P (op1))
19625 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19626 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19627 if (r)
19628 r = validize_mem (r);
19629 else
19630 r = force_reg (imode, SUBREG_REG (op1));
19631 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19633 else
19634 op1 = validize_mem (force_const_mem (mode, op1));
19637 /* We need to check memory alignment for SSE mode since attribute
19638 can make operands unaligned. */
19639 if (can_create_pseudo_p ()
19640 && SSE_REG_MODE_P (mode)
19641 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19642 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19644 rtx tmp[2];
19646 /* ix86_expand_vector_move_misalign() does not like both
19647 arguments in memory. */
19648 if (!register_operand (op0, mode)
19649 && !register_operand (op1, mode))
19650 op1 = force_reg (mode, op1);
19652 tmp[0] = op0; tmp[1] = op1;
19653 ix86_expand_vector_move_misalign (mode, tmp);
19654 return;
19657 /* Make operand1 a register if it isn't already. */
19658 if (can_create_pseudo_p ()
19659 && !register_operand (op0, mode)
19660 && !register_operand (op1, mode))
19662 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19663 return;
19666 emit_insn (gen_rtx_SET (op0, op1));
19669 /* Split 32-byte AVX unaligned load and store if needed. */
19671 static void
19672 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19674 rtx m;
19675 rtx (*extract) (rtx, rtx, rtx);
19676 machine_mode mode;
19678 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19679 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19681 emit_insn (gen_rtx_SET (op0, op1));
19682 return;
19685 rtx orig_op0 = NULL_RTX;
19686 mode = GET_MODE (op0);
19687 switch (GET_MODE_CLASS (mode))
19689 case MODE_VECTOR_INT:
19690 case MODE_INT:
19691 if (mode != V32QImode)
19693 if (!MEM_P (op0))
19695 orig_op0 = op0;
19696 op0 = gen_reg_rtx (V32QImode);
19698 else
19699 op0 = gen_lowpart (V32QImode, op0);
19700 op1 = gen_lowpart (V32QImode, op1);
19701 mode = V32QImode;
19703 break;
19704 case MODE_VECTOR_FLOAT:
19705 break;
19706 default:
19707 gcc_unreachable ();
19710 switch (mode)
19712 default:
19713 gcc_unreachable ();
19714 case E_V32QImode:
19715 extract = gen_avx_vextractf128v32qi;
19716 mode = V16QImode;
19717 break;
19718 case E_V8SFmode:
19719 extract = gen_avx_vextractf128v8sf;
19720 mode = V4SFmode;
19721 break;
19722 case E_V4DFmode:
19723 extract = gen_avx_vextractf128v4df;
19724 mode = V2DFmode;
19725 break;
19728 if (MEM_P (op1))
19730 rtx r = gen_reg_rtx (mode);
19731 m = adjust_address (op1, mode, 0);
19732 emit_move_insn (r, m);
19733 m = adjust_address (op1, mode, 16);
19734 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19735 emit_move_insn (op0, r);
19737 else if (MEM_P (op0))
19739 m = adjust_address (op0, mode, 0);
19740 emit_insn (extract (m, op1, const0_rtx));
19741 m = adjust_address (op0, mode, 16);
19742 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19744 else
19745 gcc_unreachable ();
19747 if (orig_op0)
19748 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19751 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19752 straight to ix86_expand_vector_move. */
19753 /* Code generation for scalar reg-reg moves of single and double precision data:
19754 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19755 movaps reg, reg
19756 else
19757 movss reg, reg
19758 if (x86_sse_partial_reg_dependency == true)
19759 movapd reg, reg
19760 else
19761 movsd reg, reg
19763 Code generation for scalar loads of double precision data:
19764 if (x86_sse_split_regs == true)
19765 movlpd mem, reg (gas syntax)
19766 else
19767 movsd mem, reg
19769 Code generation for unaligned packed loads of single precision data
19770 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19771 if (x86_sse_unaligned_move_optimal)
19772 movups mem, reg
19774 if (x86_sse_partial_reg_dependency == true)
19776 xorps reg, reg
19777 movlps mem, reg
19778 movhps mem+8, reg
19780 else
19782 movlps mem, reg
19783 movhps mem+8, reg
19786 Code generation for unaligned packed loads of double precision data
19787 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19788 if (x86_sse_unaligned_move_optimal)
19789 movupd mem, reg
19791 if (x86_sse_split_regs == true)
19793 movlpd mem, reg
19794 movhpd mem+8, reg
19796 else
19798 movsd mem, reg
19799 movhpd mem+8, reg
19803 void
19804 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19806 rtx op0, op1, m;
19808 op0 = operands[0];
19809 op1 = operands[1];
19811 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19812 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19814 emit_insn (gen_rtx_SET (op0, op1));
19815 return;
19818 if (TARGET_AVX)
19820 if (GET_MODE_SIZE (mode) == 32)
19821 ix86_avx256_split_vector_move_misalign (op0, op1);
19822 else
19823 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19824 emit_insn (gen_rtx_SET (op0, op1));
19825 return;
19828 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19829 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19831 emit_insn (gen_rtx_SET (op0, op1));
19832 return;
19835 /* ??? If we have typed data, then it would appear that using
19836 movdqu is the only way to get unaligned data loaded with
19837 integer type. */
19838 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19840 emit_insn (gen_rtx_SET (op0, op1));
19841 return;
19844 if (MEM_P (op1))
19846 if (TARGET_SSE2 && mode == V2DFmode)
19848 rtx zero;
19850 /* When SSE registers are split into halves, we can avoid
19851 writing to the top half twice. */
19852 if (TARGET_SSE_SPLIT_REGS)
19854 emit_clobber (op0);
19855 zero = op0;
19857 else
19859 /* ??? Not sure about the best option for the Intel chips.
19860 The following would seem to satisfy; the register is
19861 entirely cleared, breaking the dependency chain. We
19862 then store to the upper half, with a dependency depth
19863 of one. A rumor has it that Intel recommends two movsd
19864 followed by an unpacklpd, but this is unconfirmed. And
19865 given that the dependency depth of the unpacklpd would
19866 still be one, I'm not sure why this would be better. */
19867 zero = CONST0_RTX (V2DFmode);
19870 m = adjust_address (op1, DFmode, 0);
19871 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19872 m = adjust_address (op1, DFmode, 8);
19873 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19875 else
19877 rtx t;
19879 if (mode != V4SFmode)
19880 t = gen_reg_rtx (V4SFmode);
19881 else
19882 t = op0;
19884 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19885 emit_move_insn (t, CONST0_RTX (V4SFmode));
19886 else
19887 emit_clobber (t);
19889 m = adjust_address (op1, V2SFmode, 0);
19890 emit_insn (gen_sse_loadlps (t, t, m));
19891 m = adjust_address (op1, V2SFmode, 8);
19892 emit_insn (gen_sse_loadhps (t, t, m));
19893 if (mode != V4SFmode)
19894 emit_move_insn (op0, gen_lowpart (mode, t));
19897 else if (MEM_P (op0))
19899 if (TARGET_SSE2 && mode == V2DFmode)
19901 m = adjust_address (op0, DFmode, 0);
19902 emit_insn (gen_sse2_storelpd (m, op1));
19903 m = adjust_address (op0, DFmode, 8);
19904 emit_insn (gen_sse2_storehpd (m, op1));
19906 else
19908 if (mode != V4SFmode)
19909 op1 = gen_lowpart (V4SFmode, op1);
19911 m = adjust_address (op0, V2SFmode, 0);
19912 emit_insn (gen_sse_storelps (m, op1));
19913 m = adjust_address (op0, V2SFmode, 8);
19914 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19917 else
19918 gcc_unreachable ();
19921 /* Helper function of ix86_fixup_binary_operands to canonicalize
19922 operand order. Returns true if the operands should be swapped. */
19924 static bool
19925 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19926 rtx operands[])
19928 rtx dst = operands[0];
19929 rtx src1 = operands[1];
19930 rtx src2 = operands[2];
19932 /* If the operation is not commutative, we can't do anything. */
19933 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
19934 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
19935 return false;
19937 /* Highest priority is that src1 should match dst. */
19938 if (rtx_equal_p (dst, src1))
19939 return false;
19940 if (rtx_equal_p (dst, src2))
19941 return true;
19943 /* Next highest priority is that immediate constants come second. */
19944 if (immediate_operand (src2, mode))
19945 return false;
19946 if (immediate_operand (src1, mode))
19947 return true;
19949 /* Lowest priority is that memory references should come second. */
19950 if (MEM_P (src2))
19951 return false;
19952 if (MEM_P (src1))
19953 return true;
19955 return false;
19959 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19960 destination to use for the operation. If different from the true
19961 destination in operands[0], a copy operation will be required. */
19964 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19965 rtx operands[])
19967 rtx dst = operands[0];
19968 rtx src1 = operands[1];
19969 rtx src2 = operands[2];
19971 /* Canonicalize operand order. */
19972 if (ix86_swap_binary_operands_p (code, mode, operands))
19974 /* It is invalid to swap operands of different modes. */
19975 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19977 std::swap (src1, src2);
19980 /* Both source operands cannot be in memory. */
19981 if (MEM_P (src1) && MEM_P (src2))
19983 /* Optimization: Only read from memory once. */
19984 if (rtx_equal_p (src1, src2))
19986 src2 = force_reg (mode, src2);
19987 src1 = src2;
19989 else if (rtx_equal_p (dst, src1))
19990 src2 = force_reg (mode, src2);
19991 else
19992 src1 = force_reg (mode, src1);
19995 /* If the destination is memory, and we do not have matching source
19996 operands, do things in registers. */
19997 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19998 dst = gen_reg_rtx (mode);
20000 /* Source 1 cannot be a constant. */
20001 if (CONSTANT_P (src1))
20002 src1 = force_reg (mode, src1);
20004 /* Source 1 cannot be a non-matching memory. */
20005 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20006 src1 = force_reg (mode, src1);
20008 /* Improve address combine. */
20009 if (code == PLUS
20010 && GET_MODE_CLASS (mode) == MODE_INT
20011 && MEM_P (src2))
20012 src2 = force_reg (mode, src2);
20014 operands[1] = src1;
20015 operands[2] = src2;
20016 return dst;
20019 /* Similarly, but assume that the destination has already been
20020 set up properly. */
20022 void
20023 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20024 machine_mode mode, rtx operands[])
20026 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20027 gcc_assert (dst == operands[0]);
20030 /* Attempt to expand a binary operator. Make the expansion closer to the
20031 actual machine, then just general_operand, which will allow 3 separate
20032 memory references (one output, two input) in a single insn. */
20034 void
20035 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20036 rtx operands[])
20038 rtx src1, src2, dst, op, clob;
20040 dst = ix86_fixup_binary_operands (code, mode, operands);
20041 src1 = operands[1];
20042 src2 = operands[2];
20044 /* Emit the instruction. */
20046 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20048 if (reload_completed
20049 && code == PLUS
20050 && !rtx_equal_p (dst, src1))
20052 /* This is going to be an LEA; avoid splitting it later. */
20053 emit_insn (op);
20055 else
20057 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20058 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20061 /* Fix up the destination if needed. */
20062 if (dst != operands[0])
20063 emit_move_insn (operands[0], dst);
20066 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20067 the given OPERANDS. */
20069 void
20070 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20071 rtx operands[])
20073 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20074 if (SUBREG_P (operands[1]))
20076 op1 = operands[1];
20077 op2 = operands[2];
20079 else if (SUBREG_P (operands[2]))
20081 op1 = operands[2];
20082 op2 = operands[1];
20084 /* Optimize (__m128i) d | (__m128i) e and similar code
20085 when d and e are float vectors into float vector logical
20086 insn. In C/C++ without using intrinsics there is no other way
20087 to express vector logical operation on float vectors than
20088 to cast them temporarily to integer vectors. */
20089 if (op1
20090 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20091 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20092 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20093 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20094 && SUBREG_BYTE (op1) == 0
20095 && (GET_CODE (op2) == CONST_VECTOR
20096 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20097 && SUBREG_BYTE (op2) == 0))
20098 && can_create_pseudo_p ())
20100 rtx dst;
20101 switch (GET_MODE (SUBREG_REG (op1)))
20103 case E_V4SFmode:
20104 case E_V8SFmode:
20105 case E_V16SFmode:
20106 case E_V2DFmode:
20107 case E_V4DFmode:
20108 case E_V8DFmode:
20109 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20110 if (GET_CODE (op2) == CONST_VECTOR)
20112 op2 = gen_lowpart (GET_MODE (dst), op2);
20113 op2 = force_reg (GET_MODE (dst), op2);
20115 else
20117 op1 = operands[1];
20118 op2 = SUBREG_REG (operands[2]);
20119 if (!vector_operand (op2, GET_MODE (dst)))
20120 op2 = force_reg (GET_MODE (dst), op2);
20122 op1 = SUBREG_REG (op1);
20123 if (!vector_operand (op1, GET_MODE (dst)))
20124 op1 = force_reg (GET_MODE (dst), op1);
20125 emit_insn (gen_rtx_SET (dst,
20126 gen_rtx_fmt_ee (code, GET_MODE (dst),
20127 op1, op2)));
20128 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20129 return;
20130 default:
20131 break;
20134 if (!vector_operand (operands[1], mode))
20135 operands[1] = force_reg (mode, operands[1]);
20136 if (!vector_operand (operands[2], mode))
20137 operands[2] = force_reg (mode, operands[2]);
20138 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20139 emit_insn (gen_rtx_SET (operands[0],
20140 gen_rtx_fmt_ee (code, mode, operands[1],
20141 operands[2])));
20144 /* Return TRUE or FALSE depending on whether the binary operator meets the
20145 appropriate constraints. */
20147 bool
20148 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20149 rtx operands[3])
20151 rtx dst = operands[0];
20152 rtx src1 = operands[1];
20153 rtx src2 = operands[2];
20155 /* Both source operands cannot be in memory. */
20156 if (MEM_P (src1) && MEM_P (src2))
20157 return false;
20159 /* Canonicalize operand order for commutative operators. */
20160 if (ix86_swap_binary_operands_p (code, mode, operands))
20161 std::swap (src1, src2);
20163 /* If the destination is memory, we must have a matching source operand. */
20164 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20165 return false;
20167 /* Source 1 cannot be a constant. */
20168 if (CONSTANT_P (src1))
20169 return false;
20171 /* Source 1 cannot be a non-matching memory. */
20172 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20173 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20174 return (code == AND
20175 && (mode == HImode
20176 || mode == SImode
20177 || (TARGET_64BIT && mode == DImode))
20178 && satisfies_constraint_L (src2));
20180 return true;
20183 /* Attempt to expand a unary operator. Make the expansion closer to the
20184 actual machine, then just general_operand, which will allow 2 separate
20185 memory references (one output, one input) in a single insn. */
20187 void
20188 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20189 rtx operands[])
20191 bool matching_memory = false;
20192 rtx src, dst, op, clob;
20194 dst = operands[0];
20195 src = operands[1];
20197 /* If the destination is memory, and we do not have matching source
20198 operands, do things in registers. */
20199 if (MEM_P (dst))
20201 if (rtx_equal_p (dst, src))
20202 matching_memory = true;
20203 else
20204 dst = gen_reg_rtx (mode);
20207 /* When source operand is memory, destination must match. */
20208 if (MEM_P (src) && !matching_memory)
20209 src = force_reg (mode, src);
20211 /* Emit the instruction. */
20213 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20215 if (code == NOT)
20216 emit_insn (op);
20217 else
20219 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20220 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20223 /* Fix up the destination if needed. */
20224 if (dst != operands[0])
20225 emit_move_insn (operands[0], dst);
20228 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20229 divisor are within the range [0-255]. */
20231 void
20232 ix86_split_idivmod (machine_mode mode, rtx operands[],
20233 bool signed_p)
20235 rtx_code_label *end_label, *qimode_label;
20236 rtx div, mod;
20237 rtx_insn *insn;
20238 rtx scratch, tmp0, tmp1, tmp2;
20239 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20240 rtx (*gen_zero_extend) (rtx, rtx);
20241 rtx (*gen_test_ccno_1) (rtx, rtx);
20243 switch (mode)
20245 case E_SImode:
20246 if (GET_MODE (operands[0]) == SImode)
20248 if (GET_MODE (operands[1]) == SImode)
20249 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20250 else
20251 gen_divmod4_1
20252 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20253 gen_zero_extend = gen_zero_extendqisi2;
20255 else
20257 gen_divmod4_1
20258 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20259 gen_zero_extend = gen_zero_extendqidi2;
20261 gen_test_ccno_1 = gen_testsi_ccno_1;
20262 break;
20263 case E_DImode:
20264 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20265 gen_test_ccno_1 = gen_testdi_ccno_1;
20266 gen_zero_extend = gen_zero_extendqidi2;
20267 break;
20268 default:
20269 gcc_unreachable ();
20272 end_label = gen_label_rtx ();
20273 qimode_label = gen_label_rtx ();
20275 scratch = gen_reg_rtx (mode);
20277 /* Use 8bit unsigned divimod if dividend and divisor are within
20278 the range [0-255]. */
20279 emit_move_insn (scratch, operands[2]);
20280 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20281 scratch, 1, OPTAB_DIRECT);
20282 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20283 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20284 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20285 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20286 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20287 pc_rtx);
20288 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20289 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20290 JUMP_LABEL (insn) = qimode_label;
20292 /* Generate original signed/unsigned divimod. */
20293 div = gen_divmod4_1 (operands[0], operands[1],
20294 operands[2], operands[3]);
20295 emit_insn (div);
20297 /* Branch to the end. */
20298 emit_jump_insn (gen_jump (end_label));
20299 emit_barrier ();
20301 /* Generate 8bit unsigned divide. */
20302 emit_label (qimode_label);
20303 /* Don't use operands[0] for result of 8bit divide since not all
20304 registers support QImode ZERO_EXTRACT. */
20305 tmp0 = lowpart_subreg (HImode, scratch, mode);
20306 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20307 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20308 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20310 if (signed_p)
20312 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20313 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20315 else
20317 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20318 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20320 if (mode == SImode)
20322 if (GET_MODE (operands[0]) != SImode)
20323 div = gen_rtx_ZERO_EXTEND (DImode, div);
20324 if (GET_MODE (operands[1]) != SImode)
20325 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20328 /* Extract remainder from AH. */
20329 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20330 tmp0, GEN_INT (8), GEN_INT (8));
20331 if (REG_P (operands[1]))
20332 insn = emit_move_insn (operands[1], tmp1);
20333 else
20335 /* Need a new scratch register since the old one has result
20336 of 8bit divide. */
20337 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20338 emit_move_insn (scratch, tmp1);
20339 insn = emit_move_insn (operands[1], scratch);
20341 set_unique_reg_note (insn, REG_EQUAL, mod);
20343 /* Zero extend quotient from AL. */
20344 tmp1 = gen_lowpart (QImode, tmp0);
20345 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20346 set_unique_reg_note (insn, REG_EQUAL, div);
20348 emit_label (end_label);
20351 #define LEA_MAX_STALL (3)
20352 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20354 /* Increase given DISTANCE in half-cycles according to
20355 dependencies between PREV and NEXT instructions.
20356 Add 1 half-cycle if there is no dependency and
20357 go to next cycle if there is some dependecy. */
20359 static unsigned int
20360 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20362 df_ref def, use;
20364 if (!prev || !next)
20365 return distance + (distance & 1) + 2;
20367 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20368 return distance + 1;
20370 FOR_EACH_INSN_USE (use, next)
20371 FOR_EACH_INSN_DEF (def, prev)
20372 if (!DF_REF_IS_ARTIFICIAL (def)
20373 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20374 return distance + (distance & 1) + 2;
20376 return distance + 1;
20379 /* Function checks if instruction INSN defines register number
20380 REGNO1 or REGNO2. */
20382 static bool
20383 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20384 rtx_insn *insn)
20386 df_ref def;
20388 FOR_EACH_INSN_DEF (def, insn)
20389 if (DF_REF_REG_DEF_P (def)
20390 && !DF_REF_IS_ARTIFICIAL (def)
20391 && (regno1 == DF_REF_REGNO (def)
20392 || regno2 == DF_REF_REGNO (def)))
20393 return true;
20395 return false;
20398 /* Function checks if instruction INSN uses register number
20399 REGNO as a part of address expression. */
20401 static bool
20402 insn_uses_reg_mem (unsigned int regno, rtx insn)
20404 df_ref use;
20406 FOR_EACH_INSN_USE (use, insn)
20407 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20408 return true;
20410 return false;
20413 /* Search backward for non-agu definition of register number REGNO1
20414 or register number REGNO2 in basic block starting from instruction
20415 START up to head of basic block or instruction INSN.
20417 Function puts true value into *FOUND var if definition was found
20418 and false otherwise.
20420 Distance in half-cycles between START and found instruction or head
20421 of BB is added to DISTANCE and returned. */
20423 static int
20424 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20425 rtx_insn *insn, int distance,
20426 rtx_insn *start, bool *found)
20428 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20429 rtx_insn *prev = start;
20430 rtx_insn *next = NULL;
20432 *found = false;
20434 while (prev
20435 && prev != insn
20436 && distance < LEA_SEARCH_THRESHOLD)
20438 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20440 distance = increase_distance (prev, next, distance);
20441 if (insn_defines_reg (regno1, regno2, prev))
20443 if (recog_memoized (prev) < 0
20444 || get_attr_type (prev) != TYPE_LEA)
20446 *found = true;
20447 return distance;
20451 next = prev;
20453 if (prev == BB_HEAD (bb))
20454 break;
20456 prev = PREV_INSN (prev);
20459 return distance;
20462 /* Search backward for non-agu definition of register number REGNO1
20463 or register number REGNO2 in INSN's basic block until
20464 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20465 2. Reach neighbor BBs boundary, or
20466 3. Reach agu definition.
20467 Returns the distance between the non-agu definition point and INSN.
20468 If no definition point, returns -1. */
20470 static int
20471 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20472 rtx_insn *insn)
20474 basic_block bb = BLOCK_FOR_INSN (insn);
20475 int distance = 0;
20476 bool found = false;
20478 if (insn != BB_HEAD (bb))
20479 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20480 distance, PREV_INSN (insn),
20481 &found);
20483 if (!found && distance < LEA_SEARCH_THRESHOLD)
20485 edge e;
20486 edge_iterator ei;
20487 bool simple_loop = false;
20489 FOR_EACH_EDGE (e, ei, bb->preds)
20490 if (e->src == bb)
20492 simple_loop = true;
20493 break;
20496 if (simple_loop)
20497 distance = distance_non_agu_define_in_bb (regno1, regno2,
20498 insn, distance,
20499 BB_END (bb), &found);
20500 else
20502 int shortest_dist = -1;
20503 bool found_in_bb = false;
20505 FOR_EACH_EDGE (e, ei, bb->preds)
20507 int bb_dist
20508 = distance_non_agu_define_in_bb (regno1, regno2,
20509 insn, distance,
20510 BB_END (e->src),
20511 &found_in_bb);
20512 if (found_in_bb)
20514 if (shortest_dist < 0)
20515 shortest_dist = bb_dist;
20516 else if (bb_dist > 0)
20517 shortest_dist = MIN (bb_dist, shortest_dist);
20519 found = true;
20523 distance = shortest_dist;
20527 /* get_attr_type may modify recog data. We want to make sure
20528 that recog data is valid for instruction INSN, on which
20529 distance_non_agu_define is called. INSN is unchanged here. */
20530 extract_insn_cached (insn);
20532 if (!found)
20533 return -1;
20535 return distance >> 1;
20538 /* Return the distance in half-cycles between INSN and the next
20539 insn that uses register number REGNO in memory address added
20540 to DISTANCE. Return -1 if REGNO0 is set.
20542 Put true value into *FOUND if register usage was found and
20543 false otherwise.
20544 Put true value into *REDEFINED if register redefinition was
20545 found and false otherwise. */
20547 static int
20548 distance_agu_use_in_bb (unsigned int regno,
20549 rtx_insn *insn, int distance, rtx_insn *start,
20550 bool *found, bool *redefined)
20552 basic_block bb = NULL;
20553 rtx_insn *next = start;
20554 rtx_insn *prev = NULL;
20556 *found = false;
20557 *redefined = false;
20559 if (start != NULL_RTX)
20561 bb = BLOCK_FOR_INSN (start);
20562 if (start != BB_HEAD (bb))
20563 /* If insn and start belong to the same bb, set prev to insn,
20564 so the call to increase_distance will increase the distance
20565 between insns by 1. */
20566 prev = insn;
20569 while (next
20570 && next != insn
20571 && distance < LEA_SEARCH_THRESHOLD)
20573 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20575 distance = increase_distance(prev, next, distance);
20576 if (insn_uses_reg_mem (regno, next))
20578 /* Return DISTANCE if OP0 is used in memory
20579 address in NEXT. */
20580 *found = true;
20581 return distance;
20584 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20586 /* Return -1 if OP0 is set in NEXT. */
20587 *redefined = true;
20588 return -1;
20591 prev = next;
20594 if (next == BB_END (bb))
20595 break;
20597 next = NEXT_INSN (next);
20600 return distance;
20603 /* Return the distance between INSN and the next insn that uses
20604 register number REGNO0 in memory address. Return -1 if no such
20605 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20607 static int
20608 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20610 basic_block bb = BLOCK_FOR_INSN (insn);
20611 int distance = 0;
20612 bool found = false;
20613 bool redefined = false;
20615 if (insn != BB_END (bb))
20616 distance = distance_agu_use_in_bb (regno0, insn, distance,
20617 NEXT_INSN (insn),
20618 &found, &redefined);
20620 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20622 edge e;
20623 edge_iterator ei;
20624 bool simple_loop = false;
20626 FOR_EACH_EDGE (e, ei, bb->succs)
20627 if (e->dest == bb)
20629 simple_loop = true;
20630 break;
20633 if (simple_loop)
20634 distance = distance_agu_use_in_bb (regno0, insn,
20635 distance, BB_HEAD (bb),
20636 &found, &redefined);
20637 else
20639 int shortest_dist = -1;
20640 bool found_in_bb = false;
20641 bool redefined_in_bb = false;
20643 FOR_EACH_EDGE (e, ei, bb->succs)
20645 int bb_dist
20646 = distance_agu_use_in_bb (regno0, insn,
20647 distance, BB_HEAD (e->dest),
20648 &found_in_bb, &redefined_in_bb);
20649 if (found_in_bb)
20651 if (shortest_dist < 0)
20652 shortest_dist = bb_dist;
20653 else if (bb_dist > 0)
20654 shortest_dist = MIN (bb_dist, shortest_dist);
20656 found = true;
20660 distance = shortest_dist;
20664 if (!found || redefined)
20665 return -1;
20667 return distance >> 1;
20670 /* Define this macro to tune LEA priority vs ADD, it take effect when
20671 there is a dilemma of choicing LEA or ADD
20672 Negative value: ADD is more preferred than LEA
20673 Zero: Netrual
20674 Positive value: LEA is more preferred than ADD*/
20675 #define IX86_LEA_PRIORITY 0
20677 /* Return true if usage of lea INSN has performance advantage
20678 over a sequence of instructions. Instructions sequence has
20679 SPLIT_COST cycles higher latency than lea latency. */
20681 static bool
20682 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20683 unsigned int regno2, int split_cost, bool has_scale)
20685 int dist_define, dist_use;
20687 /* For Silvermont if using a 2-source or 3-source LEA for
20688 non-destructive destination purposes, or due to wanting
20689 ability to use SCALE, the use of LEA is justified. */
20690 if (TARGET_SILVERMONT || TARGET_INTEL)
20692 if (has_scale)
20693 return true;
20694 if (split_cost < 1)
20695 return false;
20696 if (regno0 == regno1 || regno0 == regno2)
20697 return false;
20698 return true;
20701 dist_define = distance_non_agu_define (regno1, regno2, insn);
20702 dist_use = distance_agu_use (regno0, insn);
20704 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20706 /* If there is no non AGU operand definition, no AGU
20707 operand usage and split cost is 0 then both lea
20708 and non lea variants have same priority. Currently
20709 we prefer lea for 64 bit code and non lea on 32 bit
20710 code. */
20711 if (dist_use < 0 && split_cost == 0)
20712 return TARGET_64BIT || IX86_LEA_PRIORITY;
20713 else
20714 return true;
20717 /* With longer definitions distance lea is more preferable.
20718 Here we change it to take into account splitting cost and
20719 lea priority. */
20720 dist_define += split_cost + IX86_LEA_PRIORITY;
20722 /* If there is no use in memory addess then we just check
20723 that split cost exceeds AGU stall. */
20724 if (dist_use < 0)
20725 return dist_define > LEA_MAX_STALL;
20727 /* If this insn has both backward non-agu dependence and forward
20728 agu dependence, the one with short distance takes effect. */
20729 return dist_define >= dist_use;
20732 /* Return true if it is legal to clobber flags by INSN and
20733 false otherwise. */
20735 static bool
20736 ix86_ok_to_clobber_flags (rtx_insn *insn)
20738 basic_block bb = BLOCK_FOR_INSN (insn);
20739 df_ref use;
20740 bitmap live;
20742 while (insn)
20744 if (NONDEBUG_INSN_P (insn))
20746 FOR_EACH_INSN_USE (use, insn)
20747 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20748 return false;
20750 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20751 return true;
20754 if (insn == BB_END (bb))
20755 break;
20757 insn = NEXT_INSN (insn);
20760 live = df_get_live_out(bb);
20761 return !REGNO_REG_SET_P (live, FLAGS_REG);
20764 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20765 move and add to avoid AGU stalls. */
20767 bool
20768 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20770 unsigned int regno0, regno1, regno2;
20772 /* Check if we need to optimize. */
20773 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20774 return false;
20776 /* Check it is correct to split here. */
20777 if (!ix86_ok_to_clobber_flags(insn))
20778 return false;
20780 regno0 = true_regnum (operands[0]);
20781 regno1 = true_regnum (operands[1]);
20782 regno2 = true_regnum (operands[2]);
20784 /* We need to split only adds with non destructive
20785 destination operand. */
20786 if (regno0 == regno1 || regno0 == regno2)
20787 return false;
20788 else
20789 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20792 /* Return true if we should emit lea instruction instead of mov
20793 instruction. */
20795 bool
20796 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20798 unsigned int regno0, regno1;
20800 /* Check if we need to optimize. */
20801 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20802 return false;
20804 /* Use lea for reg to reg moves only. */
20805 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20806 return false;
20808 regno0 = true_regnum (operands[0]);
20809 regno1 = true_regnum (operands[1]);
20811 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20814 /* Return true if we need to split lea into a sequence of
20815 instructions to avoid AGU stalls. */
20817 bool
20818 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20820 unsigned int regno0, regno1, regno2;
20821 int split_cost;
20822 struct ix86_address parts;
20823 int ok;
20825 /* Check we need to optimize. */
20826 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20827 return false;
20829 /* The "at least two components" test below might not catch simple
20830 move or zero extension insns if parts.base is non-NULL and parts.disp
20831 is const0_rtx as the only components in the address, e.g. if the
20832 register is %rbp or %r13. As this test is much cheaper and moves or
20833 zero extensions are the common case, do this check first. */
20834 if (REG_P (operands[1])
20835 || (SImode_address_operand (operands[1], VOIDmode)
20836 && REG_P (XEXP (operands[1], 0))))
20837 return false;
20839 /* Check if it is OK to split here. */
20840 if (!ix86_ok_to_clobber_flags (insn))
20841 return false;
20843 ok = ix86_decompose_address (operands[1], &parts);
20844 gcc_assert (ok);
20846 /* There should be at least two components in the address. */
20847 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20848 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20849 return false;
20851 /* We should not split into add if non legitimate pic
20852 operand is used as displacement. */
20853 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20854 return false;
20856 regno0 = true_regnum (operands[0]) ;
20857 regno1 = INVALID_REGNUM;
20858 regno2 = INVALID_REGNUM;
20860 if (parts.base)
20861 regno1 = true_regnum (parts.base);
20862 if (parts.index)
20863 regno2 = true_regnum (parts.index);
20865 split_cost = 0;
20867 /* Compute how many cycles we will add to execution time
20868 if split lea into a sequence of instructions. */
20869 if (parts.base || parts.index)
20871 /* Have to use mov instruction if non desctructive
20872 destination form is used. */
20873 if (regno1 != regno0 && regno2 != regno0)
20874 split_cost += 1;
20876 /* Have to add index to base if both exist. */
20877 if (parts.base && parts.index)
20878 split_cost += 1;
20880 /* Have to use shift and adds if scale is 2 or greater. */
20881 if (parts.scale > 1)
20883 if (regno0 != regno1)
20884 split_cost += 1;
20885 else if (regno2 == regno0)
20886 split_cost += 4;
20887 else
20888 split_cost += parts.scale;
20891 /* Have to use add instruction with immediate if
20892 disp is non zero. */
20893 if (parts.disp && parts.disp != const0_rtx)
20894 split_cost += 1;
20896 /* Subtract the price of lea. */
20897 split_cost -= 1;
20900 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20901 parts.scale > 1);
20904 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20905 matches destination. RTX includes clobber of FLAGS_REG. */
20907 static void
20908 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20909 rtx dst, rtx src)
20911 rtx op, clob;
20913 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20914 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20916 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20919 /* Return true if regno1 def is nearest to the insn. */
20921 static bool
20922 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20924 rtx_insn *prev = insn;
20925 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20927 if (insn == start)
20928 return false;
20929 while (prev && prev != start)
20931 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20933 prev = PREV_INSN (prev);
20934 continue;
20936 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20937 return true;
20938 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20939 return false;
20940 prev = PREV_INSN (prev);
20943 /* None of the regs is defined in the bb. */
20944 return false;
20947 /* Split lea instructions into a sequence of instructions
20948 which are executed on ALU to avoid AGU stalls.
20949 It is assumed that it is allowed to clobber flags register
20950 at lea position. */
20952 void
20953 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20955 unsigned int regno0, regno1, regno2;
20956 struct ix86_address parts;
20957 rtx target, tmp;
20958 int ok, adds;
20960 ok = ix86_decompose_address (operands[1], &parts);
20961 gcc_assert (ok);
20963 target = gen_lowpart (mode, operands[0]);
20965 regno0 = true_regnum (target);
20966 regno1 = INVALID_REGNUM;
20967 regno2 = INVALID_REGNUM;
20969 if (parts.base)
20971 parts.base = gen_lowpart (mode, parts.base);
20972 regno1 = true_regnum (parts.base);
20975 if (parts.index)
20977 parts.index = gen_lowpart (mode, parts.index);
20978 regno2 = true_regnum (parts.index);
20981 if (parts.disp)
20982 parts.disp = gen_lowpart (mode, parts.disp);
20984 if (parts.scale > 1)
20986 /* Case r1 = r1 + ... */
20987 if (regno1 == regno0)
20989 /* If we have a case r1 = r1 + C * r2 then we
20990 should use multiplication which is very
20991 expensive. Assume cost model is wrong if we
20992 have such case here. */
20993 gcc_assert (regno2 != regno0);
20995 for (adds = parts.scale; adds > 0; adds--)
20996 ix86_emit_binop (PLUS, mode, target, parts.index);
20998 else
21000 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21001 if (regno0 != regno2)
21002 emit_insn (gen_rtx_SET (target, parts.index));
21004 /* Use shift for scaling. */
21005 ix86_emit_binop (ASHIFT, mode, target,
21006 GEN_INT (exact_log2 (parts.scale)));
21008 if (parts.base)
21009 ix86_emit_binop (PLUS, mode, target, parts.base);
21011 if (parts.disp && parts.disp != const0_rtx)
21012 ix86_emit_binop (PLUS, mode, target, parts.disp);
21015 else if (!parts.base && !parts.index)
21017 gcc_assert(parts.disp);
21018 emit_insn (gen_rtx_SET (target, parts.disp));
21020 else
21022 if (!parts.base)
21024 if (regno0 != regno2)
21025 emit_insn (gen_rtx_SET (target, parts.index));
21027 else if (!parts.index)
21029 if (regno0 != regno1)
21030 emit_insn (gen_rtx_SET (target, parts.base));
21032 else
21034 if (regno0 == regno1)
21035 tmp = parts.index;
21036 else if (regno0 == regno2)
21037 tmp = parts.base;
21038 else
21040 rtx tmp1;
21042 /* Find better operand for SET instruction, depending
21043 on which definition is farther from the insn. */
21044 if (find_nearest_reg_def (insn, regno1, regno2))
21045 tmp = parts.index, tmp1 = parts.base;
21046 else
21047 tmp = parts.base, tmp1 = parts.index;
21049 emit_insn (gen_rtx_SET (target, tmp));
21051 if (parts.disp && parts.disp != const0_rtx)
21052 ix86_emit_binop (PLUS, mode, target, parts.disp);
21054 ix86_emit_binop (PLUS, mode, target, tmp1);
21055 return;
21058 ix86_emit_binop (PLUS, mode, target, tmp);
21061 if (parts.disp && parts.disp != const0_rtx)
21062 ix86_emit_binop (PLUS, mode, target, parts.disp);
21066 /* Return true if it is ok to optimize an ADD operation to LEA
21067 operation to avoid flag register consumation. For most processors,
21068 ADD is faster than LEA. For the processors like BONNELL, if the
21069 destination register of LEA holds an actual address which will be
21070 used soon, LEA is better and otherwise ADD is better. */
21072 bool
21073 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21075 unsigned int regno0 = true_regnum (operands[0]);
21076 unsigned int regno1 = true_regnum (operands[1]);
21077 unsigned int regno2 = true_regnum (operands[2]);
21079 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21080 if (regno0 != regno1 && regno0 != regno2)
21081 return true;
21083 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21084 return false;
21086 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21089 /* Return true if destination reg of SET_BODY is shift count of
21090 USE_BODY. */
21092 static bool
21093 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21095 rtx set_dest;
21096 rtx shift_rtx;
21097 int i;
21099 /* Retrieve destination of SET_BODY. */
21100 switch (GET_CODE (set_body))
21102 case SET:
21103 set_dest = SET_DEST (set_body);
21104 if (!set_dest || !REG_P (set_dest))
21105 return false;
21106 break;
21107 case PARALLEL:
21108 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21109 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21110 use_body))
21111 return true;
21112 /* FALLTHROUGH */
21113 default:
21114 return false;
21117 /* Retrieve shift count of USE_BODY. */
21118 switch (GET_CODE (use_body))
21120 case SET:
21121 shift_rtx = XEXP (use_body, 1);
21122 break;
21123 case PARALLEL:
21124 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21125 if (ix86_dep_by_shift_count_body (set_body,
21126 XVECEXP (use_body, 0, i)))
21127 return true;
21128 /* FALLTHROUGH */
21129 default:
21130 return false;
21133 if (shift_rtx
21134 && (GET_CODE (shift_rtx) == ASHIFT
21135 || GET_CODE (shift_rtx) == LSHIFTRT
21136 || GET_CODE (shift_rtx) == ASHIFTRT
21137 || GET_CODE (shift_rtx) == ROTATE
21138 || GET_CODE (shift_rtx) == ROTATERT))
21140 rtx shift_count = XEXP (shift_rtx, 1);
21142 /* Return true if shift count is dest of SET_BODY. */
21143 if (REG_P (shift_count))
21145 /* Add check since it can be invoked before register
21146 allocation in pre-reload schedule. */
21147 if (reload_completed
21148 && true_regnum (set_dest) == true_regnum (shift_count))
21149 return true;
21150 else if (REGNO(set_dest) == REGNO(shift_count))
21151 return true;
21155 return false;
21158 /* Return true if destination reg of SET_INSN is shift count of
21159 USE_INSN. */
21161 bool
21162 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21164 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21165 PATTERN (use_insn));
21168 /* Return TRUE or FALSE depending on whether the unary operator meets the
21169 appropriate constraints. */
21171 bool
21172 ix86_unary_operator_ok (enum rtx_code,
21173 machine_mode,
21174 rtx operands[2])
21176 /* If one of operands is memory, source and destination must match. */
21177 if ((MEM_P (operands[0])
21178 || MEM_P (operands[1]))
21179 && ! rtx_equal_p (operands[0], operands[1]))
21180 return false;
21181 return true;
21184 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21185 are ok, keeping in mind the possible movddup alternative. */
21187 bool
21188 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21190 if (MEM_P (operands[0]))
21191 return rtx_equal_p (operands[0], operands[1 + high]);
21192 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21193 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21194 return true;
21197 /* Post-reload splitter for converting an SF or DFmode value in an
21198 SSE register into an unsigned SImode. */
21200 void
21201 ix86_split_convert_uns_si_sse (rtx operands[])
21203 machine_mode vecmode;
21204 rtx value, large, zero_or_two31, input, two31, x;
21206 large = operands[1];
21207 zero_or_two31 = operands[2];
21208 input = operands[3];
21209 two31 = operands[4];
21210 vecmode = GET_MODE (large);
21211 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21213 /* Load up the value into the low element. We must ensure that the other
21214 elements are valid floats -- zero is the easiest such value. */
21215 if (MEM_P (input))
21217 if (vecmode == V4SFmode)
21218 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21219 else
21220 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21222 else
21224 input = gen_rtx_REG (vecmode, REGNO (input));
21225 emit_move_insn (value, CONST0_RTX (vecmode));
21226 if (vecmode == V4SFmode)
21227 emit_insn (gen_sse_movss (value, value, input));
21228 else
21229 emit_insn (gen_sse2_movsd (value, value, input));
21232 emit_move_insn (large, two31);
21233 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21235 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21236 emit_insn (gen_rtx_SET (large, x));
21238 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21239 emit_insn (gen_rtx_SET (zero_or_two31, x));
21241 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21242 emit_insn (gen_rtx_SET (value, x));
21244 large = gen_rtx_REG (V4SImode, REGNO (large));
21245 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21247 x = gen_rtx_REG (V4SImode, REGNO (value));
21248 if (vecmode == V4SFmode)
21249 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21250 else
21251 emit_insn (gen_sse2_cvttpd2dq (x, value));
21252 value = x;
21254 emit_insn (gen_xorv4si3 (value, value, large));
21257 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21258 Expects the 64-bit DImode to be supplied in a pair of integral
21259 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21260 -mfpmath=sse, !optimize_size only. */
21262 void
21263 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21265 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21266 rtx int_xmm, fp_xmm;
21267 rtx biases, exponents;
21268 rtx x;
21270 int_xmm = gen_reg_rtx (V4SImode);
21271 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21272 emit_insn (gen_movdi_to_sse (int_xmm, input));
21273 else if (TARGET_SSE_SPLIT_REGS)
21275 emit_clobber (int_xmm);
21276 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21278 else
21280 x = gen_reg_rtx (V2DImode);
21281 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21282 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21285 x = gen_rtx_CONST_VECTOR (V4SImode,
21286 gen_rtvec (4, GEN_INT (0x43300000UL),
21287 GEN_INT (0x45300000UL),
21288 const0_rtx, const0_rtx));
21289 exponents = validize_mem (force_const_mem (V4SImode, x));
21291 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21292 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21294 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21295 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21296 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21297 (0x1.0p84 + double(fp_value_hi_xmm)).
21298 Note these exponents differ by 32. */
21300 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21302 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21303 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21304 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21305 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21306 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21307 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21308 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21309 biases = validize_mem (force_const_mem (V2DFmode, biases));
21310 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21312 /* Add the upper and lower DFmode values together. */
21313 if (TARGET_SSE3)
21314 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21315 else
21317 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21318 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21319 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21322 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21325 /* Not used, but eases macroization of patterns. */
21326 void
21327 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21329 gcc_unreachable ();
21332 /* Convert an unsigned SImode value into a DFmode. Only currently used
21333 for SSE, but applicable anywhere. */
21335 void
21336 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21338 REAL_VALUE_TYPE TWO31r;
21339 rtx x, fp;
21341 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21342 NULL, 1, OPTAB_DIRECT);
21344 fp = gen_reg_rtx (DFmode);
21345 emit_insn (gen_floatsidf2 (fp, x));
21347 real_ldexp (&TWO31r, &dconst1, 31);
21348 x = const_double_from_real_value (TWO31r, DFmode);
21350 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21351 if (x != target)
21352 emit_move_insn (target, x);
21355 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21356 32-bit mode; otherwise we have a direct convert instruction. */
21358 void
21359 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21361 REAL_VALUE_TYPE TWO32r;
21362 rtx fp_lo, fp_hi, x;
21364 fp_lo = gen_reg_rtx (DFmode);
21365 fp_hi = gen_reg_rtx (DFmode);
21367 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21369 real_ldexp (&TWO32r, &dconst1, 32);
21370 x = const_double_from_real_value (TWO32r, DFmode);
21371 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21373 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21375 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21376 0, OPTAB_DIRECT);
21377 if (x != target)
21378 emit_move_insn (target, x);
21381 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21382 For x86_32, -mfpmath=sse, !optimize_size only. */
21383 void
21384 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21386 REAL_VALUE_TYPE ONE16r;
21387 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21389 real_ldexp (&ONE16r, &dconst1, 16);
21390 x = const_double_from_real_value (ONE16r, SFmode);
21391 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21392 NULL, 0, OPTAB_DIRECT);
21393 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21394 NULL, 0, OPTAB_DIRECT);
21395 fp_hi = gen_reg_rtx (SFmode);
21396 fp_lo = gen_reg_rtx (SFmode);
21397 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21398 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21399 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21400 0, OPTAB_DIRECT);
21401 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21402 0, OPTAB_DIRECT);
21403 if (!rtx_equal_p (target, fp_hi))
21404 emit_move_insn (target, fp_hi);
21407 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21408 a vector of unsigned ints VAL to vector of floats TARGET. */
21410 void
21411 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21413 rtx tmp[8];
21414 REAL_VALUE_TYPE TWO16r;
21415 machine_mode intmode = GET_MODE (val);
21416 machine_mode fltmode = GET_MODE (target);
21417 rtx (*cvt) (rtx, rtx);
21419 if (intmode == V4SImode)
21420 cvt = gen_floatv4siv4sf2;
21421 else
21422 cvt = gen_floatv8siv8sf2;
21423 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21424 tmp[0] = force_reg (intmode, tmp[0]);
21425 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21426 OPTAB_DIRECT);
21427 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21428 NULL_RTX, 1, OPTAB_DIRECT);
21429 tmp[3] = gen_reg_rtx (fltmode);
21430 emit_insn (cvt (tmp[3], tmp[1]));
21431 tmp[4] = gen_reg_rtx (fltmode);
21432 emit_insn (cvt (tmp[4], tmp[2]));
21433 real_ldexp (&TWO16r, &dconst1, 16);
21434 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21435 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21436 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21437 OPTAB_DIRECT);
21438 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21439 OPTAB_DIRECT);
21440 if (tmp[7] != target)
21441 emit_move_insn (target, tmp[7]);
21444 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21445 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21446 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21447 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21450 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21452 REAL_VALUE_TYPE TWO31r;
21453 rtx two31r, tmp[4];
21454 machine_mode mode = GET_MODE (val);
21455 machine_mode scalarmode = GET_MODE_INNER (mode);
21456 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21457 rtx (*cmp) (rtx, rtx, rtx, rtx);
21458 int i;
21460 for (i = 0; i < 3; i++)
21461 tmp[i] = gen_reg_rtx (mode);
21462 real_ldexp (&TWO31r, &dconst1, 31);
21463 two31r = const_double_from_real_value (TWO31r, scalarmode);
21464 two31r = ix86_build_const_vector (mode, 1, two31r);
21465 two31r = force_reg (mode, two31r);
21466 switch (mode)
21468 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21469 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21470 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21471 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21472 default: gcc_unreachable ();
21474 tmp[3] = gen_rtx_LE (mode, two31r, val);
21475 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21476 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21477 0, OPTAB_DIRECT);
21478 if (intmode == V4SImode || TARGET_AVX2)
21479 *xorp = expand_simple_binop (intmode, ASHIFT,
21480 gen_lowpart (intmode, tmp[0]),
21481 GEN_INT (31), NULL_RTX, 0,
21482 OPTAB_DIRECT);
21483 else
21485 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21486 two31 = ix86_build_const_vector (intmode, 1, two31);
21487 *xorp = expand_simple_binop (intmode, AND,
21488 gen_lowpart (intmode, tmp[0]),
21489 two31, NULL_RTX, 0,
21490 OPTAB_DIRECT);
21492 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21493 0, OPTAB_DIRECT);
21496 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21497 then replicate the value for all elements of the vector
21498 register. */
21501 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21503 int i, n_elt;
21504 rtvec v;
21505 machine_mode scalar_mode;
21507 switch (mode)
21509 case E_V64QImode:
21510 case E_V32QImode:
21511 case E_V16QImode:
21512 case E_V32HImode:
21513 case E_V16HImode:
21514 case E_V8HImode:
21515 case E_V16SImode:
21516 case E_V8SImode:
21517 case E_V4SImode:
21518 case E_V8DImode:
21519 case E_V4DImode:
21520 case E_V2DImode:
21521 gcc_assert (vect);
21522 /* FALLTHRU */
21523 case E_V16SFmode:
21524 case E_V8SFmode:
21525 case E_V4SFmode:
21526 case E_V8DFmode:
21527 case E_V4DFmode:
21528 case E_V2DFmode:
21529 n_elt = GET_MODE_NUNITS (mode);
21530 v = rtvec_alloc (n_elt);
21531 scalar_mode = GET_MODE_INNER (mode);
21533 RTVEC_ELT (v, 0) = value;
21535 for (i = 1; i < n_elt; ++i)
21536 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21538 return gen_rtx_CONST_VECTOR (mode, v);
21540 default:
21541 gcc_unreachable ();
21545 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21546 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21547 for an SSE register. If VECT is true, then replicate the mask for
21548 all elements of the vector register. If INVERT is true, then create
21549 a mask excluding the sign bit. */
21552 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21554 machine_mode vec_mode, imode;
21555 wide_int w;
21556 rtx mask, v;
21558 switch (mode)
21560 case E_V16SImode:
21561 case E_V16SFmode:
21562 case E_V8SImode:
21563 case E_V4SImode:
21564 case E_V8SFmode:
21565 case E_V4SFmode:
21566 vec_mode = mode;
21567 imode = SImode;
21568 break;
21570 case E_V8DImode:
21571 case E_V4DImode:
21572 case E_V2DImode:
21573 case E_V8DFmode:
21574 case E_V4DFmode:
21575 case E_V2DFmode:
21576 vec_mode = mode;
21577 imode = DImode;
21578 break;
21580 case E_TImode:
21581 case E_TFmode:
21582 vec_mode = VOIDmode;
21583 imode = TImode;
21584 break;
21586 default:
21587 gcc_unreachable ();
21590 machine_mode inner_mode = GET_MODE_INNER (mode);
21591 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21592 GET_MODE_BITSIZE (inner_mode));
21593 if (invert)
21594 w = wi::bit_not (w);
21596 /* Force this value into the low part of a fp vector constant. */
21597 mask = immed_wide_int_const (w, imode);
21598 mask = gen_lowpart (inner_mode, mask);
21600 if (vec_mode == VOIDmode)
21601 return force_reg (inner_mode, mask);
21603 v = ix86_build_const_vector (vec_mode, vect, mask);
21604 return force_reg (vec_mode, v);
21607 /* Generate code for floating point ABS or NEG. */
21609 void
21610 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21611 rtx operands[])
21613 rtx mask, set, dst, src;
21614 bool use_sse = false;
21615 bool vector_mode = VECTOR_MODE_P (mode);
21616 machine_mode vmode = mode;
21618 if (vector_mode)
21619 use_sse = true;
21620 else if (mode == TFmode)
21621 use_sse = true;
21622 else if (TARGET_SSE_MATH)
21624 use_sse = SSE_FLOAT_MODE_P (mode);
21625 if (mode == SFmode)
21626 vmode = V4SFmode;
21627 else if (mode == DFmode)
21628 vmode = V2DFmode;
21631 /* NEG and ABS performed with SSE use bitwise mask operations.
21632 Create the appropriate mask now. */
21633 if (use_sse)
21634 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21635 else
21636 mask = NULL_RTX;
21638 dst = operands[0];
21639 src = operands[1];
21641 set = gen_rtx_fmt_e (code, mode, src);
21642 set = gen_rtx_SET (dst, set);
21644 if (mask)
21646 rtx use, clob;
21647 rtvec par;
21649 use = gen_rtx_USE (VOIDmode, mask);
21650 if (vector_mode)
21651 par = gen_rtvec (2, set, use);
21652 else
21654 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21655 par = gen_rtvec (3, set, use, clob);
21657 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21659 else
21660 emit_insn (set);
21663 /* Expand a copysign operation. Special case operand 0 being a constant. */
21665 void
21666 ix86_expand_copysign (rtx operands[])
21668 machine_mode mode, vmode;
21669 rtx dest, op0, op1, mask, nmask;
21671 dest = operands[0];
21672 op0 = operands[1];
21673 op1 = operands[2];
21675 mode = GET_MODE (dest);
21677 if (mode == SFmode)
21678 vmode = V4SFmode;
21679 else if (mode == DFmode)
21680 vmode = V2DFmode;
21681 else
21682 vmode = mode;
21684 if (CONST_DOUBLE_P (op0))
21686 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21688 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21689 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21691 if (mode == SFmode || mode == DFmode)
21693 if (op0 == CONST0_RTX (mode))
21694 op0 = CONST0_RTX (vmode);
21695 else
21697 rtx v = ix86_build_const_vector (vmode, false, op0);
21699 op0 = force_reg (vmode, v);
21702 else if (op0 != CONST0_RTX (mode))
21703 op0 = force_reg (mode, op0);
21705 mask = ix86_build_signbit_mask (vmode, 0, 0);
21707 if (mode == SFmode)
21708 copysign_insn = gen_copysignsf3_const;
21709 else if (mode == DFmode)
21710 copysign_insn = gen_copysigndf3_const;
21711 else
21712 copysign_insn = gen_copysigntf3_const;
21714 emit_insn (copysign_insn (dest, op0, op1, mask));
21716 else
21718 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21720 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21721 mask = ix86_build_signbit_mask (vmode, 0, 0);
21723 if (mode == SFmode)
21724 copysign_insn = gen_copysignsf3_var;
21725 else if (mode == DFmode)
21726 copysign_insn = gen_copysigndf3_var;
21727 else
21728 copysign_insn = gen_copysigntf3_var;
21730 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21734 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21735 be a constant, and so has already been expanded into a vector constant. */
21737 void
21738 ix86_split_copysign_const (rtx operands[])
21740 machine_mode mode, vmode;
21741 rtx dest, op0, mask, x;
21743 dest = operands[0];
21744 op0 = operands[1];
21745 mask = operands[3];
21747 mode = GET_MODE (dest);
21748 vmode = GET_MODE (mask);
21750 dest = lowpart_subreg (vmode, dest, mode);
21751 x = gen_rtx_AND (vmode, dest, mask);
21752 emit_insn (gen_rtx_SET (dest, x));
21754 if (op0 != CONST0_RTX (vmode))
21756 x = gen_rtx_IOR (vmode, dest, op0);
21757 emit_insn (gen_rtx_SET (dest, x));
21761 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21762 so we have to do two masks. */
21764 void
21765 ix86_split_copysign_var (rtx operands[])
21767 machine_mode mode, vmode;
21768 rtx dest, scratch, op0, op1, mask, nmask, x;
21770 dest = operands[0];
21771 scratch = operands[1];
21772 op0 = operands[2];
21773 op1 = operands[3];
21774 nmask = operands[4];
21775 mask = operands[5];
21777 mode = GET_MODE (dest);
21778 vmode = GET_MODE (mask);
21780 if (rtx_equal_p (op0, op1))
21782 /* Shouldn't happen often (it's useless, obviously), but when it does
21783 we'd generate incorrect code if we continue below. */
21784 emit_move_insn (dest, op0);
21785 return;
21788 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21790 gcc_assert (REGNO (op1) == REGNO (scratch));
21792 x = gen_rtx_AND (vmode, scratch, mask);
21793 emit_insn (gen_rtx_SET (scratch, x));
21795 dest = mask;
21796 op0 = lowpart_subreg (vmode, op0, mode);
21797 x = gen_rtx_NOT (vmode, dest);
21798 x = gen_rtx_AND (vmode, x, op0);
21799 emit_insn (gen_rtx_SET (dest, x));
21801 else
21803 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21805 x = gen_rtx_AND (vmode, scratch, mask);
21807 else /* alternative 2,4 */
21809 gcc_assert (REGNO (mask) == REGNO (scratch));
21810 op1 = lowpart_subreg (vmode, op1, mode);
21811 x = gen_rtx_AND (vmode, scratch, op1);
21813 emit_insn (gen_rtx_SET (scratch, x));
21815 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21817 dest = lowpart_subreg (vmode, op0, mode);
21818 x = gen_rtx_AND (vmode, dest, nmask);
21820 else /* alternative 3,4 */
21822 gcc_assert (REGNO (nmask) == REGNO (dest));
21823 dest = nmask;
21824 op0 = lowpart_subreg (vmode, op0, mode);
21825 x = gen_rtx_AND (vmode, dest, op0);
21827 emit_insn (gen_rtx_SET (dest, x));
21830 x = gen_rtx_IOR (vmode, dest, scratch);
21831 emit_insn (gen_rtx_SET (dest, x));
21834 /* Return TRUE or FALSE depending on whether the first SET in INSN
21835 has source and destination with matching CC modes, and that the
21836 CC mode is at least as constrained as REQ_MODE. */
21838 bool
21839 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21841 rtx set;
21842 machine_mode set_mode;
21844 set = PATTERN (insn);
21845 if (GET_CODE (set) == PARALLEL)
21846 set = XVECEXP (set, 0, 0);
21847 gcc_assert (GET_CODE (set) == SET);
21848 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21850 set_mode = GET_MODE (SET_DEST (set));
21851 switch (set_mode)
21853 case E_CCNOmode:
21854 if (req_mode != CCNOmode
21855 && (req_mode != CCmode
21856 || XEXP (SET_SRC (set), 1) != const0_rtx))
21857 return false;
21858 break;
21859 case E_CCmode:
21860 if (req_mode == CCGCmode)
21861 return false;
21862 /* FALLTHRU */
21863 case E_CCGCmode:
21864 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21865 return false;
21866 /* FALLTHRU */
21867 case E_CCGOCmode:
21868 if (req_mode == CCZmode)
21869 return false;
21870 /* FALLTHRU */
21871 case E_CCZmode:
21872 break;
21874 case E_CCGZmode:
21876 case E_CCAmode:
21877 case E_CCCmode:
21878 case E_CCOmode:
21879 case E_CCPmode:
21880 case E_CCSmode:
21881 if (set_mode != req_mode)
21882 return false;
21883 break;
21885 default:
21886 gcc_unreachable ();
21889 return GET_MODE (SET_SRC (set)) == set_mode;
21892 /* Generate insn patterns to do an integer compare of OPERANDS. */
21894 static rtx
21895 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21897 machine_mode cmpmode;
21898 rtx tmp, flags;
21900 cmpmode = SELECT_CC_MODE (code, op0, op1);
21901 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21903 /* This is very simple, but making the interface the same as in the
21904 FP case makes the rest of the code easier. */
21905 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21906 emit_insn (gen_rtx_SET (flags, tmp));
21908 /* Return the test that should be put into the flags user, i.e.
21909 the bcc, scc, or cmov instruction. */
21910 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21913 /* Figure out whether to use unordered fp comparisons. */
21915 static bool
21916 ix86_unordered_fp_compare (enum rtx_code code)
21918 if (!TARGET_IEEE_FP)
21919 return false;
21921 switch (code)
21923 case GT:
21924 case GE:
21925 case LT:
21926 case LE:
21927 return false;
21929 case EQ:
21930 case NE:
21932 case LTGT:
21933 case UNORDERED:
21934 case ORDERED:
21935 case UNLT:
21936 case UNLE:
21937 case UNGT:
21938 case UNGE:
21939 case UNEQ:
21940 return true;
21942 default:
21943 gcc_unreachable ();
21947 machine_mode
21948 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21950 machine_mode mode = GET_MODE (op0);
21952 if (SCALAR_FLOAT_MODE_P (mode))
21954 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21955 return CCFPmode;
21958 switch (code)
21960 /* Only zero flag is needed. */
21961 case EQ: /* ZF=0 */
21962 case NE: /* ZF!=0 */
21963 return CCZmode;
21964 /* Codes needing carry flag. */
21965 case GEU: /* CF=0 */
21966 case LTU: /* CF=1 */
21967 /* Detect overflow checks. They need just the carry flag. */
21968 if (GET_CODE (op0) == PLUS
21969 && (rtx_equal_p (op1, XEXP (op0, 0))
21970 || rtx_equal_p (op1, XEXP (op0, 1))))
21971 return CCCmode;
21972 else
21973 return CCmode;
21974 case GTU: /* CF=0 & ZF=0 */
21975 case LEU: /* CF=1 | ZF=1 */
21976 return CCmode;
21977 /* Codes possibly doable only with sign flag when
21978 comparing against zero. */
21979 case GE: /* SF=OF or SF=0 */
21980 case LT: /* SF<>OF or SF=1 */
21981 if (op1 == const0_rtx)
21982 return CCGOCmode;
21983 else
21984 /* For other cases Carry flag is not required. */
21985 return CCGCmode;
21986 /* Codes doable only with sign flag when comparing
21987 against zero, but we miss jump instruction for it
21988 so we need to use relational tests against overflow
21989 that thus needs to be zero. */
21990 case GT: /* ZF=0 & SF=OF */
21991 case LE: /* ZF=1 | SF<>OF */
21992 if (op1 == const0_rtx)
21993 return CCNOmode;
21994 else
21995 return CCGCmode;
21996 /* strcmp pattern do (use flags) and combine may ask us for proper
21997 mode. */
21998 case USE:
21999 return CCmode;
22000 default:
22001 gcc_unreachable ();
22005 /* Return the fixed registers used for condition codes. */
22007 static bool
22008 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22010 *p1 = FLAGS_REG;
22011 *p2 = FPSR_REG;
22012 return true;
22015 /* If two condition code modes are compatible, return a condition code
22016 mode which is compatible with both. Otherwise, return
22017 VOIDmode. */
22019 static machine_mode
22020 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22022 if (m1 == m2)
22023 return m1;
22025 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22026 return VOIDmode;
22028 if ((m1 == CCGCmode && m2 == CCGOCmode)
22029 || (m1 == CCGOCmode && m2 == CCGCmode))
22030 return CCGCmode;
22032 if ((m1 == CCNOmode && m2 == CCGOCmode)
22033 || (m1 == CCGOCmode && m2 == CCNOmode))
22034 return CCNOmode;
22036 if (m1 == CCZmode
22037 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22038 return m2;
22039 else if (m2 == CCZmode
22040 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22041 return m1;
22043 switch (m1)
22045 default:
22046 gcc_unreachable ();
22048 case E_CCmode:
22049 case E_CCGCmode:
22050 case E_CCGOCmode:
22051 case E_CCNOmode:
22052 case E_CCAmode:
22053 case E_CCCmode:
22054 case E_CCOmode:
22055 case E_CCPmode:
22056 case E_CCSmode:
22057 case E_CCZmode:
22058 switch (m2)
22060 default:
22061 return VOIDmode;
22063 case E_CCmode:
22064 case E_CCGCmode:
22065 case E_CCGOCmode:
22066 case E_CCNOmode:
22067 case E_CCAmode:
22068 case E_CCCmode:
22069 case E_CCOmode:
22070 case E_CCPmode:
22071 case E_CCSmode:
22072 case E_CCZmode:
22073 return CCmode;
22076 case E_CCFPmode:
22077 /* These are only compatible with themselves, which we already
22078 checked above. */
22079 return VOIDmode;
22084 /* Return a comparison we can do and that it is equivalent to
22085 swap_condition (code) apart possibly from orderedness.
22086 But, never change orderedness if TARGET_IEEE_FP, returning
22087 UNKNOWN in that case if necessary. */
22089 static enum rtx_code
22090 ix86_fp_swap_condition (enum rtx_code code)
22092 switch (code)
22094 case GT: /* GTU - CF=0 & ZF=0 */
22095 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22096 case GE: /* GEU - CF=0 */
22097 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22098 case UNLT: /* LTU - CF=1 */
22099 return TARGET_IEEE_FP ? UNKNOWN : GT;
22100 case UNLE: /* LEU - CF=1 | ZF=1 */
22101 return TARGET_IEEE_FP ? UNKNOWN : GE;
22102 default:
22103 return swap_condition (code);
22107 /* Return cost of comparison CODE using the best strategy for performance.
22108 All following functions do use number of instructions as a cost metrics.
22109 In future this should be tweaked to compute bytes for optimize_size and
22110 take into account performance of various instructions on various CPUs. */
22112 static int
22113 ix86_fp_comparison_cost (enum rtx_code code)
22115 int arith_cost;
22117 /* The cost of code using bit-twiddling on %ah. */
22118 switch (code)
22120 case UNLE:
22121 case UNLT:
22122 case LTGT:
22123 case GT:
22124 case GE:
22125 case UNORDERED:
22126 case ORDERED:
22127 case UNEQ:
22128 arith_cost = 4;
22129 break;
22130 case LT:
22131 case NE:
22132 case EQ:
22133 case UNGE:
22134 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22135 break;
22136 case LE:
22137 case UNGT:
22138 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22139 break;
22140 default:
22141 gcc_unreachable ();
22144 switch (ix86_fp_comparison_strategy (code))
22146 case IX86_FPCMP_COMI:
22147 return arith_cost > 4 ? 3 : 2;
22148 case IX86_FPCMP_SAHF:
22149 return arith_cost > 4 ? 4 : 3;
22150 default:
22151 return arith_cost;
22155 /* Return strategy to use for floating-point. We assume that fcomi is always
22156 preferrable where available, since that is also true when looking at size
22157 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22159 enum ix86_fpcmp_strategy
22160 ix86_fp_comparison_strategy (enum rtx_code)
22162 /* Do fcomi/sahf based test when profitable. */
22164 if (TARGET_CMOVE)
22165 return IX86_FPCMP_COMI;
22167 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22168 return IX86_FPCMP_SAHF;
22170 return IX86_FPCMP_ARITH;
22173 /* Swap, force into registers, or otherwise massage the two operands
22174 to a fp comparison. The operands are updated in place; the new
22175 comparison code is returned. */
22177 static enum rtx_code
22178 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22180 bool unordered_compare = ix86_unordered_fp_compare (code);
22181 rtx op0 = *pop0, op1 = *pop1;
22182 machine_mode op_mode = GET_MODE (op0);
22183 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22185 /* All of the unordered compare instructions only work on registers.
22186 The same is true of the fcomi compare instructions. The XFmode
22187 compare instructions require registers except when comparing
22188 against zero or when converting operand 1 from fixed point to
22189 floating point. */
22191 if (!is_sse
22192 && (unordered_compare
22193 || (op_mode == XFmode
22194 && ! (standard_80387_constant_p (op0) == 1
22195 || standard_80387_constant_p (op1) == 1)
22196 && GET_CODE (op1) != FLOAT)
22197 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22199 op0 = force_reg (op_mode, op0);
22200 op1 = force_reg (op_mode, op1);
22202 else
22204 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22205 things around if they appear profitable, otherwise force op0
22206 into a register. */
22208 if (standard_80387_constant_p (op0) == 0
22209 || (MEM_P (op0)
22210 && ! (standard_80387_constant_p (op1) == 0
22211 || MEM_P (op1))))
22213 enum rtx_code new_code = ix86_fp_swap_condition (code);
22214 if (new_code != UNKNOWN)
22216 std::swap (op0, op1);
22217 code = new_code;
22221 if (!REG_P (op0))
22222 op0 = force_reg (op_mode, op0);
22224 if (CONSTANT_P (op1))
22226 int tmp = standard_80387_constant_p (op1);
22227 if (tmp == 0)
22228 op1 = validize_mem (force_const_mem (op_mode, op1));
22229 else if (tmp == 1)
22231 if (TARGET_CMOVE)
22232 op1 = force_reg (op_mode, op1);
22234 else
22235 op1 = force_reg (op_mode, op1);
22239 /* Try to rearrange the comparison to make it cheaper. */
22240 if (ix86_fp_comparison_cost (code)
22241 > ix86_fp_comparison_cost (swap_condition (code))
22242 && (REG_P (op1) || can_create_pseudo_p ()))
22244 std::swap (op0, op1);
22245 code = swap_condition (code);
22246 if (!REG_P (op0))
22247 op0 = force_reg (op_mode, op0);
22250 *pop0 = op0;
22251 *pop1 = op1;
22252 return code;
22255 /* Convert comparison codes we use to represent FP comparison to integer
22256 code that will result in proper branch. Return UNKNOWN if no such code
22257 is available. */
22259 enum rtx_code
22260 ix86_fp_compare_code_to_integer (enum rtx_code code)
22262 switch (code)
22264 case GT:
22265 return GTU;
22266 case GE:
22267 return GEU;
22268 case ORDERED:
22269 case UNORDERED:
22270 return code;
22271 case UNEQ:
22272 return EQ;
22273 case UNLT:
22274 return LTU;
22275 case UNLE:
22276 return LEU;
22277 case LTGT:
22278 return NE;
22279 default:
22280 return UNKNOWN;
22284 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22286 static rtx
22287 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22289 bool unordered_compare = ix86_unordered_fp_compare (code);
22290 machine_mode intcmp_mode;
22291 rtx tmp, tmp2;
22293 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22295 /* Do fcomi/sahf based test when profitable. */
22296 switch (ix86_fp_comparison_strategy (code))
22298 case IX86_FPCMP_COMI:
22299 intcmp_mode = CCFPmode;
22300 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22301 if (unordered_compare)
22302 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22303 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22304 break;
22306 case IX86_FPCMP_SAHF:
22307 intcmp_mode = CCFPmode;
22308 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22309 if (unordered_compare)
22310 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22311 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22312 if (!scratch)
22313 scratch = gen_reg_rtx (HImode);
22314 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22315 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22316 break;
22318 case IX86_FPCMP_ARITH:
22319 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22320 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22321 if (unordered_compare)
22322 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22323 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22324 if (!scratch)
22325 scratch = gen_reg_rtx (HImode);
22326 emit_insn (gen_rtx_SET (scratch, tmp));
22328 /* In the unordered case, we have to check C2 for NaN's, which
22329 doesn't happen to work out to anything nice combination-wise.
22330 So do some bit twiddling on the value we've got in AH to come
22331 up with an appropriate set of condition codes. */
22333 intcmp_mode = CCNOmode;
22334 switch (code)
22336 case GT:
22337 case UNGT:
22338 if (code == GT || !TARGET_IEEE_FP)
22340 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22341 code = EQ;
22343 else
22345 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22346 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22347 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22348 intcmp_mode = CCmode;
22349 code = GEU;
22351 break;
22352 case LT:
22353 case UNLT:
22354 if (code == LT && TARGET_IEEE_FP)
22356 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22357 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22358 intcmp_mode = CCmode;
22359 code = EQ;
22361 else
22363 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22364 code = NE;
22366 break;
22367 case GE:
22368 case UNGE:
22369 if (code == GE || !TARGET_IEEE_FP)
22371 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22372 code = EQ;
22374 else
22376 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22377 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22378 code = NE;
22380 break;
22381 case LE:
22382 case UNLE:
22383 if (code == LE && TARGET_IEEE_FP)
22385 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22386 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22387 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22388 intcmp_mode = CCmode;
22389 code = LTU;
22391 else
22393 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22394 code = NE;
22396 break;
22397 case EQ:
22398 case UNEQ:
22399 if (code == EQ && TARGET_IEEE_FP)
22401 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22402 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22403 intcmp_mode = CCmode;
22404 code = EQ;
22406 else
22408 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22409 code = NE;
22411 break;
22412 case NE:
22413 case LTGT:
22414 if (code == NE && TARGET_IEEE_FP)
22416 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22417 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22418 GEN_INT (0x40)));
22419 code = NE;
22421 else
22423 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22424 code = EQ;
22426 break;
22428 case UNORDERED:
22429 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22430 code = NE;
22431 break;
22432 case ORDERED:
22433 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22434 code = EQ;
22435 break;
22437 default:
22438 gcc_unreachable ();
22440 break;
22442 default:
22443 gcc_unreachable();
22446 /* Return the test that should be put into the flags user, i.e.
22447 the bcc, scc, or cmov instruction. */
22448 return gen_rtx_fmt_ee (code, VOIDmode,
22449 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22450 const0_rtx);
22453 static rtx
22454 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22456 rtx ret;
22458 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22459 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22461 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22463 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22464 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22466 else
22467 ret = ix86_expand_int_compare (code, op0, op1);
22469 return ret;
22472 void
22473 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22475 machine_mode mode = GET_MODE (op0);
22476 rtx tmp;
22478 /* Handle special case - vector comparsion with boolean result, transform
22479 it using ptest instruction. */
22480 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22482 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22483 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22485 gcc_assert (code == EQ || code == NE);
22486 /* Generate XOR since we can't check that one operand is zero vector. */
22487 tmp = gen_reg_rtx (mode);
22488 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22489 tmp = gen_lowpart (p_mode, tmp);
22490 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22491 gen_rtx_UNSPEC (CCmode,
22492 gen_rtvec (2, tmp, tmp),
22493 UNSPEC_PTEST)));
22494 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22495 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22496 gen_rtx_LABEL_REF (VOIDmode, label),
22497 pc_rtx);
22498 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22499 return;
22502 switch (mode)
22504 case E_SFmode:
22505 case E_DFmode:
22506 case E_XFmode:
22507 case E_QImode:
22508 case E_HImode:
22509 case E_SImode:
22510 simple:
22511 tmp = ix86_expand_compare (code, op0, op1);
22512 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22513 gen_rtx_LABEL_REF (VOIDmode, label),
22514 pc_rtx);
22515 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22516 return;
22518 case E_DImode:
22519 if (TARGET_64BIT)
22520 goto simple;
22521 /* For 32-bit target DI comparison may be performed on
22522 SSE registers. To allow this we should avoid split
22523 to SI mode which is achieved by doing xor in DI mode
22524 and then comparing with zero (which is recognized by
22525 STV pass). We don't compare using xor when optimizing
22526 for size. */
22527 if (!optimize_insn_for_size_p ()
22528 && TARGET_STV
22529 && (code == EQ || code == NE))
22531 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22532 op1 = const0_rtx;
22534 /* FALLTHRU */
22535 case E_TImode:
22536 /* Expand DImode branch into multiple compare+branch. */
22538 rtx lo[2], hi[2];
22539 rtx_code_label *label2;
22540 enum rtx_code code1, code2, code3;
22541 machine_mode submode;
22543 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22545 std::swap (op0, op1);
22546 code = swap_condition (code);
22549 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22550 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22552 submode = mode == DImode ? SImode : DImode;
22554 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22555 avoid two branches. This costs one extra insn, so disable when
22556 optimizing for size. */
22558 if ((code == EQ || code == NE)
22559 && (!optimize_insn_for_size_p ()
22560 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22562 rtx xor0, xor1;
22564 xor1 = hi[0];
22565 if (hi[1] != const0_rtx)
22566 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22567 NULL_RTX, 0, OPTAB_WIDEN);
22569 xor0 = lo[0];
22570 if (lo[1] != const0_rtx)
22571 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22572 NULL_RTX, 0, OPTAB_WIDEN);
22574 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22575 NULL_RTX, 0, OPTAB_WIDEN);
22577 ix86_expand_branch (code, tmp, const0_rtx, label);
22578 return;
22581 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22582 op1 is a constant and the low word is zero, then we can just
22583 examine the high word. Similarly for low word -1 and
22584 less-or-equal-than or greater-than. */
22586 if (CONST_INT_P (hi[1]))
22587 switch (code)
22589 case LT: case LTU: case GE: case GEU:
22590 if (lo[1] == const0_rtx)
22592 ix86_expand_branch (code, hi[0], hi[1], label);
22593 return;
22595 break;
22596 case LE: case LEU: case GT: case GTU:
22597 if (lo[1] == constm1_rtx)
22599 ix86_expand_branch (code, hi[0], hi[1], label);
22600 return;
22602 break;
22603 default:
22604 break;
22607 /* Emulate comparisons that do not depend on Zero flag with
22608 double-word subtraction. Note that only Overflow, Sign
22609 and Carry flags are valid, so swap arguments and condition
22610 of comparisons that would otherwise test Zero flag. */
22612 switch (code)
22614 case LE: case LEU: case GT: case GTU:
22615 std::swap (lo[0], lo[1]);
22616 std::swap (hi[0], hi[1]);
22617 code = swap_condition (code);
22618 /* FALLTHRU */
22620 case LT: case LTU: case GE: case GEU:
22622 rtx (*cmp_insn) (rtx, rtx);
22623 rtx (*sbb_insn) (rtx, rtx, rtx);
22624 bool uns = (code == LTU || code == GEU);
22626 if (TARGET_64BIT)
22628 cmp_insn = gen_cmpdi_1;
22629 sbb_insn
22630 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22632 else
22634 cmp_insn = gen_cmpsi_1;
22635 sbb_insn
22636 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22639 if (!nonimmediate_operand (lo[0], submode))
22640 lo[0] = force_reg (submode, lo[0]);
22641 if (!x86_64_general_operand (lo[1], submode))
22642 lo[1] = force_reg (submode, lo[1]);
22644 if (!register_operand (hi[0], submode))
22645 hi[0] = force_reg (submode, hi[0]);
22646 if ((uns && !nonimmediate_operand (hi[1], submode))
22647 || (!uns && !x86_64_general_operand (hi[1], submode)))
22648 hi[1] = force_reg (submode, hi[1]);
22650 emit_insn (cmp_insn (lo[0], lo[1]));
22651 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22653 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22655 ix86_expand_branch (code, tmp, const0_rtx, label);
22656 return;
22659 default:
22660 break;
22663 /* Otherwise, we need two or three jumps. */
22665 label2 = gen_label_rtx ();
22667 code1 = code;
22668 code2 = swap_condition (code);
22669 code3 = unsigned_condition (code);
22671 switch (code)
22673 case LT: case GT: case LTU: case GTU:
22674 break;
22676 case LE: code1 = LT; code2 = GT; break;
22677 case GE: code1 = GT; code2 = LT; break;
22678 case LEU: code1 = LTU; code2 = GTU; break;
22679 case GEU: code1 = GTU; code2 = LTU; break;
22681 case EQ: code1 = UNKNOWN; code2 = NE; break;
22682 case NE: code2 = UNKNOWN; break;
22684 default:
22685 gcc_unreachable ();
22689 * a < b =>
22690 * if (hi(a) < hi(b)) goto true;
22691 * if (hi(a) > hi(b)) goto false;
22692 * if (lo(a) < lo(b)) goto true;
22693 * false:
22696 if (code1 != UNKNOWN)
22697 ix86_expand_branch (code1, hi[0], hi[1], label);
22698 if (code2 != UNKNOWN)
22699 ix86_expand_branch (code2, hi[0], hi[1], label2);
22701 ix86_expand_branch (code3, lo[0], lo[1], label);
22703 if (code2 != UNKNOWN)
22704 emit_label (label2);
22705 return;
22708 default:
22709 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22710 goto simple;
22714 void
22715 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22717 rtx ret;
22719 gcc_assert (GET_MODE (dest) == QImode);
22721 ret = ix86_expand_compare (code, op0, op1);
22722 PUT_MODE (ret, QImode);
22723 emit_insn (gen_rtx_SET (dest, ret));
22726 /* Expand comparison setting or clearing carry flag. Return true when
22727 successful and set pop for the operation. */
22728 static bool
22729 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22731 machine_mode mode =
22732 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22734 /* Do not handle double-mode compares that go through special path. */
22735 if (mode == (TARGET_64BIT ? TImode : DImode))
22736 return false;
22738 if (SCALAR_FLOAT_MODE_P (mode))
22740 rtx compare_op;
22741 rtx_insn *compare_seq;
22743 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22745 /* Shortcut: following common codes never translate
22746 into carry flag compares. */
22747 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22748 || code == ORDERED || code == UNORDERED)
22749 return false;
22751 /* These comparisons require zero flag; swap operands so they won't. */
22752 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22753 && !TARGET_IEEE_FP)
22755 std::swap (op0, op1);
22756 code = swap_condition (code);
22759 /* Try to expand the comparison and verify that we end up with
22760 carry flag based comparison. This fails to be true only when
22761 we decide to expand comparison using arithmetic that is not
22762 too common scenario. */
22763 start_sequence ();
22764 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22765 compare_seq = get_insns ();
22766 end_sequence ();
22768 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
22769 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22770 else
22771 code = GET_CODE (compare_op);
22773 if (code != LTU && code != GEU)
22774 return false;
22776 emit_insn (compare_seq);
22777 *pop = compare_op;
22778 return true;
22781 if (!INTEGRAL_MODE_P (mode))
22782 return false;
22784 switch (code)
22786 case LTU:
22787 case GEU:
22788 break;
22790 /* Convert a==0 into (unsigned)a<1. */
22791 case EQ:
22792 case NE:
22793 if (op1 != const0_rtx)
22794 return false;
22795 op1 = const1_rtx;
22796 code = (code == EQ ? LTU : GEU);
22797 break;
22799 /* Convert a>b into b<a or a>=b-1. */
22800 case GTU:
22801 case LEU:
22802 if (CONST_INT_P (op1))
22804 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22805 /* Bail out on overflow. We still can swap operands but that
22806 would force loading of the constant into register. */
22807 if (op1 == const0_rtx
22808 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22809 return false;
22810 code = (code == GTU ? GEU : LTU);
22812 else
22814 std::swap (op0, op1);
22815 code = (code == GTU ? LTU : GEU);
22817 break;
22819 /* Convert a>=0 into (unsigned)a<0x80000000. */
22820 case LT:
22821 case GE:
22822 if (mode == DImode || op1 != const0_rtx)
22823 return false;
22824 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22825 code = (code == LT ? GEU : LTU);
22826 break;
22827 case LE:
22828 case GT:
22829 if (mode == DImode || op1 != constm1_rtx)
22830 return false;
22831 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22832 code = (code == LE ? GEU : LTU);
22833 break;
22835 default:
22836 return false;
22838 /* Swapping operands may cause constant to appear as first operand. */
22839 if (!nonimmediate_operand (op0, VOIDmode))
22841 if (!can_create_pseudo_p ())
22842 return false;
22843 op0 = force_reg (mode, op0);
22845 *pop = ix86_expand_compare (code, op0, op1);
22846 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22847 return true;
22850 bool
22851 ix86_expand_int_movcc (rtx operands[])
22853 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22854 rtx_insn *compare_seq;
22855 rtx compare_op;
22856 machine_mode mode = GET_MODE (operands[0]);
22857 bool sign_bit_compare_p = false;
22858 rtx op0 = XEXP (operands[1], 0);
22859 rtx op1 = XEXP (operands[1], 1);
22861 if (GET_MODE (op0) == TImode
22862 || (GET_MODE (op0) == DImode
22863 && !TARGET_64BIT))
22864 return false;
22866 start_sequence ();
22867 compare_op = ix86_expand_compare (code, op0, op1);
22868 compare_seq = get_insns ();
22869 end_sequence ();
22871 compare_code = GET_CODE (compare_op);
22873 if ((op1 == const0_rtx && (code == GE || code == LT))
22874 || (op1 == constm1_rtx && (code == GT || code == LE)))
22875 sign_bit_compare_p = true;
22877 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22878 HImode insns, we'd be swallowed in word prefix ops. */
22880 if ((mode != HImode || TARGET_FAST_PREFIX)
22881 && (mode != (TARGET_64BIT ? TImode : DImode))
22882 && CONST_INT_P (operands[2])
22883 && CONST_INT_P (operands[3]))
22885 rtx out = operands[0];
22886 HOST_WIDE_INT ct = INTVAL (operands[2]);
22887 HOST_WIDE_INT cf = INTVAL (operands[3]);
22888 HOST_WIDE_INT diff;
22890 diff = ct - cf;
22891 /* Sign bit compares are better done using shifts than we do by using
22892 sbb. */
22893 if (sign_bit_compare_p
22894 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22896 /* Detect overlap between destination and compare sources. */
22897 rtx tmp = out;
22899 if (!sign_bit_compare_p)
22901 rtx flags;
22902 bool fpcmp = false;
22904 compare_code = GET_CODE (compare_op);
22906 flags = XEXP (compare_op, 0);
22908 if (GET_MODE (flags) == CCFPmode)
22910 fpcmp = true;
22911 compare_code
22912 = ix86_fp_compare_code_to_integer (compare_code);
22915 /* To simplify rest of code, restrict to the GEU case. */
22916 if (compare_code == LTU)
22918 std::swap (ct, cf);
22919 compare_code = reverse_condition (compare_code);
22920 code = reverse_condition (code);
22922 else
22924 if (fpcmp)
22925 PUT_CODE (compare_op,
22926 reverse_condition_maybe_unordered
22927 (GET_CODE (compare_op)));
22928 else
22929 PUT_CODE (compare_op,
22930 reverse_condition (GET_CODE (compare_op)));
22932 diff = ct - cf;
22934 if (reg_overlap_mentioned_p (out, op0)
22935 || reg_overlap_mentioned_p (out, op1))
22936 tmp = gen_reg_rtx (mode);
22938 if (mode == DImode)
22939 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22940 else
22941 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22942 flags, compare_op));
22944 else
22946 if (code == GT || code == GE)
22947 code = reverse_condition (code);
22948 else
22950 std::swap (ct, cf);
22951 diff = ct - cf;
22953 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22956 if (diff == 1)
22959 * cmpl op0,op1
22960 * sbbl dest,dest
22961 * [addl dest, ct]
22963 * Size 5 - 8.
22965 if (ct)
22966 tmp = expand_simple_binop (mode, PLUS,
22967 tmp, GEN_INT (ct),
22968 copy_rtx (tmp), 1, OPTAB_DIRECT);
22970 else if (cf == -1)
22973 * cmpl op0,op1
22974 * sbbl dest,dest
22975 * orl $ct, dest
22977 * Size 8.
22979 tmp = expand_simple_binop (mode, IOR,
22980 tmp, GEN_INT (ct),
22981 copy_rtx (tmp), 1, OPTAB_DIRECT);
22983 else if (diff == -1 && ct)
22986 * cmpl op0,op1
22987 * sbbl dest,dest
22988 * notl dest
22989 * [addl dest, cf]
22991 * Size 8 - 11.
22993 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22994 if (cf)
22995 tmp = expand_simple_binop (mode, PLUS,
22996 copy_rtx (tmp), GEN_INT (cf),
22997 copy_rtx (tmp), 1, OPTAB_DIRECT);
22999 else
23002 * cmpl op0,op1
23003 * sbbl dest,dest
23004 * [notl dest]
23005 * andl cf - ct, dest
23006 * [addl dest, ct]
23008 * Size 8 - 11.
23011 if (cf == 0)
23013 cf = ct;
23014 ct = 0;
23015 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23018 tmp = expand_simple_binop (mode, AND,
23019 copy_rtx (tmp),
23020 gen_int_mode (cf - ct, mode),
23021 copy_rtx (tmp), 1, OPTAB_DIRECT);
23022 if (ct)
23023 tmp = expand_simple_binop (mode, PLUS,
23024 copy_rtx (tmp), GEN_INT (ct),
23025 copy_rtx (tmp), 1, OPTAB_DIRECT);
23028 if (!rtx_equal_p (tmp, out))
23029 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23031 return true;
23034 if (diff < 0)
23036 machine_mode cmp_mode = GET_MODE (op0);
23037 enum rtx_code new_code;
23039 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23041 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23043 /* We may be reversing unordered compare to normal compare, that
23044 is not valid in general (we may convert non-trapping condition
23045 to trapping one), however on i386 we currently emit all
23046 comparisons unordered. */
23047 new_code = reverse_condition_maybe_unordered (code);
23049 else
23050 new_code = ix86_reverse_condition (code, cmp_mode);
23051 if (new_code != UNKNOWN)
23053 std::swap (ct, cf);
23054 diff = -diff;
23055 code = new_code;
23059 compare_code = UNKNOWN;
23060 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23061 && CONST_INT_P (op1))
23063 if (op1 == const0_rtx
23064 && (code == LT || code == GE))
23065 compare_code = code;
23066 else if (op1 == constm1_rtx)
23068 if (code == LE)
23069 compare_code = LT;
23070 else if (code == GT)
23071 compare_code = GE;
23075 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23076 if (compare_code != UNKNOWN
23077 && GET_MODE (op0) == GET_MODE (out)
23078 && (cf == -1 || ct == -1))
23080 /* If lea code below could be used, only optimize
23081 if it results in a 2 insn sequence. */
23083 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23084 || diff == 3 || diff == 5 || diff == 9)
23085 || (compare_code == LT && ct == -1)
23086 || (compare_code == GE && cf == -1))
23089 * notl op1 (if necessary)
23090 * sarl $31, op1
23091 * orl cf, op1
23093 if (ct != -1)
23095 cf = ct;
23096 ct = -1;
23097 code = reverse_condition (code);
23100 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23102 out = expand_simple_binop (mode, IOR,
23103 out, GEN_INT (cf),
23104 out, 1, OPTAB_DIRECT);
23105 if (out != operands[0])
23106 emit_move_insn (operands[0], out);
23108 return true;
23113 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23114 || diff == 3 || diff == 5 || diff == 9)
23115 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23116 && (mode != DImode
23117 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23120 * xorl dest,dest
23121 * cmpl op1,op2
23122 * setcc dest
23123 * lea cf(dest*(ct-cf)),dest
23125 * Size 14.
23127 * This also catches the degenerate setcc-only case.
23130 rtx tmp;
23131 int nops;
23133 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23135 nops = 0;
23136 /* On x86_64 the lea instruction operates on Pmode, so we need
23137 to get arithmetics done in proper mode to match. */
23138 if (diff == 1)
23139 tmp = copy_rtx (out);
23140 else
23142 rtx out1;
23143 out1 = copy_rtx (out);
23144 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23145 nops++;
23146 if (diff & 1)
23148 tmp = gen_rtx_PLUS (mode, tmp, out1);
23149 nops++;
23152 if (cf != 0)
23154 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23155 nops++;
23157 if (!rtx_equal_p (tmp, out))
23159 if (nops == 1)
23160 out = force_operand (tmp, copy_rtx (out));
23161 else
23162 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23164 if (!rtx_equal_p (out, operands[0]))
23165 emit_move_insn (operands[0], copy_rtx (out));
23167 return true;
23171 * General case: Jumpful:
23172 * xorl dest,dest cmpl op1, op2
23173 * cmpl op1, op2 movl ct, dest
23174 * setcc dest jcc 1f
23175 * decl dest movl cf, dest
23176 * andl (cf-ct),dest 1:
23177 * addl ct,dest
23179 * Size 20. Size 14.
23181 * This is reasonably steep, but branch mispredict costs are
23182 * high on modern cpus, so consider failing only if optimizing
23183 * for space.
23186 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23187 && BRANCH_COST (optimize_insn_for_speed_p (),
23188 false) >= 2)
23190 if (cf == 0)
23192 machine_mode cmp_mode = GET_MODE (op0);
23193 enum rtx_code new_code;
23195 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23197 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23199 /* We may be reversing unordered compare to normal compare,
23200 that is not valid in general (we may convert non-trapping
23201 condition to trapping one), however on i386 we currently
23202 emit all comparisons unordered. */
23203 new_code = reverse_condition_maybe_unordered (code);
23205 else
23207 new_code = ix86_reverse_condition (code, cmp_mode);
23208 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23209 compare_code = reverse_condition (compare_code);
23212 if (new_code != UNKNOWN)
23214 cf = ct;
23215 ct = 0;
23216 code = new_code;
23220 if (compare_code != UNKNOWN)
23222 /* notl op1 (if needed)
23223 sarl $31, op1
23224 andl (cf-ct), op1
23225 addl ct, op1
23227 For x < 0 (resp. x <= -1) there will be no notl,
23228 so if possible swap the constants to get rid of the
23229 complement.
23230 True/false will be -1/0 while code below (store flag
23231 followed by decrement) is 0/-1, so the constants need
23232 to be exchanged once more. */
23234 if (compare_code == GE || !cf)
23236 code = reverse_condition (code);
23237 compare_code = LT;
23239 else
23240 std::swap (ct, cf);
23242 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23244 else
23246 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23248 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23249 constm1_rtx,
23250 copy_rtx (out), 1, OPTAB_DIRECT);
23253 out = expand_simple_binop (mode, AND, copy_rtx (out),
23254 gen_int_mode (cf - ct, mode),
23255 copy_rtx (out), 1, OPTAB_DIRECT);
23256 if (ct)
23257 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23258 copy_rtx (out), 1, OPTAB_DIRECT);
23259 if (!rtx_equal_p (out, operands[0]))
23260 emit_move_insn (operands[0], copy_rtx (out));
23262 return true;
23266 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23268 /* Try a few things more with specific constants and a variable. */
23270 optab op;
23271 rtx var, orig_out, out, tmp;
23273 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23274 return false;
23276 /* If one of the two operands is an interesting constant, load a
23277 constant with the above and mask it in with a logical operation. */
23279 if (CONST_INT_P (operands[2]))
23281 var = operands[3];
23282 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23283 operands[3] = constm1_rtx, op = and_optab;
23284 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23285 operands[3] = const0_rtx, op = ior_optab;
23286 else
23287 return false;
23289 else if (CONST_INT_P (operands[3]))
23291 var = operands[2];
23292 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23293 operands[2] = constm1_rtx, op = and_optab;
23294 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23295 operands[2] = const0_rtx, op = ior_optab;
23296 else
23297 return false;
23299 else
23300 return false;
23302 orig_out = operands[0];
23303 tmp = gen_reg_rtx (mode);
23304 operands[0] = tmp;
23306 /* Recurse to get the constant loaded. */
23307 if (!ix86_expand_int_movcc (operands))
23308 return false;
23310 /* Mask in the interesting variable. */
23311 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23312 OPTAB_WIDEN);
23313 if (!rtx_equal_p (out, orig_out))
23314 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23316 return true;
23320 * For comparison with above,
23322 * movl cf,dest
23323 * movl ct,tmp
23324 * cmpl op1,op2
23325 * cmovcc tmp,dest
23327 * Size 15.
23330 if (! nonimmediate_operand (operands[2], mode))
23331 operands[2] = force_reg (mode, operands[2]);
23332 if (! nonimmediate_operand (operands[3], mode))
23333 operands[3] = force_reg (mode, operands[3]);
23335 if (! register_operand (operands[2], VOIDmode)
23336 && (mode == QImode
23337 || ! register_operand (operands[3], VOIDmode)))
23338 operands[2] = force_reg (mode, operands[2]);
23340 if (mode == QImode
23341 && ! register_operand (operands[3], VOIDmode))
23342 operands[3] = force_reg (mode, operands[3]);
23344 emit_insn (compare_seq);
23345 emit_insn (gen_rtx_SET (operands[0],
23346 gen_rtx_IF_THEN_ELSE (mode,
23347 compare_op, operands[2],
23348 operands[3])));
23349 return true;
23352 /* Swap, force into registers, or otherwise massage the two operands
23353 to an sse comparison with a mask result. Thus we differ a bit from
23354 ix86_prepare_fp_compare_args which expects to produce a flags result.
23356 The DEST operand exists to help determine whether to commute commutative
23357 operators. The POP0/POP1 operands are updated in place. The new
23358 comparison code is returned, or UNKNOWN if not implementable. */
23360 static enum rtx_code
23361 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23362 rtx *pop0, rtx *pop1)
23364 switch (code)
23366 case LTGT:
23367 case UNEQ:
23368 /* AVX supports all the needed comparisons. */
23369 if (TARGET_AVX)
23370 break;
23371 /* We have no LTGT as an operator. We could implement it with
23372 NE & ORDERED, but this requires an extra temporary. It's
23373 not clear that it's worth it. */
23374 return UNKNOWN;
23376 case LT:
23377 case LE:
23378 case UNGT:
23379 case UNGE:
23380 /* These are supported directly. */
23381 break;
23383 case EQ:
23384 case NE:
23385 case UNORDERED:
23386 case ORDERED:
23387 /* AVX has 3 operand comparisons, no need to swap anything. */
23388 if (TARGET_AVX)
23389 break;
23390 /* For commutative operators, try to canonicalize the destination
23391 operand to be first in the comparison - this helps reload to
23392 avoid extra moves. */
23393 if (!dest || !rtx_equal_p (dest, *pop1))
23394 break;
23395 /* FALLTHRU */
23397 case GE:
23398 case GT:
23399 case UNLE:
23400 case UNLT:
23401 /* These are not supported directly before AVX, and furthermore
23402 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23403 comparison operands to transform into something that is
23404 supported. */
23405 std::swap (*pop0, *pop1);
23406 code = swap_condition (code);
23407 break;
23409 default:
23410 gcc_unreachable ();
23413 return code;
23416 /* Detect conditional moves that exactly match min/max operational
23417 semantics. Note that this is IEEE safe, as long as we don't
23418 interchange the operands.
23420 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23421 and TRUE if the operation is successful and instructions are emitted. */
23423 static bool
23424 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23425 rtx cmp_op1, rtx if_true, rtx if_false)
23427 machine_mode mode;
23428 bool is_min;
23429 rtx tmp;
23431 if (code == LT)
23433 else if (code == UNGE)
23434 std::swap (if_true, if_false);
23435 else
23436 return false;
23438 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23439 is_min = true;
23440 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23441 is_min = false;
23442 else
23443 return false;
23445 mode = GET_MODE (dest);
23447 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23448 but MODE may be a vector mode and thus not appropriate. */
23449 if (!flag_finite_math_only || flag_signed_zeros)
23451 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23452 rtvec v;
23454 if_true = force_reg (mode, if_true);
23455 v = gen_rtvec (2, if_true, if_false);
23456 tmp = gen_rtx_UNSPEC (mode, v, u);
23458 else
23460 code = is_min ? SMIN : SMAX;
23461 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23464 emit_insn (gen_rtx_SET (dest, tmp));
23465 return true;
23468 /* Expand an sse vector comparison. Return the register with the result. */
23470 static rtx
23471 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23472 rtx op_true, rtx op_false)
23474 machine_mode mode = GET_MODE (dest);
23475 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23477 /* In general case result of comparison can differ from operands' type. */
23478 machine_mode cmp_mode;
23480 /* In AVX512F the result of comparison is an integer mask. */
23481 bool maskcmp = false;
23482 rtx x;
23484 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23486 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23487 cmp_mode = int_mode_for_size (nbits, 0).require ();
23488 maskcmp = true;
23490 else
23491 cmp_mode = cmp_ops_mode;
23494 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23495 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23496 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23498 if (optimize
23499 || (maskcmp && cmp_mode != mode)
23500 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23501 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23502 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23504 /* Compare patterns for int modes are unspec in AVX512F only. */
23505 if (maskcmp && (code == GT || code == EQ))
23507 rtx (*gen)(rtx, rtx, rtx);
23509 switch (cmp_ops_mode)
23511 case E_V64QImode:
23512 gcc_assert (TARGET_AVX512BW);
23513 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23514 break;
23515 case E_V32HImode:
23516 gcc_assert (TARGET_AVX512BW);
23517 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23518 break;
23519 case E_V16SImode:
23520 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23521 break;
23522 case E_V8DImode:
23523 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23524 break;
23525 default:
23526 gen = NULL;
23529 if (gen)
23531 emit_insn (gen (dest, cmp_op0, cmp_op1));
23532 return dest;
23535 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23537 if (cmp_mode != mode && !maskcmp)
23539 x = force_reg (cmp_ops_mode, x);
23540 convert_move (dest, x, false);
23542 else
23543 emit_insn (gen_rtx_SET (dest, x));
23545 return dest;
23548 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23549 operations. This is used for both scalar and vector conditional moves. */
23551 void
23552 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23554 machine_mode mode = GET_MODE (dest);
23555 machine_mode cmpmode = GET_MODE (cmp);
23557 /* In AVX512F the result of comparison is an integer mask. */
23558 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23560 rtx t2, t3, x;
23562 /* If we have an integer mask and FP value then we need
23563 to cast mask to FP mode. */
23564 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23566 cmp = force_reg (cmpmode, cmp);
23567 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23570 if (vector_all_ones_operand (op_true, mode)
23571 && rtx_equal_p (op_false, CONST0_RTX (mode))
23572 && !maskcmp)
23574 emit_insn (gen_rtx_SET (dest, cmp));
23576 else if (op_false == CONST0_RTX (mode)
23577 && !maskcmp)
23579 op_true = force_reg (mode, op_true);
23580 x = gen_rtx_AND (mode, cmp, op_true);
23581 emit_insn (gen_rtx_SET (dest, x));
23583 else if (op_true == CONST0_RTX (mode)
23584 && !maskcmp)
23586 op_false = force_reg (mode, op_false);
23587 x = gen_rtx_NOT (mode, cmp);
23588 x = gen_rtx_AND (mode, x, op_false);
23589 emit_insn (gen_rtx_SET (dest, x));
23591 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23592 && !maskcmp)
23594 op_false = force_reg (mode, op_false);
23595 x = gen_rtx_IOR (mode, cmp, op_false);
23596 emit_insn (gen_rtx_SET (dest, x));
23598 else if (TARGET_XOP
23599 && !maskcmp)
23601 op_true = force_reg (mode, op_true);
23603 if (!nonimmediate_operand (op_false, mode))
23604 op_false = force_reg (mode, op_false);
23606 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23607 op_true,
23608 op_false)));
23610 else
23612 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23613 rtx d = dest;
23615 if (!nonimmediate_operand (op_true, mode))
23616 op_true = force_reg (mode, op_true);
23618 op_false = force_reg (mode, op_false);
23620 switch (mode)
23622 case E_V4SFmode:
23623 if (TARGET_SSE4_1)
23624 gen = gen_sse4_1_blendvps;
23625 break;
23626 case E_V2DFmode:
23627 if (TARGET_SSE4_1)
23628 gen = gen_sse4_1_blendvpd;
23629 break;
23630 case E_V16QImode:
23631 case E_V8HImode:
23632 case E_V4SImode:
23633 case E_V2DImode:
23634 if (TARGET_SSE4_1)
23636 gen = gen_sse4_1_pblendvb;
23637 if (mode != V16QImode)
23638 d = gen_reg_rtx (V16QImode);
23639 op_false = gen_lowpart (V16QImode, op_false);
23640 op_true = gen_lowpart (V16QImode, op_true);
23641 cmp = gen_lowpart (V16QImode, cmp);
23643 break;
23644 case E_V8SFmode:
23645 if (TARGET_AVX)
23646 gen = gen_avx_blendvps256;
23647 break;
23648 case E_V4DFmode:
23649 if (TARGET_AVX)
23650 gen = gen_avx_blendvpd256;
23651 break;
23652 case E_V32QImode:
23653 case E_V16HImode:
23654 case E_V8SImode:
23655 case E_V4DImode:
23656 if (TARGET_AVX2)
23658 gen = gen_avx2_pblendvb;
23659 if (mode != V32QImode)
23660 d = gen_reg_rtx (V32QImode);
23661 op_false = gen_lowpart (V32QImode, op_false);
23662 op_true = gen_lowpart (V32QImode, op_true);
23663 cmp = gen_lowpart (V32QImode, cmp);
23665 break;
23667 case E_V64QImode:
23668 gen = gen_avx512bw_blendmv64qi;
23669 break;
23670 case E_V32HImode:
23671 gen = gen_avx512bw_blendmv32hi;
23672 break;
23673 case E_V16SImode:
23674 gen = gen_avx512f_blendmv16si;
23675 break;
23676 case E_V8DImode:
23677 gen = gen_avx512f_blendmv8di;
23678 break;
23679 case E_V8DFmode:
23680 gen = gen_avx512f_blendmv8df;
23681 break;
23682 case E_V16SFmode:
23683 gen = gen_avx512f_blendmv16sf;
23684 break;
23686 default:
23687 break;
23690 if (gen != NULL)
23692 emit_insn (gen (d, op_false, op_true, cmp));
23693 if (d != dest)
23694 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23696 else
23698 op_true = force_reg (mode, op_true);
23700 t2 = gen_reg_rtx (mode);
23701 if (optimize)
23702 t3 = gen_reg_rtx (mode);
23703 else
23704 t3 = dest;
23706 x = gen_rtx_AND (mode, op_true, cmp);
23707 emit_insn (gen_rtx_SET (t2, x));
23709 x = gen_rtx_NOT (mode, cmp);
23710 x = gen_rtx_AND (mode, x, op_false);
23711 emit_insn (gen_rtx_SET (t3, x));
23713 x = gen_rtx_IOR (mode, t3, t2);
23714 emit_insn (gen_rtx_SET (dest, x));
23719 /* Expand a floating-point conditional move. Return true if successful. */
23721 bool
23722 ix86_expand_fp_movcc (rtx operands[])
23724 machine_mode mode = GET_MODE (operands[0]);
23725 enum rtx_code code = GET_CODE (operands[1]);
23726 rtx tmp, compare_op;
23727 rtx op0 = XEXP (operands[1], 0);
23728 rtx op1 = XEXP (operands[1], 1);
23730 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23732 machine_mode cmode;
23734 /* Since we've no cmove for sse registers, don't force bad register
23735 allocation just to gain access to it. Deny movcc when the
23736 comparison mode doesn't match the move mode. */
23737 cmode = GET_MODE (op0);
23738 if (cmode == VOIDmode)
23739 cmode = GET_MODE (op1);
23740 if (cmode != mode)
23741 return false;
23743 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23744 if (code == UNKNOWN)
23745 return false;
23747 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23748 operands[2], operands[3]))
23749 return true;
23751 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23752 operands[2], operands[3]);
23753 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23754 return true;
23757 if (GET_MODE (op0) == TImode
23758 || (GET_MODE (op0) == DImode
23759 && !TARGET_64BIT))
23760 return false;
23762 /* The floating point conditional move instructions don't directly
23763 support conditions resulting from a signed integer comparison. */
23765 compare_op = ix86_expand_compare (code, op0, op1);
23766 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23768 tmp = gen_reg_rtx (QImode);
23769 ix86_expand_setcc (tmp, code, op0, op1);
23771 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23774 emit_insn (gen_rtx_SET (operands[0],
23775 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23776 operands[2], operands[3])));
23778 return true;
23781 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23783 static int
23784 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23786 switch (code)
23788 case EQ:
23789 return 0;
23790 case LT:
23791 case LTU:
23792 return 1;
23793 case LE:
23794 case LEU:
23795 return 2;
23796 case NE:
23797 return 4;
23798 case GE:
23799 case GEU:
23800 return 5;
23801 case GT:
23802 case GTU:
23803 return 6;
23804 default:
23805 gcc_unreachable ();
23809 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23811 static int
23812 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23814 switch (code)
23816 case EQ:
23817 return 0x00;
23818 case NE:
23819 return 0x04;
23820 case GT:
23821 return 0x0e;
23822 case LE:
23823 return 0x02;
23824 case GE:
23825 return 0x0d;
23826 case LT:
23827 return 0x01;
23828 case UNLE:
23829 return 0x0a;
23830 case UNLT:
23831 return 0x09;
23832 case UNGE:
23833 return 0x05;
23834 case UNGT:
23835 return 0x06;
23836 case UNEQ:
23837 return 0x18;
23838 case LTGT:
23839 return 0x0c;
23840 case ORDERED:
23841 return 0x07;
23842 case UNORDERED:
23843 return 0x03;
23844 default:
23845 gcc_unreachable ();
23849 /* Return immediate value to be used in UNSPEC_PCMP
23850 for comparison CODE in MODE. */
23852 static int
23853 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23855 if (FLOAT_MODE_P (mode))
23856 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23857 return ix86_int_cmp_code_to_pcmp_immediate (code);
23860 /* Expand AVX-512 vector comparison. */
23862 bool
23863 ix86_expand_mask_vec_cmp (rtx operands[])
23865 machine_mode mask_mode = GET_MODE (operands[0]);
23866 machine_mode cmp_mode = GET_MODE (operands[2]);
23867 enum rtx_code code = GET_CODE (operands[1]);
23868 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23869 int unspec_code;
23870 rtx unspec;
23872 switch (code)
23874 case LEU:
23875 case GTU:
23876 case GEU:
23877 case LTU:
23878 unspec_code = UNSPEC_UNSIGNED_PCMP;
23879 break;
23881 default:
23882 unspec_code = UNSPEC_PCMP;
23885 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23886 operands[3], imm),
23887 unspec_code);
23888 emit_insn (gen_rtx_SET (operands[0], unspec));
23890 return true;
23893 /* Expand fp vector comparison. */
23895 bool
23896 ix86_expand_fp_vec_cmp (rtx operands[])
23898 enum rtx_code code = GET_CODE (operands[1]);
23899 rtx cmp;
23901 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23902 &operands[2], &operands[3]);
23903 if (code == UNKNOWN)
23905 rtx temp;
23906 switch (GET_CODE (operands[1]))
23908 case LTGT:
23909 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23910 operands[3], NULL, NULL);
23911 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23912 operands[3], NULL, NULL);
23913 code = AND;
23914 break;
23915 case UNEQ:
23916 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23917 operands[3], NULL, NULL);
23918 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23919 operands[3], NULL, NULL);
23920 code = IOR;
23921 break;
23922 default:
23923 gcc_unreachable ();
23925 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23926 OPTAB_DIRECT);
23928 else
23929 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23930 operands[1], operands[2]);
23932 if (operands[0] != cmp)
23933 emit_move_insn (operands[0], cmp);
23935 return true;
23938 static rtx
23939 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23940 rtx op_true, rtx op_false, bool *negate)
23942 machine_mode data_mode = GET_MODE (dest);
23943 machine_mode mode = GET_MODE (cop0);
23944 rtx x;
23946 *negate = false;
23948 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23949 if (TARGET_XOP
23950 && (mode == V16QImode || mode == V8HImode
23951 || mode == V4SImode || mode == V2DImode))
23953 else
23955 /* Canonicalize the comparison to EQ, GT, GTU. */
23956 switch (code)
23958 case EQ:
23959 case GT:
23960 case GTU:
23961 break;
23963 case NE:
23964 case LE:
23965 case LEU:
23966 code = reverse_condition (code);
23967 *negate = true;
23968 break;
23970 case GE:
23971 case GEU:
23972 code = reverse_condition (code);
23973 *negate = true;
23974 /* FALLTHRU */
23976 case LT:
23977 case LTU:
23978 std::swap (cop0, cop1);
23979 code = swap_condition (code);
23980 break;
23982 default:
23983 gcc_unreachable ();
23986 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23987 if (mode == V2DImode)
23989 switch (code)
23991 case EQ:
23992 /* SSE4.1 supports EQ. */
23993 if (!TARGET_SSE4_1)
23994 return NULL;
23995 break;
23997 case GT:
23998 case GTU:
23999 /* SSE4.2 supports GT/GTU. */
24000 if (!TARGET_SSE4_2)
24001 return NULL;
24002 break;
24004 default:
24005 gcc_unreachable ();
24009 /* Unsigned parallel compare is not supported by the hardware.
24010 Play some tricks to turn this into a signed comparison
24011 against 0. */
24012 if (code == GTU)
24014 cop0 = force_reg (mode, cop0);
24016 switch (mode)
24018 case E_V16SImode:
24019 case E_V8DImode:
24020 case E_V8SImode:
24021 case E_V4DImode:
24022 case E_V4SImode:
24023 case E_V2DImode:
24025 rtx t1, t2, mask;
24026 rtx (*gen_sub3) (rtx, rtx, rtx);
24028 switch (mode)
24030 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24031 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24032 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24033 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24034 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24035 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24036 default:
24037 gcc_unreachable ();
24039 /* Subtract (-(INT MAX) - 1) from both operands to make
24040 them signed. */
24041 mask = ix86_build_signbit_mask (mode, true, false);
24042 t1 = gen_reg_rtx (mode);
24043 emit_insn (gen_sub3 (t1, cop0, mask));
24045 t2 = gen_reg_rtx (mode);
24046 emit_insn (gen_sub3 (t2, cop1, mask));
24048 cop0 = t1;
24049 cop1 = t2;
24050 code = GT;
24052 break;
24054 case E_V64QImode:
24055 case E_V32HImode:
24056 case E_V32QImode:
24057 case E_V16HImode:
24058 case E_V16QImode:
24059 case E_V8HImode:
24060 /* Perform a parallel unsigned saturating subtraction. */
24061 x = gen_reg_rtx (mode);
24062 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24063 cop1)));
24065 cop0 = x;
24066 cop1 = CONST0_RTX (mode);
24067 code = EQ;
24068 *negate = !*negate;
24069 break;
24071 default:
24072 gcc_unreachable ();
24077 if (*negate)
24078 std::swap (op_true, op_false);
24080 /* Allow the comparison to be done in one mode, but the movcc to
24081 happen in another mode. */
24082 if (data_mode == mode)
24084 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24085 op_true, op_false);
24087 else
24089 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24090 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24091 op_true, op_false);
24092 if (GET_MODE (x) == mode)
24093 x = gen_lowpart (data_mode, x);
24096 return x;
24099 /* Expand integer vector comparison. */
24101 bool
24102 ix86_expand_int_vec_cmp (rtx operands[])
24104 rtx_code code = GET_CODE (operands[1]);
24105 bool negate = false;
24106 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24107 operands[3], NULL, NULL, &negate);
24109 if (!cmp)
24110 return false;
24112 if (negate)
24113 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24114 CONST0_RTX (GET_MODE (cmp)),
24115 NULL, NULL, &negate);
24117 gcc_assert (!negate);
24119 if (operands[0] != cmp)
24120 emit_move_insn (operands[0], cmp);
24122 return true;
24125 /* Expand a floating-point vector conditional move; a vcond operation
24126 rather than a movcc operation. */
24128 bool
24129 ix86_expand_fp_vcond (rtx operands[])
24131 enum rtx_code code = GET_CODE (operands[3]);
24132 rtx cmp;
24134 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24135 &operands[4], &operands[5]);
24136 if (code == UNKNOWN)
24138 rtx temp;
24139 switch (GET_CODE (operands[3]))
24141 case LTGT:
24142 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24143 operands[5], operands[0], operands[0]);
24144 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24145 operands[5], operands[1], operands[2]);
24146 code = AND;
24147 break;
24148 case UNEQ:
24149 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24150 operands[5], operands[0], operands[0]);
24151 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24152 operands[5], operands[1], operands[2]);
24153 code = IOR;
24154 break;
24155 default:
24156 gcc_unreachable ();
24158 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24159 OPTAB_DIRECT);
24160 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24161 return true;
24164 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24165 operands[5], operands[1], operands[2]))
24166 return true;
24168 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24169 operands[1], operands[2]);
24170 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24171 return true;
24174 /* Expand a signed/unsigned integral vector conditional move. */
24176 bool
24177 ix86_expand_int_vcond (rtx operands[])
24179 machine_mode data_mode = GET_MODE (operands[0]);
24180 machine_mode mode = GET_MODE (operands[4]);
24181 enum rtx_code code = GET_CODE (operands[3]);
24182 bool negate = false;
24183 rtx x, cop0, cop1;
24185 cop0 = operands[4];
24186 cop1 = operands[5];
24188 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24189 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24190 if ((code == LT || code == GE)
24191 && data_mode == mode
24192 && cop1 == CONST0_RTX (mode)
24193 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24194 && GET_MODE_UNIT_SIZE (data_mode) > 1
24195 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24196 && (GET_MODE_SIZE (data_mode) == 16
24197 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24199 rtx negop = operands[2 - (code == LT)];
24200 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24201 if (negop == CONST1_RTX (data_mode))
24203 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24204 operands[0], 1, OPTAB_DIRECT);
24205 if (res != operands[0])
24206 emit_move_insn (operands[0], res);
24207 return true;
24209 else if (GET_MODE_INNER (data_mode) != DImode
24210 && vector_all_ones_operand (negop, data_mode))
24212 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24213 operands[0], 0, OPTAB_DIRECT);
24214 if (res != operands[0])
24215 emit_move_insn (operands[0], res);
24216 return true;
24220 if (!nonimmediate_operand (cop1, mode))
24221 cop1 = force_reg (mode, cop1);
24222 if (!general_operand (operands[1], data_mode))
24223 operands[1] = force_reg (data_mode, operands[1]);
24224 if (!general_operand (operands[2], data_mode))
24225 operands[2] = force_reg (data_mode, operands[2]);
24227 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24228 operands[1], operands[2], &negate);
24230 if (!x)
24231 return false;
24233 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24234 operands[2-negate]);
24235 return true;
24238 /* AVX512F does support 64-byte integer vector operations,
24239 thus the longest vector we are faced with is V64QImode. */
24240 #define MAX_VECT_LEN 64
24242 struct expand_vec_perm_d
24244 rtx target, op0, op1;
24245 unsigned char perm[MAX_VECT_LEN];
24246 machine_mode vmode;
24247 unsigned char nelt;
24248 bool one_operand_p;
24249 bool testing_p;
24252 static bool
24253 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24254 struct expand_vec_perm_d *d)
24256 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24257 expander, so args are either in d, or in op0, op1 etc. */
24258 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24259 machine_mode maskmode = mode;
24260 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24262 switch (mode)
24264 case E_V8HImode:
24265 if (TARGET_AVX512VL && TARGET_AVX512BW)
24266 gen = gen_avx512vl_vpermt2varv8hi3;
24267 break;
24268 case E_V16HImode:
24269 if (TARGET_AVX512VL && TARGET_AVX512BW)
24270 gen = gen_avx512vl_vpermt2varv16hi3;
24271 break;
24272 case E_V64QImode:
24273 if (TARGET_AVX512VBMI)
24274 gen = gen_avx512bw_vpermt2varv64qi3;
24275 break;
24276 case E_V32HImode:
24277 if (TARGET_AVX512BW)
24278 gen = gen_avx512bw_vpermt2varv32hi3;
24279 break;
24280 case E_V4SImode:
24281 if (TARGET_AVX512VL)
24282 gen = gen_avx512vl_vpermt2varv4si3;
24283 break;
24284 case E_V8SImode:
24285 if (TARGET_AVX512VL)
24286 gen = gen_avx512vl_vpermt2varv8si3;
24287 break;
24288 case E_V16SImode:
24289 if (TARGET_AVX512F)
24290 gen = gen_avx512f_vpermt2varv16si3;
24291 break;
24292 case E_V4SFmode:
24293 if (TARGET_AVX512VL)
24295 gen = gen_avx512vl_vpermt2varv4sf3;
24296 maskmode = V4SImode;
24298 break;
24299 case E_V8SFmode:
24300 if (TARGET_AVX512VL)
24302 gen = gen_avx512vl_vpermt2varv8sf3;
24303 maskmode = V8SImode;
24305 break;
24306 case E_V16SFmode:
24307 if (TARGET_AVX512F)
24309 gen = gen_avx512f_vpermt2varv16sf3;
24310 maskmode = V16SImode;
24312 break;
24313 case E_V2DImode:
24314 if (TARGET_AVX512VL)
24315 gen = gen_avx512vl_vpermt2varv2di3;
24316 break;
24317 case E_V4DImode:
24318 if (TARGET_AVX512VL)
24319 gen = gen_avx512vl_vpermt2varv4di3;
24320 break;
24321 case E_V8DImode:
24322 if (TARGET_AVX512F)
24323 gen = gen_avx512f_vpermt2varv8di3;
24324 break;
24325 case E_V2DFmode:
24326 if (TARGET_AVX512VL)
24328 gen = gen_avx512vl_vpermt2varv2df3;
24329 maskmode = V2DImode;
24331 break;
24332 case E_V4DFmode:
24333 if (TARGET_AVX512VL)
24335 gen = gen_avx512vl_vpermt2varv4df3;
24336 maskmode = V4DImode;
24338 break;
24339 case E_V8DFmode:
24340 if (TARGET_AVX512F)
24342 gen = gen_avx512f_vpermt2varv8df3;
24343 maskmode = V8DImode;
24345 break;
24346 default:
24347 break;
24350 if (gen == NULL)
24351 return false;
24353 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24354 expander, so args are either in d, or in op0, op1 etc. */
24355 if (d)
24357 rtx vec[64];
24358 target = d->target;
24359 op0 = d->op0;
24360 op1 = d->op1;
24361 for (int i = 0; i < d->nelt; ++i)
24362 vec[i] = GEN_INT (d->perm[i]);
24363 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24366 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24367 return true;
24370 /* Expand a variable vector permutation. */
24372 void
24373 ix86_expand_vec_perm (rtx operands[])
24375 rtx target = operands[0];
24376 rtx op0 = operands[1];
24377 rtx op1 = operands[2];
24378 rtx mask = operands[3];
24379 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24380 machine_mode mode = GET_MODE (op0);
24381 machine_mode maskmode = GET_MODE (mask);
24382 int w, e, i;
24383 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24385 /* Number of elements in the vector. */
24386 w = GET_MODE_NUNITS (mode);
24387 e = GET_MODE_UNIT_SIZE (mode);
24388 gcc_assert (w <= 64);
24390 if (TARGET_AVX512F && one_operand_shuffle)
24392 rtx (*gen) (rtx, rtx, rtx) = NULL;
24393 switch (mode)
24395 case E_V16SImode:
24396 gen =gen_avx512f_permvarv16si;
24397 break;
24398 case E_V16SFmode:
24399 gen = gen_avx512f_permvarv16sf;
24400 break;
24401 case E_V8DImode:
24402 gen = gen_avx512f_permvarv8di;
24403 break;
24404 case E_V8DFmode:
24405 gen = gen_avx512f_permvarv8df;
24406 break;
24407 default:
24408 break;
24410 if (gen != NULL)
24412 emit_insn (gen (target, op0, mask));
24413 return;
24417 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24418 return;
24420 if (TARGET_AVX2)
24422 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24424 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24425 an constant shuffle operand. With a tiny bit of effort we can
24426 use VPERMD instead. A re-interpretation stall for V4DFmode is
24427 unfortunate but there's no avoiding it.
24428 Similarly for V16HImode we don't have instructions for variable
24429 shuffling, while for V32QImode we can use after preparing suitable
24430 masks vpshufb; vpshufb; vpermq; vpor. */
24432 if (mode == V16HImode)
24434 maskmode = mode = V32QImode;
24435 w = 32;
24436 e = 1;
24438 else
24440 maskmode = mode = V8SImode;
24441 w = 8;
24442 e = 4;
24444 t1 = gen_reg_rtx (maskmode);
24446 /* Replicate the low bits of the V4DImode mask into V8SImode:
24447 mask = { A B C D }
24448 t1 = { A A B B C C D D }. */
24449 for (i = 0; i < w / 2; ++i)
24450 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24451 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24452 vt = force_reg (maskmode, vt);
24453 mask = gen_lowpart (maskmode, mask);
24454 if (maskmode == V8SImode)
24455 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24456 else
24457 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24459 /* Multiply the shuffle indicies by two. */
24460 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24461 OPTAB_DIRECT);
24463 /* Add one to the odd shuffle indicies:
24464 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24465 for (i = 0; i < w / 2; ++i)
24467 vec[i * 2] = const0_rtx;
24468 vec[i * 2 + 1] = const1_rtx;
24470 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24471 vt = validize_mem (force_const_mem (maskmode, vt));
24472 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24473 OPTAB_DIRECT);
24475 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24476 operands[3] = mask = t1;
24477 target = gen_reg_rtx (mode);
24478 op0 = gen_lowpart (mode, op0);
24479 op1 = gen_lowpart (mode, op1);
24482 switch (mode)
24484 case E_V8SImode:
24485 /* The VPERMD and VPERMPS instructions already properly ignore
24486 the high bits of the shuffle elements. No need for us to
24487 perform an AND ourselves. */
24488 if (one_operand_shuffle)
24490 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24491 if (target != operands[0])
24492 emit_move_insn (operands[0],
24493 gen_lowpart (GET_MODE (operands[0]), target));
24495 else
24497 t1 = gen_reg_rtx (V8SImode);
24498 t2 = gen_reg_rtx (V8SImode);
24499 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24500 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24501 goto merge_two;
24503 return;
24505 case E_V8SFmode:
24506 mask = gen_lowpart (V8SImode, mask);
24507 if (one_operand_shuffle)
24508 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24509 else
24511 t1 = gen_reg_rtx (V8SFmode);
24512 t2 = gen_reg_rtx (V8SFmode);
24513 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24514 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24515 goto merge_two;
24517 return;
24519 case E_V4SImode:
24520 /* By combining the two 128-bit input vectors into one 256-bit
24521 input vector, we can use VPERMD and VPERMPS for the full
24522 two-operand shuffle. */
24523 t1 = gen_reg_rtx (V8SImode);
24524 t2 = gen_reg_rtx (V8SImode);
24525 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24526 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24527 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24528 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24529 return;
24531 case E_V4SFmode:
24532 t1 = gen_reg_rtx (V8SFmode);
24533 t2 = gen_reg_rtx (V8SImode);
24534 mask = gen_lowpart (V4SImode, mask);
24535 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24536 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24537 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24538 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24539 return;
24541 case E_V32QImode:
24542 t1 = gen_reg_rtx (V32QImode);
24543 t2 = gen_reg_rtx (V32QImode);
24544 t3 = gen_reg_rtx (V32QImode);
24545 vt2 = GEN_INT (-128);
24546 vt = gen_const_vec_duplicate (V32QImode, vt2);
24547 vt = force_reg (V32QImode, vt);
24548 for (i = 0; i < 32; i++)
24549 vec[i] = i < 16 ? vt2 : const0_rtx;
24550 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24551 vt2 = force_reg (V32QImode, vt2);
24552 /* From mask create two adjusted masks, which contain the same
24553 bits as mask in the low 7 bits of each vector element.
24554 The first mask will have the most significant bit clear
24555 if it requests element from the same 128-bit lane
24556 and MSB set if it requests element from the other 128-bit lane.
24557 The second mask will have the opposite values of the MSB,
24558 and additionally will have its 128-bit lanes swapped.
24559 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24560 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24561 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24562 stands for other 12 bytes. */
24563 /* The bit whether element is from the same lane or the other
24564 lane is bit 4, so shift it up by 3 to the MSB position. */
24565 t5 = gen_reg_rtx (V4DImode);
24566 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24567 GEN_INT (3)));
24568 /* Clear MSB bits from the mask just in case it had them set. */
24569 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24570 /* After this t1 will have MSB set for elements from other lane. */
24571 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24572 /* Clear bits other than MSB. */
24573 emit_insn (gen_andv32qi3 (t1, t1, vt));
24574 /* Or in the lower bits from mask into t3. */
24575 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24576 /* And invert MSB bits in t1, so MSB is set for elements from the same
24577 lane. */
24578 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24579 /* Swap 128-bit lanes in t3. */
24580 t6 = gen_reg_rtx (V4DImode);
24581 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24582 const2_rtx, GEN_INT (3),
24583 const0_rtx, const1_rtx));
24584 /* And or in the lower bits from mask into t1. */
24585 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24586 if (one_operand_shuffle)
24588 /* Each of these shuffles will put 0s in places where
24589 element from the other 128-bit lane is needed, otherwise
24590 will shuffle in the requested value. */
24591 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24592 gen_lowpart (V32QImode, t6)));
24593 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24594 /* For t3 the 128-bit lanes are swapped again. */
24595 t7 = gen_reg_rtx (V4DImode);
24596 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24597 const2_rtx, GEN_INT (3),
24598 const0_rtx, const1_rtx));
24599 /* And oring both together leads to the result. */
24600 emit_insn (gen_iorv32qi3 (target, t1,
24601 gen_lowpart (V32QImode, t7)));
24602 if (target != operands[0])
24603 emit_move_insn (operands[0],
24604 gen_lowpart (GET_MODE (operands[0]), target));
24605 return;
24608 t4 = gen_reg_rtx (V32QImode);
24609 /* Similarly to the above one_operand_shuffle code,
24610 just for repeated twice for each operand. merge_two:
24611 code will merge the two results together. */
24612 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24613 gen_lowpart (V32QImode, t6)));
24614 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24615 gen_lowpart (V32QImode, t6)));
24616 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24617 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24618 t7 = gen_reg_rtx (V4DImode);
24619 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24620 const2_rtx, GEN_INT (3),
24621 const0_rtx, const1_rtx));
24622 t8 = gen_reg_rtx (V4DImode);
24623 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24624 const2_rtx, GEN_INT (3),
24625 const0_rtx, const1_rtx));
24626 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24627 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24628 t1 = t4;
24629 t2 = t3;
24630 goto merge_two;
24632 default:
24633 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24634 break;
24638 if (TARGET_XOP)
24640 /* The XOP VPPERM insn supports three inputs. By ignoring the
24641 one_operand_shuffle special case, we avoid creating another
24642 set of constant vectors in memory. */
24643 one_operand_shuffle = false;
24645 /* mask = mask & {2*w-1, ...} */
24646 vt = GEN_INT (2*w - 1);
24648 else
24650 /* mask = mask & {w-1, ...} */
24651 vt = GEN_INT (w - 1);
24654 vt = gen_const_vec_duplicate (maskmode, vt);
24655 mask = expand_simple_binop (maskmode, AND, mask, vt,
24656 NULL_RTX, 0, OPTAB_DIRECT);
24658 /* For non-QImode operations, convert the word permutation control
24659 into a byte permutation control. */
24660 if (mode != V16QImode)
24662 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24663 GEN_INT (exact_log2 (e)),
24664 NULL_RTX, 0, OPTAB_DIRECT);
24666 /* Convert mask to vector of chars. */
24667 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24669 /* Replicate each of the input bytes into byte positions:
24670 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24671 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24672 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24673 for (i = 0; i < 16; ++i)
24674 vec[i] = GEN_INT (i/e * e);
24675 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24676 vt = validize_mem (force_const_mem (V16QImode, vt));
24677 if (TARGET_XOP)
24678 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24679 else
24680 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24682 /* Convert it into the byte positions by doing
24683 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24684 for (i = 0; i < 16; ++i)
24685 vec[i] = GEN_INT (i % e);
24686 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24687 vt = validize_mem (force_const_mem (V16QImode, vt));
24688 emit_insn (gen_addv16qi3 (mask, mask, vt));
24691 /* The actual shuffle operations all operate on V16QImode. */
24692 op0 = gen_lowpart (V16QImode, op0);
24693 op1 = gen_lowpart (V16QImode, op1);
24695 if (TARGET_XOP)
24697 if (GET_MODE (target) != V16QImode)
24698 target = gen_reg_rtx (V16QImode);
24699 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24700 if (target != operands[0])
24701 emit_move_insn (operands[0],
24702 gen_lowpart (GET_MODE (operands[0]), target));
24704 else if (one_operand_shuffle)
24706 if (GET_MODE (target) != V16QImode)
24707 target = gen_reg_rtx (V16QImode);
24708 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24709 if (target != operands[0])
24710 emit_move_insn (operands[0],
24711 gen_lowpart (GET_MODE (operands[0]), target));
24713 else
24715 rtx xops[6];
24716 bool ok;
24718 /* Shuffle the two input vectors independently. */
24719 t1 = gen_reg_rtx (V16QImode);
24720 t2 = gen_reg_rtx (V16QImode);
24721 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24722 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24724 merge_two:
24725 /* Then merge them together. The key is whether any given control
24726 element contained a bit set that indicates the second word. */
24727 mask = operands[3];
24728 vt = GEN_INT (w);
24729 if (maskmode == V2DImode && !TARGET_SSE4_1)
24731 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24732 more shuffle to convert the V2DI input mask into a V4SI
24733 input mask. At which point the masking that expand_int_vcond
24734 will work as desired. */
24735 rtx t3 = gen_reg_rtx (V4SImode);
24736 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24737 const0_rtx, const0_rtx,
24738 const2_rtx, const2_rtx));
24739 mask = t3;
24740 maskmode = V4SImode;
24741 e = w = 4;
24744 vt = gen_const_vec_duplicate (maskmode, vt);
24745 vt = force_reg (maskmode, vt);
24746 mask = expand_simple_binop (maskmode, AND, mask, vt,
24747 NULL_RTX, 0, OPTAB_DIRECT);
24749 if (GET_MODE (target) != mode)
24750 target = gen_reg_rtx (mode);
24751 xops[0] = target;
24752 xops[1] = gen_lowpart (mode, t2);
24753 xops[2] = gen_lowpart (mode, t1);
24754 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24755 xops[4] = mask;
24756 xops[5] = vt;
24757 ok = ix86_expand_int_vcond (xops);
24758 gcc_assert (ok);
24759 if (target != operands[0])
24760 emit_move_insn (operands[0],
24761 gen_lowpart (GET_MODE (operands[0]), target));
24765 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24766 true if we should do zero extension, else sign extension. HIGH_P is
24767 true if we want the N/2 high elements, else the low elements. */
24769 void
24770 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24772 machine_mode imode = GET_MODE (src);
24773 rtx tmp;
24775 if (TARGET_SSE4_1)
24777 rtx (*unpack)(rtx, rtx);
24778 rtx (*extract)(rtx, rtx) = NULL;
24779 machine_mode halfmode = BLKmode;
24781 switch (imode)
24783 case E_V64QImode:
24784 if (unsigned_p)
24785 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24786 else
24787 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24788 halfmode = V32QImode;
24789 extract
24790 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24791 break;
24792 case E_V32QImode:
24793 if (unsigned_p)
24794 unpack = gen_avx2_zero_extendv16qiv16hi2;
24795 else
24796 unpack = gen_avx2_sign_extendv16qiv16hi2;
24797 halfmode = V16QImode;
24798 extract
24799 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24800 break;
24801 case E_V32HImode:
24802 if (unsigned_p)
24803 unpack = gen_avx512f_zero_extendv16hiv16si2;
24804 else
24805 unpack = gen_avx512f_sign_extendv16hiv16si2;
24806 halfmode = V16HImode;
24807 extract
24808 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24809 break;
24810 case E_V16HImode:
24811 if (unsigned_p)
24812 unpack = gen_avx2_zero_extendv8hiv8si2;
24813 else
24814 unpack = gen_avx2_sign_extendv8hiv8si2;
24815 halfmode = V8HImode;
24816 extract
24817 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24818 break;
24819 case E_V16SImode:
24820 if (unsigned_p)
24821 unpack = gen_avx512f_zero_extendv8siv8di2;
24822 else
24823 unpack = gen_avx512f_sign_extendv8siv8di2;
24824 halfmode = V8SImode;
24825 extract
24826 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24827 break;
24828 case E_V8SImode:
24829 if (unsigned_p)
24830 unpack = gen_avx2_zero_extendv4siv4di2;
24831 else
24832 unpack = gen_avx2_sign_extendv4siv4di2;
24833 halfmode = V4SImode;
24834 extract
24835 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24836 break;
24837 case E_V16QImode:
24838 if (unsigned_p)
24839 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24840 else
24841 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24842 break;
24843 case E_V8HImode:
24844 if (unsigned_p)
24845 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24846 else
24847 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24848 break;
24849 case E_V4SImode:
24850 if (unsigned_p)
24851 unpack = gen_sse4_1_zero_extendv2siv2di2;
24852 else
24853 unpack = gen_sse4_1_sign_extendv2siv2di2;
24854 break;
24855 default:
24856 gcc_unreachable ();
24859 if (GET_MODE_SIZE (imode) >= 32)
24861 tmp = gen_reg_rtx (halfmode);
24862 emit_insn (extract (tmp, src));
24864 else if (high_p)
24866 /* Shift higher 8 bytes to lower 8 bytes. */
24867 tmp = gen_reg_rtx (V1TImode);
24868 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24869 GEN_INT (64)));
24870 tmp = gen_lowpart (imode, tmp);
24872 else
24873 tmp = src;
24875 emit_insn (unpack (dest, tmp));
24877 else
24879 rtx (*unpack)(rtx, rtx, rtx);
24881 switch (imode)
24883 case E_V16QImode:
24884 if (high_p)
24885 unpack = gen_vec_interleave_highv16qi;
24886 else
24887 unpack = gen_vec_interleave_lowv16qi;
24888 break;
24889 case E_V8HImode:
24890 if (high_p)
24891 unpack = gen_vec_interleave_highv8hi;
24892 else
24893 unpack = gen_vec_interleave_lowv8hi;
24894 break;
24895 case E_V4SImode:
24896 if (high_p)
24897 unpack = gen_vec_interleave_highv4si;
24898 else
24899 unpack = gen_vec_interleave_lowv4si;
24900 break;
24901 default:
24902 gcc_unreachable ();
24905 if (unsigned_p)
24906 tmp = force_reg (imode, CONST0_RTX (imode));
24907 else
24908 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24909 src, pc_rtx, pc_rtx);
24911 rtx tmp2 = gen_reg_rtx (imode);
24912 emit_insn (unpack (tmp2, src, tmp));
24913 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24917 /* Expand conditional increment or decrement using adb/sbb instructions.
24918 The default case using setcc followed by the conditional move can be
24919 done by generic code. */
24920 bool
24921 ix86_expand_int_addcc (rtx operands[])
24923 enum rtx_code code = GET_CODE (operands[1]);
24924 rtx flags;
24925 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24926 rtx compare_op;
24927 rtx val = const0_rtx;
24928 bool fpcmp = false;
24929 machine_mode mode;
24930 rtx op0 = XEXP (operands[1], 0);
24931 rtx op1 = XEXP (operands[1], 1);
24933 if (operands[3] != const1_rtx
24934 && operands[3] != constm1_rtx)
24935 return false;
24936 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24937 return false;
24938 code = GET_CODE (compare_op);
24940 flags = XEXP (compare_op, 0);
24942 if (GET_MODE (flags) == CCFPmode)
24944 fpcmp = true;
24945 code = ix86_fp_compare_code_to_integer (code);
24948 if (code != LTU)
24950 val = constm1_rtx;
24951 if (fpcmp)
24952 PUT_CODE (compare_op,
24953 reverse_condition_maybe_unordered
24954 (GET_CODE (compare_op)));
24955 else
24956 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24959 mode = GET_MODE (operands[0]);
24961 /* Construct either adc or sbb insn. */
24962 if ((code == LTU) == (operands[3] == constm1_rtx))
24964 switch (mode)
24966 case E_QImode:
24967 insn = gen_subqi3_carry;
24968 break;
24969 case E_HImode:
24970 insn = gen_subhi3_carry;
24971 break;
24972 case E_SImode:
24973 insn = gen_subsi3_carry;
24974 break;
24975 case E_DImode:
24976 insn = gen_subdi3_carry;
24977 break;
24978 default:
24979 gcc_unreachable ();
24982 else
24984 switch (mode)
24986 case E_QImode:
24987 insn = gen_addqi3_carry;
24988 break;
24989 case E_HImode:
24990 insn = gen_addhi3_carry;
24991 break;
24992 case E_SImode:
24993 insn = gen_addsi3_carry;
24994 break;
24995 case E_DImode:
24996 insn = gen_adddi3_carry;
24997 break;
24998 default:
24999 gcc_unreachable ();
25002 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25004 return true;
25008 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25009 but works for floating pointer parameters and nonoffsetable memories.
25010 For pushes, it returns just stack offsets; the values will be saved
25011 in the right order. Maximally three parts are generated. */
25013 static int
25014 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25016 int size;
25018 if (!TARGET_64BIT)
25019 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25020 else
25021 size = (GET_MODE_SIZE (mode) + 4) / 8;
25023 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25024 gcc_assert (size >= 2 && size <= 4);
25026 /* Optimize constant pool reference to immediates. This is used by fp
25027 moves, that force all constants to memory to allow combining. */
25028 if (MEM_P (operand) && MEM_READONLY_P (operand))
25029 operand = avoid_constant_pool_reference (operand);
25031 if (MEM_P (operand) && !offsettable_memref_p (operand))
25033 /* The only non-offsetable memories we handle are pushes. */
25034 int ok = push_operand (operand, VOIDmode);
25036 gcc_assert (ok);
25038 operand = copy_rtx (operand);
25039 PUT_MODE (operand, word_mode);
25040 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25041 return size;
25044 if (GET_CODE (operand) == CONST_VECTOR)
25046 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25047 /* Caution: if we looked through a constant pool memory above,
25048 the operand may actually have a different mode now. That's
25049 ok, since we want to pun this all the way back to an integer. */
25050 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25051 gcc_assert (operand != NULL);
25052 mode = imode;
25055 if (!TARGET_64BIT)
25057 if (mode == DImode)
25058 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25059 else
25061 int i;
25063 if (REG_P (operand))
25065 gcc_assert (reload_completed);
25066 for (i = 0; i < size; i++)
25067 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25069 else if (offsettable_memref_p (operand))
25071 operand = adjust_address (operand, SImode, 0);
25072 parts[0] = operand;
25073 for (i = 1; i < size; i++)
25074 parts[i] = adjust_address (operand, SImode, 4 * i);
25076 else if (CONST_DOUBLE_P (operand))
25078 const REAL_VALUE_TYPE *r;
25079 long l[4];
25081 r = CONST_DOUBLE_REAL_VALUE (operand);
25082 switch (mode)
25084 case E_TFmode:
25085 real_to_target (l, r, mode);
25086 parts[3] = gen_int_mode (l[3], SImode);
25087 parts[2] = gen_int_mode (l[2], SImode);
25088 break;
25089 case E_XFmode:
25090 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25091 long double may not be 80-bit. */
25092 real_to_target (l, r, mode);
25093 parts[2] = gen_int_mode (l[2], SImode);
25094 break;
25095 case E_DFmode:
25096 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25097 break;
25098 default:
25099 gcc_unreachable ();
25101 parts[1] = gen_int_mode (l[1], SImode);
25102 parts[0] = gen_int_mode (l[0], SImode);
25104 else
25105 gcc_unreachable ();
25108 else
25110 if (mode == TImode)
25111 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25112 if (mode == XFmode || mode == TFmode)
25114 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25115 if (REG_P (operand))
25117 gcc_assert (reload_completed);
25118 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25119 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25121 else if (offsettable_memref_p (operand))
25123 operand = adjust_address (operand, DImode, 0);
25124 parts[0] = operand;
25125 parts[1] = adjust_address (operand, upper_mode, 8);
25127 else if (CONST_DOUBLE_P (operand))
25129 long l[4];
25131 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25133 /* real_to_target puts 32-bit pieces in each long. */
25134 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25135 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25136 << 32), DImode);
25138 if (upper_mode == SImode)
25139 parts[1] = gen_int_mode (l[2], SImode);
25140 else
25141 parts[1]
25142 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25143 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25144 << 32), DImode);
25146 else
25147 gcc_unreachable ();
25151 return size;
25154 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25155 Return false when normal moves are needed; true when all required
25156 insns have been emitted. Operands 2-4 contain the input values
25157 int the correct order; operands 5-7 contain the output values. */
25159 void
25160 ix86_split_long_move (rtx operands[])
25162 rtx part[2][4];
25163 int nparts, i, j;
25164 int push = 0;
25165 int collisions = 0;
25166 machine_mode mode = GET_MODE (operands[0]);
25167 bool collisionparts[4];
25169 /* The DFmode expanders may ask us to move double.
25170 For 64bit target this is single move. By hiding the fact
25171 here we simplify i386.md splitters. */
25172 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25174 /* Optimize constant pool reference to immediates. This is used by
25175 fp moves, that force all constants to memory to allow combining. */
25177 if (MEM_P (operands[1])
25178 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25179 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25180 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25181 if (push_operand (operands[0], VOIDmode))
25183 operands[0] = copy_rtx (operands[0]);
25184 PUT_MODE (operands[0], word_mode);
25186 else
25187 operands[0] = gen_lowpart (DImode, operands[0]);
25188 operands[1] = gen_lowpart (DImode, operands[1]);
25189 emit_move_insn (operands[0], operands[1]);
25190 return;
25193 /* The only non-offsettable memory we handle is push. */
25194 if (push_operand (operands[0], VOIDmode))
25195 push = 1;
25196 else
25197 gcc_assert (!MEM_P (operands[0])
25198 || offsettable_memref_p (operands[0]));
25200 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25201 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25203 /* When emitting push, take care for source operands on the stack. */
25204 if (push && MEM_P (operands[1])
25205 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25207 rtx src_base = XEXP (part[1][nparts - 1], 0);
25209 /* Compensate for the stack decrement by 4. */
25210 if (!TARGET_64BIT && nparts == 3
25211 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25212 src_base = plus_constant (Pmode, src_base, 4);
25214 /* src_base refers to the stack pointer and is
25215 automatically decreased by emitted push. */
25216 for (i = 0; i < nparts; i++)
25217 part[1][i] = change_address (part[1][i],
25218 GET_MODE (part[1][i]), src_base);
25221 /* We need to do copy in the right order in case an address register
25222 of the source overlaps the destination. */
25223 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25225 rtx tmp;
25227 for (i = 0; i < nparts; i++)
25229 collisionparts[i]
25230 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25231 if (collisionparts[i])
25232 collisions++;
25235 /* Collision in the middle part can be handled by reordering. */
25236 if (collisions == 1 && nparts == 3 && collisionparts [1])
25238 std::swap (part[0][1], part[0][2]);
25239 std::swap (part[1][1], part[1][2]);
25241 else if (collisions == 1
25242 && nparts == 4
25243 && (collisionparts [1] || collisionparts [2]))
25245 if (collisionparts [1])
25247 std::swap (part[0][1], part[0][2]);
25248 std::swap (part[1][1], part[1][2]);
25250 else
25252 std::swap (part[0][2], part[0][3]);
25253 std::swap (part[1][2], part[1][3]);
25257 /* If there are more collisions, we can't handle it by reordering.
25258 Do an lea to the last part and use only one colliding move. */
25259 else if (collisions > 1)
25261 rtx base, addr;
25263 collisions = 1;
25265 base = part[0][nparts - 1];
25267 /* Handle the case when the last part isn't valid for lea.
25268 Happens in 64-bit mode storing the 12-byte XFmode. */
25269 if (GET_MODE (base) != Pmode)
25270 base = gen_rtx_REG (Pmode, REGNO (base));
25272 addr = XEXP (part[1][0], 0);
25273 if (TARGET_TLS_DIRECT_SEG_REFS)
25275 struct ix86_address parts;
25276 int ok = ix86_decompose_address (addr, &parts);
25277 gcc_assert (ok);
25278 /* It is not valid to use %gs: or %fs: in lea. */
25279 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25281 emit_insn (gen_rtx_SET (base, addr));
25282 part[1][0] = replace_equiv_address (part[1][0], base);
25283 for (i = 1; i < nparts; i++)
25285 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25286 part[1][i] = replace_equiv_address (part[1][i], tmp);
25291 if (push)
25293 if (!TARGET_64BIT)
25295 if (nparts == 3)
25297 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25298 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25299 stack_pointer_rtx, GEN_INT (-4)));
25300 emit_move_insn (part[0][2], part[1][2]);
25302 else if (nparts == 4)
25304 emit_move_insn (part[0][3], part[1][3]);
25305 emit_move_insn (part[0][2], part[1][2]);
25308 else
25310 /* In 64bit mode we don't have 32bit push available. In case this is
25311 register, it is OK - we will just use larger counterpart. We also
25312 retype memory - these comes from attempt to avoid REX prefix on
25313 moving of second half of TFmode value. */
25314 if (GET_MODE (part[1][1]) == SImode)
25316 switch (GET_CODE (part[1][1]))
25318 case MEM:
25319 part[1][1] = adjust_address (part[1][1], DImode, 0);
25320 break;
25322 case REG:
25323 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25324 break;
25326 default:
25327 gcc_unreachable ();
25330 if (GET_MODE (part[1][0]) == SImode)
25331 part[1][0] = part[1][1];
25334 emit_move_insn (part[0][1], part[1][1]);
25335 emit_move_insn (part[0][0], part[1][0]);
25336 return;
25339 /* Choose correct order to not overwrite the source before it is copied. */
25340 if ((REG_P (part[0][0])
25341 && REG_P (part[1][1])
25342 && (REGNO (part[0][0]) == REGNO (part[1][1])
25343 || (nparts == 3
25344 && REGNO (part[0][0]) == REGNO (part[1][2]))
25345 || (nparts == 4
25346 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25347 || (collisions > 0
25348 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25350 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25352 operands[2 + i] = part[0][j];
25353 operands[6 + i] = part[1][j];
25356 else
25358 for (i = 0; i < nparts; i++)
25360 operands[2 + i] = part[0][i];
25361 operands[6 + i] = part[1][i];
25365 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25366 if (optimize_insn_for_size_p ())
25368 for (j = 0; j < nparts - 1; j++)
25369 if (CONST_INT_P (operands[6 + j])
25370 && operands[6 + j] != const0_rtx
25371 && REG_P (operands[2 + j]))
25372 for (i = j; i < nparts - 1; i++)
25373 if (CONST_INT_P (operands[7 + i])
25374 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25375 operands[7 + i] = operands[2 + j];
25378 for (i = 0; i < nparts; i++)
25379 emit_move_insn (operands[2 + i], operands[6 + i]);
25381 return;
25384 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25385 left shift by a constant, either using a single shift or
25386 a sequence of add instructions. */
25388 static void
25389 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25391 rtx (*insn)(rtx, rtx, rtx);
25393 if (count == 1
25394 || (count * ix86_cost->add <= ix86_cost->shift_const
25395 && !optimize_insn_for_size_p ()))
25397 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25398 while (count-- > 0)
25399 emit_insn (insn (operand, operand, operand));
25401 else
25403 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25404 emit_insn (insn (operand, operand, GEN_INT (count)));
25408 void
25409 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25411 rtx (*gen_ashl3)(rtx, rtx, rtx);
25412 rtx (*gen_shld)(rtx, rtx, rtx);
25413 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25415 rtx low[2], high[2];
25416 int count;
25418 if (CONST_INT_P (operands[2]))
25420 split_double_mode (mode, operands, 2, low, high);
25421 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25423 if (count >= half_width)
25425 emit_move_insn (high[0], low[1]);
25426 emit_move_insn (low[0], const0_rtx);
25428 if (count > half_width)
25429 ix86_expand_ashl_const (high[0], count - half_width, mode);
25431 else
25433 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25435 if (!rtx_equal_p (operands[0], operands[1]))
25436 emit_move_insn (operands[0], operands[1]);
25438 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25439 ix86_expand_ashl_const (low[0], count, mode);
25441 return;
25444 split_double_mode (mode, operands, 1, low, high);
25446 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25448 if (operands[1] == const1_rtx)
25450 /* Assuming we've chosen a QImode capable registers, then 1 << N
25451 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25452 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25454 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25456 ix86_expand_clear (low[0]);
25457 ix86_expand_clear (high[0]);
25458 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25460 d = gen_lowpart (QImode, low[0]);
25461 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25462 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25463 emit_insn (gen_rtx_SET (d, s));
25465 d = gen_lowpart (QImode, high[0]);
25466 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25467 s = gen_rtx_NE (QImode, flags, const0_rtx);
25468 emit_insn (gen_rtx_SET (d, s));
25471 /* Otherwise, we can get the same results by manually performing
25472 a bit extract operation on bit 5/6, and then performing the two
25473 shifts. The two methods of getting 0/1 into low/high are exactly
25474 the same size. Avoiding the shift in the bit extract case helps
25475 pentium4 a bit; no one else seems to care much either way. */
25476 else
25478 machine_mode half_mode;
25479 rtx (*gen_lshr3)(rtx, rtx, rtx);
25480 rtx (*gen_and3)(rtx, rtx, rtx);
25481 rtx (*gen_xor3)(rtx, rtx, rtx);
25482 HOST_WIDE_INT bits;
25483 rtx x;
25485 if (mode == DImode)
25487 half_mode = SImode;
25488 gen_lshr3 = gen_lshrsi3;
25489 gen_and3 = gen_andsi3;
25490 gen_xor3 = gen_xorsi3;
25491 bits = 5;
25493 else
25495 half_mode = DImode;
25496 gen_lshr3 = gen_lshrdi3;
25497 gen_and3 = gen_anddi3;
25498 gen_xor3 = gen_xordi3;
25499 bits = 6;
25502 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25503 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25504 else
25505 x = gen_lowpart (half_mode, operands[2]);
25506 emit_insn (gen_rtx_SET (high[0], x));
25508 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25509 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25510 emit_move_insn (low[0], high[0]);
25511 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25514 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25515 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25516 return;
25519 if (operands[1] == constm1_rtx)
25521 /* For -1 << N, we can avoid the shld instruction, because we
25522 know that we're shifting 0...31/63 ones into a -1. */
25523 emit_move_insn (low[0], constm1_rtx);
25524 if (optimize_insn_for_size_p ())
25525 emit_move_insn (high[0], low[0]);
25526 else
25527 emit_move_insn (high[0], constm1_rtx);
25529 else
25531 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25533 if (!rtx_equal_p (operands[0], operands[1]))
25534 emit_move_insn (operands[0], operands[1]);
25536 split_double_mode (mode, operands, 1, low, high);
25537 emit_insn (gen_shld (high[0], low[0], operands[2]));
25540 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25542 if (TARGET_CMOVE && scratch)
25544 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25545 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25547 ix86_expand_clear (scratch);
25548 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25550 else
25552 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25553 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25555 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25559 void
25560 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25562 rtx (*gen_ashr3)(rtx, rtx, rtx)
25563 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25564 rtx (*gen_shrd)(rtx, rtx, rtx);
25565 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25567 rtx low[2], high[2];
25568 int count;
25570 if (CONST_INT_P (operands[2]))
25572 split_double_mode (mode, operands, 2, low, high);
25573 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25575 if (count == GET_MODE_BITSIZE (mode) - 1)
25577 emit_move_insn (high[0], high[1]);
25578 emit_insn (gen_ashr3 (high[0], high[0],
25579 GEN_INT (half_width - 1)));
25580 emit_move_insn (low[0], high[0]);
25583 else if (count >= half_width)
25585 emit_move_insn (low[0], high[1]);
25586 emit_move_insn (high[0], low[0]);
25587 emit_insn (gen_ashr3 (high[0], high[0],
25588 GEN_INT (half_width - 1)));
25590 if (count > half_width)
25591 emit_insn (gen_ashr3 (low[0], low[0],
25592 GEN_INT (count - half_width)));
25594 else
25596 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25598 if (!rtx_equal_p (operands[0], operands[1]))
25599 emit_move_insn (operands[0], operands[1]);
25601 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25602 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25605 else
25607 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25609 if (!rtx_equal_p (operands[0], operands[1]))
25610 emit_move_insn (operands[0], operands[1]);
25612 split_double_mode (mode, operands, 1, low, high);
25614 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25615 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25617 if (TARGET_CMOVE && scratch)
25619 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25620 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25622 emit_move_insn (scratch, high[0]);
25623 emit_insn (gen_ashr3 (scratch, scratch,
25624 GEN_INT (half_width - 1)));
25625 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25626 scratch));
25628 else
25630 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25631 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25633 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25638 void
25639 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25641 rtx (*gen_lshr3)(rtx, rtx, rtx)
25642 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25643 rtx (*gen_shrd)(rtx, rtx, rtx);
25644 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25646 rtx low[2], high[2];
25647 int count;
25649 if (CONST_INT_P (operands[2]))
25651 split_double_mode (mode, operands, 2, low, high);
25652 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25654 if (count >= half_width)
25656 emit_move_insn (low[0], high[1]);
25657 ix86_expand_clear (high[0]);
25659 if (count > half_width)
25660 emit_insn (gen_lshr3 (low[0], low[0],
25661 GEN_INT (count - half_width)));
25663 else
25665 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25667 if (!rtx_equal_p (operands[0], operands[1]))
25668 emit_move_insn (operands[0], operands[1]);
25670 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25671 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25674 else
25676 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25678 if (!rtx_equal_p (operands[0], operands[1]))
25679 emit_move_insn (operands[0], operands[1]);
25681 split_double_mode (mode, operands, 1, low, high);
25683 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25684 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25686 if (TARGET_CMOVE && scratch)
25688 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25689 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25691 ix86_expand_clear (scratch);
25692 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25693 scratch));
25695 else
25697 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25698 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25700 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25705 /* Predict just emitted jump instruction to be taken with probability PROB. */
25706 static void
25707 predict_jump (int prob)
25709 rtx_insn *insn = get_last_insn ();
25710 gcc_assert (JUMP_P (insn));
25711 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25714 /* Helper function for the string operations below. Dest VARIABLE whether
25715 it is aligned to VALUE bytes. If true, jump to the label. */
25716 static rtx_code_label *
25717 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25719 rtx_code_label *label = gen_label_rtx ();
25720 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25721 if (GET_MODE (variable) == DImode)
25722 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25723 else
25724 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25725 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25726 1, label);
25727 if (epilogue)
25728 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25729 else
25730 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25731 return label;
25734 /* Adjust COUNTER by the VALUE. */
25735 static void
25736 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25738 rtx (*gen_add)(rtx, rtx, rtx)
25739 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25741 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25744 /* Zero extend possibly SImode EXP to Pmode register. */
25746 ix86_zero_extend_to_Pmode (rtx exp)
25748 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25751 /* Divide COUNTREG by SCALE. */
25752 static rtx
25753 scale_counter (rtx countreg, int scale)
25755 rtx sc;
25757 if (scale == 1)
25758 return countreg;
25759 if (CONST_INT_P (countreg))
25760 return GEN_INT (INTVAL (countreg) / scale);
25761 gcc_assert (REG_P (countreg));
25763 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25764 GEN_INT (exact_log2 (scale)),
25765 NULL, 1, OPTAB_DIRECT);
25766 return sc;
25769 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25770 DImode for constant loop counts. */
25772 static machine_mode
25773 counter_mode (rtx count_exp)
25775 if (GET_MODE (count_exp) != VOIDmode)
25776 return GET_MODE (count_exp);
25777 if (!CONST_INT_P (count_exp))
25778 return Pmode;
25779 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25780 return DImode;
25781 return SImode;
25784 /* Copy the address to a Pmode register. This is used for x32 to
25785 truncate DImode TLS address to a SImode register. */
25787 static rtx
25788 ix86_copy_addr_to_reg (rtx addr)
25790 rtx reg;
25791 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25793 reg = copy_addr_to_reg (addr);
25794 REG_POINTER (reg) = 1;
25795 return reg;
25797 else
25799 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25800 reg = copy_to_mode_reg (DImode, addr);
25801 REG_POINTER (reg) = 1;
25802 return gen_rtx_SUBREG (SImode, reg, 0);
25806 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25807 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25808 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25809 memory by VALUE (supposed to be in MODE).
25811 The size is rounded down to whole number of chunk size moved at once.
25812 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25815 static void
25816 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25817 rtx destptr, rtx srcptr, rtx value,
25818 rtx count, machine_mode mode, int unroll,
25819 int expected_size, bool issetmem)
25821 rtx_code_label *out_label, *top_label;
25822 rtx iter, tmp;
25823 machine_mode iter_mode = counter_mode (count);
25824 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25825 rtx piece_size = GEN_INT (piece_size_n);
25826 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25827 rtx size;
25828 int i;
25830 top_label = gen_label_rtx ();
25831 out_label = gen_label_rtx ();
25832 iter = gen_reg_rtx (iter_mode);
25834 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25835 NULL, 1, OPTAB_DIRECT);
25836 /* Those two should combine. */
25837 if (piece_size == const1_rtx)
25839 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25840 true, out_label);
25841 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25843 emit_move_insn (iter, const0_rtx);
25845 emit_label (top_label);
25847 tmp = convert_modes (Pmode, iter_mode, iter, true);
25849 /* This assert could be relaxed - in this case we'll need to compute
25850 smallest power of two, containing in PIECE_SIZE_N and pass it to
25851 offset_address. */
25852 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25853 destmem = offset_address (destmem, tmp, piece_size_n);
25854 destmem = adjust_address (destmem, mode, 0);
25856 if (!issetmem)
25858 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25859 srcmem = adjust_address (srcmem, mode, 0);
25861 /* When unrolling for chips that reorder memory reads and writes,
25862 we can save registers by using single temporary.
25863 Also using 4 temporaries is overkill in 32bit mode. */
25864 if (!TARGET_64BIT && 0)
25866 for (i = 0; i < unroll; i++)
25868 if (i)
25870 destmem =
25871 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25872 srcmem =
25873 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25875 emit_move_insn (destmem, srcmem);
25878 else
25880 rtx tmpreg[4];
25881 gcc_assert (unroll <= 4);
25882 for (i = 0; i < unroll; i++)
25884 tmpreg[i] = gen_reg_rtx (mode);
25885 if (i)
25887 srcmem =
25888 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25890 emit_move_insn (tmpreg[i], srcmem);
25892 for (i = 0; i < unroll; i++)
25894 if (i)
25896 destmem =
25897 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25899 emit_move_insn (destmem, tmpreg[i]);
25903 else
25904 for (i = 0; i < unroll; i++)
25906 if (i)
25907 destmem =
25908 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25909 emit_move_insn (destmem, value);
25912 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25913 true, OPTAB_LIB_WIDEN);
25914 if (tmp != iter)
25915 emit_move_insn (iter, tmp);
25917 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25918 true, top_label);
25919 if (expected_size != -1)
25921 expected_size /= GET_MODE_SIZE (mode) * unroll;
25922 if (expected_size == 0)
25923 predict_jump (0);
25924 else if (expected_size > REG_BR_PROB_BASE)
25925 predict_jump (REG_BR_PROB_BASE - 1);
25926 else
25927 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25929 else
25930 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25931 iter = ix86_zero_extend_to_Pmode (iter);
25932 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25933 true, OPTAB_LIB_WIDEN);
25934 if (tmp != destptr)
25935 emit_move_insn (destptr, tmp);
25936 if (!issetmem)
25938 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25939 true, OPTAB_LIB_WIDEN);
25940 if (tmp != srcptr)
25941 emit_move_insn (srcptr, tmp);
25943 emit_label (out_label);
25946 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25947 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25948 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25949 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25950 ORIG_VALUE is the original value passed to memset to fill the memory with.
25951 Other arguments have same meaning as for previous function. */
25953 static void
25954 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25955 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25956 rtx count,
25957 machine_mode mode, bool issetmem)
25959 rtx destexp;
25960 rtx srcexp;
25961 rtx countreg;
25962 HOST_WIDE_INT rounded_count;
25964 /* If possible, it is shorter to use rep movs.
25965 TODO: Maybe it is better to move this logic to decide_alg. */
25966 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25967 && (!issetmem || orig_value == const0_rtx))
25968 mode = SImode;
25970 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25971 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25973 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25974 GET_MODE_SIZE (mode)));
25975 if (mode != QImode)
25977 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25978 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25979 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25981 else
25982 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25983 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25985 rounded_count
25986 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25987 destmem = shallow_copy_rtx (destmem);
25988 set_mem_size (destmem, rounded_count);
25990 else if (MEM_SIZE_KNOWN_P (destmem))
25991 clear_mem_size (destmem);
25993 if (issetmem)
25995 value = force_reg (mode, gen_lowpart (mode, value));
25996 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25998 else
26000 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26001 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26002 if (mode != QImode)
26004 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26005 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26006 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26008 else
26009 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26010 if (CONST_INT_P (count))
26012 rounded_count
26013 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26014 srcmem = shallow_copy_rtx (srcmem);
26015 set_mem_size (srcmem, rounded_count);
26017 else
26019 if (MEM_SIZE_KNOWN_P (srcmem))
26020 clear_mem_size (srcmem);
26022 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26023 destexp, srcexp));
26027 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26028 DESTMEM.
26029 SRC is passed by pointer to be updated on return.
26030 Return value is updated DST. */
26031 static rtx
26032 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26033 HOST_WIDE_INT size_to_move)
26035 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26036 enum insn_code code;
26037 machine_mode move_mode;
26038 int piece_size, i;
26040 /* Find the widest mode in which we could perform moves.
26041 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26042 it until move of such size is supported. */
26043 piece_size = 1 << floor_log2 (size_to_move);
26044 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26045 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26047 gcc_assert (piece_size > 1);
26048 piece_size >>= 1;
26051 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26052 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26053 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26055 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26056 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26057 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26059 move_mode = word_mode;
26060 piece_size = GET_MODE_SIZE (move_mode);
26061 code = optab_handler (mov_optab, move_mode);
26064 gcc_assert (code != CODE_FOR_nothing);
26066 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26067 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26069 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26070 gcc_assert (size_to_move % piece_size == 0);
26071 adjust = GEN_INT (piece_size);
26072 for (i = 0; i < size_to_move; i += piece_size)
26074 /* We move from memory to memory, so we'll need to do it via
26075 a temporary register. */
26076 tempreg = gen_reg_rtx (move_mode);
26077 emit_insn (GEN_FCN (code) (tempreg, src));
26078 emit_insn (GEN_FCN (code) (dst, tempreg));
26080 emit_move_insn (destptr,
26081 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26082 emit_move_insn (srcptr,
26083 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26085 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26086 piece_size);
26087 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26088 piece_size);
26091 /* Update DST and SRC rtx. */
26092 *srcmem = src;
26093 return dst;
26096 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26097 static void
26098 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26099 rtx destptr, rtx srcptr, rtx count, int max_size)
26101 rtx src, dest;
26102 if (CONST_INT_P (count))
26104 HOST_WIDE_INT countval = INTVAL (count);
26105 HOST_WIDE_INT epilogue_size = countval % max_size;
26106 int i;
26108 /* For now MAX_SIZE should be a power of 2. This assert could be
26109 relaxed, but it'll require a bit more complicated epilogue
26110 expanding. */
26111 gcc_assert ((max_size & (max_size - 1)) == 0);
26112 for (i = max_size; i >= 1; i >>= 1)
26114 if (epilogue_size & i)
26115 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26117 return;
26119 if (max_size > 8)
26121 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26122 count, 1, OPTAB_DIRECT);
26123 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26124 count, QImode, 1, 4, false);
26125 return;
26128 /* When there are stringops, we can cheaply increase dest and src pointers.
26129 Otherwise we save code size by maintaining offset (zero is readily
26130 available from preceding rep operation) and using x86 addressing modes.
26132 if (TARGET_SINGLE_STRINGOP)
26134 if (max_size > 4)
26136 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26137 src = change_address (srcmem, SImode, srcptr);
26138 dest = change_address (destmem, SImode, destptr);
26139 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26140 emit_label (label);
26141 LABEL_NUSES (label) = 1;
26143 if (max_size > 2)
26145 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26146 src = change_address (srcmem, HImode, srcptr);
26147 dest = change_address (destmem, HImode, destptr);
26148 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26149 emit_label (label);
26150 LABEL_NUSES (label) = 1;
26152 if (max_size > 1)
26154 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26155 src = change_address (srcmem, QImode, srcptr);
26156 dest = change_address (destmem, QImode, destptr);
26157 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26158 emit_label (label);
26159 LABEL_NUSES (label) = 1;
26162 else
26164 rtx offset = force_reg (Pmode, const0_rtx);
26165 rtx tmp;
26167 if (max_size > 4)
26169 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26170 src = change_address (srcmem, SImode, srcptr);
26171 dest = change_address (destmem, SImode, destptr);
26172 emit_move_insn (dest, src);
26173 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26174 true, OPTAB_LIB_WIDEN);
26175 if (tmp != offset)
26176 emit_move_insn (offset, tmp);
26177 emit_label (label);
26178 LABEL_NUSES (label) = 1;
26180 if (max_size > 2)
26182 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26183 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26184 src = change_address (srcmem, HImode, tmp);
26185 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26186 dest = change_address (destmem, HImode, tmp);
26187 emit_move_insn (dest, src);
26188 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26189 true, OPTAB_LIB_WIDEN);
26190 if (tmp != offset)
26191 emit_move_insn (offset, tmp);
26192 emit_label (label);
26193 LABEL_NUSES (label) = 1;
26195 if (max_size > 1)
26197 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26198 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26199 src = change_address (srcmem, QImode, tmp);
26200 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26201 dest = change_address (destmem, QImode, tmp);
26202 emit_move_insn (dest, src);
26203 emit_label (label);
26204 LABEL_NUSES (label) = 1;
26209 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26210 with value PROMOTED_VAL.
26211 SRC is passed by pointer to be updated on return.
26212 Return value is updated DST. */
26213 static rtx
26214 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26215 HOST_WIDE_INT size_to_move)
26217 rtx dst = destmem, adjust;
26218 enum insn_code code;
26219 machine_mode move_mode;
26220 int piece_size, i;
26222 /* Find the widest mode in which we could perform moves.
26223 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26224 it until move of such size is supported. */
26225 move_mode = GET_MODE (promoted_val);
26226 if (move_mode == VOIDmode)
26227 move_mode = QImode;
26228 if (size_to_move < GET_MODE_SIZE (move_mode))
26230 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26231 move_mode = int_mode_for_size (move_bits, 0).require ();
26232 promoted_val = gen_lowpart (move_mode, promoted_val);
26234 piece_size = GET_MODE_SIZE (move_mode);
26235 code = optab_handler (mov_optab, move_mode);
26236 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26238 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26240 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26241 gcc_assert (size_to_move % piece_size == 0);
26242 adjust = GEN_INT (piece_size);
26243 for (i = 0; i < size_to_move; i += piece_size)
26245 if (piece_size <= GET_MODE_SIZE (word_mode))
26247 emit_insn (gen_strset (destptr, dst, promoted_val));
26248 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26249 piece_size);
26250 continue;
26253 emit_insn (GEN_FCN (code) (dst, promoted_val));
26255 emit_move_insn (destptr,
26256 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26258 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26259 piece_size);
26262 /* Update DST rtx. */
26263 return dst;
26265 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26266 static void
26267 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26268 rtx count, int max_size)
26270 count =
26271 expand_simple_binop (counter_mode (count), AND, count,
26272 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26273 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26274 gen_lowpart (QImode, value), count, QImode,
26275 1, max_size / 2, true);
26278 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26279 static void
26280 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26281 rtx count, int max_size)
26283 rtx dest;
26285 if (CONST_INT_P (count))
26287 HOST_WIDE_INT countval = INTVAL (count);
26288 HOST_WIDE_INT epilogue_size = countval % max_size;
26289 int i;
26291 /* For now MAX_SIZE should be a power of 2. This assert could be
26292 relaxed, but it'll require a bit more complicated epilogue
26293 expanding. */
26294 gcc_assert ((max_size & (max_size - 1)) == 0);
26295 for (i = max_size; i >= 1; i >>= 1)
26297 if (epilogue_size & i)
26299 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26300 destmem = emit_memset (destmem, destptr, vec_value, i);
26301 else
26302 destmem = emit_memset (destmem, destptr, value, i);
26305 return;
26307 if (max_size > 32)
26309 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26310 return;
26312 if (max_size > 16)
26314 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26315 if (TARGET_64BIT)
26317 dest = change_address (destmem, DImode, destptr);
26318 emit_insn (gen_strset (destptr, dest, value));
26319 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26320 emit_insn (gen_strset (destptr, dest, value));
26322 else
26324 dest = change_address (destmem, SImode, destptr);
26325 emit_insn (gen_strset (destptr, dest, value));
26326 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26327 emit_insn (gen_strset (destptr, dest, value));
26328 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26329 emit_insn (gen_strset (destptr, dest, value));
26330 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26331 emit_insn (gen_strset (destptr, dest, value));
26333 emit_label (label);
26334 LABEL_NUSES (label) = 1;
26336 if (max_size > 8)
26338 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26339 if (TARGET_64BIT)
26341 dest = change_address (destmem, DImode, destptr);
26342 emit_insn (gen_strset (destptr, dest, value));
26344 else
26346 dest = change_address (destmem, SImode, destptr);
26347 emit_insn (gen_strset (destptr, dest, value));
26348 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26349 emit_insn (gen_strset (destptr, dest, value));
26351 emit_label (label);
26352 LABEL_NUSES (label) = 1;
26354 if (max_size > 4)
26356 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26357 dest = change_address (destmem, SImode, destptr);
26358 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26359 emit_label (label);
26360 LABEL_NUSES (label) = 1;
26362 if (max_size > 2)
26364 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26365 dest = change_address (destmem, HImode, destptr);
26366 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26367 emit_label (label);
26368 LABEL_NUSES (label) = 1;
26370 if (max_size > 1)
26372 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26373 dest = change_address (destmem, QImode, destptr);
26374 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26375 emit_label (label);
26376 LABEL_NUSES (label) = 1;
26380 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26381 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26382 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26383 ignored.
26384 Return value is updated DESTMEM. */
26385 static rtx
26386 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26387 rtx destptr, rtx srcptr, rtx value,
26388 rtx vec_value, rtx count, int align,
26389 int desired_alignment, bool issetmem)
26391 int i;
26392 for (i = 1; i < desired_alignment; i <<= 1)
26394 if (align <= i)
26396 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26397 if (issetmem)
26399 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26400 destmem = emit_memset (destmem, destptr, vec_value, i);
26401 else
26402 destmem = emit_memset (destmem, destptr, value, i);
26404 else
26405 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26406 ix86_adjust_counter (count, i);
26407 emit_label (label);
26408 LABEL_NUSES (label) = 1;
26409 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26412 return destmem;
26415 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26416 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26417 and jump to DONE_LABEL. */
26418 static void
26419 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26420 rtx destptr, rtx srcptr,
26421 rtx value, rtx vec_value,
26422 rtx count, int size,
26423 rtx done_label, bool issetmem)
26425 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26426 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26427 rtx modesize;
26428 int n;
26430 /* If we do not have vector value to copy, we must reduce size. */
26431 if (issetmem)
26433 if (!vec_value)
26435 if (GET_MODE (value) == VOIDmode && size > 8)
26436 mode = Pmode;
26437 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26438 mode = GET_MODE (value);
26440 else
26441 mode = GET_MODE (vec_value), value = vec_value;
26443 else
26445 /* Choose appropriate vector mode. */
26446 if (size >= 32)
26447 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26448 else if (size >= 16)
26449 mode = TARGET_SSE ? V16QImode : DImode;
26450 srcmem = change_address (srcmem, mode, srcptr);
26452 destmem = change_address (destmem, mode, destptr);
26453 modesize = GEN_INT (GET_MODE_SIZE (mode));
26454 gcc_assert (GET_MODE_SIZE (mode) <= size);
26455 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26457 if (issetmem)
26458 emit_move_insn (destmem, gen_lowpart (mode, value));
26459 else
26461 emit_move_insn (destmem, srcmem);
26462 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26464 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26467 destmem = offset_address (destmem, count, 1);
26468 destmem = offset_address (destmem, GEN_INT (-2 * size),
26469 GET_MODE_SIZE (mode));
26470 if (!issetmem)
26472 srcmem = offset_address (srcmem, count, 1);
26473 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26474 GET_MODE_SIZE (mode));
26476 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26478 if (issetmem)
26479 emit_move_insn (destmem, gen_lowpart (mode, value));
26480 else
26482 emit_move_insn (destmem, srcmem);
26483 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26485 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26487 emit_jump_insn (gen_jump (done_label));
26488 emit_barrier ();
26490 emit_label (label);
26491 LABEL_NUSES (label) = 1;
26494 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26495 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26496 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26497 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26498 DONE_LABEL is a label after the whole copying sequence. The label is created
26499 on demand if *DONE_LABEL is NULL.
26500 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26501 bounds after the initial copies.
26503 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26504 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26505 we will dispatch to a library call for large blocks.
26507 In pseudocode we do:
26509 if (COUNT < SIZE)
26511 Assume that SIZE is 4. Bigger sizes are handled analogously
26512 if (COUNT & 4)
26514 copy 4 bytes from SRCPTR to DESTPTR
26515 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26516 goto done_label
26518 if (!COUNT)
26519 goto done_label;
26520 copy 1 byte from SRCPTR to DESTPTR
26521 if (COUNT & 2)
26523 copy 2 bytes from SRCPTR to DESTPTR
26524 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26527 else
26529 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26530 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26532 OLD_DESPTR = DESTPTR;
26533 Align DESTPTR up to DESIRED_ALIGN
26534 SRCPTR += DESTPTR - OLD_DESTPTR
26535 COUNT -= DEST_PTR - OLD_DESTPTR
26536 if (DYNAMIC_CHECK)
26537 Round COUNT down to multiple of SIZE
26538 << optional caller supplied zero size guard is here >>
26539 << optional caller supplied dynamic check is here >>
26540 << caller supplied main copy loop is here >>
26542 done_label:
26544 static void
26545 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26546 rtx *destptr, rtx *srcptr,
26547 machine_mode mode,
26548 rtx value, rtx vec_value,
26549 rtx *count,
26550 rtx_code_label **done_label,
26551 int size,
26552 int desired_align,
26553 int align,
26554 unsigned HOST_WIDE_INT *min_size,
26555 bool dynamic_check,
26556 bool issetmem)
26558 rtx_code_label *loop_label = NULL, *label;
26559 int n;
26560 rtx modesize;
26561 int prolog_size = 0;
26562 rtx mode_value;
26564 /* Chose proper value to copy. */
26565 if (issetmem && VECTOR_MODE_P (mode))
26566 mode_value = vec_value;
26567 else
26568 mode_value = value;
26569 gcc_assert (GET_MODE_SIZE (mode) <= size);
26571 /* See if block is big or small, handle small blocks. */
26572 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26574 int size2 = size;
26575 loop_label = gen_label_rtx ();
26577 if (!*done_label)
26578 *done_label = gen_label_rtx ();
26580 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26581 1, loop_label);
26582 size2 >>= 1;
26584 /* Handle sizes > 3. */
26585 for (;size2 > 2; size2 >>= 1)
26586 expand_small_movmem_or_setmem (destmem, srcmem,
26587 *destptr, *srcptr,
26588 value, vec_value,
26589 *count,
26590 size2, *done_label, issetmem);
26591 /* Nothing to copy? Jump to DONE_LABEL if so */
26592 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26593 1, *done_label);
26595 /* Do a byte copy. */
26596 destmem = change_address (destmem, QImode, *destptr);
26597 if (issetmem)
26598 emit_move_insn (destmem, gen_lowpart (QImode, value));
26599 else
26601 srcmem = change_address (srcmem, QImode, *srcptr);
26602 emit_move_insn (destmem, srcmem);
26605 /* Handle sizes 2 and 3. */
26606 label = ix86_expand_aligntest (*count, 2, false);
26607 destmem = change_address (destmem, HImode, *destptr);
26608 destmem = offset_address (destmem, *count, 1);
26609 destmem = offset_address (destmem, GEN_INT (-2), 2);
26610 if (issetmem)
26611 emit_move_insn (destmem, gen_lowpart (HImode, value));
26612 else
26614 srcmem = change_address (srcmem, HImode, *srcptr);
26615 srcmem = offset_address (srcmem, *count, 1);
26616 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26617 emit_move_insn (destmem, srcmem);
26620 emit_label (label);
26621 LABEL_NUSES (label) = 1;
26622 emit_jump_insn (gen_jump (*done_label));
26623 emit_barrier ();
26625 else
26626 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26627 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26629 /* Start memcpy for COUNT >= SIZE. */
26630 if (loop_label)
26632 emit_label (loop_label);
26633 LABEL_NUSES (loop_label) = 1;
26636 /* Copy first desired_align bytes. */
26637 if (!issetmem)
26638 srcmem = change_address (srcmem, mode, *srcptr);
26639 destmem = change_address (destmem, mode, *destptr);
26640 modesize = GEN_INT (GET_MODE_SIZE (mode));
26641 for (n = 0; prolog_size < desired_align - align; n++)
26643 if (issetmem)
26644 emit_move_insn (destmem, mode_value);
26645 else
26647 emit_move_insn (destmem, srcmem);
26648 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26650 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26651 prolog_size += GET_MODE_SIZE (mode);
26655 /* Copy last SIZE bytes. */
26656 destmem = offset_address (destmem, *count, 1);
26657 destmem = offset_address (destmem,
26658 GEN_INT (-size - prolog_size),
26660 if (issetmem)
26661 emit_move_insn (destmem, mode_value);
26662 else
26664 srcmem = offset_address (srcmem, *count, 1);
26665 srcmem = offset_address (srcmem,
26666 GEN_INT (-size - prolog_size),
26668 emit_move_insn (destmem, srcmem);
26670 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26672 destmem = offset_address (destmem, modesize, 1);
26673 if (issetmem)
26674 emit_move_insn (destmem, mode_value);
26675 else
26677 srcmem = offset_address (srcmem, modesize, 1);
26678 emit_move_insn (destmem, srcmem);
26682 /* Align destination. */
26683 if (desired_align > 1 && desired_align > align)
26685 rtx saveddest = *destptr;
26687 gcc_assert (desired_align <= size);
26688 /* Align destptr up, place it to new register. */
26689 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26690 GEN_INT (prolog_size),
26691 NULL_RTX, 1, OPTAB_DIRECT);
26692 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26693 REG_POINTER (*destptr) = 1;
26694 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26695 GEN_INT (-desired_align),
26696 *destptr, 1, OPTAB_DIRECT);
26697 /* See how many bytes we skipped. */
26698 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26699 *destptr,
26700 saveddest, 1, OPTAB_DIRECT);
26701 /* Adjust srcptr and count. */
26702 if (!issetmem)
26703 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26704 saveddest, *srcptr, 1, OPTAB_DIRECT);
26705 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26706 saveddest, *count, 1, OPTAB_DIRECT);
26707 /* We copied at most size + prolog_size. */
26708 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26709 *min_size
26710 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26711 else
26712 *min_size = 0;
26714 /* Our loops always round down the block size, but for dispatch to
26715 library we need precise value. */
26716 if (dynamic_check)
26717 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26718 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26720 else
26722 gcc_assert (prolog_size == 0);
26723 /* Decrease count, so we won't end up copying last word twice. */
26724 if (!CONST_INT_P (*count))
26725 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26726 constm1_rtx, *count, 1, OPTAB_DIRECT);
26727 else
26728 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26729 (unsigned HOST_WIDE_INT)size));
26730 if (*min_size)
26731 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26736 /* This function is like the previous one, except here we know how many bytes
26737 need to be copied. That allows us to update alignment not only of DST, which
26738 is returned, but also of SRC, which is passed as a pointer for that
26739 reason. */
26740 static rtx
26741 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26742 rtx srcreg, rtx value, rtx vec_value,
26743 int desired_align, int align_bytes,
26744 bool issetmem)
26746 rtx src = NULL;
26747 rtx orig_dst = dst;
26748 rtx orig_src = NULL;
26749 int piece_size = 1;
26750 int copied_bytes = 0;
26752 if (!issetmem)
26754 gcc_assert (srcp != NULL);
26755 src = *srcp;
26756 orig_src = src;
26759 for (piece_size = 1;
26760 piece_size <= desired_align && copied_bytes < align_bytes;
26761 piece_size <<= 1)
26763 if (align_bytes & piece_size)
26765 if (issetmem)
26767 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26768 dst = emit_memset (dst, destreg, vec_value, piece_size);
26769 else
26770 dst = emit_memset (dst, destreg, value, piece_size);
26772 else
26773 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26774 copied_bytes += piece_size;
26777 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26778 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26779 if (MEM_SIZE_KNOWN_P (orig_dst))
26780 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26782 if (!issetmem)
26784 int src_align_bytes = get_mem_align_offset (src, desired_align
26785 * BITS_PER_UNIT);
26786 if (src_align_bytes >= 0)
26787 src_align_bytes = desired_align - src_align_bytes;
26788 if (src_align_bytes >= 0)
26790 unsigned int src_align;
26791 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26793 if ((src_align_bytes & (src_align - 1))
26794 == (align_bytes & (src_align - 1)))
26795 break;
26797 if (src_align > (unsigned int) desired_align)
26798 src_align = desired_align;
26799 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26800 set_mem_align (src, src_align * BITS_PER_UNIT);
26802 if (MEM_SIZE_KNOWN_P (orig_src))
26803 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26804 *srcp = src;
26807 return dst;
26810 /* Return true if ALG can be used in current context.
26811 Assume we expand memset if MEMSET is true. */
26812 static bool
26813 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26815 if (alg == no_stringop)
26816 return false;
26817 if (alg == vector_loop)
26818 return TARGET_SSE || TARGET_AVX;
26819 /* Algorithms using the rep prefix want at least edi and ecx;
26820 additionally, memset wants eax and memcpy wants esi. Don't
26821 consider such algorithms if the user has appropriated those
26822 registers for their own purposes, or if we have a non-default
26823 address space, since some string insns cannot override the segment. */
26824 if (alg == rep_prefix_1_byte
26825 || alg == rep_prefix_4_byte
26826 || alg == rep_prefix_8_byte)
26828 if (have_as)
26829 return false;
26830 if (fixed_regs[CX_REG]
26831 || fixed_regs[DI_REG]
26832 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26833 return false;
26835 return true;
26838 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26839 static enum stringop_alg
26840 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26841 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26842 bool memset, bool zero_memset, bool have_as,
26843 int *dynamic_check, bool *noalign, bool recur)
26845 const struct stringop_algs *algs;
26846 bool optimize_for_speed;
26847 int max = 0;
26848 const struct processor_costs *cost;
26849 int i;
26850 bool any_alg_usable_p = false;
26852 *noalign = false;
26853 *dynamic_check = -1;
26855 /* Even if the string operation call is cold, we still might spend a lot
26856 of time processing large blocks. */
26857 if (optimize_function_for_size_p (cfun)
26858 || (optimize_insn_for_size_p ()
26859 && (max_size < 256
26860 || (expected_size != -1 && expected_size < 256))))
26861 optimize_for_speed = false;
26862 else
26863 optimize_for_speed = true;
26865 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26866 if (memset)
26867 algs = &cost->memset[TARGET_64BIT != 0];
26868 else
26869 algs = &cost->memcpy[TARGET_64BIT != 0];
26871 /* See maximal size for user defined algorithm. */
26872 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26874 enum stringop_alg candidate = algs->size[i].alg;
26875 bool usable = alg_usable_p (candidate, memset, have_as);
26876 any_alg_usable_p |= usable;
26878 if (candidate != libcall && candidate && usable)
26879 max = algs->size[i].max;
26882 /* If expected size is not known but max size is small enough
26883 so inline version is a win, set expected size into
26884 the range. */
26885 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26886 && expected_size == -1)
26887 expected_size = min_size / 2 + max_size / 2;
26889 /* If user specified the algorithm, honor it if possible. */
26890 if (ix86_stringop_alg != no_stringop
26891 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26892 return ix86_stringop_alg;
26893 /* rep; movq or rep; movl is the smallest variant. */
26894 else if (!optimize_for_speed)
26896 *noalign = true;
26897 if (!count || (count & 3) || (memset && !zero_memset))
26898 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26899 ? rep_prefix_1_byte : loop_1_byte;
26900 else
26901 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26902 ? rep_prefix_4_byte : loop;
26904 /* Very tiny blocks are best handled via the loop, REP is expensive to
26905 setup. */
26906 else if (expected_size != -1 && expected_size < 4)
26907 return loop_1_byte;
26908 else if (expected_size != -1)
26910 enum stringop_alg alg = libcall;
26911 bool alg_noalign = false;
26912 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26914 /* We get here if the algorithms that were not libcall-based
26915 were rep-prefix based and we are unable to use rep prefixes
26916 based on global register usage. Break out of the loop and
26917 use the heuristic below. */
26918 if (algs->size[i].max == 0)
26919 break;
26920 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26922 enum stringop_alg candidate = algs->size[i].alg;
26924 if (candidate != libcall
26925 && alg_usable_p (candidate, memset, have_as))
26927 alg = candidate;
26928 alg_noalign = algs->size[i].noalign;
26930 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26931 last non-libcall inline algorithm. */
26932 if (TARGET_INLINE_ALL_STRINGOPS)
26934 /* When the current size is best to be copied by a libcall,
26935 but we are still forced to inline, run the heuristic below
26936 that will pick code for medium sized blocks. */
26937 if (alg != libcall)
26939 *noalign = alg_noalign;
26940 return alg;
26942 else if (!any_alg_usable_p)
26943 break;
26945 else if (alg_usable_p (candidate, memset, have_as))
26947 *noalign = algs->size[i].noalign;
26948 return candidate;
26953 /* When asked to inline the call anyway, try to pick meaningful choice.
26954 We look for maximal size of block that is faster to copy by hand and
26955 take blocks of at most of that size guessing that average size will
26956 be roughly half of the block.
26958 If this turns out to be bad, we might simply specify the preferred
26959 choice in ix86_costs. */
26960 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26961 && (algs->unknown_size == libcall
26962 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26964 enum stringop_alg alg;
26965 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26967 /* If there aren't any usable algorithms or if recursing already,
26968 then recursing on smaller sizes or same size isn't going to
26969 find anything. Just return the simple byte-at-a-time copy loop. */
26970 if (!any_alg_usable_p || recur)
26972 /* Pick something reasonable. */
26973 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26974 *dynamic_check = 128;
26975 return loop_1_byte;
26977 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26978 zero_memset, have_as, dynamic_check, noalign, true);
26979 gcc_assert (*dynamic_check == -1);
26980 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26981 *dynamic_check = max;
26982 else
26983 gcc_assert (alg != libcall);
26984 return alg;
26986 return (alg_usable_p (algs->unknown_size, memset, have_as)
26987 ? algs->unknown_size : libcall);
26990 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26991 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26992 static int
26993 decide_alignment (int align,
26994 enum stringop_alg alg,
26995 int expected_size,
26996 machine_mode move_mode)
26998 int desired_align = 0;
27000 gcc_assert (alg != no_stringop);
27002 if (alg == libcall)
27003 return 0;
27004 if (move_mode == VOIDmode)
27005 return 0;
27007 desired_align = GET_MODE_SIZE (move_mode);
27008 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27009 copying whole cacheline at once. */
27010 if (TARGET_PENTIUMPRO
27011 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27012 desired_align = 8;
27014 if (optimize_size)
27015 desired_align = 1;
27016 if (desired_align < align)
27017 desired_align = align;
27018 if (expected_size != -1 && expected_size < 4)
27019 desired_align = align;
27021 return desired_align;
27025 /* Helper function for memcpy. For QImode value 0xXY produce
27026 0xXYXYXYXY of wide specified by MODE. This is essentially
27027 a * 0x10101010, but we can do slightly better than
27028 synth_mult by unwinding the sequence by hand on CPUs with
27029 slow multiply. */
27030 static rtx
27031 promote_duplicated_reg (machine_mode mode, rtx val)
27033 machine_mode valmode = GET_MODE (val);
27034 rtx tmp;
27035 int nops = mode == DImode ? 3 : 2;
27037 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27038 if (val == const0_rtx)
27039 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27040 if (CONST_INT_P (val))
27042 HOST_WIDE_INT v = INTVAL (val) & 255;
27044 v |= v << 8;
27045 v |= v << 16;
27046 if (mode == DImode)
27047 v |= (v << 16) << 16;
27048 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27051 if (valmode == VOIDmode)
27052 valmode = QImode;
27053 if (valmode != QImode)
27054 val = gen_lowpart (QImode, val);
27055 if (mode == QImode)
27056 return val;
27057 if (!TARGET_PARTIAL_REG_STALL)
27058 nops--;
27059 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27060 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27061 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27062 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27064 rtx reg = convert_modes (mode, QImode, val, true);
27065 tmp = promote_duplicated_reg (mode, const1_rtx);
27066 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27067 OPTAB_DIRECT);
27069 else
27071 rtx reg = convert_modes (mode, QImode, val, true);
27073 if (!TARGET_PARTIAL_REG_STALL)
27074 if (mode == SImode)
27075 emit_insn (gen_insvsi_1 (reg, reg));
27076 else
27077 emit_insn (gen_insvdi_1 (reg, reg));
27078 else
27080 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27081 NULL, 1, OPTAB_DIRECT);
27082 reg =
27083 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27085 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27086 NULL, 1, OPTAB_DIRECT);
27087 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27088 if (mode == SImode)
27089 return reg;
27090 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27091 NULL, 1, OPTAB_DIRECT);
27092 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27093 return reg;
27097 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27098 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27099 alignment from ALIGN to DESIRED_ALIGN. */
27100 static rtx
27101 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27102 int align)
27104 rtx promoted_val;
27106 if (TARGET_64BIT
27107 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27108 promoted_val = promote_duplicated_reg (DImode, val);
27109 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27110 promoted_val = promote_duplicated_reg (SImode, val);
27111 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27112 promoted_val = promote_duplicated_reg (HImode, val);
27113 else
27114 promoted_val = val;
27116 return promoted_val;
27119 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27120 operations when profitable. The code depends upon architecture, block size
27121 and alignment, but always has one of the following overall structures:
27123 Aligned move sequence:
27125 1) Prologue guard: Conditional that jumps up to epilogues for small
27126 blocks that can be handled by epilogue alone. This is faster
27127 but also needed for correctness, since prologue assume the block
27128 is larger than the desired alignment.
27130 Optional dynamic check for size and libcall for large
27131 blocks is emitted here too, with -minline-stringops-dynamically.
27133 2) Prologue: copy first few bytes in order to get destination
27134 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27135 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27136 copied. We emit either a jump tree on power of two sized
27137 blocks, or a byte loop.
27139 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27140 with specified algorithm.
27142 4) Epilogue: code copying tail of the block that is too small to be
27143 handled by main body (or up to size guarded by prologue guard).
27145 Misaligned move sequence
27147 1) missaligned move prologue/epilogue containing:
27148 a) Prologue handling small memory blocks and jumping to done_label
27149 (skipped if blocks are known to be large enough)
27150 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27151 needed by single possibly misaligned move
27152 (skipped if alignment is not needed)
27153 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27155 2) Zero size guard dispatching to done_label, if needed
27157 3) dispatch to library call, if needed,
27159 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27160 with specified algorithm. */
27161 bool
27162 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27163 rtx align_exp, rtx expected_align_exp,
27164 rtx expected_size_exp, rtx min_size_exp,
27165 rtx max_size_exp, rtx probable_max_size_exp,
27166 bool issetmem)
27168 rtx destreg;
27169 rtx srcreg = NULL;
27170 rtx_code_label *label = NULL;
27171 rtx tmp;
27172 rtx_code_label *jump_around_label = NULL;
27173 HOST_WIDE_INT align = 1;
27174 unsigned HOST_WIDE_INT count = 0;
27175 HOST_WIDE_INT expected_size = -1;
27176 int size_needed = 0, epilogue_size_needed;
27177 int desired_align = 0, align_bytes = 0;
27178 enum stringop_alg alg;
27179 rtx promoted_val = NULL;
27180 rtx vec_promoted_val = NULL;
27181 bool force_loopy_epilogue = false;
27182 int dynamic_check;
27183 bool need_zero_guard = false;
27184 bool noalign;
27185 machine_mode move_mode = VOIDmode;
27186 machine_mode wider_mode;
27187 int unroll_factor = 1;
27188 /* TODO: Once value ranges are available, fill in proper data. */
27189 unsigned HOST_WIDE_INT min_size = 0;
27190 unsigned HOST_WIDE_INT max_size = -1;
27191 unsigned HOST_WIDE_INT probable_max_size = -1;
27192 bool misaligned_prologue_used = false;
27193 bool have_as;
27195 if (CONST_INT_P (align_exp))
27196 align = INTVAL (align_exp);
27197 /* i386 can do misaligned access on reasonably increased cost. */
27198 if (CONST_INT_P (expected_align_exp)
27199 && INTVAL (expected_align_exp) > align)
27200 align = INTVAL (expected_align_exp);
27201 /* ALIGN is the minimum of destination and source alignment, but we care here
27202 just about destination alignment. */
27203 else if (!issetmem
27204 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27205 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27207 if (CONST_INT_P (count_exp))
27209 min_size = max_size = probable_max_size = count = expected_size
27210 = INTVAL (count_exp);
27211 /* When COUNT is 0, there is nothing to do. */
27212 if (!count)
27213 return true;
27215 else
27217 if (min_size_exp)
27218 min_size = INTVAL (min_size_exp);
27219 if (max_size_exp)
27220 max_size = INTVAL (max_size_exp);
27221 if (probable_max_size_exp)
27222 probable_max_size = INTVAL (probable_max_size_exp);
27223 if (CONST_INT_P (expected_size_exp))
27224 expected_size = INTVAL (expected_size_exp);
27227 /* Make sure we don't need to care about overflow later on. */
27228 if (count > (HOST_WIDE_INT_1U << 30))
27229 return false;
27231 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27232 if (!issetmem)
27233 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27235 /* Step 0: Decide on preferred algorithm, desired alignment and
27236 size of chunks to be copied by main loop. */
27237 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27238 issetmem,
27239 issetmem && val_exp == const0_rtx, have_as,
27240 &dynamic_check, &noalign, false);
27241 if (alg == libcall)
27242 return false;
27243 gcc_assert (alg != no_stringop);
27245 /* For now vector-version of memset is generated only for memory zeroing, as
27246 creating of promoted vector value is very cheap in this case. */
27247 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27248 alg = unrolled_loop;
27250 if (!count)
27251 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27252 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27253 if (!issetmem)
27254 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27256 unroll_factor = 1;
27257 move_mode = word_mode;
27258 switch (alg)
27260 case libcall:
27261 case no_stringop:
27262 case last_alg:
27263 gcc_unreachable ();
27264 case loop_1_byte:
27265 need_zero_guard = true;
27266 move_mode = QImode;
27267 break;
27268 case loop:
27269 need_zero_guard = true;
27270 break;
27271 case unrolled_loop:
27272 need_zero_guard = true;
27273 unroll_factor = (TARGET_64BIT ? 4 : 2);
27274 break;
27275 case vector_loop:
27276 need_zero_guard = true;
27277 unroll_factor = 4;
27278 /* Find the widest supported mode. */
27279 move_mode = word_mode;
27280 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27281 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27282 move_mode = wider_mode;
27284 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27285 move_mode = TImode;
27287 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27288 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27289 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27291 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27292 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27293 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27294 move_mode = word_mode;
27296 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27297 break;
27298 case rep_prefix_8_byte:
27299 move_mode = DImode;
27300 break;
27301 case rep_prefix_4_byte:
27302 move_mode = SImode;
27303 break;
27304 case rep_prefix_1_byte:
27305 move_mode = QImode;
27306 break;
27308 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27309 epilogue_size_needed = size_needed;
27311 /* If we are going to call any library calls conditionally, make sure any
27312 pending stack adjustment happen before the first conditional branch,
27313 otherwise they will be emitted before the library call only and won't
27314 happen from the other branches. */
27315 if (dynamic_check != -1)
27316 do_pending_stack_adjust ();
27318 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27319 if (!TARGET_ALIGN_STRINGOPS || noalign)
27320 align = desired_align;
27322 /* Step 1: Prologue guard. */
27324 /* Alignment code needs count to be in register. */
27325 if (CONST_INT_P (count_exp) && desired_align > align)
27327 if (INTVAL (count_exp) > desired_align
27328 && INTVAL (count_exp) > size_needed)
27330 align_bytes
27331 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27332 if (align_bytes <= 0)
27333 align_bytes = 0;
27334 else
27335 align_bytes = desired_align - align_bytes;
27337 if (align_bytes == 0)
27338 count_exp = force_reg (counter_mode (count_exp), count_exp);
27340 gcc_assert (desired_align >= 1 && align >= 1);
27342 /* Misaligned move sequences handle both prologue and epilogue at once.
27343 Default code generation results in a smaller code for large alignments
27344 and also avoids redundant job when sizes are known precisely. */
27345 misaligned_prologue_used
27346 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27347 && MAX (desired_align, epilogue_size_needed) <= 32
27348 && desired_align <= epilogue_size_needed
27349 && ((desired_align > align && !align_bytes)
27350 || (!count && epilogue_size_needed > 1)));
27352 /* Do the cheap promotion to allow better CSE across the
27353 main loop and epilogue (ie one load of the big constant in the
27354 front of all code.
27355 For now the misaligned move sequences do not have fast path
27356 without broadcasting. */
27357 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27359 if (alg == vector_loop)
27361 gcc_assert (val_exp == const0_rtx);
27362 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27363 promoted_val = promote_duplicated_reg_to_size (val_exp,
27364 GET_MODE_SIZE (word_mode),
27365 desired_align, align);
27367 else
27369 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27370 desired_align, align);
27373 /* Misaligned move sequences handles both prologues and epilogues at once.
27374 Default code generation results in smaller code for large alignments and
27375 also avoids redundant job when sizes are known precisely. */
27376 if (misaligned_prologue_used)
27378 /* Misaligned move prologue handled small blocks by itself. */
27379 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27380 (dst, src, &destreg, &srcreg,
27381 move_mode, promoted_val, vec_promoted_val,
27382 &count_exp,
27383 &jump_around_label,
27384 desired_align < align
27385 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27386 desired_align, align, &min_size, dynamic_check, issetmem);
27387 if (!issetmem)
27388 src = change_address (src, BLKmode, srcreg);
27389 dst = change_address (dst, BLKmode, destreg);
27390 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27391 epilogue_size_needed = 0;
27392 if (need_zero_guard
27393 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27395 /* It is possible that we copied enough so the main loop will not
27396 execute. */
27397 gcc_assert (size_needed > 1);
27398 if (jump_around_label == NULL_RTX)
27399 jump_around_label = gen_label_rtx ();
27400 emit_cmp_and_jump_insns (count_exp,
27401 GEN_INT (size_needed),
27402 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27403 if (expected_size == -1
27404 || expected_size < (desired_align - align) / 2 + size_needed)
27405 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27406 else
27407 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27410 /* Ensure that alignment prologue won't copy past end of block. */
27411 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27413 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27414 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27415 Make sure it is power of 2. */
27416 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27418 /* To improve performance of small blocks, we jump around the VAL
27419 promoting mode. This mean that if the promoted VAL is not constant,
27420 we might not use it in the epilogue and have to use byte
27421 loop variant. */
27422 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27423 force_loopy_epilogue = true;
27424 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27425 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27427 /* If main algorithm works on QImode, no epilogue is needed.
27428 For small sizes just don't align anything. */
27429 if (size_needed == 1)
27430 desired_align = align;
27431 else
27432 goto epilogue;
27434 else if (!count
27435 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27437 label = gen_label_rtx ();
27438 emit_cmp_and_jump_insns (count_exp,
27439 GEN_INT (epilogue_size_needed),
27440 LTU, 0, counter_mode (count_exp), 1, label);
27441 if (expected_size == -1 || expected_size < epilogue_size_needed)
27442 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27443 else
27444 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27448 /* Emit code to decide on runtime whether library call or inline should be
27449 used. */
27450 if (dynamic_check != -1)
27452 if (!issetmem && CONST_INT_P (count_exp))
27454 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27456 emit_block_copy_via_libcall (dst, src, count_exp);
27457 count_exp = const0_rtx;
27458 goto epilogue;
27461 else
27463 rtx_code_label *hot_label = gen_label_rtx ();
27464 if (jump_around_label == NULL_RTX)
27465 jump_around_label = gen_label_rtx ();
27466 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27467 LEU, 0, counter_mode (count_exp),
27468 1, hot_label);
27469 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27470 if (issetmem)
27471 set_storage_via_libcall (dst, count_exp, val_exp);
27472 else
27473 emit_block_copy_via_libcall (dst, src, count_exp);
27474 emit_jump (jump_around_label);
27475 emit_label (hot_label);
27479 /* Step 2: Alignment prologue. */
27480 /* Do the expensive promotion once we branched off the small blocks. */
27481 if (issetmem && !promoted_val)
27482 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27483 desired_align, align);
27485 if (desired_align > align && !misaligned_prologue_used)
27487 if (align_bytes == 0)
27489 /* Except for the first move in prologue, we no longer know
27490 constant offset in aliasing info. It don't seems to worth
27491 the pain to maintain it for the first move, so throw away
27492 the info early. */
27493 dst = change_address (dst, BLKmode, destreg);
27494 if (!issetmem)
27495 src = change_address (src, BLKmode, srcreg);
27496 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27497 promoted_val, vec_promoted_val,
27498 count_exp, align, desired_align,
27499 issetmem);
27500 /* At most desired_align - align bytes are copied. */
27501 if (min_size < (unsigned)(desired_align - align))
27502 min_size = 0;
27503 else
27504 min_size -= desired_align - align;
27506 else
27508 /* If we know how many bytes need to be stored before dst is
27509 sufficiently aligned, maintain aliasing info accurately. */
27510 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27511 srcreg,
27512 promoted_val,
27513 vec_promoted_val,
27514 desired_align,
27515 align_bytes,
27516 issetmem);
27518 count_exp = plus_constant (counter_mode (count_exp),
27519 count_exp, -align_bytes);
27520 count -= align_bytes;
27521 min_size -= align_bytes;
27522 max_size -= align_bytes;
27524 if (need_zero_guard
27525 && min_size < (unsigned HOST_WIDE_INT) size_needed
27526 && (count < (unsigned HOST_WIDE_INT) size_needed
27527 || (align_bytes == 0
27528 && count < ((unsigned HOST_WIDE_INT) size_needed
27529 + desired_align - align))))
27531 /* It is possible that we copied enough so the main loop will not
27532 execute. */
27533 gcc_assert (size_needed > 1);
27534 if (label == NULL_RTX)
27535 label = gen_label_rtx ();
27536 emit_cmp_and_jump_insns (count_exp,
27537 GEN_INT (size_needed),
27538 LTU, 0, counter_mode (count_exp), 1, label);
27539 if (expected_size == -1
27540 || expected_size < (desired_align - align) / 2 + size_needed)
27541 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27542 else
27543 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27546 if (label && size_needed == 1)
27548 emit_label (label);
27549 LABEL_NUSES (label) = 1;
27550 label = NULL;
27551 epilogue_size_needed = 1;
27552 if (issetmem)
27553 promoted_val = val_exp;
27555 else if (label == NULL_RTX && !misaligned_prologue_used)
27556 epilogue_size_needed = size_needed;
27558 /* Step 3: Main loop. */
27560 switch (alg)
27562 case libcall:
27563 case no_stringop:
27564 case last_alg:
27565 gcc_unreachable ();
27566 case loop_1_byte:
27567 case loop:
27568 case unrolled_loop:
27569 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27570 count_exp, move_mode, unroll_factor,
27571 expected_size, issetmem);
27572 break;
27573 case vector_loop:
27574 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27575 vec_promoted_val, count_exp, move_mode,
27576 unroll_factor, expected_size, issetmem);
27577 break;
27578 case rep_prefix_8_byte:
27579 case rep_prefix_4_byte:
27580 case rep_prefix_1_byte:
27581 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27582 val_exp, count_exp, move_mode, issetmem);
27583 break;
27585 /* Adjust properly the offset of src and dest memory for aliasing. */
27586 if (CONST_INT_P (count_exp))
27588 if (!issetmem)
27589 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27590 (count / size_needed) * size_needed);
27591 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27592 (count / size_needed) * size_needed);
27594 else
27596 if (!issetmem)
27597 src = change_address (src, BLKmode, srcreg);
27598 dst = change_address (dst, BLKmode, destreg);
27601 /* Step 4: Epilogue to copy the remaining bytes. */
27602 epilogue:
27603 if (label)
27605 /* When the main loop is done, COUNT_EXP might hold original count,
27606 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27607 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27608 bytes. Compensate if needed. */
27610 if (size_needed < epilogue_size_needed)
27612 tmp =
27613 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27614 GEN_INT (size_needed - 1), count_exp, 1,
27615 OPTAB_DIRECT);
27616 if (tmp != count_exp)
27617 emit_move_insn (count_exp, tmp);
27619 emit_label (label);
27620 LABEL_NUSES (label) = 1;
27623 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27625 if (force_loopy_epilogue)
27626 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27627 epilogue_size_needed);
27628 else
27630 if (issetmem)
27631 expand_setmem_epilogue (dst, destreg, promoted_val,
27632 vec_promoted_val, count_exp,
27633 epilogue_size_needed);
27634 else
27635 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27636 epilogue_size_needed);
27639 if (jump_around_label)
27640 emit_label (jump_around_label);
27641 return true;
27645 /* Expand the appropriate insns for doing strlen if not just doing
27646 repnz; scasb
27648 out = result, initialized with the start address
27649 align_rtx = alignment of the address.
27650 scratch = scratch register, initialized with the startaddress when
27651 not aligned, otherwise undefined
27653 This is just the body. It needs the initializations mentioned above and
27654 some address computing at the end. These things are done in i386.md. */
27656 static void
27657 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27659 int align;
27660 rtx tmp;
27661 rtx_code_label *align_2_label = NULL;
27662 rtx_code_label *align_3_label = NULL;
27663 rtx_code_label *align_4_label = gen_label_rtx ();
27664 rtx_code_label *end_0_label = gen_label_rtx ();
27665 rtx mem;
27666 rtx tmpreg = gen_reg_rtx (SImode);
27667 rtx scratch = gen_reg_rtx (SImode);
27668 rtx cmp;
27670 align = 0;
27671 if (CONST_INT_P (align_rtx))
27672 align = INTVAL (align_rtx);
27674 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27676 /* Is there a known alignment and is it less than 4? */
27677 if (align < 4)
27679 rtx scratch1 = gen_reg_rtx (Pmode);
27680 emit_move_insn (scratch1, out);
27681 /* Is there a known alignment and is it not 2? */
27682 if (align != 2)
27684 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27685 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27687 /* Leave just the 3 lower bits. */
27688 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27689 NULL_RTX, 0, OPTAB_WIDEN);
27691 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27692 Pmode, 1, align_4_label);
27693 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27694 Pmode, 1, align_2_label);
27695 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27696 Pmode, 1, align_3_label);
27698 else
27700 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27701 check if is aligned to 4 - byte. */
27703 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27704 NULL_RTX, 0, OPTAB_WIDEN);
27706 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27707 Pmode, 1, align_4_label);
27710 mem = change_address (src, QImode, out);
27712 /* Now compare the bytes. */
27714 /* Compare the first n unaligned byte on a byte per byte basis. */
27715 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27716 QImode, 1, end_0_label);
27718 /* Increment the address. */
27719 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27721 /* Not needed with an alignment of 2 */
27722 if (align != 2)
27724 emit_label (align_2_label);
27726 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27727 end_0_label);
27729 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27731 emit_label (align_3_label);
27734 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27735 end_0_label);
27737 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27740 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27741 align this loop. It gives only huge programs, but does not help to
27742 speed up. */
27743 emit_label (align_4_label);
27745 mem = change_address (src, SImode, out);
27746 emit_move_insn (scratch, mem);
27747 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27749 /* This formula yields a nonzero result iff one of the bytes is zero.
27750 This saves three branches inside loop and many cycles. */
27752 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27753 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27754 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27755 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27756 gen_int_mode (0x80808080, SImode)));
27757 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27758 align_4_label);
27760 if (TARGET_CMOVE)
27762 rtx reg = gen_reg_rtx (SImode);
27763 rtx reg2 = gen_reg_rtx (Pmode);
27764 emit_move_insn (reg, tmpreg);
27765 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27767 /* If zero is not in the first two bytes, move two bytes forward. */
27768 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27769 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27770 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27771 emit_insn (gen_rtx_SET (tmpreg,
27772 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27773 reg,
27774 tmpreg)));
27775 /* Emit lea manually to avoid clobbering of flags. */
27776 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27778 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27779 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27780 emit_insn (gen_rtx_SET (out,
27781 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27782 reg2,
27783 out)));
27785 else
27787 rtx_code_label *end_2_label = gen_label_rtx ();
27788 /* Is zero in the first two bytes? */
27790 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27791 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27792 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27793 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27794 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27795 pc_rtx);
27796 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27797 JUMP_LABEL (tmp) = end_2_label;
27799 /* Not in the first two. Move two bytes forward. */
27800 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27801 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27803 emit_label (end_2_label);
27807 /* Avoid branch in fixing the byte. */
27808 tmpreg = gen_lowpart (QImode, tmpreg);
27809 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27810 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27811 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27812 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27814 emit_label (end_0_label);
27817 /* Expand strlen. */
27819 bool
27820 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27822 rtx addr, scratch1, scratch2, scratch3, scratch4;
27824 /* The generic case of strlen expander is long. Avoid it's
27825 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27827 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27828 && !TARGET_INLINE_ALL_STRINGOPS
27829 && !optimize_insn_for_size_p ()
27830 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27831 return false;
27833 addr = force_reg (Pmode, XEXP (src, 0));
27834 scratch1 = gen_reg_rtx (Pmode);
27836 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27837 && !optimize_insn_for_size_p ())
27839 /* Well it seems that some optimizer does not combine a call like
27840 foo(strlen(bar), strlen(bar));
27841 when the move and the subtraction is done here. It does calculate
27842 the length just once when these instructions are done inside of
27843 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27844 often used and I use one fewer register for the lifetime of
27845 output_strlen_unroll() this is better. */
27847 emit_move_insn (out, addr);
27849 ix86_expand_strlensi_unroll_1 (out, src, align);
27851 /* strlensi_unroll_1 returns the address of the zero at the end of
27852 the string, like memchr(), so compute the length by subtracting
27853 the start address. */
27854 emit_insn (ix86_gen_sub3 (out, out, addr));
27856 else
27858 rtx unspec;
27860 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27861 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27862 return false;
27863 /* Can't use this for non-default address spaces. */
27864 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27865 return false;
27867 scratch2 = gen_reg_rtx (Pmode);
27868 scratch3 = gen_reg_rtx (Pmode);
27869 scratch4 = force_reg (Pmode, constm1_rtx);
27871 emit_move_insn (scratch3, addr);
27872 eoschar = force_reg (QImode, eoschar);
27874 src = replace_equiv_address_nv (src, scratch3);
27876 /* If .md starts supporting :P, this can be done in .md. */
27877 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27878 scratch4), UNSPEC_SCAS);
27879 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27880 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27881 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27883 return true;
27886 /* For given symbol (function) construct code to compute address of it's PLT
27887 entry in large x86-64 PIC model. */
27888 static rtx
27889 construct_plt_address (rtx symbol)
27891 rtx tmp, unspec;
27893 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27894 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27895 gcc_assert (Pmode == DImode);
27897 tmp = gen_reg_rtx (Pmode);
27898 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27900 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27901 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27902 return tmp;
27906 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27907 rtx callarg2,
27908 rtx pop, bool sibcall)
27910 rtx vec[3];
27911 rtx use = NULL, call;
27912 unsigned int vec_len = 0;
27913 tree fndecl;
27915 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27917 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27918 if (fndecl
27919 && (lookup_attribute ("interrupt",
27920 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27921 error ("interrupt service routine can't be called directly");
27923 else
27924 fndecl = NULL_TREE;
27926 if (pop == const0_rtx)
27927 pop = NULL;
27928 gcc_assert (!TARGET_64BIT || !pop);
27930 if (TARGET_MACHO && !TARGET_64BIT)
27932 #if TARGET_MACHO
27933 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27934 fnaddr = machopic_indirect_call_target (fnaddr);
27935 #endif
27937 else
27939 /* Static functions and indirect calls don't need the pic register. Also,
27940 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27941 it an indirect call. */
27942 rtx addr = XEXP (fnaddr, 0);
27943 if (flag_pic
27944 && GET_CODE (addr) == SYMBOL_REF
27945 && !SYMBOL_REF_LOCAL_P (addr))
27947 if (flag_plt
27948 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27949 || !lookup_attribute ("noplt",
27950 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27952 if (!TARGET_64BIT
27953 || (ix86_cmodel == CM_LARGE_PIC
27954 && DEFAULT_ABI != MS_ABI))
27956 use_reg (&use, gen_rtx_REG (Pmode,
27957 REAL_PIC_OFFSET_TABLE_REGNUM));
27958 if (ix86_use_pseudo_pic_reg ())
27959 emit_move_insn (gen_rtx_REG (Pmode,
27960 REAL_PIC_OFFSET_TABLE_REGNUM),
27961 pic_offset_table_rtx);
27964 else if (!TARGET_PECOFF && !TARGET_MACHO)
27966 if (TARGET_64BIT)
27968 fnaddr = gen_rtx_UNSPEC (Pmode,
27969 gen_rtvec (1, addr),
27970 UNSPEC_GOTPCREL);
27971 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27973 else
27975 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27976 UNSPEC_GOT);
27977 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27978 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27979 fnaddr);
27981 fnaddr = gen_const_mem (Pmode, fnaddr);
27982 /* Pmode may not be the same as word_mode for x32, which
27983 doesn't support indirect branch via 32-bit memory slot.
27984 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27985 indirect branch via x32 GOT slot is OK. */
27986 if (GET_MODE (fnaddr) != word_mode)
27987 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27988 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27993 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27994 parameters passed in vector registers. */
27995 if (TARGET_64BIT
27996 && (INTVAL (callarg2) > 0
27997 || (INTVAL (callarg2) == 0
27998 && (TARGET_SSE || !flag_skip_rax_setup))))
28000 rtx al = gen_rtx_REG (QImode, AX_REG);
28001 emit_move_insn (al, callarg2);
28002 use_reg (&use, al);
28005 if (ix86_cmodel == CM_LARGE_PIC
28006 && !TARGET_PECOFF
28007 && MEM_P (fnaddr)
28008 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28009 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28010 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28011 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28012 branch via x32 GOT slot is OK. */
28013 else if (!(TARGET_X32
28014 && MEM_P (fnaddr)
28015 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28016 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28017 && (sibcall
28018 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28019 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28021 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28022 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28025 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28027 if (retval)
28029 /* We should add bounds as destination register in case
28030 pointer with bounds may be returned. */
28031 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28033 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28034 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28035 if (GET_CODE (retval) == PARALLEL)
28037 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28038 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28039 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28040 retval = chkp_join_splitted_slot (retval, par);
28042 else
28044 retval = gen_rtx_PARALLEL (VOIDmode,
28045 gen_rtvec (3, retval, b0, b1));
28046 chkp_put_regs_to_expr_list (retval);
28050 call = gen_rtx_SET (retval, call);
28052 vec[vec_len++] = call;
28054 if (pop)
28056 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28057 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28058 vec[vec_len++] = pop;
28061 if (cfun->machine->no_caller_saved_registers
28062 && (!fndecl
28063 || (!TREE_THIS_VOLATILE (fndecl)
28064 && !lookup_attribute ("no_caller_saved_registers",
28065 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28067 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28068 bool is_64bit_ms_abi = (TARGET_64BIT
28069 && ix86_function_abi (fndecl) == MS_ABI);
28070 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28072 /* If there are no caller-saved registers, add all registers
28073 that are clobbered by the call which returns. */
28074 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28075 if (!fixed_regs[i]
28076 && (ix86_call_used_regs[i] == 1
28077 || (ix86_call_used_regs[i] & c_mask))
28078 && !STACK_REGNO_P (i)
28079 && !MMX_REGNO_P (i))
28080 clobber_reg (&use,
28081 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28083 else if (TARGET_64BIT_MS_ABI
28084 && (!callarg2 || INTVAL (callarg2) != -2))
28086 unsigned i;
28088 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28090 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28091 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28093 clobber_reg (&use, gen_rtx_REG (mode, regno));
28096 /* Set here, but it may get cleared later. */
28097 if (TARGET_CALL_MS2SYSV_XLOGUES)
28099 if (!TARGET_SSE)
28102 /* Don't break hot-patched functions. */
28103 else if (ix86_function_ms_hook_prologue (current_function_decl))
28106 /* TODO: Cases not yet examined. */
28107 else if (flag_split_stack)
28108 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28110 else
28112 gcc_assert (!reload_completed);
28113 cfun->machine->call_ms2sysv = true;
28118 if (vec_len > 1)
28119 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28120 call = emit_call_insn (call);
28121 if (use)
28122 CALL_INSN_FUNCTION_USAGE (call) = use;
28124 return call;
28127 /* Return true if the function being called was marked with attribute
28128 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28129 to handle the non-PIC case in the backend because there is no easy
28130 interface for the front-end to force non-PLT calls to use the GOT.
28131 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28132 to call the function marked "noplt" indirectly. */
28134 static bool
28135 ix86_nopic_noplt_attribute_p (rtx call_op)
28137 if (flag_pic || ix86_cmodel == CM_LARGE
28138 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28139 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28140 || SYMBOL_REF_LOCAL_P (call_op))
28141 return false;
28143 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28145 if (!flag_plt
28146 || (symbol_decl != NULL_TREE
28147 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28148 return true;
28150 return false;
28153 /* Output the assembly for a call instruction. */
28155 const char *
28156 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28158 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28159 bool seh_nop_p = false;
28160 const char *xasm;
28162 if (SIBLING_CALL_P (insn))
28164 if (direct_p)
28166 if (ix86_nopic_noplt_attribute_p (call_op))
28168 if (TARGET_64BIT)
28169 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28170 else
28171 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28173 else
28174 xasm = "%!jmp\t%P0";
28176 /* SEH epilogue detection requires the indirect branch case
28177 to include REX.W. */
28178 else if (TARGET_SEH)
28179 xasm = "%!rex.W jmp\t%A0";
28180 else
28181 xasm = "%!jmp\t%A0";
28183 output_asm_insn (xasm, &call_op);
28184 return "";
28187 /* SEH unwinding can require an extra nop to be emitted in several
28188 circumstances. Determine if we have one of those. */
28189 if (TARGET_SEH)
28191 rtx_insn *i;
28193 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28195 /* If we get to another real insn, we don't need the nop. */
28196 if (INSN_P (i))
28197 break;
28199 /* If we get to the epilogue note, prevent a catch region from
28200 being adjacent to the standard epilogue sequence. If non-
28201 call-exceptions, we'll have done this during epilogue emission. */
28202 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28203 && !flag_non_call_exceptions
28204 && !can_throw_internal (insn))
28206 seh_nop_p = true;
28207 break;
28211 /* If we didn't find a real insn following the call, prevent the
28212 unwinder from looking into the next function. */
28213 if (i == NULL)
28214 seh_nop_p = true;
28217 if (direct_p)
28219 if (ix86_nopic_noplt_attribute_p (call_op))
28221 if (TARGET_64BIT)
28222 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28223 else
28224 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28226 else
28227 xasm = "%!call\t%P0";
28229 else
28230 xasm = "%!call\t%A0";
28232 output_asm_insn (xasm, &call_op);
28234 if (seh_nop_p)
28235 return "nop";
28237 return "";
28240 /* Clear stack slot assignments remembered from previous functions.
28241 This is called from INIT_EXPANDERS once before RTL is emitted for each
28242 function. */
28244 static struct machine_function *
28245 ix86_init_machine_status (void)
28247 struct machine_function *f;
28249 f = ggc_cleared_alloc<machine_function> ();
28250 f->call_abi = ix86_abi;
28252 return f;
28255 /* Return a MEM corresponding to a stack slot with mode MODE.
28256 Allocate a new slot if necessary.
28258 The RTL for a function can have several slots available: N is
28259 which slot to use. */
28262 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28264 struct stack_local_entry *s;
28266 gcc_assert (n < MAX_386_STACK_LOCALS);
28268 for (s = ix86_stack_locals; s; s = s->next)
28269 if (s->mode == mode && s->n == n)
28270 return validize_mem (copy_rtx (s->rtl));
28272 s = ggc_alloc<stack_local_entry> ();
28273 s->n = n;
28274 s->mode = mode;
28275 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28277 s->next = ix86_stack_locals;
28278 ix86_stack_locals = s;
28279 return validize_mem (copy_rtx (s->rtl));
28282 static void
28283 ix86_instantiate_decls (void)
28285 struct stack_local_entry *s;
28287 for (s = ix86_stack_locals; s; s = s->next)
28288 if (s->rtl != NULL_RTX)
28289 instantiate_decl_rtl (s->rtl);
28292 /* Return the number used for encoding REG, in the range 0..7. */
28294 static int
28295 reg_encoded_number (rtx reg)
28297 unsigned regno = REGNO (reg);
28298 switch (regno)
28300 case AX_REG:
28301 return 0;
28302 case CX_REG:
28303 return 1;
28304 case DX_REG:
28305 return 2;
28306 case BX_REG:
28307 return 3;
28308 case SP_REG:
28309 return 4;
28310 case BP_REG:
28311 return 5;
28312 case SI_REG:
28313 return 6;
28314 case DI_REG:
28315 return 7;
28316 default:
28317 break;
28319 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28320 return regno - FIRST_STACK_REG;
28321 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28322 return regno - FIRST_SSE_REG;
28323 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28324 return regno - FIRST_MMX_REG;
28325 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28326 return regno - FIRST_REX_SSE_REG;
28327 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28328 return regno - FIRST_REX_INT_REG;
28329 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28330 return regno - FIRST_MASK_REG;
28331 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28332 return regno - FIRST_BND_REG;
28333 return -1;
28336 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28337 in its encoding if it could be relevant for ROP mitigation, otherwise
28338 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28339 used for calculating it into them. */
28341 static int
28342 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28343 int *popno0 = 0, int *popno1 = 0)
28345 if (asm_noperands (PATTERN (insn)) >= 0)
28346 return -1;
28347 int has_modrm = get_attr_modrm (insn);
28348 if (!has_modrm)
28349 return -1;
28350 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28351 rtx op0, op1;
28352 switch (cls)
28354 case MODRM_CLASS_OP02:
28355 gcc_assert (noperands >= 3);
28356 if (popno0)
28358 *popno0 = 0;
28359 *popno1 = 2;
28361 op0 = operands[0];
28362 op1 = operands[2];
28363 break;
28364 case MODRM_CLASS_OP01:
28365 gcc_assert (noperands >= 2);
28366 if (popno0)
28368 *popno0 = 0;
28369 *popno1 = 1;
28371 op0 = operands[0];
28372 op1 = operands[1];
28373 break;
28374 default:
28375 return -1;
28377 if (REG_P (op0) && REG_P (op1))
28379 int enc0 = reg_encoded_number (op0);
28380 int enc1 = reg_encoded_number (op1);
28381 return 0xc0 + (enc1 << 3) + enc0;
28383 return -1;
28386 /* Check whether x86 address PARTS is a pc-relative address. */
28388 bool
28389 ix86_rip_relative_addr_p (struct ix86_address *parts)
28391 rtx base, index, disp;
28393 base = parts->base;
28394 index = parts->index;
28395 disp = parts->disp;
28397 if (disp && !base && !index)
28399 if (TARGET_64BIT)
28401 rtx symbol = disp;
28403 if (GET_CODE (disp) == CONST)
28404 symbol = XEXP (disp, 0);
28405 if (GET_CODE (symbol) == PLUS
28406 && CONST_INT_P (XEXP (symbol, 1)))
28407 symbol = XEXP (symbol, 0);
28409 if (GET_CODE (symbol) == LABEL_REF
28410 || (GET_CODE (symbol) == SYMBOL_REF
28411 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28412 || (GET_CODE (symbol) == UNSPEC
28413 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28414 || XINT (symbol, 1) == UNSPEC_PCREL
28415 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28416 return true;
28419 return false;
28422 /* Calculate the length of the memory address in the instruction encoding.
28423 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28424 or other prefixes. We never generate addr32 prefix for LEA insn. */
28427 memory_address_length (rtx addr, bool lea)
28429 struct ix86_address parts;
28430 rtx base, index, disp;
28431 int len;
28432 int ok;
28434 if (GET_CODE (addr) == PRE_DEC
28435 || GET_CODE (addr) == POST_INC
28436 || GET_CODE (addr) == PRE_MODIFY
28437 || GET_CODE (addr) == POST_MODIFY)
28438 return 0;
28440 ok = ix86_decompose_address (addr, &parts);
28441 gcc_assert (ok);
28443 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28445 /* If this is not LEA instruction, add the length of addr32 prefix. */
28446 if (TARGET_64BIT && !lea
28447 && (SImode_address_operand (addr, VOIDmode)
28448 || (parts.base && GET_MODE (parts.base) == SImode)
28449 || (parts.index && GET_MODE (parts.index) == SImode)))
28450 len++;
28452 base = parts.base;
28453 index = parts.index;
28454 disp = parts.disp;
28456 if (base && SUBREG_P (base))
28457 base = SUBREG_REG (base);
28458 if (index && SUBREG_P (index))
28459 index = SUBREG_REG (index);
28461 gcc_assert (base == NULL_RTX || REG_P (base));
28462 gcc_assert (index == NULL_RTX || REG_P (index));
28464 /* Rule of thumb:
28465 - esp as the base always wants an index,
28466 - ebp as the base always wants a displacement,
28467 - r12 as the base always wants an index,
28468 - r13 as the base always wants a displacement. */
28470 /* Register Indirect. */
28471 if (base && !index && !disp)
28473 /* esp (for its index) and ebp (for its displacement) need
28474 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28475 code. */
28476 if (base == arg_pointer_rtx
28477 || base == frame_pointer_rtx
28478 || REGNO (base) == SP_REG
28479 || REGNO (base) == BP_REG
28480 || REGNO (base) == R12_REG
28481 || REGNO (base) == R13_REG)
28482 len++;
28485 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28486 is not disp32, but disp32(%rip), so for disp32
28487 SIB byte is needed, unless print_operand_address
28488 optimizes it into disp32(%rip) or (%rip) is implied
28489 by UNSPEC. */
28490 else if (disp && !base && !index)
28492 len += 4;
28493 if (!ix86_rip_relative_addr_p (&parts))
28494 len++;
28496 else
28498 /* Find the length of the displacement constant. */
28499 if (disp)
28501 if (base && satisfies_constraint_K (disp))
28502 len += 1;
28503 else
28504 len += 4;
28506 /* ebp always wants a displacement. Similarly r13. */
28507 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28508 len++;
28510 /* An index requires the two-byte modrm form.... */
28511 if (index
28512 /* ...like esp (or r12), which always wants an index. */
28513 || base == arg_pointer_rtx
28514 || base == frame_pointer_rtx
28515 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28516 len++;
28519 return len;
28522 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28523 is set, expect that insn have 8bit immediate alternative. */
28525 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28527 int len = 0;
28528 int i;
28529 extract_insn_cached (insn);
28530 for (i = recog_data.n_operands - 1; i >= 0; --i)
28531 if (CONSTANT_P (recog_data.operand[i]))
28533 enum attr_mode mode = get_attr_mode (insn);
28535 gcc_assert (!len);
28536 if (shortform && CONST_INT_P (recog_data.operand[i]))
28538 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28539 switch (mode)
28541 case MODE_QI:
28542 len = 1;
28543 continue;
28544 case MODE_HI:
28545 ival = trunc_int_for_mode (ival, HImode);
28546 break;
28547 case MODE_SI:
28548 ival = trunc_int_for_mode (ival, SImode);
28549 break;
28550 default:
28551 break;
28553 if (IN_RANGE (ival, -128, 127))
28555 len = 1;
28556 continue;
28559 switch (mode)
28561 case MODE_QI:
28562 len = 1;
28563 break;
28564 case MODE_HI:
28565 len = 2;
28566 break;
28567 case MODE_SI:
28568 len = 4;
28569 break;
28570 /* Immediates for DImode instructions are encoded
28571 as 32bit sign extended values. */
28572 case MODE_DI:
28573 len = 4;
28574 break;
28575 default:
28576 fatal_insn ("unknown insn mode", insn);
28579 return len;
28582 /* Compute default value for "length_address" attribute. */
28584 ix86_attr_length_address_default (rtx_insn *insn)
28586 int i;
28588 if (get_attr_type (insn) == TYPE_LEA)
28590 rtx set = PATTERN (insn), addr;
28592 if (GET_CODE (set) == PARALLEL)
28593 set = XVECEXP (set, 0, 0);
28595 gcc_assert (GET_CODE (set) == SET);
28597 addr = SET_SRC (set);
28599 return memory_address_length (addr, true);
28602 extract_insn_cached (insn);
28603 for (i = recog_data.n_operands - 1; i >= 0; --i)
28605 rtx op = recog_data.operand[i];
28606 if (MEM_P (op))
28608 constrain_operands_cached (insn, reload_completed);
28609 if (which_alternative != -1)
28611 const char *constraints = recog_data.constraints[i];
28612 int alt = which_alternative;
28614 while (*constraints == '=' || *constraints == '+')
28615 constraints++;
28616 while (alt-- > 0)
28617 while (*constraints++ != ',')
28619 /* Skip ignored operands. */
28620 if (*constraints == 'X')
28621 continue;
28624 int len = memory_address_length (XEXP (op, 0), false);
28626 /* Account for segment prefix for non-default addr spaces. */
28627 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28628 len++;
28630 return len;
28633 return 0;
28636 /* Compute default value for "length_vex" attribute. It includes
28637 2 or 3 byte VEX prefix and 1 opcode byte. */
28640 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28641 bool has_vex_w)
28643 int i;
28645 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28646 byte VEX prefix. */
28647 if (!has_0f_opcode || has_vex_w)
28648 return 3 + 1;
28650 /* We can always use 2 byte VEX prefix in 32bit. */
28651 if (!TARGET_64BIT)
28652 return 2 + 1;
28654 extract_insn_cached (insn);
28656 for (i = recog_data.n_operands - 1; i >= 0; --i)
28657 if (REG_P (recog_data.operand[i]))
28659 /* REX.W bit uses 3 byte VEX prefix. */
28660 if (GET_MODE (recog_data.operand[i]) == DImode
28661 && GENERAL_REG_P (recog_data.operand[i]))
28662 return 3 + 1;
28664 else
28666 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28667 if (MEM_P (recog_data.operand[i])
28668 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28669 return 3 + 1;
28672 return 2 + 1;
28676 static bool
28677 ix86_class_likely_spilled_p (reg_class_t);
28679 /* Returns true if lhs of insn is HW function argument register and set up
28680 is_spilled to true if it is likely spilled HW register. */
28681 static bool
28682 insn_is_function_arg (rtx insn, bool* is_spilled)
28684 rtx dst;
28686 if (!NONDEBUG_INSN_P (insn))
28687 return false;
28688 /* Call instructions are not movable, ignore it. */
28689 if (CALL_P (insn))
28690 return false;
28691 insn = PATTERN (insn);
28692 if (GET_CODE (insn) == PARALLEL)
28693 insn = XVECEXP (insn, 0, 0);
28694 if (GET_CODE (insn) != SET)
28695 return false;
28696 dst = SET_DEST (insn);
28697 if (REG_P (dst) && HARD_REGISTER_P (dst)
28698 && ix86_function_arg_regno_p (REGNO (dst)))
28700 /* Is it likely spilled HW register? */
28701 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28702 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28703 *is_spilled = true;
28704 return true;
28706 return false;
28709 /* Add output dependencies for chain of function adjacent arguments if only
28710 there is a move to likely spilled HW register. Return first argument
28711 if at least one dependence was added or NULL otherwise. */
28712 static rtx_insn *
28713 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28715 rtx_insn *insn;
28716 rtx_insn *last = call;
28717 rtx_insn *first_arg = NULL;
28718 bool is_spilled = false;
28720 head = PREV_INSN (head);
28722 /* Find nearest to call argument passing instruction. */
28723 while (true)
28725 last = PREV_INSN (last);
28726 if (last == head)
28727 return NULL;
28728 if (!NONDEBUG_INSN_P (last))
28729 continue;
28730 if (insn_is_function_arg (last, &is_spilled))
28731 break;
28732 return NULL;
28735 first_arg = last;
28736 while (true)
28738 insn = PREV_INSN (last);
28739 if (!INSN_P (insn))
28740 break;
28741 if (insn == head)
28742 break;
28743 if (!NONDEBUG_INSN_P (insn))
28745 last = insn;
28746 continue;
28748 if (insn_is_function_arg (insn, &is_spilled))
28750 /* Add output depdendence between two function arguments if chain
28751 of output arguments contains likely spilled HW registers. */
28752 if (is_spilled)
28753 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28754 first_arg = last = insn;
28756 else
28757 break;
28759 if (!is_spilled)
28760 return NULL;
28761 return first_arg;
28764 /* Add output or anti dependency from insn to first_arg to restrict its code
28765 motion. */
28766 static void
28767 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28769 rtx set;
28770 rtx tmp;
28772 /* Add anti dependencies for bounds stores. */
28773 if (INSN_P (insn)
28774 && GET_CODE (PATTERN (insn)) == PARALLEL
28775 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28776 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28778 add_dependence (first_arg, insn, REG_DEP_ANTI);
28779 return;
28782 set = single_set (insn);
28783 if (!set)
28784 return;
28785 tmp = SET_DEST (set);
28786 if (REG_P (tmp))
28788 /* Add output dependency to the first function argument. */
28789 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28790 return;
28792 /* Add anti dependency. */
28793 add_dependence (first_arg, insn, REG_DEP_ANTI);
28796 /* Avoid cross block motion of function argument through adding dependency
28797 from the first non-jump instruction in bb. */
28798 static void
28799 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28801 rtx_insn *insn = BB_END (bb);
28803 while (insn)
28805 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28807 rtx set = single_set (insn);
28808 if (set)
28810 avoid_func_arg_motion (arg, insn);
28811 return;
28814 if (insn == BB_HEAD (bb))
28815 return;
28816 insn = PREV_INSN (insn);
28820 /* Hook for pre-reload schedule - avoid motion of function arguments
28821 passed in likely spilled HW registers. */
28822 static void
28823 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28825 rtx_insn *insn;
28826 rtx_insn *first_arg = NULL;
28827 if (reload_completed)
28828 return;
28829 while (head != tail && DEBUG_INSN_P (head))
28830 head = NEXT_INSN (head);
28831 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28832 if (INSN_P (insn) && CALL_P (insn))
28834 first_arg = add_parameter_dependencies (insn, head);
28835 if (first_arg)
28837 /* Add dependee for first argument to predecessors if only
28838 region contains more than one block. */
28839 basic_block bb = BLOCK_FOR_INSN (insn);
28840 int rgn = CONTAINING_RGN (bb->index);
28841 int nr_blks = RGN_NR_BLOCKS (rgn);
28842 /* Skip trivial regions and region head blocks that can have
28843 predecessors outside of region. */
28844 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28846 edge e;
28847 edge_iterator ei;
28849 /* Regions are SCCs with the exception of selective
28850 scheduling with pipelining of outer blocks enabled.
28851 So also check that immediate predecessors of a non-head
28852 block are in the same region. */
28853 FOR_EACH_EDGE (e, ei, bb->preds)
28855 /* Avoid creating of loop-carried dependencies through
28856 using topological ordering in the region. */
28857 if (rgn == CONTAINING_RGN (e->src->index)
28858 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28859 add_dependee_for_func_arg (first_arg, e->src);
28862 insn = first_arg;
28863 if (insn == head)
28864 break;
28867 else if (first_arg)
28868 avoid_func_arg_motion (first_arg, insn);
28871 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28872 HW registers to maximum, to schedule them at soon as possible. These are
28873 moves from function argument registers at the top of the function entry
28874 and moves from function return value registers after call. */
28875 static int
28876 ix86_adjust_priority (rtx_insn *insn, int priority)
28878 rtx set;
28880 if (reload_completed)
28881 return priority;
28883 if (!NONDEBUG_INSN_P (insn))
28884 return priority;
28886 set = single_set (insn);
28887 if (set)
28889 rtx tmp = SET_SRC (set);
28890 if (REG_P (tmp)
28891 && HARD_REGISTER_P (tmp)
28892 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28893 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28894 return current_sched_info->sched_max_insns_priority;
28897 return priority;
28900 /* Prepare for scheduling pass. */
28901 static void
28902 ix86_sched_init_global (FILE *, int, int)
28904 /* Install scheduling hooks for current CPU. Some of these hooks are used
28905 in time-critical parts of the scheduler, so we only set them up when
28906 they are actually used. */
28907 switch (ix86_tune)
28909 case PROCESSOR_CORE2:
28910 case PROCESSOR_NEHALEM:
28911 case PROCESSOR_SANDYBRIDGE:
28912 case PROCESSOR_HASWELL:
28913 case PROCESSOR_GENERIC:
28914 /* Do not perform multipass scheduling for pre-reload schedule
28915 to save compile time. */
28916 if (reload_completed)
28918 ix86_core2i7_init_hooks ();
28919 break;
28921 /* Fall through. */
28922 default:
28923 targetm.sched.dfa_post_advance_cycle = NULL;
28924 targetm.sched.first_cycle_multipass_init = NULL;
28925 targetm.sched.first_cycle_multipass_begin = NULL;
28926 targetm.sched.first_cycle_multipass_issue = NULL;
28927 targetm.sched.first_cycle_multipass_backtrack = NULL;
28928 targetm.sched.first_cycle_multipass_end = NULL;
28929 targetm.sched.first_cycle_multipass_fini = NULL;
28930 break;
28935 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
28937 static HOST_WIDE_INT
28938 ix86_static_rtx_alignment (machine_mode mode)
28940 if (mode == DFmode)
28941 return 64;
28942 if (ALIGN_MODE_128 (mode))
28943 return MAX (128, GET_MODE_ALIGNMENT (mode));
28944 return GET_MODE_ALIGNMENT (mode);
28947 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28949 static HOST_WIDE_INT
28950 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28952 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28953 || TREE_CODE (exp) == INTEGER_CST)
28955 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
28956 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
28957 return MAX (mode_align, align);
28959 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28960 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28961 return BITS_PER_WORD;
28963 return align;
28966 /* Implement TARGET_EMPTY_RECORD_P. */
28968 static bool
28969 ix86_is_empty_record (const_tree type)
28971 if (!TARGET_64BIT)
28972 return false;
28973 return default_is_empty_record (type);
28976 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
28978 static void
28979 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
28981 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
28983 if (!cum->warn_empty)
28984 return;
28986 if (!TYPE_EMPTY_P (type))
28987 return;
28989 const_tree ctx = get_ultimate_context (cum->decl);
28990 if (ctx != NULL_TREE
28991 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
28992 return;
28994 /* If the actual size of the type is zero, then there is no change
28995 in how objects of this size are passed. */
28996 if (int_size_in_bytes (type) == 0)
28997 return;
28999 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
29000 "changes in -fabi-version=12 (GCC 8)", type);
29002 /* Only warn once. */
29003 cum->warn_empty = false;
29006 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
29007 the data type, and ALIGN is the alignment that the object would
29008 ordinarily have. */
29010 static int
29011 iamcu_alignment (tree type, int align)
29013 machine_mode mode;
29015 if (align < 32 || TYPE_USER_ALIGN (type))
29016 return align;
29018 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
29019 bytes. */
29020 mode = TYPE_MODE (strip_array_types (type));
29021 switch (GET_MODE_CLASS (mode))
29023 case MODE_INT:
29024 case MODE_COMPLEX_INT:
29025 case MODE_COMPLEX_FLOAT:
29026 case MODE_FLOAT:
29027 case MODE_DECIMAL_FLOAT:
29028 return 32;
29029 default:
29030 return align;
29034 /* Compute the alignment for a static variable.
29035 TYPE is the data type, and ALIGN is the alignment that
29036 the object would ordinarily have. The value of this function is used
29037 instead of that alignment to align the object. */
29040 ix86_data_alignment (tree type, int align, bool opt)
29042 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
29043 for symbols from other compilation units or symbols that don't need
29044 to bind locally. In order to preserve some ABI compatibility with
29045 those compilers, ensure we don't decrease alignment from what we
29046 used to assume. */
29048 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
29050 /* A data structure, equal or greater than the size of a cache line
29051 (64 bytes in the Pentium 4 and other recent Intel processors, including
29052 processors based on Intel Core microarchitecture) should be aligned
29053 so that its base address is a multiple of a cache line size. */
29055 int max_align
29056 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29058 if (max_align < BITS_PER_WORD)
29059 max_align = BITS_PER_WORD;
29061 switch (ix86_align_data_type)
29063 case ix86_align_data_type_abi: opt = false; break;
29064 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29065 case ix86_align_data_type_cacheline: break;
29068 if (TARGET_IAMCU)
29069 align = iamcu_alignment (type, align);
29071 if (opt
29072 && AGGREGATE_TYPE_P (type)
29073 && TYPE_SIZE (type)
29074 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29076 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29077 && align < max_align_compat)
29078 align = max_align_compat;
29079 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29080 && align < max_align)
29081 align = max_align;
29084 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29085 to 16byte boundary. */
29086 if (TARGET_64BIT)
29088 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29089 && TYPE_SIZE (type)
29090 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29091 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29092 && align < 128)
29093 return 128;
29096 if (!opt)
29097 return align;
29099 if (TREE_CODE (type) == ARRAY_TYPE)
29101 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29102 return 64;
29103 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29104 return 128;
29106 else if (TREE_CODE (type) == COMPLEX_TYPE)
29109 if (TYPE_MODE (type) == DCmode && align < 64)
29110 return 64;
29111 if ((TYPE_MODE (type) == XCmode
29112 || TYPE_MODE (type) == TCmode) && align < 128)
29113 return 128;
29115 else if ((TREE_CODE (type) == RECORD_TYPE
29116 || TREE_CODE (type) == UNION_TYPE
29117 || TREE_CODE (type) == QUAL_UNION_TYPE)
29118 && TYPE_FIELDS (type))
29120 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29121 return 64;
29122 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29123 return 128;
29125 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29126 || TREE_CODE (type) == INTEGER_TYPE)
29128 if (TYPE_MODE (type) == DFmode && align < 64)
29129 return 64;
29130 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29131 return 128;
29134 return align;
29137 /* Compute the alignment for a local variable or a stack slot. EXP is
29138 the data type or decl itself, MODE is the widest mode available and
29139 ALIGN is the alignment that the object would ordinarily have. The
29140 value of this macro is used instead of that alignment to align the
29141 object. */
29143 unsigned int
29144 ix86_local_alignment (tree exp, machine_mode mode,
29145 unsigned int align)
29147 tree type, decl;
29149 if (exp && DECL_P (exp))
29151 type = TREE_TYPE (exp);
29152 decl = exp;
29154 else
29156 type = exp;
29157 decl = NULL;
29160 /* Don't do dynamic stack realignment for long long objects with
29161 -mpreferred-stack-boundary=2. */
29162 if (!TARGET_64BIT
29163 && align == 64
29164 && ix86_preferred_stack_boundary < 64
29165 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29166 && (!type || !TYPE_USER_ALIGN (type))
29167 && (!decl || !DECL_USER_ALIGN (decl)))
29168 align = 32;
29170 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29171 register in MODE. We will return the largest alignment of XF
29172 and DF. */
29173 if (!type)
29175 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29176 align = GET_MODE_ALIGNMENT (DFmode);
29177 return align;
29180 /* Don't increase alignment for Intel MCU psABI. */
29181 if (TARGET_IAMCU)
29182 return align;
29184 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29185 to 16byte boundary. Exact wording is:
29187 An array uses the same alignment as its elements, except that a local or
29188 global array variable of length at least 16 bytes or
29189 a C99 variable-length array variable always has alignment of at least 16 bytes.
29191 This was added to allow use of aligned SSE instructions at arrays. This
29192 rule is meant for static storage (where compiler can not do the analysis
29193 by itself). We follow it for automatic variables only when convenient.
29194 We fully control everything in the function compiled and functions from
29195 other unit can not rely on the alignment.
29197 Exclude va_list type. It is the common case of local array where
29198 we can not benefit from the alignment.
29200 TODO: Probably one should optimize for size only when var is not escaping. */
29201 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29202 && TARGET_SSE)
29204 if (AGGREGATE_TYPE_P (type)
29205 && (va_list_type_node == NULL_TREE
29206 || (TYPE_MAIN_VARIANT (type)
29207 != TYPE_MAIN_VARIANT (va_list_type_node)))
29208 && TYPE_SIZE (type)
29209 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29210 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29211 && align < 128)
29212 return 128;
29214 if (TREE_CODE (type) == ARRAY_TYPE)
29216 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29217 return 64;
29218 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29219 return 128;
29221 else if (TREE_CODE (type) == COMPLEX_TYPE)
29223 if (TYPE_MODE (type) == DCmode && align < 64)
29224 return 64;
29225 if ((TYPE_MODE (type) == XCmode
29226 || TYPE_MODE (type) == TCmode) && align < 128)
29227 return 128;
29229 else if ((TREE_CODE (type) == RECORD_TYPE
29230 || TREE_CODE (type) == UNION_TYPE
29231 || TREE_CODE (type) == QUAL_UNION_TYPE)
29232 && TYPE_FIELDS (type))
29234 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29235 return 64;
29236 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29237 return 128;
29239 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29240 || TREE_CODE (type) == INTEGER_TYPE)
29243 if (TYPE_MODE (type) == DFmode && align < 64)
29244 return 64;
29245 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29246 return 128;
29248 return align;
29251 /* Compute the minimum required alignment for dynamic stack realignment
29252 purposes for a local variable, parameter or a stack slot. EXP is
29253 the data type or decl itself, MODE is its mode and ALIGN is the
29254 alignment that the object would ordinarily have. */
29256 unsigned int
29257 ix86_minimum_alignment (tree exp, machine_mode mode,
29258 unsigned int align)
29260 tree type, decl;
29262 if (exp && DECL_P (exp))
29264 type = TREE_TYPE (exp);
29265 decl = exp;
29267 else
29269 type = exp;
29270 decl = NULL;
29273 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29274 return align;
29276 /* Don't do dynamic stack realignment for long long objects with
29277 -mpreferred-stack-boundary=2. */
29278 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29279 && (!type || !TYPE_USER_ALIGN (type))
29280 && (!decl || !DECL_USER_ALIGN (decl)))
29282 gcc_checking_assert (!TARGET_STV);
29283 return 32;
29286 return align;
29289 /* Find a location for the static chain incoming to a nested function.
29290 This is a register, unless all free registers are used by arguments. */
29292 static rtx
29293 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29295 unsigned regno;
29297 if (TARGET_64BIT)
29299 /* We always use R10 in 64-bit mode. */
29300 regno = R10_REG;
29302 else
29304 const_tree fntype, fndecl;
29305 unsigned int ccvt;
29307 /* By default in 32-bit mode we use ECX to pass the static chain. */
29308 regno = CX_REG;
29310 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29312 fntype = TREE_TYPE (fndecl_or_type);
29313 fndecl = fndecl_or_type;
29315 else
29317 fntype = fndecl_or_type;
29318 fndecl = NULL;
29321 ccvt = ix86_get_callcvt (fntype);
29322 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29324 /* Fastcall functions use ecx/edx for arguments, which leaves
29325 us with EAX for the static chain.
29326 Thiscall functions use ecx for arguments, which also
29327 leaves us with EAX for the static chain. */
29328 regno = AX_REG;
29330 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29332 /* Thiscall functions use ecx for arguments, which leaves
29333 us with EAX and EDX for the static chain.
29334 We are using for abi-compatibility EAX. */
29335 regno = AX_REG;
29337 else if (ix86_function_regparm (fntype, fndecl) == 3)
29339 /* For regparm 3, we have no free call-clobbered registers in
29340 which to store the static chain. In order to implement this,
29341 we have the trampoline push the static chain to the stack.
29342 However, we can't push a value below the return address when
29343 we call the nested function directly, so we have to use an
29344 alternate entry point. For this we use ESI, and have the
29345 alternate entry point push ESI, so that things appear the
29346 same once we're executing the nested function. */
29347 if (incoming_p)
29349 if (fndecl == current_function_decl
29350 && !ix86_static_chain_on_stack)
29352 gcc_assert (!reload_completed);
29353 ix86_static_chain_on_stack = true;
29355 return gen_frame_mem (SImode,
29356 plus_constant (Pmode,
29357 arg_pointer_rtx, -8));
29359 regno = SI_REG;
29363 return gen_rtx_REG (Pmode, regno);
29366 /* Emit RTL insns to initialize the variable parts of a trampoline.
29367 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29368 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29369 to be passed to the target function. */
29371 static void
29372 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29374 rtx mem, fnaddr;
29375 int opcode;
29376 int offset = 0;
29378 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29380 if (TARGET_64BIT)
29382 int size;
29384 /* Load the function address to r11. Try to load address using
29385 the shorter movl instead of movabs. We may want to support
29386 movq for kernel mode, but kernel does not use trampolines at
29387 the moment. FNADDR is a 32bit address and may not be in
29388 DImode when ptr_mode == SImode. Always use movl in this
29389 case. */
29390 if (ptr_mode == SImode
29391 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29393 fnaddr = copy_addr_to_reg (fnaddr);
29395 mem = adjust_address (m_tramp, HImode, offset);
29396 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29398 mem = adjust_address (m_tramp, SImode, offset + 2);
29399 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29400 offset += 6;
29402 else
29404 mem = adjust_address (m_tramp, HImode, offset);
29405 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29407 mem = adjust_address (m_tramp, DImode, offset + 2);
29408 emit_move_insn (mem, fnaddr);
29409 offset += 10;
29412 /* Load static chain using movabs to r10. Use the shorter movl
29413 instead of movabs when ptr_mode == SImode. */
29414 if (ptr_mode == SImode)
29416 opcode = 0xba41;
29417 size = 6;
29419 else
29421 opcode = 0xba49;
29422 size = 10;
29425 mem = adjust_address (m_tramp, HImode, offset);
29426 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29428 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29429 emit_move_insn (mem, chain_value);
29430 offset += size;
29432 /* Jump to r11; the last (unused) byte is a nop, only there to
29433 pad the write out to a single 32-bit store. */
29434 mem = adjust_address (m_tramp, SImode, offset);
29435 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29436 offset += 4;
29438 else
29440 rtx disp, chain;
29442 /* Depending on the static chain location, either load a register
29443 with a constant, or push the constant to the stack. All of the
29444 instructions are the same size. */
29445 chain = ix86_static_chain (fndecl, true);
29446 if (REG_P (chain))
29448 switch (REGNO (chain))
29450 case AX_REG:
29451 opcode = 0xb8; break;
29452 case CX_REG:
29453 opcode = 0xb9; break;
29454 default:
29455 gcc_unreachable ();
29458 else
29459 opcode = 0x68;
29461 mem = adjust_address (m_tramp, QImode, offset);
29462 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29464 mem = adjust_address (m_tramp, SImode, offset + 1);
29465 emit_move_insn (mem, chain_value);
29466 offset += 5;
29468 mem = adjust_address (m_tramp, QImode, offset);
29469 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29471 mem = adjust_address (m_tramp, SImode, offset + 1);
29473 /* Compute offset from the end of the jmp to the target function.
29474 In the case in which the trampoline stores the static chain on
29475 the stack, we need to skip the first insn which pushes the
29476 (call-saved) register static chain; this push is 1 byte. */
29477 offset += 5;
29478 disp = expand_binop (SImode, sub_optab, fnaddr,
29479 plus_constant (Pmode, XEXP (m_tramp, 0),
29480 offset - (MEM_P (chain) ? 1 : 0)),
29481 NULL_RTX, 1, OPTAB_DIRECT);
29482 emit_move_insn (mem, disp);
29485 gcc_assert (offset <= TRAMPOLINE_SIZE);
29487 #ifdef HAVE_ENABLE_EXECUTE_STACK
29488 #ifdef CHECK_EXECUTE_STACK_ENABLED
29489 if (CHECK_EXECUTE_STACK_ENABLED)
29490 #endif
29491 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29492 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29493 #endif
29496 static bool
29497 ix86_allocate_stack_slots_for_args (void)
29499 /* Naked functions should not allocate stack slots for arguments. */
29500 return !ix86_function_naked (current_function_decl);
29503 static bool
29504 ix86_warn_func_return (tree decl)
29506 /* Naked functions are implemented entirely in assembly, including the
29507 return sequence, so suppress warnings about this. */
29508 return !ix86_function_naked (decl);
29511 /* The following file contains several enumerations and data structures
29512 built from the definitions in i386-builtin-types.def. */
29514 #include "i386-builtin-types.inc"
29516 /* Table for the ix86 builtin non-function types. */
29517 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29519 /* Retrieve an element from the above table, building some of
29520 the types lazily. */
29522 static tree
29523 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29525 unsigned int index;
29526 tree type, itype;
29528 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29530 type = ix86_builtin_type_tab[(int) tcode];
29531 if (type != NULL)
29532 return type;
29534 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29535 if (tcode <= IX86_BT_LAST_VECT)
29537 machine_mode mode;
29539 index = tcode - IX86_BT_LAST_PRIM - 1;
29540 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29541 mode = ix86_builtin_type_vect_mode[index];
29543 type = build_vector_type_for_mode (itype, mode);
29545 else
29547 int quals;
29549 index = tcode - IX86_BT_LAST_VECT - 1;
29550 if (tcode <= IX86_BT_LAST_PTR)
29551 quals = TYPE_UNQUALIFIED;
29552 else
29553 quals = TYPE_QUAL_CONST;
29555 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29556 if (quals != TYPE_UNQUALIFIED)
29557 itype = build_qualified_type (itype, quals);
29559 type = build_pointer_type (itype);
29562 ix86_builtin_type_tab[(int) tcode] = type;
29563 return type;
29566 /* Table for the ix86 builtin function types. */
29567 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29569 /* Retrieve an element from the above table, building some of
29570 the types lazily. */
29572 static tree
29573 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29575 tree type;
29577 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29579 type = ix86_builtin_func_type_tab[(int) tcode];
29580 if (type != NULL)
29581 return type;
29583 if (tcode <= IX86_BT_LAST_FUNC)
29585 unsigned start = ix86_builtin_func_start[(int) tcode];
29586 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29587 tree rtype, atype, args = void_list_node;
29588 unsigned i;
29590 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29591 for (i = after - 1; i > start; --i)
29593 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29594 args = tree_cons (NULL, atype, args);
29597 type = build_function_type (rtype, args);
29599 else
29601 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29602 enum ix86_builtin_func_type icode;
29604 icode = ix86_builtin_func_alias_base[index];
29605 type = ix86_get_builtin_func_type (icode);
29608 ix86_builtin_func_type_tab[(int) tcode] = type;
29609 return type;
29613 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29614 bdesc_* arrays below should come first, then builtins for each bdesc_*
29615 array in ascending order, so that we can use direct array accesses. */
29616 enum ix86_builtins
29618 IX86_BUILTIN_MASKMOVQ,
29619 IX86_BUILTIN_LDMXCSR,
29620 IX86_BUILTIN_STMXCSR,
29621 IX86_BUILTIN_MASKMOVDQU,
29622 IX86_BUILTIN_PSLLDQ128,
29623 IX86_BUILTIN_CLFLUSH,
29624 IX86_BUILTIN_MONITOR,
29625 IX86_BUILTIN_MWAIT,
29626 IX86_BUILTIN_CLZERO,
29627 IX86_BUILTIN_VEC_INIT_V2SI,
29628 IX86_BUILTIN_VEC_INIT_V4HI,
29629 IX86_BUILTIN_VEC_INIT_V8QI,
29630 IX86_BUILTIN_VEC_EXT_V2DF,
29631 IX86_BUILTIN_VEC_EXT_V2DI,
29632 IX86_BUILTIN_VEC_EXT_V4SF,
29633 IX86_BUILTIN_VEC_EXT_V4SI,
29634 IX86_BUILTIN_VEC_EXT_V8HI,
29635 IX86_BUILTIN_VEC_EXT_V2SI,
29636 IX86_BUILTIN_VEC_EXT_V4HI,
29637 IX86_BUILTIN_VEC_EXT_V16QI,
29638 IX86_BUILTIN_VEC_SET_V2DI,
29639 IX86_BUILTIN_VEC_SET_V4SF,
29640 IX86_BUILTIN_VEC_SET_V4SI,
29641 IX86_BUILTIN_VEC_SET_V8HI,
29642 IX86_BUILTIN_VEC_SET_V4HI,
29643 IX86_BUILTIN_VEC_SET_V16QI,
29644 IX86_BUILTIN_GATHERSIV2DF,
29645 IX86_BUILTIN_GATHERSIV4DF,
29646 IX86_BUILTIN_GATHERDIV2DF,
29647 IX86_BUILTIN_GATHERDIV4DF,
29648 IX86_BUILTIN_GATHERSIV4SF,
29649 IX86_BUILTIN_GATHERSIV8SF,
29650 IX86_BUILTIN_GATHERDIV4SF,
29651 IX86_BUILTIN_GATHERDIV8SF,
29652 IX86_BUILTIN_GATHERSIV2DI,
29653 IX86_BUILTIN_GATHERSIV4DI,
29654 IX86_BUILTIN_GATHERDIV2DI,
29655 IX86_BUILTIN_GATHERDIV4DI,
29656 IX86_BUILTIN_GATHERSIV4SI,
29657 IX86_BUILTIN_GATHERSIV8SI,
29658 IX86_BUILTIN_GATHERDIV4SI,
29659 IX86_BUILTIN_GATHERDIV8SI,
29660 IX86_BUILTIN_VFMSUBSD3_MASK3,
29661 IX86_BUILTIN_VFMSUBSS3_MASK3,
29662 IX86_BUILTIN_GATHER3SIV8SF,
29663 IX86_BUILTIN_GATHER3SIV4SF,
29664 IX86_BUILTIN_GATHER3SIV4DF,
29665 IX86_BUILTIN_GATHER3SIV2DF,
29666 IX86_BUILTIN_GATHER3DIV8SF,
29667 IX86_BUILTIN_GATHER3DIV4SF,
29668 IX86_BUILTIN_GATHER3DIV4DF,
29669 IX86_BUILTIN_GATHER3DIV2DF,
29670 IX86_BUILTIN_GATHER3SIV8SI,
29671 IX86_BUILTIN_GATHER3SIV4SI,
29672 IX86_BUILTIN_GATHER3SIV4DI,
29673 IX86_BUILTIN_GATHER3SIV2DI,
29674 IX86_BUILTIN_GATHER3DIV8SI,
29675 IX86_BUILTIN_GATHER3DIV4SI,
29676 IX86_BUILTIN_GATHER3DIV4DI,
29677 IX86_BUILTIN_GATHER3DIV2DI,
29678 IX86_BUILTIN_SCATTERSIV8SF,
29679 IX86_BUILTIN_SCATTERSIV4SF,
29680 IX86_BUILTIN_SCATTERSIV4DF,
29681 IX86_BUILTIN_SCATTERSIV2DF,
29682 IX86_BUILTIN_SCATTERDIV8SF,
29683 IX86_BUILTIN_SCATTERDIV4SF,
29684 IX86_BUILTIN_SCATTERDIV4DF,
29685 IX86_BUILTIN_SCATTERDIV2DF,
29686 IX86_BUILTIN_SCATTERSIV8SI,
29687 IX86_BUILTIN_SCATTERSIV4SI,
29688 IX86_BUILTIN_SCATTERSIV4DI,
29689 IX86_BUILTIN_SCATTERSIV2DI,
29690 IX86_BUILTIN_SCATTERDIV8SI,
29691 IX86_BUILTIN_SCATTERDIV4SI,
29692 IX86_BUILTIN_SCATTERDIV4DI,
29693 IX86_BUILTIN_SCATTERDIV2DI,
29694 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29695 where all operands are 32-byte or 64-byte wide respectively. */
29696 IX86_BUILTIN_GATHERALTSIV4DF,
29697 IX86_BUILTIN_GATHERALTDIV8SF,
29698 IX86_BUILTIN_GATHERALTSIV4DI,
29699 IX86_BUILTIN_GATHERALTDIV8SI,
29700 IX86_BUILTIN_GATHER3ALTDIV16SF,
29701 IX86_BUILTIN_GATHER3ALTDIV16SI,
29702 IX86_BUILTIN_GATHER3ALTSIV4DF,
29703 IX86_BUILTIN_GATHER3ALTDIV8SF,
29704 IX86_BUILTIN_GATHER3ALTSIV4DI,
29705 IX86_BUILTIN_GATHER3ALTDIV8SI,
29706 IX86_BUILTIN_GATHER3ALTSIV8DF,
29707 IX86_BUILTIN_GATHER3ALTSIV8DI,
29708 IX86_BUILTIN_GATHER3DIV16SF,
29709 IX86_BUILTIN_GATHER3DIV16SI,
29710 IX86_BUILTIN_GATHER3DIV8DF,
29711 IX86_BUILTIN_GATHER3DIV8DI,
29712 IX86_BUILTIN_GATHER3SIV16SF,
29713 IX86_BUILTIN_GATHER3SIV16SI,
29714 IX86_BUILTIN_GATHER3SIV8DF,
29715 IX86_BUILTIN_GATHER3SIV8DI,
29716 IX86_BUILTIN_SCATTERALTSIV8DF,
29717 IX86_BUILTIN_SCATTERALTDIV16SF,
29718 IX86_BUILTIN_SCATTERALTSIV8DI,
29719 IX86_BUILTIN_SCATTERALTDIV16SI,
29720 IX86_BUILTIN_SCATTERDIV16SF,
29721 IX86_BUILTIN_SCATTERDIV16SI,
29722 IX86_BUILTIN_SCATTERDIV8DF,
29723 IX86_BUILTIN_SCATTERDIV8DI,
29724 IX86_BUILTIN_SCATTERSIV16SF,
29725 IX86_BUILTIN_SCATTERSIV16SI,
29726 IX86_BUILTIN_SCATTERSIV8DF,
29727 IX86_BUILTIN_SCATTERSIV8DI,
29728 IX86_BUILTIN_GATHERPFQPD,
29729 IX86_BUILTIN_GATHERPFDPS,
29730 IX86_BUILTIN_GATHERPFDPD,
29731 IX86_BUILTIN_GATHERPFQPS,
29732 IX86_BUILTIN_SCATTERPFDPD,
29733 IX86_BUILTIN_SCATTERPFDPS,
29734 IX86_BUILTIN_SCATTERPFQPD,
29735 IX86_BUILTIN_SCATTERPFQPS,
29736 IX86_BUILTIN_CLWB,
29737 IX86_BUILTIN_CLFLUSHOPT,
29738 IX86_BUILTIN_INFQ,
29739 IX86_BUILTIN_HUGE_VALQ,
29740 IX86_BUILTIN_NANQ,
29741 IX86_BUILTIN_NANSQ,
29742 IX86_BUILTIN_XABORT,
29743 IX86_BUILTIN_ADDCARRYX32,
29744 IX86_BUILTIN_ADDCARRYX64,
29745 IX86_BUILTIN_SBB32,
29746 IX86_BUILTIN_SBB64,
29747 IX86_BUILTIN_RDRAND16_STEP,
29748 IX86_BUILTIN_RDRAND32_STEP,
29749 IX86_BUILTIN_RDRAND64_STEP,
29750 IX86_BUILTIN_RDSEED16_STEP,
29751 IX86_BUILTIN_RDSEED32_STEP,
29752 IX86_BUILTIN_RDSEED64_STEP,
29753 IX86_BUILTIN_MONITORX,
29754 IX86_BUILTIN_MWAITX,
29755 IX86_BUILTIN_CFSTRING,
29756 IX86_BUILTIN_CPU_INIT,
29757 IX86_BUILTIN_CPU_IS,
29758 IX86_BUILTIN_CPU_SUPPORTS,
29759 IX86_BUILTIN_READ_FLAGS,
29760 IX86_BUILTIN_WRITE_FLAGS,
29762 /* All the remaining builtins are tracked in bdesc_* arrays in
29763 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29764 this point. */
29765 #define BDESC(mask, icode, name, code, comparison, flag) \
29766 code,
29767 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29768 code, \
29769 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29770 #define BDESC_END(kind, next_kind)
29772 #include "i386-builtin.def"
29774 #undef BDESC
29775 #undef BDESC_FIRST
29776 #undef BDESC_END
29778 IX86_BUILTIN_MAX,
29780 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29782 /* Now just the aliases for bdesc_* start/end. */
29783 #define BDESC(mask, icode, name, code, comparison, flag)
29784 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29785 #define BDESC_END(kind, next_kind) \
29786 IX86_BUILTIN__BDESC_##kind##_LAST \
29787 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29789 #include "i386-builtin.def"
29791 #undef BDESC
29792 #undef BDESC_FIRST
29793 #undef BDESC_END
29795 /* Just to make sure there is no comma after the last enumerator. */
29796 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29799 /* Table for the ix86 builtin decls. */
29800 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29802 /* Table of all of the builtin functions that are possible with different ISA's
29803 but are waiting to be built until a function is declared to use that
29804 ISA. */
29805 struct builtin_isa {
29806 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29807 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29808 const char *name; /* function name */
29809 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29810 unsigned char const_p:1; /* true if the declaration is constant */
29811 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29812 bool leaf_p; /* true if the declaration has leaf attribute */
29813 bool nothrow_p; /* true if the declaration has nothrow attribute */
29814 bool set_and_not_built_p;
29817 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29819 /* Bits that can still enable any inclusion of a builtin. */
29820 static HOST_WIDE_INT deferred_isa_values = 0;
29821 static HOST_WIDE_INT deferred_isa_values2 = 0;
29823 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29824 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29825 function decl in the ix86_builtins array. Returns the function decl or
29826 NULL_TREE, if the builtin was not added.
29828 If the front end has a special hook for builtin functions, delay adding
29829 builtin functions that aren't in the current ISA until the ISA is changed
29830 with function specific optimization. Doing so, can save about 300K for the
29831 default compiler. When the builtin is expanded, check at that time whether
29832 it is valid.
29834 If the front end doesn't have a special hook, record all builtins, even if
29835 it isn't an instruction set in the current ISA in case the user uses
29836 function specific options for a different ISA, so that we don't get scope
29837 errors if a builtin is added in the middle of a function scope. */
29839 static inline tree
29840 def_builtin (HOST_WIDE_INT mask, const char *name,
29841 enum ix86_builtin_func_type tcode,
29842 enum ix86_builtins code)
29844 tree decl = NULL_TREE;
29846 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29848 ix86_builtins_isa[(int) code].isa = mask;
29850 mask &= ~OPTION_MASK_ISA_64BIT;
29852 /* Filter out the masks most often ored together with others. */
29853 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29854 && mask != OPTION_MASK_ISA_AVX512VL)
29855 mask &= ~OPTION_MASK_ISA_AVX512VL;
29856 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
29857 && mask != OPTION_MASK_ISA_AVX512BW)
29858 mask &= ~OPTION_MASK_ISA_AVX512BW;
29860 if (mask == 0
29861 || (mask & ix86_isa_flags) != 0
29862 || (lang_hooks.builtin_function
29863 == lang_hooks.builtin_function_ext_scope))
29865 tree type = ix86_get_builtin_func_type (tcode);
29866 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29867 NULL, NULL_TREE);
29868 ix86_builtins[(int) code] = decl;
29869 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29871 else
29873 /* Just a MASK where set_and_not_built_p == true can potentially
29874 include a builtin. */
29875 deferred_isa_values |= mask;
29876 ix86_builtins[(int) code] = NULL_TREE;
29877 ix86_builtins_isa[(int) code].tcode = tcode;
29878 ix86_builtins_isa[(int) code].name = name;
29879 ix86_builtins_isa[(int) code].leaf_p = false;
29880 ix86_builtins_isa[(int) code].nothrow_p = false;
29881 ix86_builtins_isa[(int) code].const_p = false;
29882 ix86_builtins_isa[(int) code].pure_p = false;
29883 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29887 return decl;
29890 /* Like def_builtin, but also marks the function decl "const". */
29892 static inline tree
29893 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29894 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29896 tree decl = def_builtin (mask, name, tcode, code);
29897 if (decl)
29898 TREE_READONLY (decl) = 1;
29899 else
29900 ix86_builtins_isa[(int) code].const_p = true;
29902 return decl;
29905 /* Like def_builtin, but also marks the function decl "pure". */
29907 static inline tree
29908 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29909 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29911 tree decl = def_builtin (mask, name, tcode, code);
29912 if (decl)
29913 DECL_PURE_P (decl) = 1;
29914 else
29915 ix86_builtins_isa[(int) code].pure_p = true;
29917 return decl;
29920 /* Like def_builtin, but for additional isa2 flags. */
29922 static inline tree
29923 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29924 enum ix86_builtin_func_type tcode,
29925 enum ix86_builtins code)
29927 tree decl = NULL_TREE;
29929 ix86_builtins_isa[(int) code].isa2 = mask;
29931 if (mask == 0
29932 || (mask & ix86_isa_flags2) != 0
29933 || (lang_hooks.builtin_function
29934 == lang_hooks.builtin_function_ext_scope))
29937 tree type = ix86_get_builtin_func_type (tcode);
29938 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29939 NULL, NULL_TREE);
29940 ix86_builtins[(int) code] = decl;
29941 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29943 else
29945 /* Just a MASK where set_and_not_built_p == true can potentially
29946 include a builtin. */
29947 deferred_isa_values2 |= mask;
29948 ix86_builtins[(int) code] = NULL_TREE;
29949 ix86_builtins_isa[(int) code].tcode = tcode;
29950 ix86_builtins_isa[(int) code].name = name;
29951 ix86_builtins_isa[(int) code].leaf_p = false;
29952 ix86_builtins_isa[(int) code].nothrow_p = false;
29953 ix86_builtins_isa[(int) code].const_p = false;
29954 ix86_builtins_isa[(int) code].pure_p = false;
29955 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29958 return decl;
29961 /* Like def_builtin, but also marks the function decl "const". */
29963 static inline tree
29964 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29965 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29967 tree decl = def_builtin2 (mask, name, tcode, code);
29968 if (decl)
29969 TREE_READONLY (decl) = 1;
29970 else
29971 ix86_builtins_isa[(int) code].const_p = true;
29973 return decl;
29976 /* Like def_builtin, but also marks the function decl "pure". */
29978 static inline tree
29979 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29980 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29982 tree decl = def_builtin2 (mask, name, tcode, code);
29983 if (decl)
29984 DECL_PURE_P (decl) = 1;
29985 else
29986 ix86_builtins_isa[(int) code].pure_p = true;
29988 return decl;
29991 /* Add any new builtin functions for a given ISA that may not have been
29992 declared. This saves a bit of space compared to adding all of the
29993 declarations to the tree, even if we didn't use them. */
29995 static void
29996 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29998 isa &= ~OPTION_MASK_ISA_64BIT;
30000 if ((isa & deferred_isa_values) == 0
30001 && (isa2 & deferred_isa_values2) == 0)
30002 return;
30004 /* Bits in ISA value can be removed from potential isa values. */
30005 deferred_isa_values &= ~isa;
30006 deferred_isa_values2 &= ~isa2;
30008 int i;
30009 tree saved_current_target_pragma = current_target_pragma;
30010 current_target_pragma = NULL_TREE;
30012 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30014 if (((ix86_builtins_isa[i].isa & isa) != 0
30015 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
30016 && ix86_builtins_isa[i].set_and_not_built_p)
30018 tree decl, type;
30020 /* Don't define the builtin again. */
30021 ix86_builtins_isa[i].set_and_not_built_p = false;
30023 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30024 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30025 type, i, BUILT_IN_MD, NULL,
30026 NULL_TREE);
30028 ix86_builtins[i] = decl;
30029 if (ix86_builtins_isa[i].const_p)
30030 TREE_READONLY (decl) = 1;
30031 if (ix86_builtins_isa[i].pure_p)
30032 DECL_PURE_P (decl) = 1;
30033 if (ix86_builtins_isa[i].leaf_p)
30034 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30035 NULL_TREE);
30036 if (ix86_builtins_isa[i].nothrow_p)
30037 TREE_NOTHROW (decl) = 1;
30041 current_target_pragma = saved_current_target_pragma;
30044 /* Bits for builtin_description.flag. */
30046 /* Set when we don't support the comparison natively, and should
30047 swap_comparison in order to support it. */
30048 #define BUILTIN_DESC_SWAP_OPERANDS 1
30050 struct builtin_description
30052 const HOST_WIDE_INT mask;
30053 const enum insn_code icode;
30054 const char *const name;
30055 const enum ix86_builtins code;
30056 const enum rtx_code comparison;
30057 const int flag;
30060 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30061 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30062 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30063 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30064 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30065 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30066 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30067 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30068 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30069 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30070 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30071 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30072 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30073 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30074 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30075 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30076 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30077 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30078 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30079 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30080 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30081 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30082 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30083 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30084 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30085 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30086 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30087 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30088 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30089 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30090 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30091 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30092 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30093 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30094 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30095 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30096 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30097 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30098 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30099 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30100 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30101 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30102 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30103 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30104 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30105 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30106 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30107 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30108 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30109 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30110 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30111 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30113 #define BDESC(mask, icode, name, code, comparison, flag) \
30114 { mask, icode, name, code, comparison, flag },
30115 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30116 static const struct builtin_description bdesc_##kind[] = \
30118 BDESC (mask, icode, name, code, comparison, flag)
30119 #define BDESC_END(kind, next_kind) \
30122 #include "i386-builtin.def"
30124 #undef BDESC
30125 #undef BDESC_FIRST
30126 #undef BDESC_END
30128 /* TM vector builtins. */
30130 /* Reuse the existing x86-specific `struct builtin_description' cause
30131 we're lazy. Add casts to make them fit. */
30132 static const struct builtin_description bdesc_tm[] =
30134 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30135 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30136 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30137 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30138 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30139 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30140 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30142 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30143 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30144 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30145 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30146 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30147 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30148 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30150 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30151 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30152 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30153 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30154 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30155 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30156 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30158 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30159 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30160 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30163 /* Initialize the transactional memory vector load/store builtins. */
30165 static void
30166 ix86_init_tm_builtins (void)
30168 enum ix86_builtin_func_type ftype;
30169 const struct builtin_description *d;
30170 size_t i;
30171 tree decl;
30172 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30173 tree attrs_log, attrs_type_log;
30175 if (!flag_tm)
30176 return;
30178 /* If there are no builtins defined, we must be compiling in a
30179 language without trans-mem support. */
30180 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30181 return;
30183 /* Use whatever attributes a normal TM load has. */
30184 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30185 attrs_load = DECL_ATTRIBUTES (decl);
30186 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30187 /* Use whatever attributes a normal TM store has. */
30188 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30189 attrs_store = DECL_ATTRIBUTES (decl);
30190 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30191 /* Use whatever attributes a normal TM log has. */
30192 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30193 attrs_log = DECL_ATTRIBUTES (decl);
30194 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30196 for (i = 0, d = bdesc_tm;
30197 i < ARRAY_SIZE (bdesc_tm);
30198 i++, d++)
30200 if ((d->mask & ix86_isa_flags) != 0
30201 || (lang_hooks.builtin_function
30202 == lang_hooks.builtin_function_ext_scope))
30204 tree type, attrs, attrs_type;
30205 enum built_in_function code = (enum built_in_function) d->code;
30207 ftype = (enum ix86_builtin_func_type) d->flag;
30208 type = ix86_get_builtin_func_type (ftype);
30210 if (BUILTIN_TM_LOAD_P (code))
30212 attrs = attrs_load;
30213 attrs_type = attrs_type_load;
30215 else if (BUILTIN_TM_STORE_P (code))
30217 attrs = attrs_store;
30218 attrs_type = attrs_type_store;
30220 else
30222 attrs = attrs_log;
30223 attrs_type = attrs_type_log;
30225 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
30226 /* The builtin without the prefix for
30227 calling it directly. */
30228 d->name + strlen ("__builtin_"),
30229 attrs);
30230 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
30231 set the TYPE_ATTRIBUTES. */
30232 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
30234 set_builtin_decl (code, decl, false);
30239 /* Macros for verification of enum ix86_builtins order. */
30240 #define BDESC_VERIFY(x, y, z) \
30241 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
30242 #define BDESC_VERIFYS(x, y, z) \
30243 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
30245 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30246 IX86_BUILTIN__BDESC_COMI_LAST, 1);
30247 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30248 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
30249 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30250 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
30251 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
30252 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
30253 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30254 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
30255 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
30256 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
30257 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
30258 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
30259 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30260 IX86_BUILTIN__BDESC_MPX_LAST, 1);
30261 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30262 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
30263 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
30264 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
30265 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30266 IX86_BUILTIN__BDESC_CET_LAST, 1);
30267 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30268 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30270 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30271 in the current target ISA to allow the user to compile particular modules
30272 with different target specific options that differ from the command line
30273 options. */
30274 static void
30275 ix86_init_mmx_sse_builtins (void)
30277 const struct builtin_description * d;
30278 enum ix86_builtin_func_type ftype;
30279 size_t i;
30281 /* Add all special builtins with variable number of operands. */
30282 for (i = 0, d = bdesc_special_args;
30283 i < ARRAY_SIZE (bdesc_special_args);
30284 i++, d++)
30286 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30287 if (d->name == 0)
30288 continue;
30290 ftype = (enum ix86_builtin_func_type) d->flag;
30291 def_builtin (d->mask, d->name, ftype, d->code);
30293 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30294 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30295 ARRAY_SIZE (bdesc_special_args) - 1);
30297 /* Add all builtins with variable number of operands. */
30298 for (i = 0, d = bdesc_args;
30299 i < ARRAY_SIZE (bdesc_args);
30300 i++, d++)
30302 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30303 if (d->name == 0)
30304 continue;
30306 ftype = (enum ix86_builtin_func_type) d->flag;
30307 def_builtin_const (d->mask, d->name, ftype, d->code);
30309 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30310 IX86_BUILTIN__BDESC_ARGS_FIRST,
30311 ARRAY_SIZE (bdesc_args) - 1);
30313 /* Add all builtins with variable number of operands. */
30314 for (i = 0, d = bdesc_args2;
30315 i < ARRAY_SIZE (bdesc_args2);
30316 i++, d++)
30318 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
30319 if (d->name == 0)
30320 continue;
30322 ftype = (enum ix86_builtin_func_type) d->flag;
30323 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30325 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
30326 IX86_BUILTIN__BDESC_ARGS2_FIRST,
30327 ARRAY_SIZE (bdesc_args2) - 1);
30329 /* Add all builtins with rounding. */
30330 for (i = 0, d = bdesc_round_args;
30331 i < ARRAY_SIZE (bdesc_round_args);
30332 i++, d++)
30334 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30335 if (d->name == 0)
30336 continue;
30338 ftype = (enum ix86_builtin_func_type) d->flag;
30339 def_builtin_const (d->mask, d->name, ftype, d->code);
30341 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30342 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30343 ARRAY_SIZE (bdesc_round_args) - 1);
30345 /* pcmpestr[im] insns. */
30346 for (i = 0, d = bdesc_pcmpestr;
30347 i < ARRAY_SIZE (bdesc_pcmpestr);
30348 i++, d++)
30350 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30351 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30352 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30353 else
30354 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30355 def_builtin_const (d->mask, d->name, ftype, d->code);
30357 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30358 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30359 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30361 /* pcmpistr[im] insns. */
30362 for (i = 0, d = bdesc_pcmpistr;
30363 i < ARRAY_SIZE (bdesc_pcmpistr);
30364 i++, d++)
30366 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30367 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30368 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30369 else
30370 ftype = INT_FTYPE_V16QI_V16QI_INT;
30371 def_builtin_const (d->mask, d->name, ftype, d->code);
30373 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30374 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30375 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30377 /* comi/ucomi insns. */
30378 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30380 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30381 if (d->mask == OPTION_MASK_ISA_SSE2)
30382 ftype = INT_FTYPE_V2DF_V2DF;
30383 else
30384 ftype = INT_FTYPE_V4SF_V4SF;
30385 def_builtin_const (d->mask, d->name, ftype, d->code);
30387 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30388 IX86_BUILTIN__BDESC_COMI_FIRST,
30389 ARRAY_SIZE (bdesc_comi) - 1);
30391 /* SSE */
30392 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30393 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30394 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30395 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30397 /* SSE or 3DNow!A */
30398 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30399 /* As it uses V4HImode, we have to require -mmmx too. */
30400 | OPTION_MASK_ISA_MMX,
30401 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30402 IX86_BUILTIN_MASKMOVQ);
30404 /* SSE2 */
30405 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30406 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30408 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30409 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30410 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30411 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30413 /* SSE3. */
30414 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30415 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30416 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30417 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30419 /* AES */
30420 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30421 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30422 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30423 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30424 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30425 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30426 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30427 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30428 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30429 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30430 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30431 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30433 /* PCLMUL */
30434 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30435 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30437 /* RDRND */
30438 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30439 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30440 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30441 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30442 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30443 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30444 IX86_BUILTIN_RDRAND64_STEP);
30446 /* AVX2 */
30447 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30448 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30449 IX86_BUILTIN_GATHERSIV2DF);
30451 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30452 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30453 IX86_BUILTIN_GATHERSIV4DF);
30455 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30456 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30457 IX86_BUILTIN_GATHERDIV2DF);
30459 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30460 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30461 IX86_BUILTIN_GATHERDIV4DF);
30463 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30464 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30465 IX86_BUILTIN_GATHERSIV4SF);
30467 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30468 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30469 IX86_BUILTIN_GATHERSIV8SF);
30471 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30472 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30473 IX86_BUILTIN_GATHERDIV4SF);
30475 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30476 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30477 IX86_BUILTIN_GATHERDIV8SF);
30479 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30480 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30481 IX86_BUILTIN_GATHERSIV2DI);
30483 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30484 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30485 IX86_BUILTIN_GATHERSIV4DI);
30487 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30488 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30489 IX86_BUILTIN_GATHERDIV2DI);
30491 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30492 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30493 IX86_BUILTIN_GATHERDIV4DI);
30495 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30496 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30497 IX86_BUILTIN_GATHERSIV4SI);
30499 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30500 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30501 IX86_BUILTIN_GATHERSIV8SI);
30503 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30504 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30505 IX86_BUILTIN_GATHERDIV4SI);
30507 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30508 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30509 IX86_BUILTIN_GATHERDIV8SI);
30511 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30512 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30513 IX86_BUILTIN_GATHERALTSIV4DF);
30515 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30516 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30517 IX86_BUILTIN_GATHERALTDIV8SF);
30519 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30520 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30521 IX86_BUILTIN_GATHERALTSIV4DI);
30523 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30524 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30525 IX86_BUILTIN_GATHERALTDIV8SI);
30527 /* AVX512F */
30528 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30529 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30530 IX86_BUILTIN_GATHER3SIV16SF);
30532 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30533 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30534 IX86_BUILTIN_GATHER3SIV8DF);
30536 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30537 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30538 IX86_BUILTIN_GATHER3DIV16SF);
30540 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30541 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30542 IX86_BUILTIN_GATHER3DIV8DF);
30544 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30545 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30546 IX86_BUILTIN_GATHER3SIV16SI);
30548 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30549 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30550 IX86_BUILTIN_GATHER3SIV8DI);
30552 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30553 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30554 IX86_BUILTIN_GATHER3DIV16SI);
30556 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30557 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30558 IX86_BUILTIN_GATHER3DIV8DI);
30560 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30561 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30562 IX86_BUILTIN_GATHER3ALTSIV8DF);
30564 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30565 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30566 IX86_BUILTIN_GATHER3ALTDIV16SF);
30568 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30569 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30570 IX86_BUILTIN_GATHER3ALTSIV8DI);
30572 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30573 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30574 IX86_BUILTIN_GATHER3ALTDIV16SI);
30576 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30577 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30578 IX86_BUILTIN_SCATTERSIV16SF);
30580 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30581 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30582 IX86_BUILTIN_SCATTERSIV8DF);
30584 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30585 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30586 IX86_BUILTIN_SCATTERDIV16SF);
30588 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30589 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30590 IX86_BUILTIN_SCATTERDIV8DF);
30592 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30593 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30594 IX86_BUILTIN_SCATTERSIV16SI);
30596 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30597 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30598 IX86_BUILTIN_SCATTERSIV8DI);
30600 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30601 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30602 IX86_BUILTIN_SCATTERDIV16SI);
30604 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30605 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30606 IX86_BUILTIN_SCATTERDIV8DI);
30608 /* AVX512VL */
30609 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30610 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30611 IX86_BUILTIN_GATHER3SIV2DF);
30613 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30614 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30615 IX86_BUILTIN_GATHER3SIV4DF);
30617 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30618 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30619 IX86_BUILTIN_GATHER3DIV2DF);
30621 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30622 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30623 IX86_BUILTIN_GATHER3DIV4DF);
30625 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30626 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30627 IX86_BUILTIN_GATHER3SIV4SF);
30629 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30630 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30631 IX86_BUILTIN_GATHER3SIV8SF);
30633 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30634 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30635 IX86_BUILTIN_GATHER3DIV4SF);
30637 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30638 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30639 IX86_BUILTIN_GATHER3DIV8SF);
30641 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30642 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30643 IX86_BUILTIN_GATHER3SIV2DI);
30645 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30646 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30647 IX86_BUILTIN_GATHER3SIV4DI);
30649 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30650 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30651 IX86_BUILTIN_GATHER3DIV2DI);
30653 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30654 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30655 IX86_BUILTIN_GATHER3DIV4DI);
30657 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30658 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30659 IX86_BUILTIN_GATHER3SIV4SI);
30661 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30662 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30663 IX86_BUILTIN_GATHER3SIV8SI);
30665 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30666 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30667 IX86_BUILTIN_GATHER3DIV4SI);
30669 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30670 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30671 IX86_BUILTIN_GATHER3DIV8SI);
30673 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30674 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30675 IX86_BUILTIN_GATHER3ALTSIV4DF);
30677 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30678 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30679 IX86_BUILTIN_GATHER3ALTDIV8SF);
30681 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30682 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30683 IX86_BUILTIN_GATHER3ALTSIV4DI);
30685 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30686 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30687 IX86_BUILTIN_GATHER3ALTDIV8SI);
30689 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30690 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30691 IX86_BUILTIN_SCATTERSIV8SF);
30693 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30694 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30695 IX86_BUILTIN_SCATTERSIV4SF);
30697 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30698 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30699 IX86_BUILTIN_SCATTERSIV4DF);
30701 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30702 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30703 IX86_BUILTIN_SCATTERSIV2DF);
30705 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30706 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30707 IX86_BUILTIN_SCATTERDIV8SF);
30709 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30710 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30711 IX86_BUILTIN_SCATTERDIV4SF);
30713 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30714 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30715 IX86_BUILTIN_SCATTERDIV4DF);
30717 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30718 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30719 IX86_BUILTIN_SCATTERDIV2DF);
30721 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30722 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30723 IX86_BUILTIN_SCATTERSIV8SI);
30725 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30726 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30727 IX86_BUILTIN_SCATTERSIV4SI);
30729 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30730 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30731 IX86_BUILTIN_SCATTERSIV4DI);
30733 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30734 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30735 IX86_BUILTIN_SCATTERSIV2DI);
30737 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30738 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30739 IX86_BUILTIN_SCATTERDIV8SI);
30741 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30742 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30743 IX86_BUILTIN_SCATTERDIV4SI);
30745 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30746 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30747 IX86_BUILTIN_SCATTERDIV4DI);
30749 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30750 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30751 IX86_BUILTIN_SCATTERDIV2DI);
30752 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30753 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30754 IX86_BUILTIN_SCATTERALTSIV8DF);
30756 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30757 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30758 IX86_BUILTIN_SCATTERALTDIV16SF);
30760 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30761 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30762 IX86_BUILTIN_SCATTERALTSIV8DI);
30764 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30765 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30766 IX86_BUILTIN_SCATTERALTDIV16SI);
30768 /* AVX512PF */
30769 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30770 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30771 IX86_BUILTIN_GATHERPFDPD);
30772 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30773 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30774 IX86_BUILTIN_GATHERPFDPS);
30775 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30776 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30777 IX86_BUILTIN_GATHERPFQPD);
30778 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30779 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30780 IX86_BUILTIN_GATHERPFQPS);
30781 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30782 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30783 IX86_BUILTIN_SCATTERPFDPD);
30784 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30785 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30786 IX86_BUILTIN_SCATTERPFDPS);
30787 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30788 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30789 IX86_BUILTIN_SCATTERPFQPD);
30790 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30791 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30792 IX86_BUILTIN_SCATTERPFQPS);
30794 /* SHA */
30795 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30796 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30797 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30798 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30799 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30800 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30801 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30802 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30803 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30804 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30805 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30806 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30807 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30808 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30810 /* RTM. */
30811 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30812 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30814 /* MMX access to the vec_init patterns. */
30815 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30816 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30818 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30819 V4HI_FTYPE_HI_HI_HI_HI,
30820 IX86_BUILTIN_VEC_INIT_V4HI);
30822 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30823 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30824 IX86_BUILTIN_VEC_INIT_V8QI);
30826 /* Access to the vec_extract patterns. */
30827 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30828 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30829 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30830 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30831 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30832 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30833 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30834 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30835 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30836 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30838 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30839 /* As it uses V4HImode, we have to require -mmmx too. */
30840 | OPTION_MASK_ISA_MMX,
30841 "__builtin_ia32_vec_ext_v4hi",
30842 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30844 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30845 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30847 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30848 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30850 /* Access to the vec_set patterns. */
30851 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30852 "__builtin_ia32_vec_set_v2di",
30853 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30855 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30856 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30858 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30859 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30861 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30862 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30864 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30865 /* As it uses V4HImode, we have to require -mmmx too. */
30866 | OPTION_MASK_ISA_MMX,
30867 "__builtin_ia32_vec_set_v4hi",
30868 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30870 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30871 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30873 /* RDSEED */
30874 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30875 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30876 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30877 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30878 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30879 "__builtin_ia32_rdseed_di_step",
30880 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30882 /* ADCX */
30883 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30884 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30885 def_builtin (OPTION_MASK_ISA_64BIT,
30886 "__builtin_ia32_addcarryx_u64",
30887 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30888 IX86_BUILTIN_ADDCARRYX64);
30890 /* SBB */
30891 def_builtin (0, "__builtin_ia32_sbb_u32",
30892 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30893 def_builtin (OPTION_MASK_ISA_64BIT,
30894 "__builtin_ia32_sbb_u64",
30895 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30896 IX86_BUILTIN_SBB64);
30898 /* Read/write FLAGS. */
30899 def_builtin (0, "__builtin_ia32_readeflags_u32",
30900 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30901 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30902 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30903 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30904 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30905 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30906 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30908 /* CLFLUSHOPT. */
30909 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30910 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30912 /* CLWB. */
30913 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30914 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30916 /* MONITORX and MWAITX. */
30917 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30918 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30919 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30920 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30922 /* CLZERO. */
30923 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30924 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30926 /* Add FMA4 multi-arg argument instructions */
30927 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30929 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30930 if (d->name == 0)
30931 continue;
30933 ftype = (enum ix86_builtin_func_type) d->flag;
30934 def_builtin_const (d->mask, d->name, ftype, d->code);
30936 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30937 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30938 ARRAY_SIZE (bdesc_multi_arg) - 1);
30940 /* Add CET inrinsics. */
30941 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30943 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30944 if (d->name == 0)
30945 continue;
30947 ftype = (enum ix86_builtin_func_type) d->flag;
30948 def_builtin (d->mask, d->name, ftype, d->code);
30950 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30951 IX86_BUILTIN__BDESC_CET_FIRST,
30952 ARRAY_SIZE (bdesc_cet) - 1);
30954 for (i = 0, d = bdesc_cet_rdssp;
30955 i < ARRAY_SIZE (bdesc_cet_rdssp);
30956 i++, d++)
30958 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30959 if (d->name == 0)
30960 continue;
30962 ftype = (enum ix86_builtin_func_type) d->flag;
30963 def_builtin (d->mask, d->name, ftype, d->code);
30965 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30966 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30967 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30970 static void
30971 ix86_init_mpx_builtins ()
30973 const struct builtin_description * d;
30974 enum ix86_builtin_func_type ftype;
30975 tree decl;
30976 size_t i;
30978 for (i = 0, d = bdesc_mpx;
30979 i < ARRAY_SIZE (bdesc_mpx);
30980 i++, d++)
30982 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30983 if (d->name == 0)
30984 continue;
30986 ftype = (enum ix86_builtin_func_type) d->flag;
30987 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
30989 /* With no leaf and nothrow flags for MPX builtins
30990 abnormal edges may follow its call when setjmp
30991 presents in the function. Since we may have a lot
30992 of MPX builtins calls it causes lots of useless
30993 edges and enormous PHI nodes. To avoid this we mark
30994 MPX builtins as leaf and nothrow. */
30995 if (decl)
30997 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30998 NULL_TREE);
30999 TREE_NOTHROW (decl) = 1;
31001 else
31003 ix86_builtins_isa[(int)d->code].leaf_p = true;
31004 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31007 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31008 IX86_BUILTIN__BDESC_MPX_FIRST,
31009 ARRAY_SIZE (bdesc_mpx) - 1);
31011 for (i = 0, d = bdesc_mpx_const;
31012 i < ARRAY_SIZE (bdesc_mpx_const);
31013 i++, d++)
31015 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31016 if (d->name == 0)
31017 continue;
31019 ftype = (enum ix86_builtin_func_type) d->flag;
31020 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
31022 if (decl)
31024 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31025 NULL_TREE);
31026 TREE_NOTHROW (decl) = 1;
31028 else
31030 ix86_builtins_isa[(int)d->code].leaf_p = true;
31031 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31034 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31035 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31036 ARRAY_SIZE (bdesc_mpx_const) - 1);
31038 #undef BDESC_VERIFY
31039 #undef BDESC_VERIFYS
31041 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31042 to return a pointer to VERSION_DECL if the outcome of the expression
31043 formed by PREDICATE_CHAIN is true. This function will be called during
31044 version dispatch to decide which function version to execute. It returns
31045 the basic block at the end, to which more conditions can be added. */
31047 static basic_block
31048 add_condition_to_bb (tree function_decl, tree version_decl,
31049 tree predicate_chain, basic_block new_bb)
31051 gimple *return_stmt;
31052 tree convert_expr, result_var;
31053 gimple *convert_stmt;
31054 gimple *call_cond_stmt;
31055 gimple *if_else_stmt;
31057 basic_block bb1, bb2, bb3;
31058 edge e12, e23;
31060 tree cond_var, and_expr_var = NULL_TREE;
31061 gimple_seq gseq;
31063 tree predicate_decl, predicate_arg;
31065 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31067 gcc_assert (new_bb != NULL);
31068 gseq = bb_seq (new_bb);
31071 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31072 build_fold_addr_expr (version_decl));
31073 result_var = create_tmp_var (ptr_type_node);
31074 convert_stmt = gimple_build_assign (result_var, convert_expr);
31075 return_stmt = gimple_build_return (result_var);
31077 if (predicate_chain == NULL_TREE)
31079 gimple_seq_add_stmt (&gseq, convert_stmt);
31080 gimple_seq_add_stmt (&gseq, return_stmt);
31081 set_bb_seq (new_bb, gseq);
31082 gimple_set_bb (convert_stmt, new_bb);
31083 gimple_set_bb (return_stmt, new_bb);
31084 pop_cfun ();
31085 return new_bb;
31088 while (predicate_chain != NULL)
31090 cond_var = create_tmp_var (integer_type_node);
31091 predicate_decl = TREE_PURPOSE (predicate_chain);
31092 predicate_arg = TREE_VALUE (predicate_chain);
31093 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31094 gimple_call_set_lhs (call_cond_stmt, cond_var);
31096 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31097 gimple_set_bb (call_cond_stmt, new_bb);
31098 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31100 predicate_chain = TREE_CHAIN (predicate_chain);
31102 if (and_expr_var == NULL)
31103 and_expr_var = cond_var;
31104 else
31106 gimple *assign_stmt;
31107 /* Use MIN_EXPR to check if any integer is zero?.
31108 and_expr_var = min_expr <cond_var, and_expr_var> */
31109 assign_stmt = gimple_build_assign (and_expr_var,
31110 build2 (MIN_EXPR, integer_type_node,
31111 cond_var, and_expr_var));
31113 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31114 gimple_set_bb (assign_stmt, new_bb);
31115 gimple_seq_add_stmt (&gseq, assign_stmt);
31119 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31120 integer_zero_node,
31121 NULL_TREE, NULL_TREE);
31122 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31123 gimple_set_bb (if_else_stmt, new_bb);
31124 gimple_seq_add_stmt (&gseq, if_else_stmt);
31126 gimple_seq_add_stmt (&gseq, convert_stmt);
31127 gimple_seq_add_stmt (&gseq, return_stmt);
31128 set_bb_seq (new_bb, gseq);
31130 bb1 = new_bb;
31131 e12 = split_block (bb1, if_else_stmt);
31132 bb2 = e12->dest;
31133 e12->flags &= ~EDGE_FALLTHRU;
31134 e12->flags |= EDGE_TRUE_VALUE;
31136 e23 = split_block (bb2, return_stmt);
31138 gimple_set_bb (convert_stmt, bb2);
31139 gimple_set_bb (return_stmt, bb2);
31141 bb3 = e23->dest;
31142 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31144 remove_edge (e23);
31145 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31147 pop_cfun ();
31149 return bb3;
31152 /* This parses the attribute arguments to target in DECL and determines
31153 the right builtin to use to match the platform specification.
31154 It returns the priority value for this version decl. If PREDICATE_LIST
31155 is not NULL, it stores the list of cpu features that need to be checked
31156 before dispatching this function. */
31158 static unsigned int
31159 get_builtin_code_for_version (tree decl, tree *predicate_list)
31161 tree attrs;
31162 struct cl_target_option cur_target;
31163 tree target_node;
31164 struct cl_target_option *new_target;
31165 const char *arg_str = NULL;
31166 const char *attrs_str = NULL;
31167 char *tok_str = NULL;
31168 char *token;
31170 /* Priority of i386 features, greater value is higher priority. This is
31171 used to decide the order in which function dispatch must happen. For
31172 instance, a version specialized for SSE4.2 should be checked for dispatch
31173 before a version for SSE3, as SSE4.2 implies SSE3. */
31174 enum feature_priority
31176 P_ZERO = 0,
31177 P_MMX,
31178 P_SSE,
31179 P_SSE2,
31180 P_SSE3,
31181 P_SSSE3,
31182 P_PROC_SSSE3,
31183 P_SSE4_A,
31184 P_PROC_SSE4_A,
31185 P_SSE4_1,
31186 P_SSE4_2,
31187 P_PROC_SSE4_2,
31188 P_POPCNT,
31189 P_AES,
31190 P_PCLMUL,
31191 P_AVX,
31192 P_PROC_AVX,
31193 P_BMI,
31194 P_PROC_BMI,
31195 P_FMA4,
31196 P_XOP,
31197 P_PROC_XOP,
31198 P_FMA,
31199 P_PROC_FMA,
31200 P_BMI2,
31201 P_AVX2,
31202 P_PROC_AVX2,
31203 P_AVX512F,
31204 P_PROC_AVX512F
31207 enum feature_priority priority = P_ZERO;
31209 /* These are the target attribute strings for which a dispatcher is
31210 available, from fold_builtin_cpu. */
31212 static struct _feature_list
31214 const char *const name;
31215 const enum feature_priority priority;
31217 const feature_list[] =
31219 {"mmx", P_MMX},
31220 {"sse", P_SSE},
31221 {"sse2", P_SSE2},
31222 {"sse3", P_SSE3},
31223 {"sse4a", P_SSE4_A},
31224 {"ssse3", P_SSSE3},
31225 {"sse4.1", P_SSE4_1},
31226 {"sse4.2", P_SSE4_2},
31227 {"popcnt", P_POPCNT},
31228 {"aes", P_AES},
31229 {"pclmul", P_PCLMUL},
31230 {"avx", P_AVX},
31231 {"bmi", P_BMI},
31232 {"fma4", P_FMA4},
31233 {"xop", P_XOP},
31234 {"fma", P_FMA},
31235 {"bmi2", P_BMI2},
31236 {"avx2", P_AVX2},
31237 {"avx512f", P_AVX512F}
31241 static unsigned int NUM_FEATURES
31242 = sizeof (feature_list) / sizeof (struct _feature_list);
31244 unsigned int i;
31246 tree predicate_chain = NULL_TREE;
31247 tree predicate_decl, predicate_arg;
31249 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31250 gcc_assert (attrs != NULL);
31252 attrs = TREE_VALUE (TREE_VALUE (attrs));
31254 gcc_assert (TREE_CODE (attrs) == STRING_CST);
31255 attrs_str = TREE_STRING_POINTER (attrs);
31257 /* Return priority zero for default function. */
31258 if (strcmp (attrs_str, "default") == 0)
31259 return 0;
31261 /* Handle arch= if specified. For priority, set it to be 1 more than
31262 the best instruction set the processor can handle. For instance, if
31263 there is a version for atom and a version for ssse3 (the highest ISA
31264 priority for atom), the atom version must be checked for dispatch
31265 before the ssse3 version. */
31266 if (strstr (attrs_str, "arch=") != NULL)
31268 cl_target_option_save (&cur_target, &global_options);
31269 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31270 &global_options_set);
31272 gcc_assert (target_node);
31273 new_target = TREE_TARGET_OPTION (target_node);
31274 gcc_assert (new_target);
31276 if (new_target->arch_specified && new_target->arch > 0)
31278 switch (new_target->arch)
31280 case PROCESSOR_CORE2:
31281 arg_str = "core2";
31282 priority = P_PROC_SSSE3;
31283 break;
31284 case PROCESSOR_NEHALEM:
31285 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31287 arg_str = "westmere";
31288 priority = P_AES;
31290 else
31292 /* We translate "arch=corei7" and "arch=nehalem" to
31293 "corei7" so that it will be mapped to M_INTEL_COREI7
31294 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31295 arg_str = "corei7";
31296 priority = P_PROC_SSE4_2;
31298 break;
31299 case PROCESSOR_SANDYBRIDGE:
31300 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31301 arg_str = "ivybridge";
31302 else
31303 arg_str = "sandybridge";
31304 priority = P_PROC_AVX;
31305 break;
31306 case PROCESSOR_HASWELL:
31307 case PROCESSOR_SKYLAKE_AVX512:
31308 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
31309 arg_str = "cannonlake";
31310 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31311 arg_str = "skylake-avx512";
31312 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31313 arg_str = "skylake";
31314 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31315 arg_str = "broadwell";
31316 else
31317 arg_str = "haswell";
31318 priority = P_PROC_AVX2;
31319 break;
31320 case PROCESSOR_BONNELL:
31321 arg_str = "bonnell";
31322 priority = P_PROC_SSSE3;
31323 break;
31324 case PROCESSOR_KNL:
31325 arg_str = "knl";
31326 priority = P_PROC_AVX512F;
31327 break;
31328 case PROCESSOR_KNM:
31329 arg_str = "knm";
31330 priority = P_PROC_AVX512F;
31331 break;
31332 case PROCESSOR_SILVERMONT:
31333 arg_str = "silvermont";
31334 priority = P_PROC_SSE4_2;
31335 break;
31336 case PROCESSOR_AMDFAM10:
31337 arg_str = "amdfam10h";
31338 priority = P_PROC_SSE4_A;
31339 break;
31340 case PROCESSOR_BTVER1:
31341 arg_str = "btver1";
31342 priority = P_PROC_SSE4_A;
31343 break;
31344 case PROCESSOR_BTVER2:
31345 arg_str = "btver2";
31346 priority = P_PROC_BMI;
31347 break;
31348 case PROCESSOR_BDVER1:
31349 arg_str = "bdver1";
31350 priority = P_PROC_XOP;
31351 break;
31352 case PROCESSOR_BDVER2:
31353 arg_str = "bdver2";
31354 priority = P_PROC_FMA;
31355 break;
31356 case PROCESSOR_BDVER3:
31357 arg_str = "bdver3";
31358 priority = P_PROC_FMA;
31359 break;
31360 case PROCESSOR_BDVER4:
31361 arg_str = "bdver4";
31362 priority = P_PROC_AVX2;
31363 break;
31364 case PROCESSOR_ZNVER1:
31365 arg_str = "znver1";
31366 priority = P_PROC_AVX2;
31367 break;
31371 cl_target_option_restore (&global_options, &cur_target);
31373 if (predicate_list && arg_str == NULL)
31375 error_at (DECL_SOURCE_LOCATION (decl),
31376 "No dispatcher found for the versioning attributes");
31377 return 0;
31380 if (predicate_list)
31382 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31383 /* For a C string literal the length includes the trailing NULL. */
31384 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31385 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31386 predicate_chain);
31390 /* Process feature name. */
31391 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31392 strcpy (tok_str, attrs_str);
31393 token = strtok (tok_str, ",");
31394 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31396 while (token != NULL)
31398 /* Do not process "arch=" */
31399 if (strncmp (token, "arch=", 5) == 0)
31401 token = strtok (NULL, ",");
31402 continue;
31404 for (i = 0; i < NUM_FEATURES; ++i)
31406 if (strcmp (token, feature_list[i].name) == 0)
31408 if (predicate_list)
31410 predicate_arg = build_string_literal (
31411 strlen (feature_list[i].name) + 1,
31412 feature_list[i].name);
31413 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31414 predicate_chain);
31416 /* Find the maximum priority feature. */
31417 if (feature_list[i].priority > priority)
31418 priority = feature_list[i].priority;
31420 break;
31423 if (predicate_list && i == NUM_FEATURES)
31425 error_at (DECL_SOURCE_LOCATION (decl),
31426 "No dispatcher found for %s", token);
31427 return 0;
31429 token = strtok (NULL, ",");
31431 free (tok_str);
31433 if (predicate_list && predicate_chain == NULL_TREE)
31435 error_at (DECL_SOURCE_LOCATION (decl),
31436 "No dispatcher found for the versioning attributes : %s",
31437 attrs_str);
31438 return 0;
31440 else if (predicate_list)
31442 predicate_chain = nreverse (predicate_chain);
31443 *predicate_list = predicate_chain;
31446 return priority;
31449 /* This compares the priority of target features in function DECL1
31450 and DECL2. It returns positive value if DECL1 is higher priority,
31451 negative value if DECL2 is higher priority and 0 if they are the
31452 same. */
31454 static int
31455 ix86_compare_version_priority (tree decl1, tree decl2)
31457 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31458 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31460 return (int)priority1 - (int)priority2;
31463 /* V1 and V2 point to function versions with different priorities
31464 based on the target ISA. This function compares their priorities. */
31466 static int
31467 feature_compare (const void *v1, const void *v2)
31469 typedef struct _function_version_info
31471 tree version_decl;
31472 tree predicate_chain;
31473 unsigned int dispatch_priority;
31474 } function_version_info;
31476 const function_version_info c1 = *(const function_version_info *)v1;
31477 const function_version_info c2 = *(const function_version_info *)v2;
31478 return (c2.dispatch_priority - c1.dispatch_priority);
31481 /* This function generates the dispatch function for
31482 multi-versioned functions. DISPATCH_DECL is the function which will
31483 contain the dispatch logic. FNDECLS are the function choices for
31484 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31485 in DISPATCH_DECL in which the dispatch code is generated. */
31487 static int
31488 dispatch_function_versions (tree dispatch_decl,
31489 void *fndecls_p,
31490 basic_block *empty_bb)
31492 tree default_decl;
31493 gimple *ifunc_cpu_init_stmt;
31494 gimple_seq gseq;
31495 int ix;
31496 tree ele;
31497 vec<tree> *fndecls;
31498 unsigned int num_versions = 0;
31499 unsigned int actual_versions = 0;
31500 unsigned int i;
31502 struct _function_version_info
31504 tree version_decl;
31505 tree predicate_chain;
31506 unsigned int dispatch_priority;
31507 }*function_version_info;
31509 gcc_assert (dispatch_decl != NULL
31510 && fndecls_p != NULL
31511 && empty_bb != NULL);
31513 /*fndecls_p is actually a vector. */
31514 fndecls = static_cast<vec<tree> *> (fndecls_p);
31516 /* At least one more version other than the default. */
31517 num_versions = fndecls->length ();
31518 gcc_assert (num_versions >= 2);
31520 function_version_info = (struct _function_version_info *)
31521 XNEWVEC (struct _function_version_info, (num_versions - 1));
31523 /* The first version in the vector is the default decl. */
31524 default_decl = (*fndecls)[0];
31526 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31528 gseq = bb_seq (*empty_bb);
31529 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31530 constructors, so explicity call __builtin_cpu_init here. */
31531 ifunc_cpu_init_stmt = gimple_build_call_vec (
31532 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31533 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31534 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31535 set_bb_seq (*empty_bb, gseq);
31537 pop_cfun ();
31540 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31542 tree version_decl = ele;
31543 tree predicate_chain = NULL_TREE;
31544 unsigned int priority;
31545 /* Get attribute string, parse it and find the right predicate decl.
31546 The predicate function could be a lengthy combination of many
31547 features, like arch-type and various isa-variants. */
31548 priority = get_builtin_code_for_version (version_decl,
31549 &predicate_chain);
31551 if (predicate_chain == NULL_TREE)
31552 continue;
31554 function_version_info [actual_versions].version_decl = version_decl;
31555 function_version_info [actual_versions].predicate_chain
31556 = predicate_chain;
31557 function_version_info [actual_versions].dispatch_priority = priority;
31558 actual_versions++;
31561 /* Sort the versions according to descending order of dispatch priority. The
31562 priority is based on the ISA. This is not a perfect solution. There
31563 could still be ambiguity. If more than one function version is suitable
31564 to execute, which one should be dispatched? In future, allow the user
31565 to specify a dispatch priority next to the version. */
31566 qsort (function_version_info, actual_versions,
31567 sizeof (struct _function_version_info), feature_compare);
31569 for (i = 0; i < actual_versions; ++i)
31570 *empty_bb = add_condition_to_bb (dispatch_decl,
31571 function_version_info[i].version_decl,
31572 function_version_info[i].predicate_chain,
31573 *empty_bb);
31575 /* dispatch default version at the end. */
31576 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31577 NULL, *empty_bb);
31579 free (function_version_info);
31580 return 0;
31583 /* This function changes the assembler name for functions that are
31584 versions. If DECL is a function version and has a "target"
31585 attribute, it appends the attribute string to its assembler name. */
31587 static tree
31588 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31590 tree version_attr;
31591 const char *orig_name, *version_string;
31592 char *attr_str, *assembler_name;
31594 if (DECL_DECLARED_INLINE_P (decl)
31595 && lookup_attribute ("gnu_inline",
31596 DECL_ATTRIBUTES (decl)))
31597 error_at (DECL_SOURCE_LOCATION (decl),
31598 "Function versions cannot be marked as gnu_inline,"
31599 " bodies have to be generated");
31601 if (DECL_VIRTUAL_P (decl)
31602 || DECL_VINDEX (decl))
31603 sorry ("Virtual function multiversioning not supported");
31605 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31607 /* target attribute string cannot be NULL. */
31608 gcc_assert (version_attr != NULL_TREE);
31610 orig_name = IDENTIFIER_POINTER (id);
31611 version_string
31612 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31614 if (strcmp (version_string, "default") == 0)
31615 return id;
31617 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31618 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31620 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31622 /* Allow assembler name to be modified if already set. */
31623 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31624 SET_DECL_RTL (decl, NULL);
31626 tree ret = get_identifier (assembler_name);
31627 XDELETEVEC (attr_str);
31628 XDELETEVEC (assembler_name);
31629 return ret;
31633 static tree
31634 ix86_mangle_decl_assembler_name (tree decl, tree id)
31636 /* For function version, add the target suffix to the assembler name. */
31637 if (TREE_CODE (decl) == FUNCTION_DECL
31638 && DECL_FUNCTION_VERSIONED (decl))
31639 id = ix86_mangle_function_version_assembler_name (decl, id);
31640 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31641 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31642 #endif
31644 return id;
31647 /* Make a dispatcher declaration for the multi-versioned function DECL.
31648 Calls to DECL function will be replaced with calls to the dispatcher
31649 by the front-end. Returns the decl of the dispatcher function. */
31651 static tree
31652 ix86_get_function_versions_dispatcher (void *decl)
31654 tree fn = (tree) decl;
31655 struct cgraph_node *node = NULL;
31656 struct cgraph_node *default_node = NULL;
31657 struct cgraph_function_version_info *node_v = NULL;
31658 struct cgraph_function_version_info *first_v = NULL;
31660 tree dispatch_decl = NULL;
31662 struct cgraph_function_version_info *default_version_info = NULL;
31664 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31666 node = cgraph_node::get (fn);
31667 gcc_assert (node != NULL);
31669 node_v = node->function_version ();
31670 gcc_assert (node_v != NULL);
31672 if (node_v->dispatcher_resolver != NULL)
31673 return node_v->dispatcher_resolver;
31675 /* Find the default version and make it the first node. */
31676 first_v = node_v;
31677 /* Go to the beginning of the chain. */
31678 while (first_v->prev != NULL)
31679 first_v = first_v->prev;
31680 default_version_info = first_v;
31681 while (default_version_info != NULL)
31683 if (is_function_default_version
31684 (default_version_info->this_node->decl))
31685 break;
31686 default_version_info = default_version_info->next;
31689 /* If there is no default node, just return NULL. */
31690 if (default_version_info == NULL)
31691 return NULL;
31693 /* Make default info the first node. */
31694 if (first_v != default_version_info)
31696 default_version_info->prev->next = default_version_info->next;
31697 if (default_version_info->next)
31698 default_version_info->next->prev = default_version_info->prev;
31699 first_v->prev = default_version_info;
31700 default_version_info->next = first_v;
31701 default_version_info->prev = NULL;
31704 default_node = default_version_info->this_node;
31706 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31707 if (targetm.has_ifunc_p ())
31709 struct cgraph_function_version_info *it_v = NULL;
31710 struct cgraph_node *dispatcher_node = NULL;
31711 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31713 /* Right now, the dispatching is done via ifunc. */
31714 dispatch_decl = make_dispatcher_decl (default_node->decl);
31716 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31717 gcc_assert (dispatcher_node != NULL);
31718 dispatcher_node->dispatcher_function = 1;
31719 dispatcher_version_info
31720 = dispatcher_node->insert_new_function_version ();
31721 dispatcher_version_info->next = default_version_info;
31722 dispatcher_node->definition = 1;
31724 /* Set the dispatcher for all the versions. */
31725 it_v = default_version_info;
31726 while (it_v != NULL)
31728 it_v->dispatcher_resolver = dispatch_decl;
31729 it_v = it_v->next;
31732 else
31733 #endif
31735 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31736 "multiversioning needs ifunc which is not supported "
31737 "on this target");
31740 return dispatch_decl;
31743 /* Make the resolver function decl to dispatch the versions of
31744 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31745 ifunc alias that will point to the created resolver. Create an
31746 empty basic block in the resolver and store the pointer in
31747 EMPTY_BB. Return the decl of the resolver function. */
31749 static tree
31750 make_resolver_func (const tree default_decl,
31751 const tree ifunc_alias_decl,
31752 basic_block *empty_bb)
31754 char *resolver_name;
31755 tree decl, type, decl_name, t;
31757 /* IFUNC's have to be globally visible. So, if the default_decl is
31758 not, then the name of the IFUNC should be made unique. */
31759 if (TREE_PUBLIC (default_decl) == 0)
31761 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31762 symtab->change_decl_assembler_name (ifunc_alias_decl,
31763 get_identifier (ifunc_name));
31764 XDELETEVEC (ifunc_name);
31767 resolver_name = make_unique_name (default_decl, "resolver", false);
31769 /* The resolver function should return a (void *). */
31770 type = build_function_type_list (ptr_type_node, NULL_TREE);
31772 decl = build_fn_decl (resolver_name, type);
31773 decl_name = get_identifier (resolver_name);
31774 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31776 DECL_NAME (decl) = decl_name;
31777 TREE_USED (decl) = 1;
31778 DECL_ARTIFICIAL (decl) = 1;
31779 DECL_IGNORED_P (decl) = 1;
31780 TREE_PUBLIC (decl) = 0;
31781 DECL_UNINLINABLE (decl) = 1;
31783 /* Resolver is not external, body is generated. */
31784 DECL_EXTERNAL (decl) = 0;
31785 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31787 DECL_CONTEXT (decl) = NULL_TREE;
31788 DECL_INITIAL (decl) = make_node (BLOCK);
31789 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31791 if (DECL_COMDAT_GROUP (default_decl)
31792 || TREE_PUBLIC (default_decl))
31794 /* In this case, each translation unit with a call to this
31795 versioned function will put out a resolver. Ensure it
31796 is comdat to keep just one copy. */
31797 DECL_COMDAT (decl) = 1;
31798 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31800 /* Build result decl and add to function_decl. */
31801 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31802 DECL_ARTIFICIAL (t) = 1;
31803 DECL_IGNORED_P (t) = 1;
31804 DECL_RESULT (decl) = t;
31806 gimplify_function_tree (decl);
31807 push_cfun (DECL_STRUCT_FUNCTION (decl));
31808 *empty_bb = init_lowered_empty_function (decl, false,
31809 profile_count::uninitialized ());
31811 cgraph_node::add_new_function (decl, true);
31812 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31814 pop_cfun ();
31816 gcc_assert (ifunc_alias_decl != NULL);
31817 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31818 DECL_ATTRIBUTES (ifunc_alias_decl)
31819 = make_attribute ("ifunc", resolver_name,
31820 DECL_ATTRIBUTES (ifunc_alias_decl));
31822 /* Create the alias for dispatch to resolver here. */
31823 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31824 XDELETEVEC (resolver_name);
31825 return decl;
31828 /* Generate the dispatching code body to dispatch multi-versioned function
31829 DECL. The target hook is called to process the "target" attributes and
31830 provide the code to dispatch the right function at run-time. NODE points
31831 to the dispatcher decl whose body will be created. */
31833 static tree
31834 ix86_generate_version_dispatcher_body (void *node_p)
31836 tree resolver_decl;
31837 basic_block empty_bb;
31838 tree default_ver_decl;
31839 struct cgraph_node *versn;
31840 struct cgraph_node *node;
31842 struct cgraph_function_version_info *node_version_info = NULL;
31843 struct cgraph_function_version_info *versn_info = NULL;
31845 node = (cgraph_node *)node_p;
31847 node_version_info = node->function_version ();
31848 gcc_assert (node->dispatcher_function
31849 && node_version_info != NULL);
31851 if (node_version_info->dispatcher_resolver)
31852 return node_version_info->dispatcher_resolver;
31854 /* The first version in the chain corresponds to the default version. */
31855 default_ver_decl = node_version_info->next->this_node->decl;
31857 /* node is going to be an alias, so remove the finalized bit. */
31858 node->definition = false;
31860 resolver_decl = make_resolver_func (default_ver_decl,
31861 node->decl, &empty_bb);
31863 node_version_info->dispatcher_resolver = resolver_decl;
31865 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31867 auto_vec<tree, 2> fn_ver_vec;
31869 for (versn_info = node_version_info->next; versn_info;
31870 versn_info = versn_info->next)
31872 versn = versn_info->this_node;
31873 /* Check for virtual functions here again, as by this time it should
31874 have been determined if this function needs a vtable index or
31875 not. This happens for methods in derived classes that override
31876 virtual methods in base classes but are not explicitly marked as
31877 virtual. */
31878 if (DECL_VINDEX (versn->decl))
31879 sorry ("Virtual function multiversioning not supported");
31881 fn_ver_vec.safe_push (versn->decl);
31884 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31885 cgraph_edge::rebuild_edges ();
31886 pop_cfun ();
31887 return resolver_decl;
31889 /* This builds the processor_model struct type defined in
31890 libgcc/config/i386/cpuinfo.c */
31892 static tree
31893 build_processor_model_struct (void)
31895 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31896 "__cpu_features"};
31897 tree field = NULL_TREE, field_chain = NULL_TREE;
31898 int i;
31899 tree type = make_node (RECORD_TYPE);
31901 /* The first 3 fields are unsigned int. */
31902 for (i = 0; i < 3; ++i)
31904 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31905 get_identifier (field_name[i]), unsigned_type_node);
31906 if (field_chain != NULL_TREE)
31907 DECL_CHAIN (field) = field_chain;
31908 field_chain = field;
31911 /* The last field is an array of unsigned integers of size one. */
31912 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31913 get_identifier (field_name[3]),
31914 build_array_type (unsigned_type_node,
31915 build_index_type (size_one_node)));
31916 if (field_chain != NULL_TREE)
31917 DECL_CHAIN (field) = field_chain;
31918 field_chain = field;
31920 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31921 return type;
31924 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31926 static tree
31927 make_var_decl (tree type, const char *name)
31929 tree new_decl;
31931 new_decl = build_decl (UNKNOWN_LOCATION,
31932 VAR_DECL,
31933 get_identifier(name),
31934 type);
31936 DECL_EXTERNAL (new_decl) = 1;
31937 TREE_STATIC (new_decl) = 1;
31938 TREE_PUBLIC (new_decl) = 1;
31939 DECL_INITIAL (new_decl) = 0;
31940 DECL_ARTIFICIAL (new_decl) = 0;
31941 DECL_PRESERVE_P (new_decl) = 1;
31943 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31944 assemble_variable (new_decl, 0, 0, 0);
31946 return new_decl;
31949 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31950 into an integer defined in libgcc/config/i386/cpuinfo.c */
31952 static tree
31953 fold_builtin_cpu (tree fndecl, tree *args)
31955 unsigned int i;
31956 enum ix86_builtins fn_code = (enum ix86_builtins)
31957 DECL_FUNCTION_CODE (fndecl);
31958 tree param_string_cst = NULL;
31960 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31961 enum processor_features
31963 F_CMOV = 0,
31964 F_MMX,
31965 F_POPCNT,
31966 F_SSE,
31967 F_SSE2,
31968 F_SSE3,
31969 F_SSSE3,
31970 F_SSE4_1,
31971 F_SSE4_2,
31972 F_AVX,
31973 F_AVX2,
31974 F_SSE4_A,
31975 F_FMA4,
31976 F_XOP,
31977 F_FMA,
31978 F_AVX512F,
31979 F_BMI,
31980 F_BMI2,
31981 F_AES,
31982 F_PCLMUL,
31983 F_AVX512VL,
31984 F_AVX512BW,
31985 F_AVX512DQ,
31986 F_AVX512CD,
31987 F_AVX512ER,
31988 F_AVX512PF,
31989 F_AVX512VBMI,
31990 F_AVX512IFMA,
31991 F_AVX5124VNNIW,
31992 F_AVX5124FMAPS,
31993 F_AVX512VPOPCNTDQ,
31994 F_MAX
31997 /* These are the values for vendor types and cpu types and subtypes
31998 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31999 the corresponding start value. */
32000 enum processor_model
32002 M_INTEL = 1,
32003 M_AMD,
32004 M_CPU_TYPE_START,
32005 M_INTEL_BONNELL,
32006 M_INTEL_CORE2,
32007 M_INTEL_COREI7,
32008 M_AMDFAM10H,
32009 M_AMDFAM15H,
32010 M_INTEL_SILVERMONT,
32011 M_INTEL_KNL,
32012 M_AMD_BTVER1,
32013 M_AMD_BTVER2,
32014 M_AMDFAM17H,
32015 M_INTEL_KNM,
32016 M_CPU_SUBTYPE_START,
32017 M_INTEL_COREI7_NEHALEM,
32018 M_INTEL_COREI7_WESTMERE,
32019 M_INTEL_COREI7_SANDYBRIDGE,
32020 M_AMDFAM10H_BARCELONA,
32021 M_AMDFAM10H_SHANGHAI,
32022 M_AMDFAM10H_ISTANBUL,
32023 M_AMDFAM15H_BDVER1,
32024 M_AMDFAM15H_BDVER2,
32025 M_AMDFAM15H_BDVER3,
32026 M_AMDFAM15H_BDVER4,
32027 M_AMDFAM17H_ZNVER1,
32028 M_INTEL_COREI7_IVYBRIDGE,
32029 M_INTEL_COREI7_HASWELL,
32030 M_INTEL_COREI7_BROADWELL,
32031 M_INTEL_COREI7_SKYLAKE,
32032 M_INTEL_COREI7_SKYLAKE_AVX512,
32033 M_INTEL_COREI7_CANNONLAKE
32036 static struct _arch_names_table
32038 const char *const name;
32039 const enum processor_model model;
32041 const arch_names_table[] =
32043 {"amd", M_AMD},
32044 {"intel", M_INTEL},
32045 {"atom", M_INTEL_BONNELL},
32046 {"slm", M_INTEL_SILVERMONT},
32047 {"core2", M_INTEL_CORE2},
32048 {"corei7", M_INTEL_COREI7},
32049 {"nehalem", M_INTEL_COREI7_NEHALEM},
32050 {"westmere", M_INTEL_COREI7_WESTMERE},
32051 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32052 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32053 {"haswell", M_INTEL_COREI7_HASWELL},
32054 {"broadwell", M_INTEL_COREI7_BROADWELL},
32055 {"skylake", M_INTEL_COREI7_SKYLAKE},
32056 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32057 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32058 {"bonnell", M_INTEL_BONNELL},
32059 {"silvermont", M_INTEL_SILVERMONT},
32060 {"knl", M_INTEL_KNL},
32061 {"knm", M_INTEL_KNM},
32062 {"amdfam10h", M_AMDFAM10H},
32063 {"barcelona", M_AMDFAM10H_BARCELONA},
32064 {"shanghai", M_AMDFAM10H_SHANGHAI},
32065 {"istanbul", M_AMDFAM10H_ISTANBUL},
32066 {"btver1", M_AMD_BTVER1},
32067 {"amdfam15h", M_AMDFAM15H},
32068 {"bdver1", M_AMDFAM15H_BDVER1},
32069 {"bdver2", M_AMDFAM15H_BDVER2},
32070 {"bdver3", M_AMDFAM15H_BDVER3},
32071 {"bdver4", M_AMDFAM15H_BDVER4},
32072 {"btver2", M_AMD_BTVER2},
32073 {"amdfam17h", M_AMDFAM17H},
32074 {"znver1", M_AMDFAM17H_ZNVER1},
32077 static struct _isa_names_table
32079 const char *const name;
32080 const enum processor_features feature;
32082 const isa_names_table[] =
32084 {"cmov", F_CMOV},
32085 {"mmx", F_MMX},
32086 {"popcnt", F_POPCNT},
32087 {"sse", F_SSE},
32088 {"sse2", F_SSE2},
32089 {"sse3", F_SSE3},
32090 {"ssse3", F_SSSE3},
32091 {"sse4a", F_SSE4_A},
32092 {"sse4.1", F_SSE4_1},
32093 {"sse4.2", F_SSE4_2},
32094 {"avx", F_AVX},
32095 {"fma4", F_FMA4},
32096 {"xop", F_XOP},
32097 {"fma", F_FMA},
32098 {"avx2", F_AVX2},
32099 {"avx512f", F_AVX512F},
32100 {"bmi", F_BMI},
32101 {"bmi2", F_BMI2},
32102 {"aes", F_AES},
32103 {"pclmul", F_PCLMUL},
32104 {"avx512vl",F_AVX512VL},
32105 {"avx512bw",F_AVX512BW},
32106 {"avx512dq",F_AVX512DQ},
32107 {"avx512cd",F_AVX512CD},
32108 {"avx512er",F_AVX512ER},
32109 {"avx512pf",F_AVX512PF},
32110 {"avx512vbmi",F_AVX512VBMI},
32111 {"avx512ifma",F_AVX512IFMA},
32112 {"avx5124vnniw",F_AVX5124VNNIW},
32113 {"avx5124fmaps",F_AVX5124FMAPS},
32114 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32117 tree __processor_model_type = build_processor_model_struct ();
32118 tree __cpu_model_var = make_var_decl (__processor_model_type,
32119 "__cpu_model");
32122 varpool_node::add (__cpu_model_var);
32124 gcc_assert ((args != NULL) && (*args != NULL));
32126 param_string_cst = *args;
32127 while (param_string_cst
32128 && TREE_CODE (param_string_cst) != STRING_CST)
32130 /* *args must be a expr that can contain other EXPRS leading to a
32131 STRING_CST. */
32132 if (!EXPR_P (param_string_cst))
32134 error ("Parameter to builtin must be a string constant or literal");
32135 return integer_zero_node;
32137 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32140 gcc_assert (param_string_cst);
32142 if (fn_code == IX86_BUILTIN_CPU_IS)
32144 tree ref;
32145 tree field;
32146 tree final;
32148 unsigned int field_val = 0;
32149 unsigned int NUM_ARCH_NAMES
32150 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32152 for (i = 0; i < NUM_ARCH_NAMES; i++)
32153 if (strcmp (arch_names_table[i].name,
32154 TREE_STRING_POINTER (param_string_cst)) == 0)
32155 break;
32157 if (i == NUM_ARCH_NAMES)
32159 error ("Parameter to builtin not valid: %s",
32160 TREE_STRING_POINTER (param_string_cst));
32161 return integer_zero_node;
32164 field = TYPE_FIELDS (__processor_model_type);
32165 field_val = arch_names_table[i].model;
32167 /* CPU types are stored in the next field. */
32168 if (field_val > M_CPU_TYPE_START
32169 && field_val < M_CPU_SUBTYPE_START)
32171 field = DECL_CHAIN (field);
32172 field_val -= M_CPU_TYPE_START;
32175 /* CPU subtypes are stored in the next field. */
32176 if (field_val > M_CPU_SUBTYPE_START)
32178 field = DECL_CHAIN ( DECL_CHAIN (field));
32179 field_val -= M_CPU_SUBTYPE_START;
32182 /* Get the appropriate field in __cpu_model. */
32183 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32184 field, NULL_TREE);
32186 /* Check the value. */
32187 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32188 build_int_cstu (unsigned_type_node, field_val));
32189 return build1 (CONVERT_EXPR, integer_type_node, final);
32191 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32193 tree ref;
32194 tree array_elt;
32195 tree field;
32196 tree final;
32198 unsigned int field_val = 0;
32199 unsigned int NUM_ISA_NAMES
32200 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32202 for (i = 0; i < NUM_ISA_NAMES; i++)
32203 if (strcmp (isa_names_table[i].name,
32204 TREE_STRING_POINTER (param_string_cst)) == 0)
32205 break;
32207 if (i == NUM_ISA_NAMES)
32209 error ("Parameter to builtin not valid: %s",
32210 TREE_STRING_POINTER (param_string_cst));
32211 return integer_zero_node;
32214 field = TYPE_FIELDS (__processor_model_type);
32215 /* Get the last field, which is __cpu_features. */
32216 while (DECL_CHAIN (field))
32217 field = DECL_CHAIN (field);
32219 /* Get the appropriate field: __cpu_model.__cpu_features */
32220 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32221 field, NULL_TREE);
32223 /* Access the 0th element of __cpu_features array. */
32224 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
32225 integer_zero_node, NULL_TREE, NULL_TREE);
32227 field_val = (1 << isa_names_table[i].feature);
32228 /* Return __cpu_model.__cpu_features[0] & field_val */
32229 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
32230 build_int_cstu (unsigned_type_node, field_val));
32231 return build1 (CONVERT_EXPR, integer_type_node, final);
32233 gcc_unreachable ();
32236 static tree
32237 ix86_fold_builtin (tree fndecl, int n_args,
32238 tree *args, bool ignore ATTRIBUTE_UNUSED)
32240 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32242 enum ix86_builtins fn_code = (enum ix86_builtins)
32243 DECL_FUNCTION_CODE (fndecl);
32244 switch (fn_code)
32246 case IX86_BUILTIN_CPU_IS:
32247 case IX86_BUILTIN_CPU_SUPPORTS:
32248 gcc_assert (n_args == 1);
32249 return fold_builtin_cpu (fndecl, args);
32251 case IX86_BUILTIN_NANQ:
32252 case IX86_BUILTIN_NANSQ:
32254 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32255 const char *str = c_getstr (*args);
32256 int quiet = fn_code == IX86_BUILTIN_NANQ;
32257 REAL_VALUE_TYPE real;
32259 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
32260 return build_real (type, real);
32261 return NULL_TREE;
32264 case IX86_BUILTIN_INFQ:
32265 case IX86_BUILTIN_HUGE_VALQ:
32267 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32268 REAL_VALUE_TYPE inf;
32269 real_inf (&inf);
32270 return build_real (type, inf);
32273 case IX86_BUILTIN_TZCNT16:
32274 case IX86_BUILTIN_CTZS:
32275 case IX86_BUILTIN_TZCNT32:
32276 case IX86_BUILTIN_TZCNT64:
32277 gcc_assert (n_args == 1);
32278 if (TREE_CODE (args[0]) == INTEGER_CST)
32280 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32281 tree arg = args[0];
32282 if (fn_code == IX86_BUILTIN_TZCNT16
32283 || fn_code == IX86_BUILTIN_CTZS)
32284 arg = fold_convert (short_unsigned_type_node, arg);
32285 if (integer_zerop (arg))
32286 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32287 else
32288 return fold_const_call (CFN_CTZ, type, arg);
32290 break;
32292 case IX86_BUILTIN_LZCNT16:
32293 case IX86_BUILTIN_CLZS:
32294 case IX86_BUILTIN_LZCNT32:
32295 case IX86_BUILTIN_LZCNT64:
32296 gcc_assert (n_args == 1);
32297 if (TREE_CODE (args[0]) == INTEGER_CST)
32299 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32300 tree arg = args[0];
32301 if (fn_code == IX86_BUILTIN_LZCNT16
32302 || fn_code == IX86_BUILTIN_CLZS)
32303 arg = fold_convert (short_unsigned_type_node, arg);
32304 if (integer_zerop (arg))
32305 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32306 else
32307 return fold_const_call (CFN_CLZ, type, arg);
32309 break;
32311 case IX86_BUILTIN_BEXTR32:
32312 case IX86_BUILTIN_BEXTR64:
32313 case IX86_BUILTIN_BEXTRI32:
32314 case IX86_BUILTIN_BEXTRI64:
32315 gcc_assert (n_args == 2);
32316 if (tree_fits_uhwi_p (args[1]))
32318 unsigned HOST_WIDE_INT res = 0;
32319 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32320 unsigned int start = tree_to_uhwi (args[1]);
32321 unsigned int len = (start & 0xff00) >> 8;
32322 start &= 0xff;
32323 if (start >= prec || len == 0)
32324 res = 0;
32325 else if (!tree_fits_uhwi_p (args[0]))
32326 break;
32327 else
32328 res = tree_to_uhwi (args[0]) >> start;
32329 if (len > prec)
32330 len = prec;
32331 if (len < HOST_BITS_PER_WIDE_INT)
32332 res &= (HOST_WIDE_INT_1U << len) - 1;
32333 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32335 break;
32337 case IX86_BUILTIN_BZHI32:
32338 case IX86_BUILTIN_BZHI64:
32339 gcc_assert (n_args == 2);
32340 if (tree_fits_uhwi_p (args[1]))
32342 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32343 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32344 return args[0];
32345 if (!tree_fits_uhwi_p (args[0]))
32346 break;
32347 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32348 res &= ~(HOST_WIDE_INT_M1U << idx);
32349 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32351 break;
32353 case IX86_BUILTIN_PDEP32:
32354 case IX86_BUILTIN_PDEP64:
32355 gcc_assert (n_args == 2);
32356 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32358 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32359 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32360 unsigned HOST_WIDE_INT res = 0;
32361 unsigned HOST_WIDE_INT m, k = 1;
32362 for (m = 1; m; m <<= 1)
32363 if ((mask & m) != 0)
32365 if ((src & k) != 0)
32366 res |= m;
32367 k <<= 1;
32369 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32371 break;
32373 case IX86_BUILTIN_PEXT32:
32374 case IX86_BUILTIN_PEXT64:
32375 gcc_assert (n_args == 2);
32376 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32378 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32379 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32380 unsigned HOST_WIDE_INT res = 0;
32381 unsigned HOST_WIDE_INT m, k = 1;
32382 for (m = 1; m; m <<= 1)
32383 if ((mask & m) != 0)
32385 if ((src & m) != 0)
32386 res |= k;
32387 k <<= 1;
32389 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32391 break;
32393 default:
32394 break;
32398 #ifdef SUBTARGET_FOLD_BUILTIN
32399 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32400 #endif
32402 return NULL_TREE;
32405 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32406 constant) in GIMPLE. */
32408 bool
32409 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32411 gimple *stmt = gsi_stmt (*gsi);
32412 tree fndecl = gimple_call_fndecl (stmt);
32413 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32414 int n_args = gimple_call_num_args (stmt);
32415 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32416 tree decl = NULL_TREE;
32417 tree arg0, arg1;
32419 switch (fn_code)
32421 case IX86_BUILTIN_TZCNT32:
32422 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32423 goto fold_tzcnt_lzcnt;
32425 case IX86_BUILTIN_TZCNT64:
32426 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32427 goto fold_tzcnt_lzcnt;
32429 case IX86_BUILTIN_LZCNT32:
32430 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32431 goto fold_tzcnt_lzcnt;
32433 case IX86_BUILTIN_LZCNT64:
32434 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32435 goto fold_tzcnt_lzcnt;
32437 fold_tzcnt_lzcnt:
32438 gcc_assert (n_args == 1);
32439 arg0 = gimple_call_arg (stmt, 0);
32440 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32442 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32443 /* If arg0 is provably non-zero, optimize into generic
32444 __builtin_c[tl]z{,ll} function the middle-end handles
32445 better. */
32446 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32447 return false;
32449 location_t loc = gimple_location (stmt);
32450 gimple *g = gimple_build_call (decl, 1, arg0);
32451 gimple_set_location (g, loc);
32452 tree lhs = make_ssa_name (integer_type_node);
32453 gimple_call_set_lhs (g, lhs);
32454 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32455 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32456 gimple_set_location (g, loc);
32457 gsi_replace (gsi, g, false);
32458 return true;
32460 break;
32462 case IX86_BUILTIN_BZHI32:
32463 case IX86_BUILTIN_BZHI64:
32464 gcc_assert (n_args == 2);
32465 arg1 = gimple_call_arg (stmt, 1);
32466 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32468 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32469 arg0 = gimple_call_arg (stmt, 0);
32470 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32471 break;
32472 location_t loc = gimple_location (stmt);
32473 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32474 gimple_set_location (g, loc);
32475 gsi_replace (gsi, g, false);
32476 return true;
32478 break;
32480 case IX86_BUILTIN_PDEP32:
32481 case IX86_BUILTIN_PDEP64:
32482 case IX86_BUILTIN_PEXT32:
32483 case IX86_BUILTIN_PEXT64:
32484 gcc_assert (n_args == 2);
32485 arg1 = gimple_call_arg (stmt, 1);
32486 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32488 location_t loc = gimple_location (stmt);
32489 arg0 = gimple_call_arg (stmt, 0);
32490 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32491 gimple_set_location (g, loc);
32492 gsi_replace (gsi, g, false);
32493 return true;
32495 break;
32497 default:
32498 break;
32501 return false;
32504 /* Make builtins to detect cpu type and features supported. NAME is
32505 the builtin name, CODE is the builtin code, and FTYPE is the function
32506 type of the builtin. */
32508 static void
32509 make_cpu_type_builtin (const char* name, int code,
32510 enum ix86_builtin_func_type ftype, bool is_const)
32512 tree decl;
32513 tree type;
32515 type = ix86_get_builtin_func_type (ftype);
32516 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32517 NULL, NULL_TREE);
32518 gcc_assert (decl != NULL_TREE);
32519 ix86_builtins[(int) code] = decl;
32520 TREE_READONLY (decl) = is_const;
32523 /* Make builtins to get CPU type and features supported. The created
32524 builtins are :
32526 __builtin_cpu_init (), to detect cpu type and features,
32527 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32528 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32531 static void
32532 ix86_init_platform_type_builtins (void)
32534 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32535 INT_FTYPE_VOID, false);
32536 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32537 INT_FTYPE_PCCHAR, true);
32538 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32539 INT_FTYPE_PCCHAR, true);
32542 /* Internal method for ix86_init_builtins. */
32544 static void
32545 ix86_init_builtins_va_builtins_abi (void)
32547 tree ms_va_ref, sysv_va_ref;
32548 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32549 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32550 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32551 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32553 if (!TARGET_64BIT)
32554 return;
32555 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32556 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32557 ms_va_ref = build_reference_type (ms_va_list_type_node);
32558 sysv_va_ref =
32559 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32561 fnvoid_va_end_ms =
32562 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32563 fnvoid_va_start_ms =
32564 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32565 fnvoid_va_end_sysv =
32566 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32567 fnvoid_va_start_sysv =
32568 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32569 NULL_TREE);
32570 fnvoid_va_copy_ms =
32571 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32572 NULL_TREE);
32573 fnvoid_va_copy_sysv =
32574 build_function_type_list (void_type_node, sysv_va_ref,
32575 sysv_va_ref, NULL_TREE);
32577 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32578 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32579 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32580 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32581 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32582 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32583 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32584 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32585 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32586 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32587 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32588 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32591 static void
32592 ix86_init_builtin_types (void)
32594 tree float80_type_node, const_string_type_node;
32596 /* The __float80 type. */
32597 float80_type_node = long_double_type_node;
32598 if (TYPE_MODE (float80_type_node) != XFmode)
32600 if (float64x_type_node != NULL_TREE
32601 && TYPE_MODE (float64x_type_node) == XFmode)
32602 float80_type_node = float64x_type_node;
32603 else
32605 /* The __float80 type. */
32606 float80_type_node = make_node (REAL_TYPE);
32608 TYPE_PRECISION (float80_type_node) = 80;
32609 layout_type (float80_type_node);
32612 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32614 /* The __float128 type. The node has already been created as
32615 _Float128, so we only need to register the __float128 name for
32616 it. */
32617 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32619 const_string_type_node
32620 = build_pointer_type (build_qualified_type
32621 (char_type_node, TYPE_QUAL_CONST));
32623 /* This macro is built by i386-builtin-types.awk. */
32624 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32627 static void
32628 ix86_init_builtins (void)
32630 tree ftype, decl;
32632 ix86_init_builtin_types ();
32634 /* Builtins to get CPU type and features. */
32635 ix86_init_platform_type_builtins ();
32637 /* TFmode support builtins. */
32638 def_builtin_const (0, "__builtin_infq",
32639 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32640 def_builtin_const (0, "__builtin_huge_valq",
32641 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32643 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32644 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32645 BUILT_IN_MD, "nanq", NULL_TREE);
32646 TREE_READONLY (decl) = 1;
32647 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32649 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32650 BUILT_IN_MD, "nansq", NULL_TREE);
32651 TREE_READONLY (decl) = 1;
32652 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32654 /* We will expand them to normal call if SSE isn't available since
32655 they are used by libgcc. */
32656 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32657 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32658 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32659 TREE_READONLY (decl) = 1;
32660 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32662 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32663 decl = add_builtin_function ("__builtin_copysignq", ftype,
32664 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32665 "__copysigntf3", NULL_TREE);
32666 TREE_READONLY (decl) = 1;
32667 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32669 ix86_init_tm_builtins ();
32670 ix86_init_mmx_sse_builtins ();
32671 ix86_init_mpx_builtins ();
32673 if (TARGET_LP64)
32674 ix86_init_builtins_va_builtins_abi ();
32676 #ifdef SUBTARGET_INIT_BUILTINS
32677 SUBTARGET_INIT_BUILTINS;
32678 #endif
32681 /* Return the ix86 builtin for CODE. */
32683 static tree
32684 ix86_builtin_decl (unsigned code, bool)
32686 if (code >= IX86_BUILTIN_MAX)
32687 return error_mark_node;
32689 return ix86_builtins[code];
32692 /* Errors in the source file can cause expand_expr to return const0_rtx
32693 where we expect a vector. To avoid crashing, use one of the vector
32694 clear instructions. */
32695 static rtx
32696 safe_vector_operand (rtx x, machine_mode mode)
32698 if (x == const0_rtx)
32699 x = CONST0_RTX (mode);
32700 return x;
32703 /* Fixup modeless constants to fit required mode. */
32704 static rtx
32705 fixup_modeless_constant (rtx x, machine_mode mode)
32707 if (GET_MODE (x) == VOIDmode)
32708 x = convert_to_mode (mode, x, 1);
32709 return x;
32712 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32714 static rtx
32715 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32717 rtx pat;
32718 tree arg0 = CALL_EXPR_ARG (exp, 0);
32719 tree arg1 = CALL_EXPR_ARG (exp, 1);
32720 rtx op0 = expand_normal (arg0);
32721 rtx op1 = expand_normal (arg1);
32722 machine_mode tmode = insn_data[icode].operand[0].mode;
32723 machine_mode mode0 = insn_data[icode].operand[1].mode;
32724 machine_mode mode1 = insn_data[icode].operand[2].mode;
32726 if (VECTOR_MODE_P (mode0))
32727 op0 = safe_vector_operand (op0, mode0);
32728 if (VECTOR_MODE_P (mode1))
32729 op1 = safe_vector_operand (op1, mode1);
32731 if (optimize || !target
32732 || GET_MODE (target) != tmode
32733 || !insn_data[icode].operand[0].predicate (target, tmode))
32734 target = gen_reg_rtx (tmode);
32736 if (GET_MODE (op1) == SImode && mode1 == TImode)
32738 rtx x = gen_reg_rtx (V4SImode);
32739 emit_insn (gen_sse2_loadd (x, op1));
32740 op1 = gen_lowpart (TImode, x);
32743 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32744 op0 = copy_to_mode_reg (mode0, op0);
32745 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32746 op1 = copy_to_mode_reg (mode1, op1);
32748 pat = GEN_FCN (icode) (target, op0, op1);
32749 if (! pat)
32750 return 0;
32752 emit_insn (pat);
32754 return target;
32757 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32759 static rtx
32760 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32761 enum ix86_builtin_func_type m_type,
32762 enum rtx_code sub_code)
32764 rtx pat;
32765 int i;
32766 int nargs;
32767 bool comparison_p = false;
32768 bool tf_p = false;
32769 bool last_arg_constant = false;
32770 int num_memory = 0;
32771 struct {
32772 rtx op;
32773 machine_mode mode;
32774 } args[4];
32776 machine_mode tmode = insn_data[icode].operand[0].mode;
32778 switch (m_type)
32780 case MULTI_ARG_4_DF2_DI_I:
32781 case MULTI_ARG_4_DF2_DI_I1:
32782 case MULTI_ARG_4_SF2_SI_I:
32783 case MULTI_ARG_4_SF2_SI_I1:
32784 nargs = 4;
32785 last_arg_constant = true;
32786 break;
32788 case MULTI_ARG_3_SF:
32789 case MULTI_ARG_3_DF:
32790 case MULTI_ARG_3_SF2:
32791 case MULTI_ARG_3_DF2:
32792 case MULTI_ARG_3_DI:
32793 case MULTI_ARG_3_SI:
32794 case MULTI_ARG_3_SI_DI:
32795 case MULTI_ARG_3_HI:
32796 case MULTI_ARG_3_HI_SI:
32797 case MULTI_ARG_3_QI:
32798 case MULTI_ARG_3_DI2:
32799 case MULTI_ARG_3_SI2:
32800 case MULTI_ARG_3_HI2:
32801 case MULTI_ARG_3_QI2:
32802 nargs = 3;
32803 break;
32805 case MULTI_ARG_2_SF:
32806 case MULTI_ARG_2_DF:
32807 case MULTI_ARG_2_DI:
32808 case MULTI_ARG_2_SI:
32809 case MULTI_ARG_2_HI:
32810 case MULTI_ARG_2_QI:
32811 nargs = 2;
32812 break;
32814 case MULTI_ARG_2_DI_IMM:
32815 case MULTI_ARG_2_SI_IMM:
32816 case MULTI_ARG_2_HI_IMM:
32817 case MULTI_ARG_2_QI_IMM:
32818 nargs = 2;
32819 last_arg_constant = true;
32820 break;
32822 case MULTI_ARG_1_SF:
32823 case MULTI_ARG_1_DF:
32824 case MULTI_ARG_1_SF2:
32825 case MULTI_ARG_1_DF2:
32826 case MULTI_ARG_1_DI:
32827 case MULTI_ARG_1_SI:
32828 case MULTI_ARG_1_HI:
32829 case MULTI_ARG_1_QI:
32830 case MULTI_ARG_1_SI_DI:
32831 case MULTI_ARG_1_HI_DI:
32832 case MULTI_ARG_1_HI_SI:
32833 case MULTI_ARG_1_QI_DI:
32834 case MULTI_ARG_1_QI_SI:
32835 case MULTI_ARG_1_QI_HI:
32836 nargs = 1;
32837 break;
32839 case MULTI_ARG_2_DI_CMP:
32840 case MULTI_ARG_2_SI_CMP:
32841 case MULTI_ARG_2_HI_CMP:
32842 case MULTI_ARG_2_QI_CMP:
32843 nargs = 2;
32844 comparison_p = true;
32845 break;
32847 case MULTI_ARG_2_SF_TF:
32848 case MULTI_ARG_2_DF_TF:
32849 case MULTI_ARG_2_DI_TF:
32850 case MULTI_ARG_2_SI_TF:
32851 case MULTI_ARG_2_HI_TF:
32852 case MULTI_ARG_2_QI_TF:
32853 nargs = 2;
32854 tf_p = true;
32855 break;
32857 default:
32858 gcc_unreachable ();
32861 if (optimize || !target
32862 || GET_MODE (target) != tmode
32863 || !insn_data[icode].operand[0].predicate (target, tmode))
32864 target = gen_reg_rtx (tmode);
32865 else if (memory_operand (target, tmode))
32866 num_memory++;
32868 gcc_assert (nargs <= 4);
32870 for (i = 0; i < nargs; i++)
32872 tree arg = CALL_EXPR_ARG (exp, i);
32873 rtx op = expand_normal (arg);
32874 int adjust = (comparison_p) ? 1 : 0;
32875 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32877 if (last_arg_constant && i == nargs - 1)
32879 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32881 enum insn_code new_icode = icode;
32882 switch (icode)
32884 case CODE_FOR_xop_vpermil2v2df3:
32885 case CODE_FOR_xop_vpermil2v4sf3:
32886 case CODE_FOR_xop_vpermil2v4df3:
32887 case CODE_FOR_xop_vpermil2v8sf3:
32888 error ("the last argument must be a 2-bit immediate");
32889 return gen_reg_rtx (tmode);
32890 case CODE_FOR_xop_rotlv2di3:
32891 new_icode = CODE_FOR_rotlv2di3;
32892 goto xop_rotl;
32893 case CODE_FOR_xop_rotlv4si3:
32894 new_icode = CODE_FOR_rotlv4si3;
32895 goto xop_rotl;
32896 case CODE_FOR_xop_rotlv8hi3:
32897 new_icode = CODE_FOR_rotlv8hi3;
32898 goto xop_rotl;
32899 case CODE_FOR_xop_rotlv16qi3:
32900 new_icode = CODE_FOR_rotlv16qi3;
32901 xop_rotl:
32902 if (CONST_INT_P (op))
32904 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32905 op = GEN_INT (INTVAL (op) & mask);
32906 gcc_checking_assert
32907 (insn_data[icode].operand[i + 1].predicate (op, mode));
32909 else
32911 gcc_checking_assert
32912 (nargs == 2
32913 && insn_data[new_icode].operand[0].mode == tmode
32914 && insn_data[new_icode].operand[1].mode == tmode
32915 && insn_data[new_icode].operand[2].mode == mode
32916 && insn_data[new_icode].operand[0].predicate
32917 == insn_data[icode].operand[0].predicate
32918 && insn_data[new_icode].operand[1].predicate
32919 == insn_data[icode].operand[1].predicate);
32920 icode = new_icode;
32921 goto non_constant;
32923 break;
32924 default:
32925 gcc_unreachable ();
32929 else
32931 non_constant:
32932 if (VECTOR_MODE_P (mode))
32933 op = safe_vector_operand (op, mode);
32935 /* If we aren't optimizing, only allow one memory operand to be
32936 generated. */
32937 if (memory_operand (op, mode))
32938 num_memory++;
32940 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32942 if (optimize
32943 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32944 || num_memory > 1)
32945 op = force_reg (mode, op);
32948 args[i].op = op;
32949 args[i].mode = mode;
32952 switch (nargs)
32954 case 1:
32955 pat = GEN_FCN (icode) (target, args[0].op);
32956 break;
32958 case 2:
32959 if (tf_p)
32960 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32961 GEN_INT ((int)sub_code));
32962 else if (! comparison_p)
32963 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32964 else
32966 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32967 args[0].op,
32968 args[1].op);
32970 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32972 break;
32974 case 3:
32975 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32976 break;
32978 case 4:
32979 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32980 break;
32982 default:
32983 gcc_unreachable ();
32986 if (! pat)
32987 return 0;
32989 emit_insn (pat);
32990 return target;
32993 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32994 insns with vec_merge. */
32996 static rtx
32997 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32998 rtx target)
33000 rtx pat;
33001 tree arg0 = CALL_EXPR_ARG (exp, 0);
33002 rtx op1, op0 = expand_normal (arg0);
33003 machine_mode tmode = insn_data[icode].operand[0].mode;
33004 machine_mode mode0 = insn_data[icode].operand[1].mode;
33006 if (optimize || !target
33007 || GET_MODE (target) != tmode
33008 || !insn_data[icode].operand[0].predicate (target, tmode))
33009 target = gen_reg_rtx (tmode);
33011 if (VECTOR_MODE_P (mode0))
33012 op0 = safe_vector_operand (op0, mode0);
33014 if ((optimize && !register_operand (op0, mode0))
33015 || !insn_data[icode].operand[1].predicate (op0, mode0))
33016 op0 = copy_to_mode_reg (mode0, op0);
33018 op1 = op0;
33019 if (!insn_data[icode].operand[2].predicate (op1, mode0))
33020 op1 = copy_to_mode_reg (mode0, op1);
33022 pat = GEN_FCN (icode) (target, op0, op1);
33023 if (! pat)
33024 return 0;
33025 emit_insn (pat);
33026 return target;
33029 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
33031 static rtx
33032 ix86_expand_sse_compare (const struct builtin_description *d,
33033 tree exp, rtx target, bool swap)
33035 rtx pat;
33036 tree arg0 = CALL_EXPR_ARG (exp, 0);
33037 tree arg1 = CALL_EXPR_ARG (exp, 1);
33038 rtx op0 = expand_normal (arg0);
33039 rtx op1 = expand_normal (arg1);
33040 rtx op2;
33041 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33042 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33043 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33044 enum rtx_code comparison = d->comparison;
33046 if (VECTOR_MODE_P (mode0))
33047 op0 = safe_vector_operand (op0, mode0);
33048 if (VECTOR_MODE_P (mode1))
33049 op1 = safe_vector_operand (op1, mode1);
33051 /* Swap operands if we have a comparison that isn't available in
33052 hardware. */
33053 if (swap)
33054 std::swap (op0, op1);
33056 if (optimize || !target
33057 || GET_MODE (target) != tmode
33058 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33059 target = gen_reg_rtx (tmode);
33061 if ((optimize && !register_operand (op0, mode0))
33062 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33063 op0 = copy_to_mode_reg (mode0, op0);
33064 if ((optimize && !register_operand (op1, mode1))
33065 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33066 op1 = copy_to_mode_reg (mode1, op1);
33068 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33069 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33070 if (! pat)
33071 return 0;
33072 emit_insn (pat);
33073 return target;
33076 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33078 static rtx
33079 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33080 rtx target)
33082 rtx pat;
33083 tree arg0 = CALL_EXPR_ARG (exp, 0);
33084 tree arg1 = CALL_EXPR_ARG (exp, 1);
33085 rtx op0 = expand_normal (arg0);
33086 rtx op1 = expand_normal (arg1);
33087 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33088 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33089 enum rtx_code comparison = d->comparison;
33091 if (VECTOR_MODE_P (mode0))
33092 op0 = safe_vector_operand (op0, mode0);
33093 if (VECTOR_MODE_P (mode1))
33094 op1 = safe_vector_operand (op1, mode1);
33096 /* Swap operands if we have a comparison that isn't available in
33097 hardware. */
33098 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33099 std::swap (op0, op1);
33101 target = gen_reg_rtx (SImode);
33102 emit_move_insn (target, const0_rtx);
33103 target = gen_rtx_SUBREG (QImode, target, 0);
33105 if ((optimize && !register_operand (op0, mode0))
33106 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33107 op0 = copy_to_mode_reg (mode0, op0);
33108 if ((optimize && !register_operand (op1, mode1))
33109 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33110 op1 = copy_to_mode_reg (mode1, op1);
33112 pat = GEN_FCN (d->icode) (op0, op1);
33113 if (! pat)
33114 return 0;
33115 emit_insn (pat);
33116 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33117 gen_rtx_fmt_ee (comparison, QImode,
33118 SET_DEST (pat),
33119 const0_rtx)));
33121 return SUBREG_REG (target);
33124 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33126 static rtx
33127 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33128 rtx target)
33130 rtx pat;
33131 tree arg0 = CALL_EXPR_ARG (exp, 0);
33132 rtx op1, op0 = expand_normal (arg0);
33133 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33134 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33136 if (optimize || target == 0
33137 || GET_MODE (target) != tmode
33138 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33139 target = gen_reg_rtx (tmode);
33141 if (VECTOR_MODE_P (mode0))
33142 op0 = safe_vector_operand (op0, mode0);
33144 if ((optimize && !register_operand (op0, mode0))
33145 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33146 op0 = copy_to_mode_reg (mode0, op0);
33148 op1 = GEN_INT (d->comparison);
33150 pat = GEN_FCN (d->icode) (target, op0, op1);
33151 if (! pat)
33152 return 0;
33153 emit_insn (pat);
33154 return target;
33157 static rtx
33158 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33159 tree exp, rtx target)
33161 rtx pat;
33162 tree arg0 = CALL_EXPR_ARG (exp, 0);
33163 tree arg1 = CALL_EXPR_ARG (exp, 1);
33164 rtx op0 = expand_normal (arg0);
33165 rtx op1 = expand_normal (arg1);
33166 rtx op2;
33167 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33168 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33169 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33171 if (optimize || target == 0
33172 || GET_MODE (target) != tmode
33173 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33174 target = gen_reg_rtx (tmode);
33176 op0 = safe_vector_operand (op0, mode0);
33177 op1 = safe_vector_operand (op1, mode1);
33179 if ((optimize && !register_operand (op0, mode0))
33180 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33181 op0 = copy_to_mode_reg (mode0, op0);
33182 if ((optimize && !register_operand (op1, mode1))
33183 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33184 op1 = copy_to_mode_reg (mode1, op1);
33186 op2 = GEN_INT (d->comparison);
33188 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33189 if (! pat)
33190 return 0;
33191 emit_insn (pat);
33192 return target;
33195 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33197 static rtx
33198 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33199 rtx target)
33201 rtx pat;
33202 tree arg0 = CALL_EXPR_ARG (exp, 0);
33203 tree arg1 = CALL_EXPR_ARG (exp, 1);
33204 rtx op0 = expand_normal (arg0);
33205 rtx op1 = expand_normal (arg1);
33206 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33207 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33208 enum rtx_code comparison = d->comparison;
33210 if (VECTOR_MODE_P (mode0))
33211 op0 = safe_vector_operand (op0, mode0);
33212 if (VECTOR_MODE_P (mode1))
33213 op1 = safe_vector_operand (op1, mode1);
33215 target = gen_reg_rtx (SImode);
33216 emit_move_insn (target, const0_rtx);
33217 target = gen_rtx_SUBREG (QImode, target, 0);
33219 if ((optimize && !register_operand (op0, mode0))
33220 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33221 op0 = copy_to_mode_reg (mode0, op0);
33222 if ((optimize && !register_operand (op1, mode1))
33223 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33224 op1 = copy_to_mode_reg (mode1, op1);
33226 pat = GEN_FCN (d->icode) (op0, op1);
33227 if (! pat)
33228 return 0;
33229 emit_insn (pat);
33230 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33231 gen_rtx_fmt_ee (comparison, QImode,
33232 SET_DEST (pat),
33233 const0_rtx)));
33235 return SUBREG_REG (target);
33238 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
33240 static rtx
33241 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
33242 tree exp, rtx target)
33244 rtx pat;
33245 tree arg0 = CALL_EXPR_ARG (exp, 0);
33246 tree arg1 = CALL_EXPR_ARG (exp, 1);
33247 tree arg2 = CALL_EXPR_ARG (exp, 2);
33248 tree arg3 = CALL_EXPR_ARG (exp, 3);
33249 tree arg4 = CALL_EXPR_ARG (exp, 4);
33250 rtx scratch0, scratch1;
33251 rtx op0 = expand_normal (arg0);
33252 rtx op1 = expand_normal (arg1);
33253 rtx op2 = expand_normal (arg2);
33254 rtx op3 = expand_normal (arg3);
33255 rtx op4 = expand_normal (arg4);
33256 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
33258 tmode0 = insn_data[d->icode].operand[0].mode;
33259 tmode1 = insn_data[d->icode].operand[1].mode;
33260 modev2 = insn_data[d->icode].operand[2].mode;
33261 modei3 = insn_data[d->icode].operand[3].mode;
33262 modev4 = insn_data[d->icode].operand[4].mode;
33263 modei5 = insn_data[d->icode].operand[5].mode;
33264 modeimm = insn_data[d->icode].operand[6].mode;
33266 if (VECTOR_MODE_P (modev2))
33267 op0 = safe_vector_operand (op0, modev2);
33268 if (VECTOR_MODE_P (modev4))
33269 op2 = safe_vector_operand (op2, modev4);
33271 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33272 op0 = copy_to_mode_reg (modev2, op0);
33273 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
33274 op1 = copy_to_mode_reg (modei3, op1);
33275 if ((optimize && !register_operand (op2, modev4))
33276 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33277 op2 = copy_to_mode_reg (modev4, op2);
33278 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33279 op3 = copy_to_mode_reg (modei5, op3);
33281 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33283 error ("the fifth argument must be an 8-bit immediate");
33284 return const0_rtx;
33287 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33289 if (optimize || !target
33290 || GET_MODE (target) != tmode0
33291 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33292 target = gen_reg_rtx (tmode0);
33294 scratch1 = gen_reg_rtx (tmode1);
33296 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33298 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33300 if (optimize || !target
33301 || GET_MODE (target) != tmode1
33302 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33303 target = gen_reg_rtx (tmode1);
33305 scratch0 = gen_reg_rtx (tmode0);
33307 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33309 else
33311 gcc_assert (d->flag);
33313 scratch0 = gen_reg_rtx (tmode0);
33314 scratch1 = gen_reg_rtx (tmode1);
33316 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33319 if (! pat)
33320 return 0;
33322 emit_insn (pat);
33324 if (d->flag)
33326 target = gen_reg_rtx (SImode);
33327 emit_move_insn (target, const0_rtx);
33328 target = gen_rtx_SUBREG (QImode, target, 0);
33330 emit_insn
33331 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33332 gen_rtx_fmt_ee (EQ, QImode,
33333 gen_rtx_REG ((machine_mode) d->flag,
33334 FLAGS_REG),
33335 const0_rtx)));
33336 return SUBREG_REG (target);
33338 else
33339 return target;
33343 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33345 static rtx
33346 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33347 tree exp, rtx target)
33349 rtx pat;
33350 tree arg0 = CALL_EXPR_ARG (exp, 0);
33351 tree arg1 = CALL_EXPR_ARG (exp, 1);
33352 tree arg2 = CALL_EXPR_ARG (exp, 2);
33353 rtx scratch0, scratch1;
33354 rtx op0 = expand_normal (arg0);
33355 rtx op1 = expand_normal (arg1);
33356 rtx op2 = expand_normal (arg2);
33357 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33359 tmode0 = insn_data[d->icode].operand[0].mode;
33360 tmode1 = insn_data[d->icode].operand[1].mode;
33361 modev2 = insn_data[d->icode].operand[2].mode;
33362 modev3 = insn_data[d->icode].operand[3].mode;
33363 modeimm = insn_data[d->icode].operand[4].mode;
33365 if (VECTOR_MODE_P (modev2))
33366 op0 = safe_vector_operand (op0, modev2);
33367 if (VECTOR_MODE_P (modev3))
33368 op1 = safe_vector_operand (op1, modev3);
33370 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33371 op0 = copy_to_mode_reg (modev2, op0);
33372 if ((optimize && !register_operand (op1, modev3))
33373 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33374 op1 = copy_to_mode_reg (modev3, op1);
33376 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33378 error ("the third argument must be an 8-bit immediate");
33379 return const0_rtx;
33382 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33384 if (optimize || !target
33385 || GET_MODE (target) != tmode0
33386 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33387 target = gen_reg_rtx (tmode0);
33389 scratch1 = gen_reg_rtx (tmode1);
33391 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33393 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33395 if (optimize || !target
33396 || GET_MODE (target) != tmode1
33397 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33398 target = gen_reg_rtx (tmode1);
33400 scratch0 = gen_reg_rtx (tmode0);
33402 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33404 else
33406 gcc_assert (d->flag);
33408 scratch0 = gen_reg_rtx (tmode0);
33409 scratch1 = gen_reg_rtx (tmode1);
33411 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33414 if (! pat)
33415 return 0;
33417 emit_insn (pat);
33419 if (d->flag)
33421 target = gen_reg_rtx (SImode);
33422 emit_move_insn (target, const0_rtx);
33423 target = gen_rtx_SUBREG (QImode, target, 0);
33425 emit_insn
33426 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33427 gen_rtx_fmt_ee (EQ, QImode,
33428 gen_rtx_REG ((machine_mode) d->flag,
33429 FLAGS_REG),
33430 const0_rtx)));
33431 return SUBREG_REG (target);
33433 else
33434 return target;
33437 /* Subroutine of ix86_expand_builtin to take care of insns with
33438 variable number of operands. */
33440 static rtx
33441 ix86_expand_args_builtin (const struct builtin_description *d,
33442 tree exp, rtx target)
33444 rtx pat, real_target;
33445 unsigned int i, nargs;
33446 unsigned int nargs_constant = 0;
33447 unsigned int mask_pos = 0;
33448 int num_memory = 0;
33449 struct
33451 rtx op;
33452 machine_mode mode;
33453 } args[6];
33454 bool second_arg_count = false;
33455 enum insn_code icode = d->icode;
33456 const struct insn_data_d *insn_p = &insn_data[icode];
33457 machine_mode tmode = insn_p->operand[0].mode;
33458 machine_mode rmode = VOIDmode;
33459 bool swap = false;
33460 enum rtx_code comparison = d->comparison;
33462 switch ((enum ix86_builtin_func_type) d->flag)
33464 case V2DF_FTYPE_V2DF_ROUND:
33465 case V4DF_FTYPE_V4DF_ROUND:
33466 case V8DF_FTYPE_V8DF_ROUND:
33467 case V4SF_FTYPE_V4SF_ROUND:
33468 case V8SF_FTYPE_V8SF_ROUND:
33469 case V16SF_FTYPE_V16SF_ROUND:
33470 case V4SI_FTYPE_V4SF_ROUND:
33471 case V8SI_FTYPE_V8SF_ROUND:
33472 case V16SI_FTYPE_V16SF_ROUND:
33473 return ix86_expand_sse_round (d, exp, target);
33474 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33475 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33476 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33477 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33478 case INT_FTYPE_V8SF_V8SF_PTEST:
33479 case INT_FTYPE_V4DI_V4DI_PTEST:
33480 case INT_FTYPE_V4DF_V4DF_PTEST:
33481 case INT_FTYPE_V4SF_V4SF_PTEST:
33482 case INT_FTYPE_V2DI_V2DI_PTEST:
33483 case INT_FTYPE_V2DF_V2DF_PTEST:
33484 return ix86_expand_sse_ptest (d, exp, target);
33485 case FLOAT128_FTYPE_FLOAT128:
33486 case FLOAT_FTYPE_FLOAT:
33487 case INT_FTYPE_INT:
33488 case UINT_FTYPE_UINT:
33489 case UINT16_FTYPE_UINT16:
33490 case UINT64_FTYPE_INT:
33491 case UINT64_FTYPE_UINT64:
33492 case INT64_FTYPE_INT64:
33493 case INT64_FTYPE_V4SF:
33494 case INT64_FTYPE_V2DF:
33495 case INT_FTYPE_V16QI:
33496 case INT_FTYPE_V8QI:
33497 case INT_FTYPE_V8SF:
33498 case INT_FTYPE_V4DF:
33499 case INT_FTYPE_V4SF:
33500 case INT_FTYPE_V2DF:
33501 case INT_FTYPE_V32QI:
33502 case V16QI_FTYPE_V16QI:
33503 case V8SI_FTYPE_V8SF:
33504 case V8SI_FTYPE_V4SI:
33505 case V8HI_FTYPE_V8HI:
33506 case V8HI_FTYPE_V16QI:
33507 case V8QI_FTYPE_V8QI:
33508 case V8SF_FTYPE_V8SF:
33509 case V8SF_FTYPE_V8SI:
33510 case V8SF_FTYPE_V4SF:
33511 case V8SF_FTYPE_V8HI:
33512 case V4SI_FTYPE_V4SI:
33513 case V4SI_FTYPE_V16QI:
33514 case V4SI_FTYPE_V4SF:
33515 case V4SI_FTYPE_V8SI:
33516 case V4SI_FTYPE_V8HI:
33517 case V4SI_FTYPE_V4DF:
33518 case V4SI_FTYPE_V2DF:
33519 case V4HI_FTYPE_V4HI:
33520 case V4DF_FTYPE_V4DF:
33521 case V4DF_FTYPE_V4SI:
33522 case V4DF_FTYPE_V4SF:
33523 case V4DF_FTYPE_V2DF:
33524 case V4SF_FTYPE_V4SF:
33525 case V4SF_FTYPE_V4SI:
33526 case V4SF_FTYPE_V8SF:
33527 case V4SF_FTYPE_V4DF:
33528 case V4SF_FTYPE_V8HI:
33529 case V4SF_FTYPE_V2DF:
33530 case V2DI_FTYPE_V2DI:
33531 case V2DI_FTYPE_V16QI:
33532 case V2DI_FTYPE_V8HI:
33533 case V2DI_FTYPE_V4SI:
33534 case V2DF_FTYPE_V2DF:
33535 case V2DF_FTYPE_V4SI:
33536 case V2DF_FTYPE_V4DF:
33537 case V2DF_FTYPE_V4SF:
33538 case V2DF_FTYPE_V2SI:
33539 case V2SI_FTYPE_V2SI:
33540 case V2SI_FTYPE_V4SF:
33541 case V2SI_FTYPE_V2SF:
33542 case V2SI_FTYPE_V2DF:
33543 case V2SF_FTYPE_V2SF:
33544 case V2SF_FTYPE_V2SI:
33545 case V32QI_FTYPE_V32QI:
33546 case V32QI_FTYPE_V16QI:
33547 case V16HI_FTYPE_V16HI:
33548 case V16HI_FTYPE_V8HI:
33549 case V8SI_FTYPE_V8SI:
33550 case V16HI_FTYPE_V16QI:
33551 case V8SI_FTYPE_V16QI:
33552 case V4DI_FTYPE_V16QI:
33553 case V8SI_FTYPE_V8HI:
33554 case V4DI_FTYPE_V8HI:
33555 case V4DI_FTYPE_V4SI:
33556 case V4DI_FTYPE_V2DI:
33557 case UQI_FTYPE_UQI:
33558 case UHI_FTYPE_UHI:
33559 case USI_FTYPE_USI:
33560 case USI_FTYPE_UQI:
33561 case USI_FTYPE_UHI:
33562 case UDI_FTYPE_UDI:
33563 case UHI_FTYPE_V16QI:
33564 case USI_FTYPE_V32QI:
33565 case UDI_FTYPE_V64QI:
33566 case V16QI_FTYPE_UHI:
33567 case V32QI_FTYPE_USI:
33568 case V64QI_FTYPE_UDI:
33569 case V8HI_FTYPE_UQI:
33570 case V16HI_FTYPE_UHI:
33571 case V32HI_FTYPE_USI:
33572 case V4SI_FTYPE_UQI:
33573 case V8SI_FTYPE_UQI:
33574 case V4SI_FTYPE_UHI:
33575 case V8SI_FTYPE_UHI:
33576 case UQI_FTYPE_V8HI:
33577 case UHI_FTYPE_V16HI:
33578 case USI_FTYPE_V32HI:
33579 case UQI_FTYPE_V4SI:
33580 case UQI_FTYPE_V8SI:
33581 case UHI_FTYPE_V16SI:
33582 case UQI_FTYPE_V2DI:
33583 case UQI_FTYPE_V4DI:
33584 case UQI_FTYPE_V8DI:
33585 case V16SI_FTYPE_UHI:
33586 case V2DI_FTYPE_UQI:
33587 case V4DI_FTYPE_UQI:
33588 case V16SI_FTYPE_INT:
33589 case V16SF_FTYPE_V8SF:
33590 case V16SI_FTYPE_V8SI:
33591 case V16SF_FTYPE_V4SF:
33592 case V16SI_FTYPE_V4SI:
33593 case V16SI_FTYPE_V16SF:
33594 case V16SI_FTYPE_V16SI:
33595 case V64QI_FTYPE_V64QI:
33596 case V32HI_FTYPE_V32HI:
33597 case V16SF_FTYPE_V16SF:
33598 case V8DI_FTYPE_UQI:
33599 case V8DI_FTYPE_V8DI:
33600 case V8DF_FTYPE_V4DF:
33601 case V8DF_FTYPE_V2DF:
33602 case V8DF_FTYPE_V8DF:
33603 case V4DI_FTYPE_V4DI:
33604 nargs = 1;
33605 break;
33606 case V4SF_FTYPE_V4SF_VEC_MERGE:
33607 case V2DF_FTYPE_V2DF_VEC_MERGE:
33608 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33609 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33610 case V16QI_FTYPE_V16QI_V16QI:
33611 case V16QI_FTYPE_V8HI_V8HI:
33612 case V16SF_FTYPE_V16SF_V16SF:
33613 case V8QI_FTYPE_V8QI_V8QI:
33614 case V8QI_FTYPE_V4HI_V4HI:
33615 case V8HI_FTYPE_V8HI_V8HI:
33616 case V8HI_FTYPE_V16QI_V16QI:
33617 case V8HI_FTYPE_V4SI_V4SI:
33618 case V8SF_FTYPE_V8SF_V8SF:
33619 case V8SF_FTYPE_V8SF_V8SI:
33620 case V8DF_FTYPE_V8DF_V8DF:
33621 case V4SI_FTYPE_V4SI_V4SI:
33622 case V4SI_FTYPE_V8HI_V8HI:
33623 case V4SI_FTYPE_V2DF_V2DF:
33624 case V4HI_FTYPE_V4HI_V4HI:
33625 case V4HI_FTYPE_V8QI_V8QI:
33626 case V4HI_FTYPE_V2SI_V2SI:
33627 case V4DF_FTYPE_V4DF_V4DF:
33628 case V4DF_FTYPE_V4DF_V4DI:
33629 case V4SF_FTYPE_V4SF_V4SF:
33630 case V4SF_FTYPE_V4SF_V4SI:
33631 case V4SF_FTYPE_V4SF_V2SI:
33632 case V4SF_FTYPE_V4SF_V2DF:
33633 case V4SF_FTYPE_V4SF_UINT:
33634 case V4SF_FTYPE_V4SF_DI:
33635 case V4SF_FTYPE_V4SF_SI:
33636 case V2DI_FTYPE_V2DI_V2DI:
33637 case V2DI_FTYPE_V16QI_V16QI:
33638 case V2DI_FTYPE_V4SI_V4SI:
33639 case V2DI_FTYPE_V2DI_V16QI:
33640 case V2SI_FTYPE_V2SI_V2SI:
33641 case V2SI_FTYPE_V4HI_V4HI:
33642 case V2SI_FTYPE_V2SF_V2SF:
33643 case V2DF_FTYPE_V2DF_V2DF:
33644 case V2DF_FTYPE_V2DF_V4SF:
33645 case V2DF_FTYPE_V2DF_V2DI:
33646 case V2DF_FTYPE_V2DF_DI:
33647 case V2DF_FTYPE_V2DF_SI:
33648 case V2DF_FTYPE_V2DF_UINT:
33649 case V2SF_FTYPE_V2SF_V2SF:
33650 case V1DI_FTYPE_V1DI_V1DI:
33651 case V1DI_FTYPE_V8QI_V8QI:
33652 case V1DI_FTYPE_V2SI_V2SI:
33653 case V32QI_FTYPE_V16HI_V16HI:
33654 case V16HI_FTYPE_V8SI_V8SI:
33655 case V64QI_FTYPE_V64QI_V64QI:
33656 case V32QI_FTYPE_V32QI_V32QI:
33657 case V16HI_FTYPE_V32QI_V32QI:
33658 case V16HI_FTYPE_V16HI_V16HI:
33659 case V8SI_FTYPE_V4DF_V4DF:
33660 case V8SI_FTYPE_V8SI_V8SI:
33661 case V8SI_FTYPE_V16HI_V16HI:
33662 case V4DI_FTYPE_V4DI_V4DI:
33663 case V4DI_FTYPE_V8SI_V8SI:
33664 case V8DI_FTYPE_V64QI_V64QI:
33665 if (comparison == UNKNOWN)
33666 return ix86_expand_binop_builtin (icode, exp, target);
33667 nargs = 2;
33668 break;
33669 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33670 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33671 gcc_assert (comparison != UNKNOWN);
33672 nargs = 2;
33673 swap = true;
33674 break;
33675 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33676 case V16HI_FTYPE_V16HI_SI_COUNT:
33677 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33678 case V8SI_FTYPE_V8SI_SI_COUNT:
33679 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33680 case V4DI_FTYPE_V4DI_INT_COUNT:
33681 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33682 case V8HI_FTYPE_V8HI_SI_COUNT:
33683 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33684 case V4SI_FTYPE_V4SI_SI_COUNT:
33685 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33686 case V4HI_FTYPE_V4HI_SI_COUNT:
33687 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33688 case V2DI_FTYPE_V2DI_SI_COUNT:
33689 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33690 case V2SI_FTYPE_V2SI_SI_COUNT:
33691 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33692 case V1DI_FTYPE_V1DI_SI_COUNT:
33693 nargs = 2;
33694 second_arg_count = true;
33695 break;
33696 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33697 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33698 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33699 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33700 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33701 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33702 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33703 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33704 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33705 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33706 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33707 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33708 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33709 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33710 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33711 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33712 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33713 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33714 nargs = 4;
33715 second_arg_count = true;
33716 break;
33717 case UINT64_FTYPE_UINT64_UINT64:
33718 case UINT_FTYPE_UINT_UINT:
33719 case UINT_FTYPE_UINT_USHORT:
33720 case UINT_FTYPE_UINT_UCHAR:
33721 case UINT16_FTYPE_UINT16_INT:
33722 case UINT8_FTYPE_UINT8_INT:
33723 case UQI_FTYPE_UQI_UQI:
33724 case UHI_FTYPE_UHI_UHI:
33725 case USI_FTYPE_USI_USI:
33726 case UDI_FTYPE_UDI_UDI:
33727 case V16SI_FTYPE_V8DF_V8DF:
33728 nargs = 2;
33729 break;
33730 case V2DI_FTYPE_V2DI_INT_CONVERT:
33731 nargs = 2;
33732 rmode = V1TImode;
33733 nargs_constant = 1;
33734 break;
33735 case V4DI_FTYPE_V4DI_INT_CONVERT:
33736 nargs = 2;
33737 rmode = V2TImode;
33738 nargs_constant = 1;
33739 break;
33740 case V8DI_FTYPE_V8DI_INT_CONVERT:
33741 nargs = 2;
33742 rmode = V4TImode;
33743 nargs_constant = 1;
33744 break;
33745 case V8HI_FTYPE_V8HI_INT:
33746 case V8HI_FTYPE_V8SF_INT:
33747 case V16HI_FTYPE_V16SF_INT:
33748 case V8HI_FTYPE_V4SF_INT:
33749 case V8SF_FTYPE_V8SF_INT:
33750 case V4SF_FTYPE_V16SF_INT:
33751 case V16SF_FTYPE_V16SF_INT:
33752 case V4SI_FTYPE_V4SI_INT:
33753 case V4SI_FTYPE_V8SI_INT:
33754 case V4HI_FTYPE_V4HI_INT:
33755 case V4DF_FTYPE_V4DF_INT:
33756 case V4DF_FTYPE_V8DF_INT:
33757 case V4SF_FTYPE_V4SF_INT:
33758 case V4SF_FTYPE_V8SF_INT:
33759 case V2DI_FTYPE_V2DI_INT:
33760 case V2DF_FTYPE_V2DF_INT:
33761 case V2DF_FTYPE_V4DF_INT:
33762 case V16HI_FTYPE_V16HI_INT:
33763 case V8SI_FTYPE_V8SI_INT:
33764 case V16SI_FTYPE_V16SI_INT:
33765 case V4SI_FTYPE_V16SI_INT:
33766 case V4DI_FTYPE_V4DI_INT:
33767 case V2DI_FTYPE_V4DI_INT:
33768 case V4DI_FTYPE_V8DI_INT:
33769 case QI_FTYPE_V4SF_INT:
33770 case QI_FTYPE_V2DF_INT:
33771 case UQI_FTYPE_UQI_UQI_CONST:
33772 case UHI_FTYPE_UHI_UQI:
33773 case USI_FTYPE_USI_UQI:
33774 case UDI_FTYPE_UDI_UQI:
33775 nargs = 2;
33776 nargs_constant = 1;
33777 break;
33778 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33779 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33780 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33781 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33782 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33783 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33784 case UHI_FTYPE_V16SI_V16SI_UHI:
33785 case UQI_FTYPE_V8DI_V8DI_UQI:
33786 case V16HI_FTYPE_V16SI_V16HI_UHI:
33787 case V16QI_FTYPE_V16SI_V16QI_UHI:
33788 case V16QI_FTYPE_V8DI_V16QI_UQI:
33789 case V16SF_FTYPE_V16SF_V16SF_UHI:
33790 case V16SF_FTYPE_V4SF_V16SF_UHI:
33791 case V16SI_FTYPE_SI_V16SI_UHI:
33792 case V16SI_FTYPE_V16HI_V16SI_UHI:
33793 case V16SI_FTYPE_V16QI_V16SI_UHI:
33794 case V8SF_FTYPE_V4SF_V8SF_UQI:
33795 case V4DF_FTYPE_V2DF_V4DF_UQI:
33796 case V8SI_FTYPE_V4SI_V8SI_UQI:
33797 case V8SI_FTYPE_SI_V8SI_UQI:
33798 case V4SI_FTYPE_V4SI_V4SI_UQI:
33799 case V4SI_FTYPE_SI_V4SI_UQI:
33800 case V4DI_FTYPE_V2DI_V4DI_UQI:
33801 case V4DI_FTYPE_DI_V4DI_UQI:
33802 case V2DI_FTYPE_V2DI_V2DI_UQI:
33803 case V2DI_FTYPE_DI_V2DI_UQI:
33804 case V64QI_FTYPE_V64QI_V64QI_UDI:
33805 case V64QI_FTYPE_V16QI_V64QI_UDI:
33806 case V64QI_FTYPE_QI_V64QI_UDI:
33807 case V32QI_FTYPE_V32QI_V32QI_USI:
33808 case V32QI_FTYPE_V16QI_V32QI_USI:
33809 case V32QI_FTYPE_QI_V32QI_USI:
33810 case V16QI_FTYPE_V16QI_V16QI_UHI:
33811 case V16QI_FTYPE_QI_V16QI_UHI:
33812 case V32HI_FTYPE_V8HI_V32HI_USI:
33813 case V32HI_FTYPE_HI_V32HI_USI:
33814 case V16HI_FTYPE_V8HI_V16HI_UHI:
33815 case V16HI_FTYPE_HI_V16HI_UHI:
33816 case V8HI_FTYPE_V8HI_V8HI_UQI:
33817 case V8HI_FTYPE_HI_V8HI_UQI:
33818 case V8SF_FTYPE_V8HI_V8SF_UQI:
33819 case V4SF_FTYPE_V8HI_V4SF_UQI:
33820 case V8SI_FTYPE_V8SF_V8SI_UQI:
33821 case V4SI_FTYPE_V4SF_V4SI_UQI:
33822 case V4DI_FTYPE_V4SF_V4DI_UQI:
33823 case V2DI_FTYPE_V4SF_V2DI_UQI:
33824 case V4SF_FTYPE_V4DI_V4SF_UQI:
33825 case V4SF_FTYPE_V2DI_V4SF_UQI:
33826 case V4DF_FTYPE_V4DI_V4DF_UQI:
33827 case V2DF_FTYPE_V2DI_V2DF_UQI:
33828 case V16QI_FTYPE_V8HI_V16QI_UQI:
33829 case V16QI_FTYPE_V16HI_V16QI_UHI:
33830 case V16QI_FTYPE_V4SI_V16QI_UQI:
33831 case V16QI_FTYPE_V8SI_V16QI_UQI:
33832 case V8HI_FTYPE_V4SI_V8HI_UQI:
33833 case V8HI_FTYPE_V8SI_V8HI_UQI:
33834 case V16QI_FTYPE_V2DI_V16QI_UQI:
33835 case V16QI_FTYPE_V4DI_V16QI_UQI:
33836 case V8HI_FTYPE_V2DI_V8HI_UQI:
33837 case V8HI_FTYPE_V4DI_V8HI_UQI:
33838 case V4SI_FTYPE_V2DI_V4SI_UQI:
33839 case V4SI_FTYPE_V4DI_V4SI_UQI:
33840 case V32QI_FTYPE_V32HI_V32QI_USI:
33841 case UHI_FTYPE_V16QI_V16QI_UHI:
33842 case USI_FTYPE_V32QI_V32QI_USI:
33843 case UDI_FTYPE_V64QI_V64QI_UDI:
33844 case UQI_FTYPE_V8HI_V8HI_UQI:
33845 case UHI_FTYPE_V16HI_V16HI_UHI:
33846 case USI_FTYPE_V32HI_V32HI_USI:
33847 case UQI_FTYPE_V4SI_V4SI_UQI:
33848 case UQI_FTYPE_V8SI_V8SI_UQI:
33849 case UQI_FTYPE_V2DI_V2DI_UQI:
33850 case UQI_FTYPE_V4DI_V4DI_UQI:
33851 case V4SF_FTYPE_V2DF_V4SF_UQI:
33852 case V4SF_FTYPE_V4DF_V4SF_UQI:
33853 case V16SI_FTYPE_V16SI_V16SI_UHI:
33854 case V16SI_FTYPE_V4SI_V16SI_UHI:
33855 case V2DI_FTYPE_V4SI_V2DI_UQI:
33856 case V2DI_FTYPE_V8HI_V2DI_UQI:
33857 case V2DI_FTYPE_V16QI_V2DI_UQI:
33858 case V4DI_FTYPE_V4DI_V4DI_UQI:
33859 case V4DI_FTYPE_V4SI_V4DI_UQI:
33860 case V4DI_FTYPE_V8HI_V4DI_UQI:
33861 case V4DI_FTYPE_V16QI_V4DI_UQI:
33862 case V4DI_FTYPE_V4DF_V4DI_UQI:
33863 case V2DI_FTYPE_V2DF_V2DI_UQI:
33864 case V4SI_FTYPE_V4DF_V4SI_UQI:
33865 case V4SI_FTYPE_V2DF_V4SI_UQI:
33866 case V4SI_FTYPE_V8HI_V4SI_UQI:
33867 case V4SI_FTYPE_V16QI_V4SI_UQI:
33868 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33869 case V8DF_FTYPE_V2DF_V8DF_UQI:
33870 case V8DF_FTYPE_V4DF_V8DF_UQI:
33871 case V8DF_FTYPE_V8DF_V8DF_UQI:
33872 case V8SF_FTYPE_V8SF_V8SF_UQI:
33873 case V8SF_FTYPE_V8SI_V8SF_UQI:
33874 case V4DF_FTYPE_V4DF_V4DF_UQI:
33875 case V4SF_FTYPE_V4SF_V4SF_UQI:
33876 case V2DF_FTYPE_V2DF_V2DF_UQI:
33877 case V2DF_FTYPE_V4SF_V2DF_UQI:
33878 case V2DF_FTYPE_V4SI_V2DF_UQI:
33879 case V4SF_FTYPE_V4SI_V4SF_UQI:
33880 case V4DF_FTYPE_V4SF_V4DF_UQI:
33881 case V4DF_FTYPE_V4SI_V4DF_UQI:
33882 case V8SI_FTYPE_V8SI_V8SI_UQI:
33883 case V8SI_FTYPE_V8HI_V8SI_UQI:
33884 case V8SI_FTYPE_V16QI_V8SI_UQI:
33885 case V8DF_FTYPE_V8SI_V8DF_UQI:
33886 case V8DI_FTYPE_DI_V8DI_UQI:
33887 case V16SF_FTYPE_V8SF_V16SF_UHI:
33888 case V16SI_FTYPE_V8SI_V16SI_UHI:
33889 case V16HI_FTYPE_V16HI_V16HI_UHI:
33890 case V8HI_FTYPE_V16QI_V8HI_UQI:
33891 case V16HI_FTYPE_V16QI_V16HI_UHI:
33892 case V32HI_FTYPE_V32HI_V32HI_USI:
33893 case V32HI_FTYPE_V32QI_V32HI_USI:
33894 case V8DI_FTYPE_V16QI_V8DI_UQI:
33895 case V8DI_FTYPE_V2DI_V8DI_UQI:
33896 case V8DI_FTYPE_V4DI_V8DI_UQI:
33897 case V8DI_FTYPE_V8DI_V8DI_UQI:
33898 case V8DI_FTYPE_V8HI_V8DI_UQI:
33899 case V8DI_FTYPE_V8SI_V8DI_UQI:
33900 case V8HI_FTYPE_V8DI_V8HI_UQI:
33901 case V8SI_FTYPE_V8DI_V8SI_UQI:
33902 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33903 case V16SI_FTYPE_V16SI_V16SI_V16SI:
33904 case V8DI_FTYPE_V8DI_V8DI_V8DI:
33905 case V32HI_FTYPE_V32HI_V32HI_V32HI:
33906 case V2DI_FTYPE_V2DI_V2DI_V2DI:
33907 case V16HI_FTYPE_V16HI_V16HI_V16HI:
33908 case V8SI_FTYPE_V8SI_V8SI_V8SI:
33909 case V8HI_FTYPE_V8HI_V8HI_V8HI:
33910 nargs = 3;
33911 break;
33912 case V32QI_FTYPE_V32QI_V32QI_INT:
33913 case V16HI_FTYPE_V16HI_V16HI_INT:
33914 case V16QI_FTYPE_V16QI_V16QI_INT:
33915 case V4DI_FTYPE_V4DI_V4DI_INT:
33916 case V8HI_FTYPE_V8HI_V8HI_INT:
33917 case V8SI_FTYPE_V8SI_V8SI_INT:
33918 case V8SI_FTYPE_V8SI_V4SI_INT:
33919 case V8SF_FTYPE_V8SF_V8SF_INT:
33920 case V8SF_FTYPE_V8SF_V4SF_INT:
33921 case V4SI_FTYPE_V4SI_V4SI_INT:
33922 case V4DF_FTYPE_V4DF_V4DF_INT:
33923 case V16SF_FTYPE_V16SF_V16SF_INT:
33924 case V16SF_FTYPE_V16SF_V4SF_INT:
33925 case V16SI_FTYPE_V16SI_V4SI_INT:
33926 case V4DF_FTYPE_V4DF_V2DF_INT:
33927 case V4SF_FTYPE_V4SF_V4SF_INT:
33928 case V2DI_FTYPE_V2DI_V2DI_INT:
33929 case V4DI_FTYPE_V4DI_V2DI_INT:
33930 case V2DF_FTYPE_V2DF_V2DF_INT:
33931 case UQI_FTYPE_V8DI_V8UDI_INT:
33932 case UQI_FTYPE_V8DF_V8DF_INT:
33933 case UQI_FTYPE_V2DF_V2DF_INT:
33934 case UQI_FTYPE_V4SF_V4SF_INT:
33935 case UHI_FTYPE_V16SI_V16SI_INT:
33936 case UHI_FTYPE_V16SF_V16SF_INT:
33937 case V64QI_FTYPE_V64QI_V64QI_INT:
33938 case V32HI_FTYPE_V32HI_V32HI_INT:
33939 case V16SI_FTYPE_V16SI_V16SI_INT:
33940 case V8DI_FTYPE_V8DI_V8DI_INT:
33941 nargs = 3;
33942 nargs_constant = 1;
33943 break;
33944 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33945 nargs = 3;
33946 rmode = V4DImode;
33947 nargs_constant = 1;
33948 break;
33949 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33950 nargs = 3;
33951 rmode = V2DImode;
33952 nargs_constant = 1;
33953 break;
33954 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33955 nargs = 3;
33956 rmode = DImode;
33957 nargs_constant = 1;
33958 break;
33959 case V2DI_FTYPE_V2DI_UINT_UINT:
33960 nargs = 3;
33961 nargs_constant = 2;
33962 break;
33963 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33964 nargs = 3;
33965 rmode = V8DImode;
33966 nargs_constant = 1;
33967 break;
33968 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33969 nargs = 5;
33970 rmode = V8DImode;
33971 mask_pos = 2;
33972 nargs_constant = 1;
33973 break;
33974 case QI_FTYPE_V8DF_INT_UQI:
33975 case QI_FTYPE_V4DF_INT_UQI:
33976 case QI_FTYPE_V2DF_INT_UQI:
33977 case HI_FTYPE_V16SF_INT_UHI:
33978 case QI_FTYPE_V8SF_INT_UQI:
33979 case QI_FTYPE_V4SF_INT_UQI:
33980 case UHI_FTYPE_V2DI_V2DI_UHI:
33981 case USI_FTYPE_V4DI_V4DI_USI:
33982 case V4SI_FTYPE_V4SI_V4SI_UHI:
33983 case V8SI_FTYPE_V8SI_V8SI_UHI:
33984 nargs = 3;
33985 mask_pos = 1;
33986 nargs_constant = 1;
33987 break;
33988 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33989 nargs = 5;
33990 rmode = V4DImode;
33991 mask_pos = 2;
33992 nargs_constant = 1;
33993 break;
33994 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33995 nargs = 5;
33996 rmode = V2DImode;
33997 mask_pos = 2;
33998 nargs_constant = 1;
33999 break;
34000 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
34001 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
34002 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
34003 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
34004 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
34005 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
34006 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
34007 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
34008 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
34009 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
34010 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
34011 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
34012 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
34013 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
34014 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
34015 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
34016 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
34017 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
34018 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
34019 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
34020 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
34021 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
34022 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
34023 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
34024 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
34025 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
34026 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
34027 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
34028 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
34029 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
34030 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
34031 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
34032 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
34033 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
34034 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
34035 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
34036 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
34037 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
34038 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
34039 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
34040 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
34041 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
34042 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34043 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34044 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34045 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34046 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34047 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34048 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34049 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34050 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34051 nargs = 4;
34052 break;
34053 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34054 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34055 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34056 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34057 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34058 nargs = 4;
34059 nargs_constant = 1;
34060 break;
34061 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34062 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34063 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34064 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34065 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34066 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34067 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34068 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34069 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34070 case USI_FTYPE_V32QI_V32QI_INT_USI:
34071 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34072 case USI_FTYPE_V32HI_V32HI_INT_USI:
34073 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34074 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34075 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34076 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34077 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34078 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34079 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34080 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34081 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34082 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34083 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34084 nargs = 4;
34085 mask_pos = 1;
34086 nargs_constant = 1;
34087 break;
34088 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34089 nargs = 4;
34090 nargs_constant = 2;
34091 break;
34092 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34093 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34094 nargs = 4;
34095 break;
34096 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34097 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34098 mask_pos = 1;
34099 nargs = 4;
34100 nargs_constant = 1;
34101 break;
34102 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34103 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34104 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34105 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34106 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34107 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34108 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34109 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34110 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34111 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34112 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34113 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34114 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34115 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34116 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34117 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34118 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34119 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34120 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34121 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34122 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34123 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34124 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34125 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34126 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34127 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34128 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34129 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34130 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34131 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34132 nargs = 4;
34133 mask_pos = 2;
34134 nargs_constant = 1;
34135 break;
34136 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34137 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34138 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34139 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34140 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34141 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34142 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34143 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34144 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34145 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34146 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34147 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34148 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34149 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34150 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34151 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34152 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34153 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34154 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34155 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34156 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34157 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34158 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34159 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34160 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34161 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34162 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34163 nargs = 5;
34164 mask_pos = 2;
34165 nargs_constant = 1;
34166 break;
34167 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34168 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34169 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34170 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34171 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34172 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34173 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34174 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34175 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34176 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34177 nargs = 5;
34178 mask_pos = 1;
34179 nargs_constant = 1;
34180 break;
34181 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34182 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34183 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34184 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34185 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34186 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34187 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34188 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34189 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34190 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34191 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34192 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34193 nargs = 5;
34194 mask_pos = 1;
34195 nargs_constant = 2;
34196 break;
34198 default:
34199 gcc_unreachable ();
34202 gcc_assert (nargs <= ARRAY_SIZE (args));
34204 if (comparison != UNKNOWN)
34206 gcc_assert (nargs == 2);
34207 return ix86_expand_sse_compare (d, exp, target, swap);
34210 if (rmode == VOIDmode || rmode == tmode)
34212 if (optimize
34213 || target == 0
34214 || GET_MODE (target) != tmode
34215 || !insn_p->operand[0].predicate (target, tmode))
34216 target = gen_reg_rtx (tmode);
34217 else if (memory_operand (target, tmode))
34218 num_memory++;
34219 real_target = target;
34221 else
34223 real_target = gen_reg_rtx (tmode);
34224 target = lowpart_subreg (rmode, real_target, tmode);
34227 for (i = 0; i < nargs; i++)
34229 tree arg = CALL_EXPR_ARG (exp, i);
34230 rtx op = expand_normal (arg);
34231 machine_mode mode = insn_p->operand[i + 1].mode;
34232 bool match = insn_p->operand[i + 1].predicate (op, mode);
34234 if (second_arg_count && i == 1)
34236 /* SIMD shift insns take either an 8-bit immediate or
34237 register as count. But builtin functions take int as
34238 count. If count doesn't match, we put it in register.
34239 The instructions are using 64-bit count, if op is just
34240 32-bit, zero-extend it, as negative shift counts
34241 are undefined behavior and zero-extension is more
34242 efficient. */
34243 if (!match)
34245 if (SCALAR_INT_MODE_P (GET_MODE (op)))
34246 op = convert_modes (mode, GET_MODE (op), op, 1);
34247 else
34248 op = lowpart_subreg (mode, op, GET_MODE (op));
34249 if (!insn_p->operand[i + 1].predicate (op, mode))
34250 op = copy_to_reg (op);
34253 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34254 (!mask_pos && (nargs - i) <= nargs_constant))
34256 if (!match)
34257 switch (icode)
34259 case CODE_FOR_avx_vinsertf128v4di:
34260 case CODE_FOR_avx_vextractf128v4di:
34261 error ("the last argument must be an 1-bit immediate");
34262 return const0_rtx;
34264 case CODE_FOR_avx512f_cmpv8di3_mask:
34265 case CODE_FOR_avx512f_cmpv16si3_mask:
34266 case CODE_FOR_avx512f_ucmpv8di3_mask:
34267 case CODE_FOR_avx512f_ucmpv16si3_mask:
34268 case CODE_FOR_avx512vl_cmpv4di3_mask:
34269 case CODE_FOR_avx512vl_cmpv8si3_mask:
34270 case CODE_FOR_avx512vl_ucmpv4di3_mask:
34271 case CODE_FOR_avx512vl_ucmpv8si3_mask:
34272 case CODE_FOR_avx512vl_cmpv2di3_mask:
34273 case CODE_FOR_avx512vl_cmpv4si3_mask:
34274 case CODE_FOR_avx512vl_ucmpv2di3_mask:
34275 case CODE_FOR_avx512vl_ucmpv4si3_mask:
34276 error ("the last argument must be a 3-bit immediate");
34277 return const0_rtx;
34279 case CODE_FOR_sse4_1_roundsd:
34280 case CODE_FOR_sse4_1_roundss:
34282 case CODE_FOR_sse4_1_roundpd:
34283 case CODE_FOR_sse4_1_roundps:
34284 case CODE_FOR_avx_roundpd256:
34285 case CODE_FOR_avx_roundps256:
34287 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
34288 case CODE_FOR_sse4_1_roundps_sfix:
34289 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
34290 case CODE_FOR_avx_roundps_sfix256:
34292 case CODE_FOR_sse4_1_blendps:
34293 case CODE_FOR_avx_blendpd256:
34294 case CODE_FOR_avx_vpermilv4df:
34295 case CODE_FOR_avx_vpermilv4df_mask:
34296 case CODE_FOR_avx512f_getmantv8df_mask:
34297 case CODE_FOR_avx512f_getmantv16sf_mask:
34298 case CODE_FOR_avx512vl_getmantv8sf_mask:
34299 case CODE_FOR_avx512vl_getmantv4df_mask:
34300 case CODE_FOR_avx512vl_getmantv4sf_mask:
34301 case CODE_FOR_avx512vl_getmantv2df_mask:
34302 case CODE_FOR_avx512dq_rangepv8df_mask_round:
34303 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
34304 case CODE_FOR_avx512dq_rangepv4df_mask:
34305 case CODE_FOR_avx512dq_rangepv8sf_mask:
34306 case CODE_FOR_avx512dq_rangepv2df_mask:
34307 case CODE_FOR_avx512dq_rangepv4sf_mask:
34308 case CODE_FOR_avx_shufpd256_mask:
34309 error ("the last argument must be a 4-bit immediate");
34310 return const0_rtx;
34312 case CODE_FOR_sha1rnds4:
34313 case CODE_FOR_sse4_1_blendpd:
34314 case CODE_FOR_avx_vpermilv2df:
34315 case CODE_FOR_avx_vpermilv2df_mask:
34316 case CODE_FOR_xop_vpermil2v2df3:
34317 case CODE_FOR_xop_vpermil2v4sf3:
34318 case CODE_FOR_xop_vpermil2v4df3:
34319 case CODE_FOR_xop_vpermil2v8sf3:
34320 case CODE_FOR_avx512f_vinsertf32x4_mask:
34321 case CODE_FOR_avx512f_vinserti32x4_mask:
34322 case CODE_FOR_avx512f_vextractf32x4_mask:
34323 case CODE_FOR_avx512f_vextracti32x4_mask:
34324 case CODE_FOR_sse2_shufpd:
34325 case CODE_FOR_sse2_shufpd_mask:
34326 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34327 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34328 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34329 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34330 error ("the last argument must be a 2-bit immediate");
34331 return const0_rtx;
34333 case CODE_FOR_avx_vextractf128v4df:
34334 case CODE_FOR_avx_vextractf128v8sf:
34335 case CODE_FOR_avx_vextractf128v8si:
34336 case CODE_FOR_avx_vinsertf128v4df:
34337 case CODE_FOR_avx_vinsertf128v8sf:
34338 case CODE_FOR_avx_vinsertf128v8si:
34339 case CODE_FOR_avx512f_vinsertf64x4_mask:
34340 case CODE_FOR_avx512f_vinserti64x4_mask:
34341 case CODE_FOR_avx512f_vextractf64x4_mask:
34342 case CODE_FOR_avx512f_vextracti64x4_mask:
34343 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34344 case CODE_FOR_avx512dq_vinserti32x8_mask:
34345 case CODE_FOR_avx512vl_vinsertv4df:
34346 case CODE_FOR_avx512vl_vinsertv4di:
34347 case CODE_FOR_avx512vl_vinsertv8sf:
34348 case CODE_FOR_avx512vl_vinsertv8si:
34349 error ("the last argument must be a 1-bit immediate");
34350 return const0_rtx;
34352 case CODE_FOR_avx_vmcmpv2df3:
34353 case CODE_FOR_avx_vmcmpv4sf3:
34354 case CODE_FOR_avx_cmpv2df3:
34355 case CODE_FOR_avx_cmpv4sf3:
34356 case CODE_FOR_avx_cmpv4df3:
34357 case CODE_FOR_avx_cmpv8sf3:
34358 case CODE_FOR_avx512f_cmpv8df3_mask:
34359 case CODE_FOR_avx512f_cmpv16sf3_mask:
34360 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34361 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34362 error ("the last argument must be a 5-bit immediate");
34363 return const0_rtx;
34365 default:
34366 switch (nargs_constant)
34368 case 2:
34369 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34370 (!mask_pos && (nargs - i) == nargs_constant))
34372 error ("the next to last argument must be an 8-bit immediate");
34373 break;
34375 /* FALLTHRU */
34376 case 1:
34377 error ("the last argument must be an 8-bit immediate");
34378 break;
34379 default:
34380 gcc_unreachable ();
34382 return const0_rtx;
34385 else
34387 if (VECTOR_MODE_P (mode))
34388 op = safe_vector_operand (op, mode);
34390 /* If we aren't optimizing, only allow one memory operand to
34391 be generated. */
34392 if (memory_operand (op, mode))
34393 num_memory++;
34395 op = fixup_modeless_constant (op, mode);
34397 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34399 if (optimize || !match || num_memory > 1)
34400 op = copy_to_mode_reg (mode, op);
34402 else
34404 op = copy_to_reg (op);
34405 op = lowpart_subreg (mode, op, GET_MODE (op));
34409 args[i].op = op;
34410 args[i].mode = mode;
34413 switch (nargs)
34415 case 1:
34416 pat = GEN_FCN (icode) (real_target, args[0].op);
34417 break;
34418 case 2:
34419 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34420 break;
34421 case 3:
34422 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34423 args[2].op);
34424 break;
34425 case 4:
34426 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34427 args[2].op, args[3].op);
34428 break;
34429 case 5:
34430 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34431 args[2].op, args[3].op, args[4].op);
34432 break;
34433 case 6:
34434 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34435 args[2].op, args[3].op, args[4].op,
34436 args[5].op);
34437 break;
34438 default:
34439 gcc_unreachable ();
34442 if (! pat)
34443 return 0;
34445 emit_insn (pat);
34446 return target;
34449 /* Transform pattern of following layout:
34450 (set A
34451 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34453 into:
34454 (set (A B)) */
34456 static rtx
34457 ix86_erase_embedded_rounding (rtx pat)
34459 if (GET_CODE (pat) == INSN)
34460 pat = PATTERN (pat);
34462 gcc_assert (GET_CODE (pat) == SET);
34463 rtx src = SET_SRC (pat);
34464 gcc_assert (XVECLEN (src, 0) == 2);
34465 rtx p0 = XVECEXP (src, 0, 0);
34466 gcc_assert (GET_CODE (src) == UNSPEC
34467 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34468 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34469 return res;
34472 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34473 with rounding. */
34474 static rtx
34475 ix86_expand_sse_comi_round (const struct builtin_description *d,
34476 tree exp, rtx target)
34478 rtx pat, set_dst;
34479 tree arg0 = CALL_EXPR_ARG (exp, 0);
34480 tree arg1 = CALL_EXPR_ARG (exp, 1);
34481 tree arg2 = CALL_EXPR_ARG (exp, 2);
34482 tree arg3 = CALL_EXPR_ARG (exp, 3);
34483 rtx op0 = expand_normal (arg0);
34484 rtx op1 = expand_normal (arg1);
34485 rtx op2 = expand_normal (arg2);
34486 rtx op3 = expand_normal (arg3);
34487 enum insn_code icode = d->icode;
34488 const struct insn_data_d *insn_p = &insn_data[icode];
34489 machine_mode mode0 = insn_p->operand[0].mode;
34490 machine_mode mode1 = insn_p->operand[1].mode;
34491 enum rtx_code comparison = UNEQ;
34492 bool need_ucomi = false;
34494 /* See avxintrin.h for values. */
34495 enum rtx_code comi_comparisons[32] =
34497 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34498 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34499 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34501 bool need_ucomi_values[32] =
34503 true, false, false, true, true, false, false, true,
34504 true, false, false, true, true, false, false, true,
34505 false, true, true, false, false, true, true, false,
34506 false, true, true, false, false, true, true, false
34509 if (!CONST_INT_P (op2))
34511 error ("the third argument must be comparison constant");
34512 return const0_rtx;
34514 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34516 error ("incorrect comparison mode");
34517 return const0_rtx;
34520 if (!insn_p->operand[2].predicate (op3, SImode))
34522 error ("incorrect rounding operand");
34523 return const0_rtx;
34526 comparison = comi_comparisons[INTVAL (op2)];
34527 need_ucomi = need_ucomi_values[INTVAL (op2)];
34529 if (VECTOR_MODE_P (mode0))
34530 op0 = safe_vector_operand (op0, mode0);
34531 if (VECTOR_MODE_P (mode1))
34532 op1 = safe_vector_operand (op1, mode1);
34534 target = gen_reg_rtx (SImode);
34535 emit_move_insn (target, const0_rtx);
34536 target = gen_rtx_SUBREG (QImode, target, 0);
34538 if ((optimize && !register_operand (op0, mode0))
34539 || !insn_p->operand[0].predicate (op0, mode0))
34540 op0 = copy_to_mode_reg (mode0, op0);
34541 if ((optimize && !register_operand (op1, mode1))
34542 || !insn_p->operand[1].predicate (op1, mode1))
34543 op1 = copy_to_mode_reg (mode1, op1);
34545 if (need_ucomi)
34546 icode = icode == CODE_FOR_sse_comi_round
34547 ? CODE_FOR_sse_ucomi_round
34548 : CODE_FOR_sse2_ucomi_round;
34550 pat = GEN_FCN (icode) (op0, op1, op3);
34551 if (! pat)
34552 return 0;
34554 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34555 if (INTVAL (op3) == NO_ROUND)
34557 pat = ix86_erase_embedded_rounding (pat);
34558 if (! pat)
34559 return 0;
34561 set_dst = SET_DEST (pat);
34563 else
34565 gcc_assert (GET_CODE (pat) == SET);
34566 set_dst = SET_DEST (pat);
34569 emit_insn (pat);
34570 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34571 gen_rtx_fmt_ee (comparison, QImode,
34572 set_dst,
34573 const0_rtx)));
34575 return SUBREG_REG (target);
34578 static rtx
34579 ix86_expand_round_builtin (const struct builtin_description *d,
34580 tree exp, rtx target)
34582 rtx pat;
34583 unsigned int i, nargs;
34584 struct
34586 rtx op;
34587 machine_mode mode;
34588 } args[6];
34589 enum insn_code icode = d->icode;
34590 const struct insn_data_d *insn_p = &insn_data[icode];
34591 machine_mode tmode = insn_p->operand[0].mode;
34592 unsigned int nargs_constant = 0;
34593 unsigned int redundant_embed_rnd = 0;
34595 switch ((enum ix86_builtin_func_type) d->flag)
34597 case UINT64_FTYPE_V2DF_INT:
34598 case UINT64_FTYPE_V4SF_INT:
34599 case UINT_FTYPE_V2DF_INT:
34600 case UINT_FTYPE_V4SF_INT:
34601 case INT64_FTYPE_V2DF_INT:
34602 case INT64_FTYPE_V4SF_INT:
34603 case INT_FTYPE_V2DF_INT:
34604 case INT_FTYPE_V4SF_INT:
34605 nargs = 2;
34606 break;
34607 case V4SF_FTYPE_V4SF_UINT_INT:
34608 case V4SF_FTYPE_V4SF_UINT64_INT:
34609 case V2DF_FTYPE_V2DF_UINT64_INT:
34610 case V4SF_FTYPE_V4SF_INT_INT:
34611 case V4SF_FTYPE_V4SF_INT64_INT:
34612 case V2DF_FTYPE_V2DF_INT64_INT:
34613 case V4SF_FTYPE_V4SF_V4SF_INT:
34614 case V2DF_FTYPE_V2DF_V2DF_INT:
34615 case V4SF_FTYPE_V4SF_V2DF_INT:
34616 case V2DF_FTYPE_V2DF_V4SF_INT:
34617 nargs = 3;
34618 break;
34619 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34620 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34621 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34622 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34623 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34624 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34625 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34626 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34627 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34628 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34629 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34630 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34631 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34632 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34633 nargs = 4;
34634 break;
34635 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34636 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34637 nargs_constant = 2;
34638 nargs = 4;
34639 break;
34640 case INT_FTYPE_V4SF_V4SF_INT_INT:
34641 case INT_FTYPE_V2DF_V2DF_INT_INT:
34642 return ix86_expand_sse_comi_round (d, exp, target);
34643 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34644 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34645 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34646 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34647 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34648 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34649 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34650 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34651 nargs = 5;
34652 break;
34653 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34654 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34655 nargs_constant = 4;
34656 nargs = 5;
34657 break;
34658 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34659 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34660 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34661 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34662 nargs_constant = 3;
34663 nargs = 5;
34664 break;
34665 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34666 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34667 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34668 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34669 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34670 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34671 nargs = 6;
34672 nargs_constant = 4;
34673 break;
34674 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34675 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34676 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34677 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34678 nargs = 6;
34679 nargs_constant = 3;
34680 break;
34681 default:
34682 gcc_unreachable ();
34684 gcc_assert (nargs <= ARRAY_SIZE (args));
34686 if (optimize
34687 || target == 0
34688 || GET_MODE (target) != tmode
34689 || !insn_p->operand[0].predicate (target, tmode))
34690 target = gen_reg_rtx (tmode);
34692 for (i = 0; i < nargs; i++)
34694 tree arg = CALL_EXPR_ARG (exp, i);
34695 rtx op = expand_normal (arg);
34696 machine_mode mode = insn_p->operand[i + 1].mode;
34697 bool match = insn_p->operand[i + 1].predicate (op, mode);
34699 if (i == nargs - nargs_constant)
34701 if (!match)
34703 switch (icode)
34705 case CODE_FOR_avx512f_getmantv8df_mask_round:
34706 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34707 case CODE_FOR_avx512f_vgetmantv2df_round:
34708 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34709 case CODE_FOR_avx512f_vgetmantv4sf_round:
34710 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34711 error ("the immediate argument must be a 4-bit immediate");
34712 return const0_rtx;
34713 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34714 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34715 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34716 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34717 error ("the immediate argument must be a 5-bit immediate");
34718 return const0_rtx;
34719 default:
34720 error ("the immediate argument must be an 8-bit immediate");
34721 return const0_rtx;
34725 else if (i == nargs-1)
34727 if (!insn_p->operand[nargs].predicate (op, SImode))
34729 error ("incorrect rounding operand");
34730 return const0_rtx;
34733 /* If there is no rounding use normal version of the pattern. */
34734 if (INTVAL (op) == NO_ROUND)
34735 redundant_embed_rnd = 1;
34737 else
34739 if (VECTOR_MODE_P (mode))
34740 op = safe_vector_operand (op, mode);
34742 op = fixup_modeless_constant (op, mode);
34744 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34746 if (optimize || !match)
34747 op = copy_to_mode_reg (mode, op);
34749 else
34751 op = copy_to_reg (op);
34752 op = lowpart_subreg (mode, op, GET_MODE (op));
34756 args[i].op = op;
34757 args[i].mode = mode;
34760 switch (nargs)
34762 case 1:
34763 pat = GEN_FCN (icode) (target, args[0].op);
34764 break;
34765 case 2:
34766 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34767 break;
34768 case 3:
34769 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34770 args[2].op);
34771 break;
34772 case 4:
34773 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34774 args[2].op, args[3].op);
34775 break;
34776 case 5:
34777 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34778 args[2].op, args[3].op, args[4].op);
34779 break;
34780 case 6:
34781 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34782 args[2].op, args[3].op, args[4].op,
34783 args[5].op);
34784 break;
34785 default:
34786 gcc_unreachable ();
34789 if (!pat)
34790 return 0;
34792 if (redundant_embed_rnd)
34793 pat = ix86_erase_embedded_rounding (pat);
34795 emit_insn (pat);
34796 return target;
34799 /* Subroutine of ix86_expand_builtin to take care of special insns
34800 with variable number of operands. */
34802 static rtx
34803 ix86_expand_special_args_builtin (const struct builtin_description *d,
34804 tree exp, rtx target)
34806 tree arg;
34807 rtx pat, op;
34808 unsigned int i, nargs, arg_adjust, memory;
34809 bool aligned_mem = false;
34810 struct
34812 rtx op;
34813 machine_mode mode;
34814 } args[3];
34815 enum insn_code icode = d->icode;
34816 bool last_arg_constant = false;
34817 const struct insn_data_d *insn_p = &insn_data[icode];
34818 machine_mode tmode = insn_p->operand[0].mode;
34819 enum { load, store } klass;
34821 switch ((enum ix86_builtin_func_type) d->flag)
34823 case VOID_FTYPE_VOID:
34824 emit_insn (GEN_FCN (icode) (target));
34825 return 0;
34826 case VOID_FTYPE_UINT64:
34827 case VOID_FTYPE_UNSIGNED:
34828 nargs = 0;
34829 klass = store;
34830 memory = 0;
34831 break;
34833 case INT_FTYPE_VOID:
34834 case USHORT_FTYPE_VOID:
34835 case UINT64_FTYPE_VOID:
34836 case UNSIGNED_FTYPE_VOID:
34837 nargs = 0;
34838 klass = load;
34839 memory = 0;
34840 break;
34841 case UINT64_FTYPE_PUNSIGNED:
34842 case V2DI_FTYPE_PV2DI:
34843 case V4DI_FTYPE_PV4DI:
34844 case V32QI_FTYPE_PCCHAR:
34845 case V16QI_FTYPE_PCCHAR:
34846 case V8SF_FTYPE_PCV4SF:
34847 case V8SF_FTYPE_PCFLOAT:
34848 case V4SF_FTYPE_PCFLOAT:
34849 case V4DF_FTYPE_PCV2DF:
34850 case V4DF_FTYPE_PCDOUBLE:
34851 case V2DF_FTYPE_PCDOUBLE:
34852 case VOID_FTYPE_PVOID:
34853 case V8DI_FTYPE_PV8DI:
34854 nargs = 1;
34855 klass = load;
34856 memory = 0;
34857 switch (icode)
34859 case CODE_FOR_sse4_1_movntdqa:
34860 case CODE_FOR_avx2_movntdqa:
34861 case CODE_FOR_avx512f_movntdqa:
34862 aligned_mem = true;
34863 break;
34864 default:
34865 break;
34867 break;
34868 case VOID_FTYPE_PV2SF_V4SF:
34869 case VOID_FTYPE_PV8DI_V8DI:
34870 case VOID_FTYPE_PV4DI_V4DI:
34871 case VOID_FTYPE_PV2DI_V2DI:
34872 case VOID_FTYPE_PCHAR_V32QI:
34873 case VOID_FTYPE_PCHAR_V16QI:
34874 case VOID_FTYPE_PFLOAT_V16SF:
34875 case VOID_FTYPE_PFLOAT_V8SF:
34876 case VOID_FTYPE_PFLOAT_V4SF:
34877 case VOID_FTYPE_PDOUBLE_V8DF:
34878 case VOID_FTYPE_PDOUBLE_V4DF:
34879 case VOID_FTYPE_PDOUBLE_V2DF:
34880 case VOID_FTYPE_PLONGLONG_LONGLONG:
34881 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34882 case VOID_FTYPE_PINT_INT:
34883 nargs = 1;
34884 klass = store;
34885 /* Reserve memory operand for target. */
34886 memory = ARRAY_SIZE (args);
34887 switch (icode)
34889 /* These builtins and instructions require the memory
34890 to be properly aligned. */
34891 case CODE_FOR_avx_movntv4di:
34892 case CODE_FOR_sse2_movntv2di:
34893 case CODE_FOR_avx_movntv8sf:
34894 case CODE_FOR_sse_movntv4sf:
34895 case CODE_FOR_sse4a_vmmovntv4sf:
34896 case CODE_FOR_avx_movntv4df:
34897 case CODE_FOR_sse2_movntv2df:
34898 case CODE_FOR_sse4a_vmmovntv2df:
34899 case CODE_FOR_sse2_movntidi:
34900 case CODE_FOR_sse_movntq:
34901 case CODE_FOR_sse2_movntisi:
34902 case CODE_FOR_avx512f_movntv16sf:
34903 case CODE_FOR_avx512f_movntv8df:
34904 case CODE_FOR_avx512f_movntv8di:
34905 aligned_mem = true;
34906 break;
34907 default:
34908 break;
34910 break;
34911 case V4SF_FTYPE_V4SF_PCV2SF:
34912 case V2DF_FTYPE_V2DF_PCDOUBLE:
34913 nargs = 2;
34914 klass = load;
34915 memory = 1;
34916 break;
34917 case V8SF_FTYPE_PCV8SF_V8SI:
34918 case V4DF_FTYPE_PCV4DF_V4DI:
34919 case V4SF_FTYPE_PCV4SF_V4SI:
34920 case V2DF_FTYPE_PCV2DF_V2DI:
34921 case V8SI_FTYPE_PCV8SI_V8SI:
34922 case V4DI_FTYPE_PCV4DI_V4DI:
34923 case V4SI_FTYPE_PCV4SI_V4SI:
34924 case V2DI_FTYPE_PCV2DI_V2DI:
34925 case VOID_FTYPE_INT_INT64:
34926 nargs = 2;
34927 klass = load;
34928 memory = 0;
34929 break;
34930 case VOID_FTYPE_PV8DF_V8DF_UQI:
34931 case VOID_FTYPE_PV4DF_V4DF_UQI:
34932 case VOID_FTYPE_PV2DF_V2DF_UQI:
34933 case VOID_FTYPE_PV16SF_V16SF_UHI:
34934 case VOID_FTYPE_PV8SF_V8SF_UQI:
34935 case VOID_FTYPE_PV4SF_V4SF_UQI:
34936 case VOID_FTYPE_PV8DI_V8DI_UQI:
34937 case VOID_FTYPE_PV4DI_V4DI_UQI:
34938 case VOID_FTYPE_PV2DI_V2DI_UQI:
34939 case VOID_FTYPE_PV16SI_V16SI_UHI:
34940 case VOID_FTYPE_PV8SI_V8SI_UQI:
34941 case VOID_FTYPE_PV4SI_V4SI_UQI:
34942 case VOID_FTYPE_PV64QI_V64QI_UDI:
34943 case VOID_FTYPE_PV32HI_V32HI_USI:
34944 case VOID_FTYPE_PV32QI_V32QI_USI:
34945 case VOID_FTYPE_PV16QI_V16QI_UHI:
34946 case VOID_FTYPE_PV16HI_V16HI_UHI:
34947 case VOID_FTYPE_PV8HI_V8HI_UQI:
34948 switch (icode)
34950 /* These builtins and instructions require the memory
34951 to be properly aligned. */
34952 case CODE_FOR_avx512f_storev16sf_mask:
34953 case CODE_FOR_avx512f_storev16si_mask:
34954 case CODE_FOR_avx512f_storev8df_mask:
34955 case CODE_FOR_avx512f_storev8di_mask:
34956 case CODE_FOR_avx512vl_storev8sf_mask:
34957 case CODE_FOR_avx512vl_storev8si_mask:
34958 case CODE_FOR_avx512vl_storev4df_mask:
34959 case CODE_FOR_avx512vl_storev4di_mask:
34960 case CODE_FOR_avx512vl_storev4sf_mask:
34961 case CODE_FOR_avx512vl_storev4si_mask:
34962 case CODE_FOR_avx512vl_storev2df_mask:
34963 case CODE_FOR_avx512vl_storev2di_mask:
34964 aligned_mem = true;
34965 break;
34966 default:
34967 break;
34969 /* FALLTHRU */
34970 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34971 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34972 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34973 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34974 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34975 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34976 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34977 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34978 case VOID_FTYPE_PV8SI_V8DI_UQI:
34979 case VOID_FTYPE_PV8HI_V8DI_UQI:
34980 case VOID_FTYPE_PV16HI_V16SI_UHI:
34981 case VOID_FTYPE_PV16QI_V8DI_UQI:
34982 case VOID_FTYPE_PV16QI_V16SI_UHI:
34983 case VOID_FTYPE_PV4SI_V4DI_UQI:
34984 case VOID_FTYPE_PV4SI_V2DI_UQI:
34985 case VOID_FTYPE_PV8HI_V4DI_UQI:
34986 case VOID_FTYPE_PV8HI_V2DI_UQI:
34987 case VOID_FTYPE_PV8HI_V8SI_UQI:
34988 case VOID_FTYPE_PV8HI_V4SI_UQI:
34989 case VOID_FTYPE_PV16QI_V4DI_UQI:
34990 case VOID_FTYPE_PV16QI_V2DI_UQI:
34991 case VOID_FTYPE_PV16QI_V8SI_UQI:
34992 case VOID_FTYPE_PV16QI_V4SI_UQI:
34993 case VOID_FTYPE_PCHAR_V64QI_UDI:
34994 case VOID_FTYPE_PCHAR_V32QI_USI:
34995 case VOID_FTYPE_PCHAR_V16QI_UHI:
34996 case VOID_FTYPE_PSHORT_V32HI_USI:
34997 case VOID_FTYPE_PSHORT_V16HI_UHI:
34998 case VOID_FTYPE_PSHORT_V8HI_UQI:
34999 case VOID_FTYPE_PINT_V16SI_UHI:
35000 case VOID_FTYPE_PINT_V8SI_UQI:
35001 case VOID_FTYPE_PINT_V4SI_UQI:
35002 case VOID_FTYPE_PINT64_V8DI_UQI:
35003 case VOID_FTYPE_PINT64_V4DI_UQI:
35004 case VOID_FTYPE_PINT64_V2DI_UQI:
35005 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
35006 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
35007 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
35008 case VOID_FTYPE_PFLOAT_V16SF_UHI:
35009 case VOID_FTYPE_PFLOAT_V8SF_UQI:
35010 case VOID_FTYPE_PFLOAT_V4SF_UQI:
35011 case VOID_FTYPE_PV32QI_V32HI_USI:
35012 case VOID_FTYPE_PV16QI_V16HI_UHI:
35013 case VOID_FTYPE_PV8QI_V8HI_UQI:
35014 nargs = 2;
35015 klass = store;
35016 /* Reserve memory operand for target. */
35017 memory = ARRAY_SIZE (args);
35018 break;
35019 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
35020 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
35021 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
35022 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
35023 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
35024 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
35025 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
35026 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
35027 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
35028 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
35029 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
35030 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
35031 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
35032 case V32HI_FTYPE_PCV32HI_V32HI_USI:
35033 case V32QI_FTYPE_PCV32QI_V32QI_USI:
35034 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
35035 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
35036 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
35037 switch (icode)
35039 /* These builtins and instructions require the memory
35040 to be properly aligned. */
35041 case CODE_FOR_avx512f_loadv16sf_mask:
35042 case CODE_FOR_avx512f_loadv16si_mask:
35043 case CODE_FOR_avx512f_loadv8df_mask:
35044 case CODE_FOR_avx512f_loadv8di_mask:
35045 case CODE_FOR_avx512vl_loadv8sf_mask:
35046 case CODE_FOR_avx512vl_loadv8si_mask:
35047 case CODE_FOR_avx512vl_loadv4df_mask:
35048 case CODE_FOR_avx512vl_loadv4di_mask:
35049 case CODE_FOR_avx512vl_loadv4sf_mask:
35050 case CODE_FOR_avx512vl_loadv4si_mask:
35051 case CODE_FOR_avx512vl_loadv2df_mask:
35052 case CODE_FOR_avx512vl_loadv2di_mask:
35053 case CODE_FOR_avx512bw_loadv64qi_mask:
35054 case CODE_FOR_avx512vl_loadv32qi_mask:
35055 case CODE_FOR_avx512vl_loadv16qi_mask:
35056 case CODE_FOR_avx512bw_loadv32hi_mask:
35057 case CODE_FOR_avx512vl_loadv16hi_mask:
35058 case CODE_FOR_avx512vl_loadv8hi_mask:
35059 aligned_mem = true;
35060 break;
35061 default:
35062 break;
35064 /* FALLTHRU */
35065 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35066 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35067 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35068 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35069 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35070 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35071 case V16SI_FTYPE_PCINT_V16SI_UHI:
35072 case V8SI_FTYPE_PCINT_V8SI_UQI:
35073 case V4SI_FTYPE_PCINT_V4SI_UQI:
35074 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35075 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35076 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35077 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35078 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35079 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35080 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35081 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35082 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35083 nargs = 3;
35084 klass = load;
35085 memory = 0;
35086 break;
35087 case VOID_FTYPE_UINT_UINT_UINT:
35088 case VOID_FTYPE_UINT64_UINT_UINT:
35089 case UCHAR_FTYPE_UINT_UINT_UINT:
35090 case UCHAR_FTYPE_UINT64_UINT_UINT:
35091 nargs = 3;
35092 klass = load;
35093 memory = ARRAY_SIZE (args);
35094 last_arg_constant = true;
35095 break;
35096 default:
35097 gcc_unreachable ();
35100 gcc_assert (nargs <= ARRAY_SIZE (args));
35102 if (klass == store)
35104 arg = CALL_EXPR_ARG (exp, 0);
35105 op = expand_normal (arg);
35106 gcc_assert (target == 0);
35107 if (memory)
35109 op = ix86_zero_extend_to_Pmode (op);
35110 target = gen_rtx_MEM (tmode, op);
35111 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35112 on it. Try to improve it using get_pointer_alignment,
35113 and if the special builtin is one that requires strict
35114 mode alignment, also from it's GET_MODE_ALIGNMENT.
35115 Failure to do so could lead to ix86_legitimate_combined_insn
35116 rejecting all changes to such insns. */
35117 unsigned int align = get_pointer_alignment (arg);
35118 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35119 align = GET_MODE_ALIGNMENT (tmode);
35120 if (MEM_ALIGN (target) < align)
35121 set_mem_align (target, align);
35123 else
35124 target = force_reg (tmode, op);
35125 arg_adjust = 1;
35127 else
35129 arg_adjust = 0;
35130 if (optimize
35131 || target == 0
35132 || !register_operand (target, tmode)
35133 || GET_MODE (target) != tmode)
35134 target = gen_reg_rtx (tmode);
35137 for (i = 0; i < nargs; i++)
35139 machine_mode mode = insn_p->operand[i + 1].mode;
35140 bool match;
35142 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35143 op = expand_normal (arg);
35144 match = insn_p->operand[i + 1].predicate (op, mode);
35146 if (last_arg_constant && (i + 1) == nargs)
35148 if (!match)
35150 if (icode == CODE_FOR_lwp_lwpvalsi3
35151 || icode == CODE_FOR_lwp_lwpinssi3
35152 || icode == CODE_FOR_lwp_lwpvaldi3
35153 || icode == CODE_FOR_lwp_lwpinsdi3)
35154 error ("the last argument must be a 32-bit immediate");
35155 else
35156 error ("the last argument must be an 8-bit immediate");
35157 return const0_rtx;
35160 else
35162 if (i == memory)
35164 /* This must be the memory operand. */
35165 op = ix86_zero_extend_to_Pmode (op);
35166 op = gen_rtx_MEM (mode, op);
35167 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35168 on it. Try to improve it using get_pointer_alignment,
35169 and if the special builtin is one that requires strict
35170 mode alignment, also from it's GET_MODE_ALIGNMENT.
35171 Failure to do so could lead to ix86_legitimate_combined_insn
35172 rejecting all changes to such insns. */
35173 unsigned int align = get_pointer_alignment (arg);
35174 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35175 align = GET_MODE_ALIGNMENT (mode);
35176 if (MEM_ALIGN (op) < align)
35177 set_mem_align (op, align);
35179 else
35181 /* This must be register. */
35182 if (VECTOR_MODE_P (mode))
35183 op = safe_vector_operand (op, mode);
35185 op = fixup_modeless_constant (op, mode);
35187 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35188 op = copy_to_mode_reg (mode, op);
35189 else
35191 op = copy_to_reg (op);
35192 op = lowpart_subreg (mode, op, GET_MODE (op));
35197 args[i].op = op;
35198 args[i].mode = mode;
35201 switch (nargs)
35203 case 0:
35204 pat = GEN_FCN (icode) (target);
35205 break;
35206 case 1:
35207 pat = GEN_FCN (icode) (target, args[0].op);
35208 break;
35209 case 2:
35210 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35211 break;
35212 case 3:
35213 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35214 break;
35215 default:
35216 gcc_unreachable ();
35219 if (! pat)
35220 return 0;
35221 emit_insn (pat);
35222 return klass == store ? 0 : target;
35225 /* Return the integer constant in ARG. Constrain it to be in the range
35226 of the subparts of VEC_TYPE; issue an error if not. */
35228 static int
35229 get_element_number (tree vec_type, tree arg)
35231 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35233 if (!tree_fits_uhwi_p (arg)
35234 || (elt = tree_to_uhwi (arg), elt > max))
35236 error ("selector must be an integer constant in the range 0..%wi", max);
35237 return 0;
35240 return elt;
35243 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35244 ix86_expand_vector_init. We DO have language-level syntax for this, in
35245 the form of (type){ init-list }. Except that since we can't place emms
35246 instructions from inside the compiler, we can't allow the use of MMX
35247 registers unless the user explicitly asks for it. So we do *not* define
35248 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
35249 we have builtins invoked by mmintrin.h that gives us license to emit
35250 these sorts of instructions. */
35252 static rtx
35253 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
35255 machine_mode tmode = TYPE_MODE (type);
35256 machine_mode inner_mode = GET_MODE_INNER (tmode);
35257 int i, n_elt = GET_MODE_NUNITS (tmode);
35258 rtvec v = rtvec_alloc (n_elt);
35260 gcc_assert (VECTOR_MODE_P (tmode));
35261 gcc_assert (call_expr_nargs (exp) == n_elt);
35263 for (i = 0; i < n_elt; ++i)
35265 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
35266 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
35269 if (!target || !register_operand (target, tmode))
35270 target = gen_reg_rtx (tmode);
35272 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
35273 return target;
35276 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35277 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
35278 had a language-level syntax for referencing vector elements. */
35280 static rtx
35281 ix86_expand_vec_ext_builtin (tree exp, rtx target)
35283 machine_mode tmode, mode0;
35284 tree arg0, arg1;
35285 int elt;
35286 rtx op0;
35288 arg0 = CALL_EXPR_ARG (exp, 0);
35289 arg1 = CALL_EXPR_ARG (exp, 1);
35291 op0 = expand_normal (arg0);
35292 elt = get_element_number (TREE_TYPE (arg0), arg1);
35294 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35295 mode0 = TYPE_MODE (TREE_TYPE (arg0));
35296 gcc_assert (VECTOR_MODE_P (mode0));
35298 op0 = force_reg (mode0, op0);
35300 if (optimize || !target || !register_operand (target, tmode))
35301 target = gen_reg_rtx (tmode);
35303 ix86_expand_vector_extract (true, target, op0, elt);
35305 return target;
35308 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35309 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
35310 a language-level syntax for referencing vector elements. */
35312 static rtx
35313 ix86_expand_vec_set_builtin (tree exp)
35315 machine_mode tmode, mode1;
35316 tree arg0, arg1, arg2;
35317 int elt;
35318 rtx op0, op1, target;
35320 arg0 = CALL_EXPR_ARG (exp, 0);
35321 arg1 = CALL_EXPR_ARG (exp, 1);
35322 arg2 = CALL_EXPR_ARG (exp, 2);
35324 tmode = TYPE_MODE (TREE_TYPE (arg0));
35325 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35326 gcc_assert (VECTOR_MODE_P (tmode));
35328 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
35329 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
35330 elt = get_element_number (TREE_TYPE (arg0), arg2);
35332 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35333 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35335 op0 = force_reg (tmode, op0);
35336 op1 = force_reg (mode1, op1);
35338 /* OP0 is the source of these builtin functions and shouldn't be
35339 modified. Create a copy, use it and return it as target. */
35340 target = gen_reg_rtx (tmode);
35341 emit_move_insn (target, op0);
35342 ix86_expand_vector_set (true, target, op1, elt);
35344 return target;
35347 /* Emit conditional move of SRC to DST with condition
35348 OP1 CODE OP2. */
35349 static void
35350 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35352 rtx t;
35354 if (TARGET_CMOVE)
35356 t = ix86_expand_compare (code, op1, op2);
35357 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35358 src, dst)));
35360 else
35362 rtx_code_label *nomove = gen_label_rtx ();
35363 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35364 const0_rtx, GET_MODE (op1), 1, nomove);
35365 emit_move_insn (dst, src);
35366 emit_label (nomove);
35370 /* Choose max of DST and SRC and put it to DST. */
35371 static void
35372 ix86_emit_move_max (rtx dst, rtx src)
35374 ix86_emit_cmove (dst, src, LTU, dst, src);
35377 /* Expand an expression EXP that calls a built-in function,
35378 with result going to TARGET if that's convenient
35379 (and in mode MODE if that's convenient).
35380 SUBTARGET may be used as the target for computing one of EXP's operands.
35381 IGNORE is nonzero if the value is to be ignored. */
35383 static rtx
35384 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35385 machine_mode mode, int ignore)
35387 size_t i;
35388 enum insn_code icode, icode2;
35389 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35390 tree arg0, arg1, arg2, arg3, arg4;
35391 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
35392 machine_mode mode0, mode1, mode2, mode3, mode4;
35393 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35395 /* For CPU builtins that can be folded, fold first and expand the fold. */
35396 switch (fcode)
35398 case IX86_BUILTIN_CPU_INIT:
35400 /* Make it call __cpu_indicator_init in libgcc. */
35401 tree call_expr, fndecl, type;
35402 type = build_function_type_list (integer_type_node, NULL_TREE);
35403 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35404 call_expr = build_call_expr (fndecl, 0);
35405 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35407 case IX86_BUILTIN_CPU_IS:
35408 case IX86_BUILTIN_CPU_SUPPORTS:
35410 tree arg0 = CALL_EXPR_ARG (exp, 0);
35411 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35412 gcc_assert (fold_expr != NULL_TREE);
35413 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35417 HOST_WIDE_INT isa = ix86_isa_flags;
35418 HOST_WIDE_INT isa2 = ix86_isa_flags2;
35419 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
35420 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
35421 /* The general case is we require all the ISAs specified in bisa{,2}
35422 to be enabled.
35423 The exceptions are:
35424 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
35425 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
35426 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
35427 where for each this pair it is sufficient if either of the ISAs is
35428 enabled, plus if it is ored with other options also those others. */
35429 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
35430 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
35431 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
35432 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
35433 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
35434 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
35435 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
35436 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
35437 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
35438 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
35439 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
35440 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
35441 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
35443 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
35444 (enum fpmath_unit) 0, false);
35445 if (!opts)
35446 error ("%qE needs unknown isa option", fndecl);
35447 else
35449 gcc_assert (opts != NULL);
35450 error ("%qE needs isa option %s", fndecl, opts);
35451 free (opts);
35453 return expand_call (exp, target, ignore);
35456 switch (fcode)
35458 case IX86_BUILTIN_BNDMK:
35459 if (!target
35460 || GET_MODE (target) != BNDmode
35461 || !register_operand (target, BNDmode))
35462 target = gen_reg_rtx (BNDmode);
35464 arg0 = CALL_EXPR_ARG (exp, 0);
35465 arg1 = CALL_EXPR_ARG (exp, 1);
35467 op0 = expand_normal (arg0);
35468 op1 = expand_normal (arg1);
35470 if (!register_operand (op0, Pmode))
35471 op0 = ix86_zero_extend_to_Pmode (op0);
35472 if (!register_operand (op1, Pmode))
35473 op1 = ix86_zero_extend_to_Pmode (op1);
35475 /* Builtin arg1 is size of block but instruction op1 should
35476 be (size - 1). */
35477 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35478 NULL_RTX, 1, OPTAB_DIRECT);
35480 emit_insn (BNDmode == BND64mode
35481 ? gen_bnd64_mk (target, op0, op1)
35482 : gen_bnd32_mk (target, op0, op1));
35483 return target;
35485 case IX86_BUILTIN_BNDSTX:
35486 arg0 = CALL_EXPR_ARG (exp, 0);
35487 arg1 = CALL_EXPR_ARG (exp, 1);
35488 arg2 = CALL_EXPR_ARG (exp, 2);
35490 op0 = expand_normal (arg0);
35491 op1 = expand_normal (arg1);
35492 op2 = expand_normal (arg2);
35494 if (!register_operand (op0, Pmode))
35495 op0 = ix86_zero_extend_to_Pmode (op0);
35496 if (!register_operand (op1, BNDmode))
35497 op1 = copy_to_mode_reg (BNDmode, op1);
35498 if (!register_operand (op2, Pmode))
35499 op2 = ix86_zero_extend_to_Pmode (op2);
35501 emit_insn (BNDmode == BND64mode
35502 ? gen_bnd64_stx (op2, op0, op1)
35503 : gen_bnd32_stx (op2, op0, op1));
35504 return 0;
35506 case IX86_BUILTIN_BNDLDX:
35507 if (!target
35508 || GET_MODE (target) != BNDmode
35509 || !register_operand (target, BNDmode))
35510 target = gen_reg_rtx (BNDmode);
35512 arg0 = CALL_EXPR_ARG (exp, 0);
35513 arg1 = CALL_EXPR_ARG (exp, 1);
35515 op0 = expand_normal (arg0);
35516 op1 = expand_normal (arg1);
35518 if (!register_operand (op0, Pmode))
35519 op0 = ix86_zero_extend_to_Pmode (op0);
35520 if (!register_operand (op1, Pmode))
35521 op1 = ix86_zero_extend_to_Pmode (op1);
35523 emit_insn (BNDmode == BND64mode
35524 ? gen_bnd64_ldx (target, op0, op1)
35525 : gen_bnd32_ldx (target, op0, op1));
35526 return target;
35528 case IX86_BUILTIN_BNDCL:
35529 arg0 = CALL_EXPR_ARG (exp, 0);
35530 arg1 = CALL_EXPR_ARG (exp, 1);
35532 op0 = expand_normal (arg0);
35533 op1 = expand_normal (arg1);
35535 if (!register_operand (op0, Pmode))
35536 op0 = ix86_zero_extend_to_Pmode (op0);
35537 if (!register_operand (op1, BNDmode))
35538 op1 = copy_to_mode_reg (BNDmode, op1);
35540 emit_insn (BNDmode == BND64mode
35541 ? gen_bnd64_cl (op1, op0)
35542 : gen_bnd32_cl (op1, op0));
35543 return 0;
35545 case IX86_BUILTIN_BNDCU:
35546 arg0 = CALL_EXPR_ARG (exp, 0);
35547 arg1 = CALL_EXPR_ARG (exp, 1);
35549 op0 = expand_normal (arg0);
35550 op1 = expand_normal (arg1);
35552 if (!register_operand (op0, Pmode))
35553 op0 = ix86_zero_extend_to_Pmode (op0);
35554 if (!register_operand (op1, BNDmode))
35555 op1 = copy_to_mode_reg (BNDmode, op1);
35557 emit_insn (BNDmode == BND64mode
35558 ? gen_bnd64_cu (op1, op0)
35559 : gen_bnd32_cu (op1, op0));
35560 return 0;
35562 case IX86_BUILTIN_BNDRET:
35563 arg0 = CALL_EXPR_ARG (exp, 0);
35564 target = chkp_get_rtl_bounds (arg0);
35566 /* If no bounds were specified for returned value,
35567 then use INIT bounds. It usually happens when
35568 some built-in function is expanded. */
35569 if (!target)
35571 rtx t1 = gen_reg_rtx (Pmode);
35572 rtx t2 = gen_reg_rtx (Pmode);
35573 target = gen_reg_rtx (BNDmode);
35574 emit_move_insn (t1, const0_rtx);
35575 emit_move_insn (t2, constm1_rtx);
35576 emit_insn (BNDmode == BND64mode
35577 ? gen_bnd64_mk (target, t1, t2)
35578 : gen_bnd32_mk (target, t1, t2));
35581 gcc_assert (target && REG_P (target));
35582 return target;
35584 case IX86_BUILTIN_BNDNARROW:
35586 rtx m1, m1h1, m1h2, lb, ub, t1;
35588 /* Return value and lb. */
35589 arg0 = CALL_EXPR_ARG (exp, 0);
35590 /* Bounds. */
35591 arg1 = CALL_EXPR_ARG (exp, 1);
35592 /* Size. */
35593 arg2 = CALL_EXPR_ARG (exp, 2);
35595 lb = expand_normal (arg0);
35596 op1 = expand_normal (arg1);
35597 op2 = expand_normal (arg2);
35599 /* Size was passed but we need to use (size - 1) as for bndmk. */
35600 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35601 NULL_RTX, 1, OPTAB_DIRECT);
35603 /* Add LB to size and inverse to get UB. */
35604 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35605 op2, 1, OPTAB_DIRECT);
35606 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35608 if (!register_operand (lb, Pmode))
35609 lb = ix86_zero_extend_to_Pmode (lb);
35610 if (!register_operand (ub, Pmode))
35611 ub = ix86_zero_extend_to_Pmode (ub);
35613 /* We need to move bounds to memory before any computations. */
35614 if (MEM_P (op1))
35615 m1 = op1;
35616 else
35618 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35619 emit_move_insn (m1, op1);
35622 /* Generate mem expression to be used for access to LB and UB. */
35623 m1h1 = adjust_address (m1, Pmode, 0);
35624 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35626 t1 = gen_reg_rtx (Pmode);
35628 /* Compute LB. */
35629 emit_move_insn (t1, m1h1);
35630 ix86_emit_move_max (t1, lb);
35631 emit_move_insn (m1h1, t1);
35633 /* Compute UB. UB is stored in 1's complement form. Therefore
35634 we also use max here. */
35635 emit_move_insn (t1, m1h2);
35636 ix86_emit_move_max (t1, ub);
35637 emit_move_insn (m1h2, t1);
35639 op2 = gen_reg_rtx (BNDmode);
35640 emit_move_insn (op2, m1);
35642 return chkp_join_splitted_slot (lb, op2);
35645 case IX86_BUILTIN_BNDINT:
35647 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35649 if (!target
35650 || GET_MODE (target) != BNDmode
35651 || !register_operand (target, BNDmode))
35652 target = gen_reg_rtx (BNDmode);
35654 arg0 = CALL_EXPR_ARG (exp, 0);
35655 arg1 = CALL_EXPR_ARG (exp, 1);
35657 op0 = expand_normal (arg0);
35658 op1 = expand_normal (arg1);
35660 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35661 rh1 = adjust_address (res, Pmode, 0);
35662 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35664 /* Put first bounds to temporaries. */
35665 lb1 = gen_reg_rtx (Pmode);
35666 ub1 = gen_reg_rtx (Pmode);
35667 if (MEM_P (op0))
35669 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35670 emit_move_insn (ub1, adjust_address (op0, Pmode,
35671 GET_MODE_SIZE (Pmode)));
35673 else
35675 emit_move_insn (res, op0);
35676 emit_move_insn (lb1, rh1);
35677 emit_move_insn (ub1, rh2);
35680 /* Put second bounds to temporaries. */
35681 lb2 = gen_reg_rtx (Pmode);
35682 ub2 = gen_reg_rtx (Pmode);
35683 if (MEM_P (op1))
35685 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35686 emit_move_insn (ub2, adjust_address (op1, Pmode,
35687 GET_MODE_SIZE (Pmode)));
35689 else
35691 emit_move_insn (res, op1);
35692 emit_move_insn (lb2, rh1);
35693 emit_move_insn (ub2, rh2);
35696 /* Compute LB. */
35697 ix86_emit_move_max (lb1, lb2);
35698 emit_move_insn (rh1, lb1);
35700 /* Compute UB. UB is stored in 1's complement form. Therefore
35701 we also use max here. */
35702 ix86_emit_move_max (ub1, ub2);
35703 emit_move_insn (rh2, ub1);
35705 emit_move_insn (target, res);
35707 return target;
35710 case IX86_BUILTIN_SIZEOF:
35712 tree name;
35713 rtx symbol;
35715 if (!target
35716 || GET_MODE (target) != Pmode
35717 || !register_operand (target, Pmode))
35718 target = gen_reg_rtx (Pmode);
35720 arg0 = CALL_EXPR_ARG (exp, 0);
35721 gcc_assert (VAR_P (arg0));
35723 name = DECL_ASSEMBLER_NAME (arg0);
35724 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35726 emit_insn (Pmode == SImode
35727 ? gen_move_size_reloc_si (target, symbol)
35728 : gen_move_size_reloc_di (target, symbol));
35730 return target;
35733 case IX86_BUILTIN_BNDLOWER:
35735 rtx mem, hmem;
35737 if (!target
35738 || GET_MODE (target) != Pmode
35739 || !register_operand (target, Pmode))
35740 target = gen_reg_rtx (Pmode);
35742 arg0 = CALL_EXPR_ARG (exp, 0);
35743 op0 = expand_normal (arg0);
35745 /* We need to move bounds to memory first. */
35746 if (MEM_P (op0))
35747 mem = op0;
35748 else
35750 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35751 emit_move_insn (mem, op0);
35754 /* Generate mem expression to access LB and load it. */
35755 hmem = adjust_address (mem, Pmode, 0);
35756 emit_move_insn (target, hmem);
35758 return target;
35761 case IX86_BUILTIN_BNDUPPER:
35763 rtx mem, hmem, res;
35765 if (!target
35766 || GET_MODE (target) != Pmode
35767 || !register_operand (target, Pmode))
35768 target = gen_reg_rtx (Pmode);
35770 arg0 = CALL_EXPR_ARG (exp, 0);
35771 op0 = expand_normal (arg0);
35773 /* We need to move bounds to memory first. */
35774 if (MEM_P (op0))
35775 mem = op0;
35776 else
35778 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35779 emit_move_insn (mem, op0);
35782 /* Generate mem expression to access UB. */
35783 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35785 /* We need to inverse all bits of UB. */
35786 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35788 if (res != target)
35789 emit_move_insn (target, res);
35791 return target;
35794 case IX86_BUILTIN_MASKMOVQ:
35795 case IX86_BUILTIN_MASKMOVDQU:
35796 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35797 ? CODE_FOR_mmx_maskmovq
35798 : CODE_FOR_sse2_maskmovdqu);
35799 /* Note the arg order is different from the operand order. */
35800 arg1 = CALL_EXPR_ARG (exp, 0);
35801 arg2 = CALL_EXPR_ARG (exp, 1);
35802 arg0 = CALL_EXPR_ARG (exp, 2);
35803 op0 = expand_normal (arg0);
35804 op1 = expand_normal (arg1);
35805 op2 = expand_normal (arg2);
35806 mode0 = insn_data[icode].operand[0].mode;
35807 mode1 = insn_data[icode].operand[1].mode;
35808 mode2 = insn_data[icode].operand[2].mode;
35810 op0 = ix86_zero_extend_to_Pmode (op0);
35811 op0 = gen_rtx_MEM (mode1, op0);
35813 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35814 op0 = copy_to_mode_reg (mode0, op0);
35815 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35816 op1 = copy_to_mode_reg (mode1, op1);
35817 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35818 op2 = copy_to_mode_reg (mode2, op2);
35819 pat = GEN_FCN (icode) (op0, op1, op2);
35820 if (! pat)
35821 return 0;
35822 emit_insn (pat);
35823 return 0;
35825 case IX86_BUILTIN_LDMXCSR:
35826 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35827 target = assign_386_stack_local (SImode, SLOT_TEMP);
35828 emit_move_insn (target, op0);
35829 emit_insn (gen_sse_ldmxcsr (target));
35830 return 0;
35832 case IX86_BUILTIN_STMXCSR:
35833 target = assign_386_stack_local (SImode, SLOT_TEMP);
35834 emit_insn (gen_sse_stmxcsr (target));
35835 return copy_to_mode_reg (SImode, target);
35837 case IX86_BUILTIN_CLFLUSH:
35838 arg0 = CALL_EXPR_ARG (exp, 0);
35839 op0 = expand_normal (arg0);
35840 icode = CODE_FOR_sse2_clflush;
35841 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35842 op0 = ix86_zero_extend_to_Pmode (op0);
35844 emit_insn (gen_sse2_clflush (op0));
35845 return 0;
35847 case IX86_BUILTIN_CLWB:
35848 arg0 = CALL_EXPR_ARG (exp, 0);
35849 op0 = expand_normal (arg0);
35850 icode = CODE_FOR_clwb;
35851 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35852 op0 = ix86_zero_extend_to_Pmode (op0);
35854 emit_insn (gen_clwb (op0));
35855 return 0;
35857 case IX86_BUILTIN_CLFLUSHOPT:
35858 arg0 = CALL_EXPR_ARG (exp, 0);
35859 op0 = expand_normal (arg0);
35860 icode = CODE_FOR_clflushopt;
35861 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35862 op0 = ix86_zero_extend_to_Pmode (op0);
35864 emit_insn (gen_clflushopt (op0));
35865 return 0;
35867 case IX86_BUILTIN_MONITOR:
35868 case IX86_BUILTIN_MONITORX:
35869 arg0 = CALL_EXPR_ARG (exp, 0);
35870 arg1 = CALL_EXPR_ARG (exp, 1);
35871 arg2 = CALL_EXPR_ARG (exp, 2);
35872 op0 = expand_normal (arg0);
35873 op1 = expand_normal (arg1);
35874 op2 = expand_normal (arg2);
35875 if (!REG_P (op0))
35876 op0 = ix86_zero_extend_to_Pmode (op0);
35877 if (!REG_P (op1))
35878 op1 = copy_to_mode_reg (SImode, op1);
35879 if (!REG_P (op2))
35880 op2 = copy_to_mode_reg (SImode, op2);
35882 emit_insn (fcode == IX86_BUILTIN_MONITOR
35883 ? ix86_gen_monitor (op0, op1, op2)
35884 : ix86_gen_monitorx (op0, op1, op2));
35885 return 0;
35887 case IX86_BUILTIN_MWAIT:
35888 arg0 = CALL_EXPR_ARG (exp, 0);
35889 arg1 = CALL_EXPR_ARG (exp, 1);
35890 op0 = expand_normal (arg0);
35891 op1 = expand_normal (arg1);
35892 if (!REG_P (op0))
35893 op0 = copy_to_mode_reg (SImode, op0);
35894 if (!REG_P (op1))
35895 op1 = copy_to_mode_reg (SImode, op1);
35896 emit_insn (gen_sse3_mwait (op0, op1));
35897 return 0;
35899 case IX86_BUILTIN_MWAITX:
35900 arg0 = CALL_EXPR_ARG (exp, 0);
35901 arg1 = CALL_EXPR_ARG (exp, 1);
35902 arg2 = CALL_EXPR_ARG (exp, 2);
35903 op0 = expand_normal (arg0);
35904 op1 = expand_normal (arg1);
35905 op2 = expand_normal (arg2);
35906 if (!REG_P (op0))
35907 op0 = copy_to_mode_reg (SImode, op0);
35908 if (!REG_P (op1))
35909 op1 = copy_to_mode_reg (SImode, op1);
35910 if (!REG_P (op2))
35911 op2 = copy_to_mode_reg (SImode, op2);
35912 emit_insn (gen_mwaitx (op0, op1, op2));
35913 return 0;
35915 case IX86_BUILTIN_CLZERO:
35916 arg0 = CALL_EXPR_ARG (exp, 0);
35917 op0 = expand_normal (arg0);
35918 if (!REG_P (op0))
35919 op0 = ix86_zero_extend_to_Pmode (op0);
35920 emit_insn (ix86_gen_clzero (op0));
35921 return 0;
35923 case IX86_BUILTIN_VEC_INIT_V2SI:
35924 case IX86_BUILTIN_VEC_INIT_V4HI:
35925 case IX86_BUILTIN_VEC_INIT_V8QI:
35926 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35928 case IX86_BUILTIN_VEC_EXT_V2DF:
35929 case IX86_BUILTIN_VEC_EXT_V2DI:
35930 case IX86_BUILTIN_VEC_EXT_V4SF:
35931 case IX86_BUILTIN_VEC_EXT_V4SI:
35932 case IX86_BUILTIN_VEC_EXT_V8HI:
35933 case IX86_BUILTIN_VEC_EXT_V2SI:
35934 case IX86_BUILTIN_VEC_EXT_V4HI:
35935 case IX86_BUILTIN_VEC_EXT_V16QI:
35936 return ix86_expand_vec_ext_builtin (exp, target);
35938 case IX86_BUILTIN_VEC_SET_V2DI:
35939 case IX86_BUILTIN_VEC_SET_V4SF:
35940 case IX86_BUILTIN_VEC_SET_V4SI:
35941 case IX86_BUILTIN_VEC_SET_V8HI:
35942 case IX86_BUILTIN_VEC_SET_V4HI:
35943 case IX86_BUILTIN_VEC_SET_V16QI:
35944 return ix86_expand_vec_set_builtin (exp);
35946 case IX86_BUILTIN_NANQ:
35947 case IX86_BUILTIN_NANSQ:
35948 return expand_call (exp, target, ignore);
35950 case IX86_BUILTIN_RDPMC:
35951 case IX86_BUILTIN_RDTSC:
35952 case IX86_BUILTIN_RDTSCP:
35953 case IX86_BUILTIN_XGETBV:
35955 op0 = gen_reg_rtx (DImode);
35956 op1 = gen_reg_rtx (DImode);
35958 if (fcode == IX86_BUILTIN_RDPMC)
35960 arg0 = CALL_EXPR_ARG (exp, 0);
35961 op2 = expand_normal (arg0);
35962 if (!register_operand (op2, SImode))
35963 op2 = copy_to_mode_reg (SImode, op2);
35965 insn = (TARGET_64BIT
35966 ? gen_rdpmc_rex64 (op0, op1, op2)
35967 : gen_rdpmc (op0, op2));
35968 emit_insn (insn);
35970 else if (fcode == IX86_BUILTIN_XGETBV)
35972 arg0 = CALL_EXPR_ARG (exp, 0);
35973 op2 = expand_normal (arg0);
35974 if (!register_operand (op2, SImode))
35975 op2 = copy_to_mode_reg (SImode, op2);
35977 insn = (TARGET_64BIT
35978 ? gen_xgetbv_rex64 (op0, op1, op2)
35979 : gen_xgetbv (op0, op2));
35980 emit_insn (insn);
35982 else if (fcode == IX86_BUILTIN_RDTSC)
35984 insn = (TARGET_64BIT
35985 ? gen_rdtsc_rex64 (op0, op1)
35986 : gen_rdtsc (op0));
35987 emit_insn (insn);
35989 else
35991 op2 = gen_reg_rtx (SImode);
35993 insn = (TARGET_64BIT
35994 ? gen_rdtscp_rex64 (op0, op1, op2)
35995 : gen_rdtscp (op0, op2));
35996 emit_insn (insn);
35998 arg0 = CALL_EXPR_ARG (exp, 0);
35999 op4 = expand_normal (arg0);
36000 if (!address_operand (op4, VOIDmode))
36002 op4 = convert_memory_address (Pmode, op4);
36003 op4 = copy_addr_to_reg (op4);
36005 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36008 if (target == 0)
36010 /* mode is VOIDmode if __builtin_rd* has been called
36011 without lhs. */
36012 if (mode == VOIDmode)
36013 return target;
36014 target = gen_reg_rtx (mode);
36017 if (TARGET_64BIT)
36019 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
36020 op1, 1, OPTAB_DIRECT);
36021 op0 = expand_simple_binop (DImode, IOR, op0, op1,
36022 op0, 1, OPTAB_DIRECT);
36025 emit_move_insn (target, op0);
36026 return target;
36028 case IX86_BUILTIN_FXSAVE:
36029 case IX86_BUILTIN_FXRSTOR:
36030 case IX86_BUILTIN_FXSAVE64:
36031 case IX86_BUILTIN_FXRSTOR64:
36032 case IX86_BUILTIN_FNSTENV:
36033 case IX86_BUILTIN_FLDENV:
36034 mode0 = BLKmode;
36035 switch (fcode)
36037 case IX86_BUILTIN_FXSAVE:
36038 icode = CODE_FOR_fxsave;
36039 break;
36040 case IX86_BUILTIN_FXRSTOR:
36041 icode = CODE_FOR_fxrstor;
36042 break;
36043 case IX86_BUILTIN_FXSAVE64:
36044 icode = CODE_FOR_fxsave64;
36045 break;
36046 case IX86_BUILTIN_FXRSTOR64:
36047 icode = CODE_FOR_fxrstor64;
36048 break;
36049 case IX86_BUILTIN_FNSTENV:
36050 icode = CODE_FOR_fnstenv;
36051 break;
36052 case IX86_BUILTIN_FLDENV:
36053 icode = CODE_FOR_fldenv;
36054 break;
36055 default:
36056 gcc_unreachable ();
36059 arg0 = CALL_EXPR_ARG (exp, 0);
36060 op0 = expand_normal (arg0);
36062 if (!address_operand (op0, VOIDmode))
36064 op0 = convert_memory_address (Pmode, op0);
36065 op0 = copy_addr_to_reg (op0);
36067 op0 = gen_rtx_MEM (mode0, op0);
36069 pat = GEN_FCN (icode) (op0);
36070 if (pat)
36071 emit_insn (pat);
36072 return 0;
36074 case IX86_BUILTIN_XSETBV:
36075 arg0 = CALL_EXPR_ARG (exp, 0);
36076 arg1 = CALL_EXPR_ARG (exp, 1);
36077 op0 = expand_normal (arg0);
36078 op1 = expand_normal (arg1);
36080 if (!REG_P (op0))
36081 op0 = copy_to_mode_reg (SImode, op0);
36083 if (TARGET_64BIT)
36085 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36086 NULL, 1, OPTAB_DIRECT);
36088 op2 = gen_lowpart (SImode, op2);
36089 op1 = gen_lowpart (SImode, op1);
36090 if (!REG_P (op1))
36091 op1 = copy_to_mode_reg (SImode, op1);
36092 if (!REG_P (op2))
36093 op2 = copy_to_mode_reg (SImode, op2);
36094 icode = CODE_FOR_xsetbv_rex64;
36095 pat = GEN_FCN (icode) (op0, op1, op2);
36097 else
36099 if (!REG_P (op1))
36100 op1 = copy_to_mode_reg (DImode, op1);
36101 icode = CODE_FOR_xsetbv;
36102 pat = GEN_FCN (icode) (op0, op1);
36104 if (pat)
36105 emit_insn (pat);
36106 return 0;
36108 case IX86_BUILTIN_XSAVE:
36109 case IX86_BUILTIN_XRSTOR:
36110 case IX86_BUILTIN_XSAVE64:
36111 case IX86_BUILTIN_XRSTOR64:
36112 case IX86_BUILTIN_XSAVEOPT:
36113 case IX86_BUILTIN_XSAVEOPT64:
36114 case IX86_BUILTIN_XSAVES:
36115 case IX86_BUILTIN_XRSTORS:
36116 case IX86_BUILTIN_XSAVES64:
36117 case IX86_BUILTIN_XRSTORS64:
36118 case IX86_BUILTIN_XSAVEC:
36119 case IX86_BUILTIN_XSAVEC64:
36120 arg0 = CALL_EXPR_ARG (exp, 0);
36121 arg1 = CALL_EXPR_ARG (exp, 1);
36122 op0 = expand_normal (arg0);
36123 op1 = expand_normal (arg1);
36125 if (!address_operand (op0, VOIDmode))
36127 op0 = convert_memory_address (Pmode, op0);
36128 op0 = copy_addr_to_reg (op0);
36130 op0 = gen_rtx_MEM (BLKmode, op0);
36132 op1 = force_reg (DImode, op1);
36134 if (TARGET_64BIT)
36136 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36137 NULL, 1, OPTAB_DIRECT);
36138 switch (fcode)
36140 case IX86_BUILTIN_XSAVE:
36141 icode = CODE_FOR_xsave_rex64;
36142 break;
36143 case IX86_BUILTIN_XRSTOR:
36144 icode = CODE_FOR_xrstor_rex64;
36145 break;
36146 case IX86_BUILTIN_XSAVE64:
36147 icode = CODE_FOR_xsave64;
36148 break;
36149 case IX86_BUILTIN_XRSTOR64:
36150 icode = CODE_FOR_xrstor64;
36151 break;
36152 case IX86_BUILTIN_XSAVEOPT:
36153 icode = CODE_FOR_xsaveopt_rex64;
36154 break;
36155 case IX86_BUILTIN_XSAVEOPT64:
36156 icode = CODE_FOR_xsaveopt64;
36157 break;
36158 case IX86_BUILTIN_XSAVES:
36159 icode = CODE_FOR_xsaves_rex64;
36160 break;
36161 case IX86_BUILTIN_XRSTORS:
36162 icode = CODE_FOR_xrstors_rex64;
36163 break;
36164 case IX86_BUILTIN_XSAVES64:
36165 icode = CODE_FOR_xsaves64;
36166 break;
36167 case IX86_BUILTIN_XRSTORS64:
36168 icode = CODE_FOR_xrstors64;
36169 break;
36170 case IX86_BUILTIN_XSAVEC:
36171 icode = CODE_FOR_xsavec_rex64;
36172 break;
36173 case IX86_BUILTIN_XSAVEC64:
36174 icode = CODE_FOR_xsavec64;
36175 break;
36176 default:
36177 gcc_unreachable ();
36180 op2 = gen_lowpart (SImode, op2);
36181 op1 = gen_lowpart (SImode, op1);
36182 pat = GEN_FCN (icode) (op0, op1, op2);
36184 else
36186 switch (fcode)
36188 case IX86_BUILTIN_XSAVE:
36189 icode = CODE_FOR_xsave;
36190 break;
36191 case IX86_BUILTIN_XRSTOR:
36192 icode = CODE_FOR_xrstor;
36193 break;
36194 case IX86_BUILTIN_XSAVEOPT:
36195 icode = CODE_FOR_xsaveopt;
36196 break;
36197 case IX86_BUILTIN_XSAVES:
36198 icode = CODE_FOR_xsaves;
36199 break;
36200 case IX86_BUILTIN_XRSTORS:
36201 icode = CODE_FOR_xrstors;
36202 break;
36203 case IX86_BUILTIN_XSAVEC:
36204 icode = CODE_FOR_xsavec;
36205 break;
36206 default:
36207 gcc_unreachable ();
36209 pat = GEN_FCN (icode) (op0, op1);
36212 if (pat)
36213 emit_insn (pat);
36214 return 0;
36216 case IX86_BUILTIN_LLWPCB:
36217 arg0 = CALL_EXPR_ARG (exp, 0);
36218 op0 = expand_normal (arg0);
36219 icode = CODE_FOR_lwp_llwpcb;
36220 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36221 op0 = ix86_zero_extend_to_Pmode (op0);
36222 emit_insn (gen_lwp_llwpcb (op0));
36223 return 0;
36225 case IX86_BUILTIN_SLWPCB:
36226 icode = CODE_FOR_lwp_slwpcb;
36227 if (!target
36228 || !insn_data[icode].operand[0].predicate (target, Pmode))
36229 target = gen_reg_rtx (Pmode);
36230 emit_insn (gen_lwp_slwpcb (target));
36231 return target;
36233 case IX86_BUILTIN_BEXTRI32:
36234 case IX86_BUILTIN_BEXTRI64:
36235 arg0 = CALL_EXPR_ARG (exp, 0);
36236 arg1 = CALL_EXPR_ARG (exp, 1);
36237 op0 = expand_normal (arg0);
36238 op1 = expand_normal (arg1);
36239 icode = (fcode == IX86_BUILTIN_BEXTRI32
36240 ? CODE_FOR_tbm_bextri_si
36241 : CODE_FOR_tbm_bextri_di);
36242 if (!CONST_INT_P (op1))
36244 error ("last argument must be an immediate");
36245 return const0_rtx;
36247 else
36249 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36250 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36251 op1 = GEN_INT (length);
36252 op2 = GEN_INT (lsb_index);
36253 pat = GEN_FCN (icode) (target, op0, op1, op2);
36254 if (pat)
36255 emit_insn (pat);
36256 return target;
36259 case IX86_BUILTIN_RDRAND16_STEP:
36260 icode = CODE_FOR_rdrandhi_1;
36261 mode0 = HImode;
36262 goto rdrand_step;
36264 case IX86_BUILTIN_RDRAND32_STEP:
36265 icode = CODE_FOR_rdrandsi_1;
36266 mode0 = SImode;
36267 goto rdrand_step;
36269 case IX86_BUILTIN_RDRAND64_STEP:
36270 icode = CODE_FOR_rdranddi_1;
36271 mode0 = DImode;
36273 rdrand_step:
36274 arg0 = CALL_EXPR_ARG (exp, 0);
36275 op1 = expand_normal (arg0);
36276 if (!address_operand (op1, VOIDmode))
36278 op1 = convert_memory_address (Pmode, op1);
36279 op1 = copy_addr_to_reg (op1);
36282 op0 = gen_reg_rtx (mode0);
36283 emit_insn (GEN_FCN (icode) (op0));
36285 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36287 op1 = gen_reg_rtx (SImode);
36288 emit_move_insn (op1, CONST1_RTX (SImode));
36290 /* Emit SImode conditional move. */
36291 if (mode0 == HImode)
36293 if (TARGET_ZERO_EXTEND_WITH_AND
36294 && optimize_function_for_speed_p (cfun))
36296 op2 = force_reg (SImode, const0_rtx);
36298 emit_insn (gen_movstricthi
36299 (gen_lowpart (HImode, op2), op0));
36301 else
36303 op2 = gen_reg_rtx (SImode);
36305 emit_insn (gen_zero_extendhisi2 (op2, op0));
36308 else if (mode0 == SImode)
36309 op2 = op0;
36310 else
36311 op2 = gen_rtx_SUBREG (SImode, op0, 0);
36313 if (target == 0
36314 || !register_operand (target, SImode))
36315 target = gen_reg_rtx (SImode);
36317 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
36318 const0_rtx);
36319 emit_insn (gen_rtx_SET (target,
36320 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
36321 return target;
36323 case IX86_BUILTIN_RDSEED16_STEP:
36324 icode = CODE_FOR_rdseedhi_1;
36325 mode0 = HImode;
36326 goto rdseed_step;
36328 case IX86_BUILTIN_RDSEED32_STEP:
36329 icode = CODE_FOR_rdseedsi_1;
36330 mode0 = SImode;
36331 goto rdseed_step;
36333 case IX86_BUILTIN_RDSEED64_STEP:
36334 icode = CODE_FOR_rdseeddi_1;
36335 mode0 = DImode;
36337 rdseed_step:
36338 arg0 = CALL_EXPR_ARG (exp, 0);
36339 op1 = expand_normal (arg0);
36340 if (!address_operand (op1, VOIDmode))
36342 op1 = convert_memory_address (Pmode, op1);
36343 op1 = copy_addr_to_reg (op1);
36346 op0 = gen_reg_rtx (mode0);
36347 emit_insn (GEN_FCN (icode) (op0));
36349 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36351 op2 = gen_reg_rtx (QImode);
36353 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36354 const0_rtx);
36355 emit_insn (gen_rtx_SET (op2, pat));
36357 if (target == 0
36358 || !register_operand (target, SImode))
36359 target = gen_reg_rtx (SImode);
36361 emit_insn (gen_zero_extendqisi2 (target, op2));
36362 return target;
36364 case IX86_BUILTIN_SBB32:
36365 icode = CODE_FOR_subborrowsi;
36366 icode2 = CODE_FOR_subborrowsi_0;
36367 mode0 = SImode;
36368 mode1 = DImode;
36369 mode2 = CCmode;
36370 goto handlecarry;
36372 case IX86_BUILTIN_SBB64:
36373 icode = CODE_FOR_subborrowdi;
36374 icode2 = CODE_FOR_subborrowdi_0;
36375 mode0 = DImode;
36376 mode1 = TImode;
36377 mode2 = CCmode;
36378 goto handlecarry;
36380 case IX86_BUILTIN_ADDCARRYX32:
36381 icode = CODE_FOR_addcarrysi;
36382 icode2 = CODE_FOR_addcarrysi_0;
36383 mode0 = SImode;
36384 mode1 = DImode;
36385 mode2 = CCCmode;
36386 goto handlecarry;
36388 case IX86_BUILTIN_ADDCARRYX64:
36389 icode = CODE_FOR_addcarrydi;
36390 icode2 = CODE_FOR_addcarrydi_0;
36391 mode0 = DImode;
36392 mode1 = TImode;
36393 mode2 = CCCmode;
36395 handlecarry:
36396 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36397 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36398 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36399 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36401 op1 = expand_normal (arg0);
36402 if (!integer_zerop (arg0))
36403 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36405 op2 = expand_normal (arg1);
36406 if (!register_operand (op2, mode0))
36407 op2 = copy_to_mode_reg (mode0, op2);
36409 op3 = expand_normal (arg2);
36410 if (!register_operand (op3, mode0))
36411 op3 = copy_to_mode_reg (mode0, op3);
36413 op4 = expand_normal (arg3);
36414 if (!address_operand (op4, VOIDmode))
36416 op4 = convert_memory_address (Pmode, op4);
36417 op4 = copy_addr_to_reg (op4);
36420 op0 = gen_reg_rtx (mode0);
36421 if (integer_zerop (arg0))
36423 /* If arg0 is 0, optimize right away into add or sub
36424 instruction that sets CCCmode flags. */
36425 op1 = gen_rtx_REG (mode2, FLAGS_REG);
36426 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
36428 else
36430 /* Generate CF from input operand. */
36431 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36433 /* Generate instruction that consumes CF. */
36434 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36435 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
36436 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
36437 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
36440 /* Return current CF value. */
36441 if (target == 0)
36442 target = gen_reg_rtx (QImode);
36444 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
36445 emit_insn (gen_rtx_SET (target, pat));
36447 /* Store the result. */
36448 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36450 return target;
36452 case IX86_BUILTIN_READ_FLAGS:
36453 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36455 if (optimize
36456 || target == NULL_RTX
36457 || !nonimmediate_operand (target, word_mode)
36458 || GET_MODE (target) != word_mode)
36459 target = gen_reg_rtx (word_mode);
36461 emit_insn (gen_pop (target));
36462 return target;
36464 case IX86_BUILTIN_WRITE_FLAGS:
36466 arg0 = CALL_EXPR_ARG (exp, 0);
36467 op0 = expand_normal (arg0);
36468 if (!general_no_elim_operand (op0, word_mode))
36469 op0 = copy_to_mode_reg (word_mode, op0);
36471 emit_insn (gen_push (op0));
36472 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36473 return 0;
36475 case IX86_BUILTIN_KTESTC8:
36476 icode = CODE_FOR_ktestqi;
36477 mode3 = CCCmode;
36478 goto kortest;
36480 case IX86_BUILTIN_KTESTZ8:
36481 icode = CODE_FOR_ktestqi;
36482 mode3 = CCZmode;
36483 goto kortest;
36485 case IX86_BUILTIN_KTESTC16:
36486 icode = CODE_FOR_ktesthi;
36487 mode3 = CCCmode;
36488 goto kortest;
36490 case IX86_BUILTIN_KTESTZ16:
36491 icode = CODE_FOR_ktesthi;
36492 mode3 = CCZmode;
36493 goto kortest;
36495 case IX86_BUILTIN_KTESTC32:
36496 icode = CODE_FOR_ktestsi;
36497 mode3 = CCCmode;
36498 goto kortest;
36500 case IX86_BUILTIN_KTESTZ32:
36501 icode = CODE_FOR_ktestsi;
36502 mode3 = CCZmode;
36503 goto kortest;
36505 case IX86_BUILTIN_KTESTC64:
36506 icode = CODE_FOR_ktestdi;
36507 mode3 = CCCmode;
36508 goto kortest;
36510 case IX86_BUILTIN_KTESTZ64:
36511 icode = CODE_FOR_ktestdi;
36512 mode3 = CCZmode;
36513 goto kortest;
36515 case IX86_BUILTIN_KORTESTC8:
36516 icode = CODE_FOR_kortestqi;
36517 mode3 = CCCmode;
36518 goto kortest;
36520 case IX86_BUILTIN_KORTESTZ8:
36521 icode = CODE_FOR_kortestqi;
36522 mode3 = CCZmode;
36523 goto kortest;
36525 case IX86_BUILTIN_KORTESTC16:
36526 icode = CODE_FOR_kortesthi;
36527 mode3 = CCCmode;
36528 goto kortest;
36530 case IX86_BUILTIN_KORTESTZ16:
36531 icode = CODE_FOR_kortesthi;
36532 mode3 = CCZmode;
36533 goto kortest;
36535 case IX86_BUILTIN_KORTESTC32:
36536 icode = CODE_FOR_kortestsi;
36537 mode3 = CCCmode;
36538 goto kortest;
36540 case IX86_BUILTIN_KORTESTZ32:
36541 icode = CODE_FOR_kortestsi;
36542 mode3 = CCZmode;
36543 goto kortest;
36545 case IX86_BUILTIN_KORTESTC64:
36546 icode = CODE_FOR_kortestdi;
36547 mode3 = CCCmode;
36548 goto kortest;
36550 case IX86_BUILTIN_KORTESTZ64:
36551 icode = CODE_FOR_kortestdi;
36552 mode3 = CCZmode;
36554 kortest:
36555 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36556 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36557 op0 = expand_normal (arg0);
36558 op1 = expand_normal (arg1);
36560 mode0 = insn_data[icode].operand[0].mode;
36561 mode1 = insn_data[icode].operand[1].mode;
36563 if (GET_MODE (op0) != VOIDmode)
36564 op0 = force_reg (GET_MODE (op0), op0);
36566 op0 = gen_lowpart (mode0, op0);
36568 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36569 op0 = copy_to_mode_reg (mode0, op0);
36571 if (GET_MODE (op1) != VOIDmode)
36572 op1 = force_reg (GET_MODE (op1), op1);
36574 op1 = gen_lowpart (mode1, op1);
36576 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36577 op1 = copy_to_mode_reg (mode1, op1);
36579 target = gen_reg_rtx (QImode);
36581 /* Emit kortest. */
36582 emit_insn (GEN_FCN (icode) (op0, op1));
36583 /* And use setcc to return result from flags. */
36584 ix86_expand_setcc (target, EQ,
36585 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36586 return target;
36588 case IX86_BUILTIN_GATHERSIV2DF:
36589 icode = CODE_FOR_avx2_gathersiv2df;
36590 goto gather_gen;
36591 case IX86_BUILTIN_GATHERSIV4DF:
36592 icode = CODE_FOR_avx2_gathersiv4df;
36593 goto gather_gen;
36594 case IX86_BUILTIN_GATHERDIV2DF:
36595 icode = CODE_FOR_avx2_gatherdiv2df;
36596 goto gather_gen;
36597 case IX86_BUILTIN_GATHERDIV4DF:
36598 icode = CODE_FOR_avx2_gatherdiv4df;
36599 goto gather_gen;
36600 case IX86_BUILTIN_GATHERSIV4SF:
36601 icode = CODE_FOR_avx2_gathersiv4sf;
36602 goto gather_gen;
36603 case IX86_BUILTIN_GATHERSIV8SF:
36604 icode = CODE_FOR_avx2_gathersiv8sf;
36605 goto gather_gen;
36606 case IX86_BUILTIN_GATHERDIV4SF:
36607 icode = CODE_FOR_avx2_gatherdiv4sf;
36608 goto gather_gen;
36609 case IX86_BUILTIN_GATHERDIV8SF:
36610 icode = CODE_FOR_avx2_gatherdiv8sf;
36611 goto gather_gen;
36612 case IX86_BUILTIN_GATHERSIV2DI:
36613 icode = CODE_FOR_avx2_gathersiv2di;
36614 goto gather_gen;
36615 case IX86_BUILTIN_GATHERSIV4DI:
36616 icode = CODE_FOR_avx2_gathersiv4di;
36617 goto gather_gen;
36618 case IX86_BUILTIN_GATHERDIV2DI:
36619 icode = CODE_FOR_avx2_gatherdiv2di;
36620 goto gather_gen;
36621 case IX86_BUILTIN_GATHERDIV4DI:
36622 icode = CODE_FOR_avx2_gatherdiv4di;
36623 goto gather_gen;
36624 case IX86_BUILTIN_GATHERSIV4SI:
36625 icode = CODE_FOR_avx2_gathersiv4si;
36626 goto gather_gen;
36627 case IX86_BUILTIN_GATHERSIV8SI:
36628 icode = CODE_FOR_avx2_gathersiv8si;
36629 goto gather_gen;
36630 case IX86_BUILTIN_GATHERDIV4SI:
36631 icode = CODE_FOR_avx2_gatherdiv4si;
36632 goto gather_gen;
36633 case IX86_BUILTIN_GATHERDIV8SI:
36634 icode = CODE_FOR_avx2_gatherdiv8si;
36635 goto gather_gen;
36636 case IX86_BUILTIN_GATHERALTSIV4DF:
36637 icode = CODE_FOR_avx2_gathersiv4df;
36638 goto gather_gen;
36639 case IX86_BUILTIN_GATHERALTDIV8SF:
36640 icode = CODE_FOR_avx2_gatherdiv8sf;
36641 goto gather_gen;
36642 case IX86_BUILTIN_GATHERALTSIV4DI:
36643 icode = CODE_FOR_avx2_gathersiv4di;
36644 goto gather_gen;
36645 case IX86_BUILTIN_GATHERALTDIV8SI:
36646 icode = CODE_FOR_avx2_gatherdiv8si;
36647 goto gather_gen;
36648 case IX86_BUILTIN_GATHER3SIV16SF:
36649 icode = CODE_FOR_avx512f_gathersiv16sf;
36650 goto gather_gen;
36651 case IX86_BUILTIN_GATHER3SIV8DF:
36652 icode = CODE_FOR_avx512f_gathersiv8df;
36653 goto gather_gen;
36654 case IX86_BUILTIN_GATHER3DIV16SF:
36655 icode = CODE_FOR_avx512f_gatherdiv16sf;
36656 goto gather_gen;
36657 case IX86_BUILTIN_GATHER3DIV8DF:
36658 icode = CODE_FOR_avx512f_gatherdiv8df;
36659 goto gather_gen;
36660 case IX86_BUILTIN_GATHER3SIV16SI:
36661 icode = CODE_FOR_avx512f_gathersiv16si;
36662 goto gather_gen;
36663 case IX86_BUILTIN_GATHER3SIV8DI:
36664 icode = CODE_FOR_avx512f_gathersiv8di;
36665 goto gather_gen;
36666 case IX86_BUILTIN_GATHER3DIV16SI:
36667 icode = CODE_FOR_avx512f_gatherdiv16si;
36668 goto gather_gen;
36669 case IX86_BUILTIN_GATHER3DIV8DI:
36670 icode = CODE_FOR_avx512f_gatherdiv8di;
36671 goto gather_gen;
36672 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36673 icode = CODE_FOR_avx512f_gathersiv8df;
36674 goto gather_gen;
36675 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36676 icode = CODE_FOR_avx512f_gatherdiv16sf;
36677 goto gather_gen;
36678 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36679 icode = CODE_FOR_avx512f_gathersiv8di;
36680 goto gather_gen;
36681 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36682 icode = CODE_FOR_avx512f_gatherdiv16si;
36683 goto gather_gen;
36684 case IX86_BUILTIN_GATHER3SIV2DF:
36685 icode = CODE_FOR_avx512vl_gathersiv2df;
36686 goto gather_gen;
36687 case IX86_BUILTIN_GATHER3SIV4DF:
36688 icode = CODE_FOR_avx512vl_gathersiv4df;
36689 goto gather_gen;
36690 case IX86_BUILTIN_GATHER3DIV2DF:
36691 icode = CODE_FOR_avx512vl_gatherdiv2df;
36692 goto gather_gen;
36693 case IX86_BUILTIN_GATHER3DIV4DF:
36694 icode = CODE_FOR_avx512vl_gatherdiv4df;
36695 goto gather_gen;
36696 case IX86_BUILTIN_GATHER3SIV4SF:
36697 icode = CODE_FOR_avx512vl_gathersiv4sf;
36698 goto gather_gen;
36699 case IX86_BUILTIN_GATHER3SIV8SF:
36700 icode = CODE_FOR_avx512vl_gathersiv8sf;
36701 goto gather_gen;
36702 case IX86_BUILTIN_GATHER3DIV4SF:
36703 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36704 goto gather_gen;
36705 case IX86_BUILTIN_GATHER3DIV8SF:
36706 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36707 goto gather_gen;
36708 case IX86_BUILTIN_GATHER3SIV2DI:
36709 icode = CODE_FOR_avx512vl_gathersiv2di;
36710 goto gather_gen;
36711 case IX86_BUILTIN_GATHER3SIV4DI:
36712 icode = CODE_FOR_avx512vl_gathersiv4di;
36713 goto gather_gen;
36714 case IX86_BUILTIN_GATHER3DIV2DI:
36715 icode = CODE_FOR_avx512vl_gatherdiv2di;
36716 goto gather_gen;
36717 case IX86_BUILTIN_GATHER3DIV4DI:
36718 icode = CODE_FOR_avx512vl_gatherdiv4di;
36719 goto gather_gen;
36720 case IX86_BUILTIN_GATHER3SIV4SI:
36721 icode = CODE_FOR_avx512vl_gathersiv4si;
36722 goto gather_gen;
36723 case IX86_BUILTIN_GATHER3SIV8SI:
36724 icode = CODE_FOR_avx512vl_gathersiv8si;
36725 goto gather_gen;
36726 case IX86_BUILTIN_GATHER3DIV4SI:
36727 icode = CODE_FOR_avx512vl_gatherdiv4si;
36728 goto gather_gen;
36729 case IX86_BUILTIN_GATHER3DIV8SI:
36730 icode = CODE_FOR_avx512vl_gatherdiv8si;
36731 goto gather_gen;
36732 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36733 icode = CODE_FOR_avx512vl_gathersiv4df;
36734 goto gather_gen;
36735 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36736 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36737 goto gather_gen;
36738 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36739 icode = CODE_FOR_avx512vl_gathersiv4di;
36740 goto gather_gen;
36741 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36742 icode = CODE_FOR_avx512vl_gatherdiv8si;
36743 goto gather_gen;
36744 case IX86_BUILTIN_SCATTERSIV16SF:
36745 icode = CODE_FOR_avx512f_scattersiv16sf;
36746 goto scatter_gen;
36747 case IX86_BUILTIN_SCATTERSIV8DF:
36748 icode = CODE_FOR_avx512f_scattersiv8df;
36749 goto scatter_gen;
36750 case IX86_BUILTIN_SCATTERDIV16SF:
36751 icode = CODE_FOR_avx512f_scatterdiv16sf;
36752 goto scatter_gen;
36753 case IX86_BUILTIN_SCATTERDIV8DF:
36754 icode = CODE_FOR_avx512f_scatterdiv8df;
36755 goto scatter_gen;
36756 case IX86_BUILTIN_SCATTERSIV16SI:
36757 icode = CODE_FOR_avx512f_scattersiv16si;
36758 goto scatter_gen;
36759 case IX86_BUILTIN_SCATTERSIV8DI:
36760 icode = CODE_FOR_avx512f_scattersiv8di;
36761 goto scatter_gen;
36762 case IX86_BUILTIN_SCATTERDIV16SI:
36763 icode = CODE_FOR_avx512f_scatterdiv16si;
36764 goto scatter_gen;
36765 case IX86_BUILTIN_SCATTERDIV8DI:
36766 icode = CODE_FOR_avx512f_scatterdiv8di;
36767 goto scatter_gen;
36768 case IX86_BUILTIN_SCATTERSIV8SF:
36769 icode = CODE_FOR_avx512vl_scattersiv8sf;
36770 goto scatter_gen;
36771 case IX86_BUILTIN_SCATTERSIV4SF:
36772 icode = CODE_FOR_avx512vl_scattersiv4sf;
36773 goto scatter_gen;
36774 case IX86_BUILTIN_SCATTERSIV4DF:
36775 icode = CODE_FOR_avx512vl_scattersiv4df;
36776 goto scatter_gen;
36777 case IX86_BUILTIN_SCATTERSIV2DF:
36778 icode = CODE_FOR_avx512vl_scattersiv2df;
36779 goto scatter_gen;
36780 case IX86_BUILTIN_SCATTERDIV8SF:
36781 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36782 goto scatter_gen;
36783 case IX86_BUILTIN_SCATTERDIV4SF:
36784 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36785 goto scatter_gen;
36786 case IX86_BUILTIN_SCATTERDIV4DF:
36787 icode = CODE_FOR_avx512vl_scatterdiv4df;
36788 goto scatter_gen;
36789 case IX86_BUILTIN_SCATTERDIV2DF:
36790 icode = CODE_FOR_avx512vl_scatterdiv2df;
36791 goto scatter_gen;
36792 case IX86_BUILTIN_SCATTERSIV8SI:
36793 icode = CODE_FOR_avx512vl_scattersiv8si;
36794 goto scatter_gen;
36795 case IX86_BUILTIN_SCATTERSIV4SI:
36796 icode = CODE_FOR_avx512vl_scattersiv4si;
36797 goto scatter_gen;
36798 case IX86_BUILTIN_SCATTERSIV4DI:
36799 icode = CODE_FOR_avx512vl_scattersiv4di;
36800 goto scatter_gen;
36801 case IX86_BUILTIN_SCATTERSIV2DI:
36802 icode = CODE_FOR_avx512vl_scattersiv2di;
36803 goto scatter_gen;
36804 case IX86_BUILTIN_SCATTERDIV8SI:
36805 icode = CODE_FOR_avx512vl_scatterdiv8si;
36806 goto scatter_gen;
36807 case IX86_BUILTIN_SCATTERDIV4SI:
36808 icode = CODE_FOR_avx512vl_scatterdiv4si;
36809 goto scatter_gen;
36810 case IX86_BUILTIN_SCATTERDIV4DI:
36811 icode = CODE_FOR_avx512vl_scatterdiv4di;
36812 goto scatter_gen;
36813 case IX86_BUILTIN_SCATTERDIV2DI:
36814 icode = CODE_FOR_avx512vl_scatterdiv2di;
36815 goto scatter_gen;
36816 case IX86_BUILTIN_GATHERPFDPD:
36817 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36818 goto vec_prefetch_gen;
36819 case IX86_BUILTIN_SCATTERALTSIV8DF:
36820 icode = CODE_FOR_avx512f_scattersiv8df;
36821 goto scatter_gen;
36822 case IX86_BUILTIN_SCATTERALTDIV16SF:
36823 icode = CODE_FOR_avx512f_scatterdiv16sf;
36824 goto scatter_gen;
36825 case IX86_BUILTIN_SCATTERALTSIV8DI:
36826 icode = CODE_FOR_avx512f_scattersiv8di;
36827 goto scatter_gen;
36828 case IX86_BUILTIN_SCATTERALTDIV16SI:
36829 icode = CODE_FOR_avx512f_scatterdiv16si;
36830 goto scatter_gen;
36831 case IX86_BUILTIN_GATHERPFDPS:
36832 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36833 goto vec_prefetch_gen;
36834 case IX86_BUILTIN_GATHERPFQPD:
36835 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36836 goto vec_prefetch_gen;
36837 case IX86_BUILTIN_GATHERPFQPS:
36838 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36839 goto vec_prefetch_gen;
36840 case IX86_BUILTIN_SCATTERPFDPD:
36841 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36842 goto vec_prefetch_gen;
36843 case IX86_BUILTIN_SCATTERPFDPS:
36844 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36845 goto vec_prefetch_gen;
36846 case IX86_BUILTIN_SCATTERPFQPD:
36847 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36848 goto vec_prefetch_gen;
36849 case IX86_BUILTIN_SCATTERPFQPS:
36850 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36851 goto vec_prefetch_gen;
36853 gather_gen:
36854 rtx half;
36855 rtx (*gen) (rtx, rtx);
36857 arg0 = CALL_EXPR_ARG (exp, 0);
36858 arg1 = CALL_EXPR_ARG (exp, 1);
36859 arg2 = CALL_EXPR_ARG (exp, 2);
36860 arg3 = CALL_EXPR_ARG (exp, 3);
36861 arg4 = CALL_EXPR_ARG (exp, 4);
36862 op0 = expand_normal (arg0);
36863 op1 = expand_normal (arg1);
36864 op2 = expand_normal (arg2);
36865 op3 = expand_normal (arg3);
36866 op4 = expand_normal (arg4);
36867 /* Note the arg order is different from the operand order. */
36868 mode0 = insn_data[icode].operand[1].mode;
36869 mode2 = insn_data[icode].operand[3].mode;
36870 mode3 = insn_data[icode].operand[4].mode;
36871 mode4 = insn_data[icode].operand[5].mode;
36873 if (target == NULL_RTX
36874 || GET_MODE (target) != insn_data[icode].operand[0].mode
36875 || !insn_data[icode].operand[0].predicate (target,
36876 GET_MODE (target)))
36877 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36878 else
36879 subtarget = target;
36881 switch (fcode)
36883 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36884 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36885 half = gen_reg_rtx (V8SImode);
36886 if (!nonimmediate_operand (op2, V16SImode))
36887 op2 = copy_to_mode_reg (V16SImode, op2);
36888 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36889 op2 = half;
36890 break;
36891 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36892 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36893 case IX86_BUILTIN_GATHERALTSIV4DF:
36894 case IX86_BUILTIN_GATHERALTSIV4DI:
36895 half = gen_reg_rtx (V4SImode);
36896 if (!nonimmediate_operand (op2, V8SImode))
36897 op2 = copy_to_mode_reg (V8SImode, op2);
36898 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36899 op2 = half;
36900 break;
36901 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36902 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36903 half = gen_reg_rtx (mode0);
36904 if (mode0 == V8SFmode)
36905 gen = gen_vec_extract_lo_v16sf;
36906 else
36907 gen = gen_vec_extract_lo_v16si;
36908 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36909 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36910 emit_insn (gen (half, op0));
36911 op0 = half;
36912 if (GET_MODE (op3) != VOIDmode)
36914 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36915 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36916 emit_insn (gen (half, op3));
36917 op3 = half;
36919 break;
36920 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36921 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36922 case IX86_BUILTIN_GATHERALTDIV8SF:
36923 case IX86_BUILTIN_GATHERALTDIV8SI:
36924 half = gen_reg_rtx (mode0);
36925 if (mode0 == V4SFmode)
36926 gen = gen_vec_extract_lo_v8sf;
36927 else
36928 gen = gen_vec_extract_lo_v8si;
36929 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36930 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36931 emit_insn (gen (half, op0));
36932 op0 = half;
36933 if (GET_MODE (op3) != VOIDmode)
36935 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36936 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36937 emit_insn (gen (half, op3));
36938 op3 = half;
36940 break;
36941 default:
36942 break;
36945 /* Force memory operand only with base register here. But we
36946 don't want to do it on memory operand for other builtin
36947 functions. */
36948 op1 = ix86_zero_extend_to_Pmode (op1);
36950 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36951 op0 = copy_to_mode_reg (mode0, op0);
36952 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36953 op1 = copy_to_mode_reg (Pmode, op1);
36954 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36955 op2 = copy_to_mode_reg (mode2, op2);
36957 op3 = fixup_modeless_constant (op3, mode3);
36959 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36961 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36962 op3 = copy_to_mode_reg (mode3, op3);
36964 else
36966 op3 = copy_to_reg (op3);
36967 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36969 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36971 error ("the last argument must be scale 1, 2, 4, 8");
36972 return const0_rtx;
36975 /* Optimize. If mask is known to have all high bits set,
36976 replace op0 with pc_rtx to signal that the instruction
36977 overwrites the whole destination and doesn't use its
36978 previous contents. */
36979 if (optimize)
36981 if (TREE_CODE (arg3) == INTEGER_CST)
36983 if (integer_all_onesp (arg3))
36984 op0 = pc_rtx;
36986 else if (TREE_CODE (arg3) == VECTOR_CST)
36988 unsigned int negative = 0;
36989 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36991 tree cst = VECTOR_CST_ELT (arg3, i);
36992 if (TREE_CODE (cst) == INTEGER_CST
36993 && tree_int_cst_sign_bit (cst))
36994 negative++;
36995 else if (TREE_CODE (cst) == REAL_CST
36996 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36997 negative++;
36999 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37000 op0 = pc_rtx;
37002 else if (TREE_CODE (arg3) == SSA_NAME
37003 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37005 /* Recognize also when mask is like:
37006 __v2df src = _mm_setzero_pd ();
37007 __v2df mask = _mm_cmpeq_pd (src, src);
37009 __v8sf src = _mm256_setzero_ps ();
37010 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37011 as that is a cheaper way to load all ones into
37012 a register than having to load a constant from
37013 memory. */
37014 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37015 if (is_gimple_call (def_stmt))
37017 tree fndecl = gimple_call_fndecl (def_stmt);
37018 if (fndecl
37019 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37020 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37022 case IX86_BUILTIN_CMPPD:
37023 case IX86_BUILTIN_CMPPS:
37024 case IX86_BUILTIN_CMPPD256:
37025 case IX86_BUILTIN_CMPPS256:
37026 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37027 break;
37028 /* FALLTHRU */
37029 case IX86_BUILTIN_CMPEQPD:
37030 case IX86_BUILTIN_CMPEQPS:
37031 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37032 && initializer_zerop (gimple_call_arg (def_stmt,
37033 1)))
37034 op0 = pc_rtx;
37035 break;
37036 default:
37037 break;
37043 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37044 if (! pat)
37045 return const0_rtx;
37046 emit_insn (pat);
37048 switch (fcode)
37050 case IX86_BUILTIN_GATHER3DIV16SF:
37051 if (target == NULL_RTX)
37052 target = gen_reg_rtx (V8SFmode);
37053 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37054 break;
37055 case IX86_BUILTIN_GATHER3DIV16SI:
37056 if (target == NULL_RTX)
37057 target = gen_reg_rtx (V8SImode);
37058 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37059 break;
37060 case IX86_BUILTIN_GATHER3DIV8SF:
37061 case IX86_BUILTIN_GATHERDIV8SF:
37062 if (target == NULL_RTX)
37063 target = gen_reg_rtx (V4SFmode);
37064 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37065 break;
37066 case IX86_BUILTIN_GATHER3DIV8SI:
37067 case IX86_BUILTIN_GATHERDIV8SI:
37068 if (target == NULL_RTX)
37069 target = gen_reg_rtx (V4SImode);
37070 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37071 break;
37072 default:
37073 target = subtarget;
37074 break;
37076 return target;
37078 scatter_gen:
37079 arg0 = CALL_EXPR_ARG (exp, 0);
37080 arg1 = CALL_EXPR_ARG (exp, 1);
37081 arg2 = CALL_EXPR_ARG (exp, 2);
37082 arg3 = CALL_EXPR_ARG (exp, 3);
37083 arg4 = CALL_EXPR_ARG (exp, 4);
37084 op0 = expand_normal (arg0);
37085 op1 = expand_normal (arg1);
37086 op2 = expand_normal (arg2);
37087 op3 = expand_normal (arg3);
37088 op4 = expand_normal (arg4);
37089 mode1 = insn_data[icode].operand[1].mode;
37090 mode2 = insn_data[icode].operand[2].mode;
37091 mode3 = insn_data[icode].operand[3].mode;
37092 mode4 = insn_data[icode].operand[4].mode;
37094 /* Scatter instruction stores operand op3 to memory with
37095 indices from op2 and scale from op4 under writemask op1.
37096 If index operand op2 has more elements then source operand
37097 op3 one need to use only its low half. And vice versa. */
37098 switch (fcode)
37100 case IX86_BUILTIN_SCATTERALTSIV8DF:
37101 case IX86_BUILTIN_SCATTERALTSIV8DI:
37102 half = gen_reg_rtx (V8SImode);
37103 if (!nonimmediate_operand (op2, V16SImode))
37104 op2 = copy_to_mode_reg (V16SImode, op2);
37105 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37106 op2 = half;
37107 break;
37108 case IX86_BUILTIN_SCATTERALTDIV16SF:
37109 case IX86_BUILTIN_SCATTERALTDIV16SI:
37110 half = gen_reg_rtx (mode3);
37111 if (mode3 == V8SFmode)
37112 gen = gen_vec_extract_lo_v16sf;
37113 else
37114 gen = gen_vec_extract_lo_v16si;
37115 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37116 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37117 emit_insn (gen (half, op3));
37118 op3 = half;
37119 break;
37120 default:
37121 break;
37124 /* Force memory operand only with base register here. But we
37125 don't want to do it on memory operand for other builtin
37126 functions. */
37127 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37129 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37130 op0 = copy_to_mode_reg (Pmode, op0);
37132 op1 = fixup_modeless_constant (op1, mode1);
37134 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37136 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37137 op1 = copy_to_mode_reg (mode1, op1);
37139 else
37141 op1 = copy_to_reg (op1);
37142 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37145 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37146 op2 = copy_to_mode_reg (mode2, op2);
37148 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37149 op3 = copy_to_mode_reg (mode3, op3);
37151 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37153 error ("the last argument must be scale 1, 2, 4, 8");
37154 return const0_rtx;
37157 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37158 if (! pat)
37159 return const0_rtx;
37161 emit_insn (pat);
37162 return 0;
37164 vec_prefetch_gen:
37165 arg0 = CALL_EXPR_ARG (exp, 0);
37166 arg1 = CALL_EXPR_ARG (exp, 1);
37167 arg2 = CALL_EXPR_ARG (exp, 2);
37168 arg3 = CALL_EXPR_ARG (exp, 3);
37169 arg4 = CALL_EXPR_ARG (exp, 4);
37170 op0 = expand_normal (arg0);
37171 op1 = expand_normal (arg1);
37172 op2 = expand_normal (arg2);
37173 op3 = expand_normal (arg3);
37174 op4 = expand_normal (arg4);
37175 mode0 = insn_data[icode].operand[0].mode;
37176 mode1 = insn_data[icode].operand[1].mode;
37177 mode3 = insn_data[icode].operand[3].mode;
37178 mode4 = insn_data[icode].operand[4].mode;
37180 op0 = fixup_modeless_constant (op0, mode0);
37182 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37184 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37185 op0 = copy_to_mode_reg (mode0, op0);
37187 else
37189 op0 = copy_to_reg (op0);
37190 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37193 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37194 op1 = copy_to_mode_reg (mode1, op1);
37196 /* Force memory operand only with base register here. But we
37197 don't want to do it on memory operand for other builtin
37198 functions. */
37199 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37201 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37202 op2 = copy_to_mode_reg (Pmode, op2);
37204 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37206 error ("the forth argument must be scale 1, 2, 4, 8");
37207 return const0_rtx;
37210 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37212 error ("incorrect hint operand");
37213 return const0_rtx;
37216 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37217 if (! pat)
37218 return const0_rtx;
37220 emit_insn (pat);
37222 return 0;
37224 case IX86_BUILTIN_XABORT:
37225 icode = CODE_FOR_xabort;
37226 arg0 = CALL_EXPR_ARG (exp, 0);
37227 op0 = expand_normal (arg0);
37228 mode0 = insn_data[icode].operand[0].mode;
37229 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37231 error ("the xabort's argument must be an 8-bit immediate");
37232 return const0_rtx;
37234 emit_insn (gen_xabort (op0));
37235 return 0;
37237 case IX86_BUILTIN_RSTORSSP:
37238 case IX86_BUILTIN_CLRSSBSY:
37239 arg0 = CALL_EXPR_ARG (exp, 0);
37240 op0 = expand_normal (arg0);
37241 icode = (fcode == IX86_BUILTIN_RSTORSSP
37242 ? CODE_FOR_rstorssp
37243 : CODE_FOR_clrssbsy);
37244 if (!address_operand (op0, VOIDmode))
37246 op1 = convert_memory_address (Pmode, op0);
37247 op0 = copy_addr_to_reg (op1);
37249 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
37250 return 0;
37252 case IX86_BUILTIN_WRSSD:
37253 case IX86_BUILTIN_WRSSQ:
37254 case IX86_BUILTIN_WRUSSD:
37255 case IX86_BUILTIN_WRUSSQ:
37256 arg0 = CALL_EXPR_ARG (exp, 0);
37257 op0 = expand_normal (arg0);
37258 arg1 = CALL_EXPR_ARG (exp, 1);
37259 op1 = expand_normal (arg1);
37260 switch (fcode)
37262 case IX86_BUILTIN_WRSSD:
37263 icode = CODE_FOR_wrsssi;
37264 mode = SImode;
37265 break;
37266 case IX86_BUILTIN_WRSSQ:
37267 icode = CODE_FOR_wrssdi;
37268 mode = DImode;
37269 break;
37270 case IX86_BUILTIN_WRUSSD:
37271 icode = CODE_FOR_wrusssi;
37272 mode = SImode;
37273 break;
37274 case IX86_BUILTIN_WRUSSQ:
37275 icode = CODE_FOR_wrussdi;
37276 mode = DImode;
37277 break;
37279 op0 = force_reg (mode, op0);
37280 if (!address_operand (op1, VOIDmode))
37282 op2 = convert_memory_address (Pmode, op1);
37283 op1 = copy_addr_to_reg (op2);
37285 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
37286 return 0;
37288 default:
37289 break;
37292 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37293 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37295 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37296 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37297 target);
37300 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37301 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37303 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37304 switch (fcode)
37306 case IX86_BUILTIN_FABSQ:
37307 case IX86_BUILTIN_COPYSIGNQ:
37308 if (!TARGET_SSE)
37309 /* Emit a normal call if SSE isn't available. */
37310 return expand_call (exp, target, ignore);
37311 /* FALLTHRU */
37312 default:
37313 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37317 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
37318 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
37320 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
37321 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
37322 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
37323 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
37324 int masked = 1;
37325 machine_mode mode, wide_mode, nar_mode;
37327 nar_mode = V4SFmode;
37328 mode = V16SFmode;
37329 wide_mode = V64SFmode;
37330 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
37331 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
37333 switch (fcode)
37335 case IX86_BUILTIN_4FMAPS:
37336 fcn = gen_avx5124fmaddps_4fmaddps;
37337 masked = 0;
37338 goto v4fma_expand;
37340 case IX86_BUILTIN_4DPWSSD:
37341 nar_mode = V4SImode;
37342 mode = V16SImode;
37343 wide_mode = V64SImode;
37344 fcn = gen_avx5124vnniw_vp4dpwssd;
37345 masked = 0;
37346 goto v4fma_expand;
37348 case IX86_BUILTIN_4DPWSSDS:
37349 nar_mode = V4SImode;
37350 mode = V16SImode;
37351 wide_mode = V64SImode;
37352 fcn = gen_avx5124vnniw_vp4dpwssds;
37353 masked = 0;
37354 goto v4fma_expand;
37356 case IX86_BUILTIN_4FNMAPS:
37357 fcn = gen_avx5124fmaddps_4fnmaddps;
37358 masked = 0;
37359 goto v4fma_expand;
37361 case IX86_BUILTIN_4FNMAPS_MASK:
37362 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37363 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37364 goto v4fma_expand;
37366 case IX86_BUILTIN_4DPWSSD_MASK:
37367 nar_mode = V4SImode;
37368 mode = V16SImode;
37369 wide_mode = V64SImode;
37370 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37371 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37372 goto v4fma_expand;
37374 case IX86_BUILTIN_4DPWSSDS_MASK:
37375 nar_mode = V4SImode;
37376 mode = V16SImode;
37377 wide_mode = V64SImode;
37378 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37379 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37380 goto v4fma_expand;
37382 case IX86_BUILTIN_4FMAPS_MASK:
37384 tree args[4];
37385 rtx ops[4];
37386 rtx wide_reg;
37387 rtx accum;
37388 rtx addr;
37389 rtx mem;
37391 v4fma_expand:
37392 wide_reg = gen_reg_rtx (wide_mode);
37393 for (i = 0; i < 4; i++)
37395 args[i] = CALL_EXPR_ARG (exp, i);
37396 ops[i] = expand_normal (args[i]);
37398 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37399 ops[i]);
37402 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37403 accum = force_reg (mode, accum);
37405 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37406 addr = force_reg (Pmode, addr);
37408 mem = gen_rtx_MEM (nar_mode, addr);
37410 target = gen_reg_rtx (mode);
37412 emit_move_insn (target, accum);
37414 if (! masked)
37415 emit_insn (fcn (target, accum, wide_reg, mem));
37416 else
37418 rtx merge, mask;
37419 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37421 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37423 if (CONST_INT_P (mask))
37424 mask = fixup_modeless_constant (mask, HImode);
37426 mask = force_reg (HImode, mask);
37428 if (GET_MODE (mask) != HImode)
37429 mask = gen_rtx_SUBREG (HImode, mask, 0);
37431 /* If merge is 0 then we're about to emit z-masked variant. */
37432 if (const0_operand (merge, mode))
37433 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37434 /* If merge is the same as accum then emit merge-masked variant. */
37435 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37437 merge = force_reg (mode, merge);
37438 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37440 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37441 else
37443 target = gen_reg_rtx (mode);
37444 emit_move_insn (target, merge);
37445 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37448 return target;
37451 case IX86_BUILTIN_4FNMASS:
37452 fcn = gen_avx5124fmaddps_4fnmaddss;
37453 masked = 0;
37454 goto s4fma_expand;
37456 case IX86_BUILTIN_4FMASS:
37457 fcn = gen_avx5124fmaddps_4fmaddss;
37458 masked = 0;
37459 goto s4fma_expand;
37461 case IX86_BUILTIN_4FNMASS_MASK:
37462 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37463 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37464 goto s4fma_expand;
37466 case IX86_BUILTIN_4FMASS_MASK:
37468 tree args[4];
37469 rtx ops[4];
37470 rtx wide_reg;
37471 rtx accum;
37472 rtx addr;
37473 rtx mem;
37475 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37476 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37478 s4fma_expand:
37479 mode = V4SFmode;
37480 wide_reg = gen_reg_rtx (V64SFmode);
37481 for (i = 0; i < 4; i++)
37483 rtx tmp;
37484 args[i] = CALL_EXPR_ARG (exp, i);
37485 ops[i] = expand_normal (args[i]);
37487 tmp = gen_reg_rtx (SFmode);
37488 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37490 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37491 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37494 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37495 accum = force_reg (V4SFmode, accum);
37497 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37498 addr = force_reg (Pmode, addr);
37500 mem = gen_rtx_MEM (V4SFmode, addr);
37502 target = gen_reg_rtx (V4SFmode);
37504 emit_move_insn (target, accum);
37506 if (! masked)
37507 emit_insn (fcn (target, accum, wide_reg, mem));
37508 else
37510 rtx merge, mask;
37511 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37513 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37515 if (CONST_INT_P (mask))
37516 mask = fixup_modeless_constant (mask, QImode);
37518 mask = force_reg (QImode, mask);
37520 if (GET_MODE (mask) != QImode)
37521 mask = gen_rtx_SUBREG (QImode, mask, 0);
37523 /* If merge is 0 then we're about to emit z-masked variant. */
37524 if (const0_operand (merge, mode))
37525 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37526 /* If merge is the same as accum then emit merge-masked
37527 variant. */
37528 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37530 merge = force_reg (mode, merge);
37531 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37533 /* Merge with something unknown might happen if we z-mask
37534 w/ -O0. */
37535 else
37537 target = gen_reg_rtx (mode);
37538 emit_move_insn (target, merge);
37539 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37542 return target;
37544 case IX86_BUILTIN_RDPID:
37545 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37546 target);
37547 default:
37548 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37552 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37553 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37555 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37556 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37559 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37560 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37562 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37563 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37566 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37567 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37569 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37570 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37573 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37574 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37576 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37577 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37580 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37581 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37583 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37584 const struct builtin_description *d = bdesc_multi_arg + i;
37585 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37586 (enum ix86_builtin_func_type)
37587 d->flag, d->comparison);
37590 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37591 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37593 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37594 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37595 target);
37598 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37599 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37601 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37602 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37603 target);
37606 gcc_unreachable ();
37609 /* This returns the target-specific builtin with code CODE if
37610 current_function_decl has visibility on this builtin, which is checked
37611 using isa flags. Returns NULL_TREE otherwise. */
37613 static tree ix86_get_builtin (enum ix86_builtins code)
37615 struct cl_target_option *opts;
37616 tree target_tree = NULL_TREE;
37618 /* Determine the isa flags of current_function_decl. */
37620 if (current_function_decl)
37621 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37623 if (target_tree == NULL)
37624 target_tree = target_option_default_node;
37626 opts = TREE_TARGET_OPTION (target_tree);
37628 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37629 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37630 return ix86_builtin_decl (code, true);
37631 else
37632 return NULL_TREE;
37635 /* Return function decl for target specific builtin
37636 for given MPX builtin passed i FCODE. */
37637 static tree
37638 ix86_builtin_mpx_function (unsigned fcode)
37640 switch (fcode)
37642 case BUILT_IN_CHKP_BNDMK:
37643 return ix86_builtins[IX86_BUILTIN_BNDMK];
37645 case BUILT_IN_CHKP_BNDSTX:
37646 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37648 case BUILT_IN_CHKP_BNDLDX:
37649 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37651 case BUILT_IN_CHKP_BNDCL:
37652 return ix86_builtins[IX86_BUILTIN_BNDCL];
37654 case BUILT_IN_CHKP_BNDCU:
37655 return ix86_builtins[IX86_BUILTIN_BNDCU];
37657 case BUILT_IN_CHKP_BNDRET:
37658 return ix86_builtins[IX86_BUILTIN_BNDRET];
37660 case BUILT_IN_CHKP_INTERSECT:
37661 return ix86_builtins[IX86_BUILTIN_BNDINT];
37663 case BUILT_IN_CHKP_NARROW:
37664 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37666 case BUILT_IN_CHKP_SIZEOF:
37667 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37669 case BUILT_IN_CHKP_EXTRACT_LOWER:
37670 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37672 case BUILT_IN_CHKP_EXTRACT_UPPER:
37673 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37675 default:
37676 return NULL_TREE;
37679 gcc_unreachable ();
37682 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37684 Return an address to be used to load/store bounds for pointer
37685 passed in SLOT.
37687 SLOT_NO is an integer constant holding number of a target
37688 dependent special slot to be used in case SLOT is not a memory.
37690 SPECIAL_BASE is a pointer to be used as a base of fake address
37691 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37692 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37694 static rtx
37695 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37697 rtx addr = NULL;
37699 /* NULL slot means we pass bounds for pointer not passed to the
37700 function at all. Register slot means we pass pointer in a
37701 register. In both these cases bounds are passed via Bounds
37702 Table. Since we do not have actual pointer stored in memory,
37703 we have to use fake addresses to access Bounds Table. We
37704 start with (special_base - sizeof (void*)) and decrease this
37705 address by pointer size to get addresses for other slots. */
37706 if (!slot || REG_P (slot))
37708 gcc_assert (CONST_INT_P (slot_no));
37709 addr = plus_constant (Pmode, special_base,
37710 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37712 /* If pointer is passed in a memory then its address is used to
37713 access Bounds Table. */
37714 else if (MEM_P (slot))
37716 addr = XEXP (slot, 0);
37717 if (!register_operand (addr, Pmode))
37718 addr = copy_addr_to_reg (addr);
37720 else
37721 gcc_unreachable ();
37723 return addr;
37726 /* Expand pass uses this hook to load bounds for function parameter
37727 PTR passed in SLOT in case its bounds are not passed in a register.
37729 If SLOT is a memory, then bounds are loaded as for regular pointer
37730 loaded from memory. PTR may be NULL in case SLOT is a memory.
37731 In such case value of PTR (if required) may be loaded from SLOT.
37733 If SLOT is NULL or a register then SLOT_NO is an integer constant
37734 holding number of the target dependent special slot which should be
37735 used to obtain bounds.
37737 Return loaded bounds. */
37739 static rtx
37740 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37742 rtx reg = gen_reg_rtx (BNDmode);
37743 rtx addr;
37745 /* Get address to be used to access Bounds Table. Special slots start
37746 at the location of return address of the current function. */
37747 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37749 /* Load pointer value from a memory if we don't have it. */
37750 if (!ptr)
37752 gcc_assert (MEM_P (slot));
37753 ptr = copy_addr_to_reg (slot);
37756 if (!register_operand (ptr, Pmode))
37757 ptr = ix86_zero_extend_to_Pmode (ptr);
37759 emit_insn (BNDmode == BND64mode
37760 ? gen_bnd64_ldx (reg, addr, ptr)
37761 : gen_bnd32_ldx (reg, addr, ptr));
37763 return reg;
37766 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37767 passed in SLOT in case BOUNDS are not passed in a register.
37769 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37770 stored in memory. PTR may be NULL in case SLOT is a memory.
37771 In such case value of PTR (if required) may be loaded from SLOT.
37773 If SLOT is NULL or a register then SLOT_NO is an integer constant
37774 holding number of the target dependent special slot which should be
37775 used to store BOUNDS. */
37777 static void
37778 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37780 rtx addr;
37782 /* Get address to be used to access Bounds Table. Special slots start
37783 at the location of return address of a called function. */
37784 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37786 /* Load pointer value from a memory if we don't have it. */
37787 if (!ptr)
37789 gcc_assert (MEM_P (slot));
37790 ptr = copy_addr_to_reg (slot);
37793 if (!register_operand (ptr, Pmode))
37794 ptr = ix86_zero_extend_to_Pmode (ptr);
37796 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37797 if (!register_operand (bounds, BNDmode))
37798 bounds = copy_to_mode_reg (BNDmode, bounds);
37800 emit_insn (BNDmode == BND64mode
37801 ? gen_bnd64_stx (addr, ptr, bounds)
37802 : gen_bnd32_stx (addr, ptr, bounds));
37805 /* Load and return bounds returned by function in SLOT. */
37807 static rtx
37808 ix86_load_returned_bounds (rtx slot)
37810 rtx res;
37812 gcc_assert (REG_P (slot));
37813 res = gen_reg_rtx (BNDmode);
37814 emit_move_insn (res, slot);
37816 return res;
37819 /* Store BOUNDS returned by function into SLOT. */
37821 static void
37822 ix86_store_returned_bounds (rtx slot, rtx bounds)
37824 gcc_assert (REG_P (slot));
37825 emit_move_insn (slot, bounds);
37828 /* Returns a function decl for a vectorized version of the combined function
37829 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37830 if it is not available. */
37832 static tree
37833 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37834 tree type_in)
37836 machine_mode in_mode, out_mode;
37837 int in_n, out_n;
37839 if (TREE_CODE (type_out) != VECTOR_TYPE
37840 || TREE_CODE (type_in) != VECTOR_TYPE)
37841 return NULL_TREE;
37843 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37844 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37845 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37846 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37848 switch (fn)
37850 CASE_CFN_EXP2:
37851 if (out_mode == SFmode && in_mode == SFmode)
37853 if (out_n == 16 && in_n == 16)
37854 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37856 break;
37858 CASE_CFN_IFLOOR:
37859 CASE_CFN_LFLOOR:
37860 CASE_CFN_LLFLOOR:
37861 /* The round insn does not trap on denormals. */
37862 if (flag_trapping_math || !TARGET_SSE4_1)
37863 break;
37865 if (out_mode == SImode && in_mode == DFmode)
37867 if (out_n == 4 && in_n == 2)
37868 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37869 else if (out_n == 8 && in_n == 4)
37870 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37871 else if (out_n == 16 && in_n == 8)
37872 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37874 if (out_mode == SImode && in_mode == SFmode)
37876 if (out_n == 4 && in_n == 4)
37877 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37878 else if (out_n == 8 && in_n == 8)
37879 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37880 else if (out_n == 16 && in_n == 16)
37881 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37883 break;
37885 CASE_CFN_ICEIL:
37886 CASE_CFN_LCEIL:
37887 CASE_CFN_LLCEIL:
37888 /* The round insn does not trap on denormals. */
37889 if (flag_trapping_math || !TARGET_SSE4_1)
37890 break;
37892 if (out_mode == SImode && in_mode == DFmode)
37894 if (out_n == 4 && in_n == 2)
37895 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37896 else if (out_n == 8 && in_n == 4)
37897 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37898 else if (out_n == 16 && in_n == 8)
37899 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37901 if (out_mode == SImode && in_mode == SFmode)
37903 if (out_n == 4 && in_n == 4)
37904 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37905 else if (out_n == 8 && in_n == 8)
37906 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37907 else if (out_n == 16 && in_n == 16)
37908 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37910 break;
37912 CASE_CFN_IRINT:
37913 CASE_CFN_LRINT:
37914 CASE_CFN_LLRINT:
37915 if (out_mode == SImode && in_mode == DFmode)
37917 if (out_n == 4 && in_n == 2)
37918 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37919 else if (out_n == 8 && in_n == 4)
37920 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37921 else if (out_n == 16 && in_n == 8)
37922 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37924 if (out_mode == SImode && in_mode == SFmode)
37926 if (out_n == 4 && in_n == 4)
37927 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37928 else if (out_n == 8 && in_n == 8)
37929 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37930 else if (out_n == 16 && in_n == 16)
37931 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37933 break;
37935 CASE_CFN_IROUND:
37936 CASE_CFN_LROUND:
37937 CASE_CFN_LLROUND:
37938 /* The round insn does not trap on denormals. */
37939 if (flag_trapping_math || !TARGET_SSE4_1)
37940 break;
37942 if (out_mode == SImode && in_mode == DFmode)
37944 if (out_n == 4 && in_n == 2)
37945 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37946 else if (out_n == 8 && in_n == 4)
37947 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37948 else if (out_n == 16 && in_n == 8)
37949 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37951 if (out_mode == SImode && in_mode == SFmode)
37953 if (out_n == 4 && in_n == 4)
37954 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37955 else if (out_n == 8 && in_n == 8)
37956 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37957 else if (out_n == 16 && in_n == 16)
37958 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37960 break;
37962 CASE_CFN_FLOOR:
37963 /* The round insn does not trap on denormals. */
37964 if (flag_trapping_math || !TARGET_SSE4_1)
37965 break;
37967 if (out_mode == DFmode && in_mode == DFmode)
37969 if (out_n == 2 && in_n == 2)
37970 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37971 else if (out_n == 4 && in_n == 4)
37972 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37973 else if (out_n == 8 && in_n == 8)
37974 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37976 if (out_mode == SFmode && in_mode == SFmode)
37978 if (out_n == 4 && in_n == 4)
37979 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37980 else if (out_n == 8 && in_n == 8)
37981 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37982 else if (out_n == 16 && in_n == 16)
37983 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37985 break;
37987 CASE_CFN_CEIL:
37988 /* The round insn does not trap on denormals. */
37989 if (flag_trapping_math || !TARGET_SSE4_1)
37990 break;
37992 if (out_mode == DFmode && in_mode == DFmode)
37994 if (out_n == 2 && in_n == 2)
37995 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37996 else if (out_n == 4 && in_n == 4)
37997 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37998 else if (out_n == 8 && in_n == 8)
37999 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38001 if (out_mode == SFmode && in_mode == SFmode)
38003 if (out_n == 4 && in_n == 4)
38004 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38005 else if (out_n == 8 && in_n == 8)
38006 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38007 else if (out_n == 16 && in_n == 16)
38008 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38010 break;
38012 CASE_CFN_TRUNC:
38013 /* The round insn does not trap on denormals. */
38014 if (flag_trapping_math || !TARGET_SSE4_1)
38015 break;
38017 if (out_mode == DFmode && in_mode == DFmode)
38019 if (out_n == 2 && in_n == 2)
38020 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38021 else if (out_n == 4 && in_n == 4)
38022 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38023 else if (out_n == 8 && in_n == 8)
38024 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38026 if (out_mode == SFmode && in_mode == SFmode)
38028 if (out_n == 4 && in_n == 4)
38029 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38030 else if (out_n == 8 && in_n == 8)
38031 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38032 else if (out_n == 16 && in_n == 16)
38033 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38035 break;
38037 CASE_CFN_RINT:
38038 /* The round insn does not trap on denormals. */
38039 if (flag_trapping_math || !TARGET_SSE4_1)
38040 break;
38042 if (out_mode == DFmode && in_mode == DFmode)
38044 if (out_n == 2 && in_n == 2)
38045 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38046 else if (out_n == 4 && in_n == 4)
38047 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38049 if (out_mode == SFmode && in_mode == SFmode)
38051 if (out_n == 4 && in_n == 4)
38052 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38053 else if (out_n == 8 && in_n == 8)
38054 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38056 break;
38058 CASE_CFN_FMA:
38059 if (out_mode == DFmode && in_mode == DFmode)
38061 if (out_n == 2 && in_n == 2)
38062 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38063 if (out_n == 4 && in_n == 4)
38064 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38066 if (out_mode == SFmode && in_mode == SFmode)
38068 if (out_n == 4 && in_n == 4)
38069 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38070 if (out_n == 8 && in_n == 8)
38071 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38073 break;
38075 default:
38076 break;
38079 /* Dispatch to a handler for a vectorization library. */
38080 if (ix86_veclib_handler)
38081 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38083 return NULL_TREE;
38086 /* Handler for an SVML-style interface to
38087 a library with vectorized intrinsics. */
38089 static tree
38090 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38092 char name[20];
38093 tree fntype, new_fndecl, args;
38094 unsigned arity;
38095 const char *bname;
38096 machine_mode el_mode, in_mode;
38097 int n, in_n;
38099 /* The SVML is suitable for unsafe math only. */
38100 if (!flag_unsafe_math_optimizations)
38101 return NULL_TREE;
38103 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38104 n = TYPE_VECTOR_SUBPARTS (type_out);
38105 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38106 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38107 if (el_mode != in_mode
38108 || n != in_n)
38109 return NULL_TREE;
38111 switch (fn)
38113 CASE_CFN_EXP:
38114 CASE_CFN_LOG:
38115 CASE_CFN_LOG10:
38116 CASE_CFN_POW:
38117 CASE_CFN_TANH:
38118 CASE_CFN_TAN:
38119 CASE_CFN_ATAN:
38120 CASE_CFN_ATAN2:
38121 CASE_CFN_ATANH:
38122 CASE_CFN_CBRT:
38123 CASE_CFN_SINH:
38124 CASE_CFN_SIN:
38125 CASE_CFN_ASINH:
38126 CASE_CFN_ASIN:
38127 CASE_CFN_COSH:
38128 CASE_CFN_COS:
38129 CASE_CFN_ACOSH:
38130 CASE_CFN_ACOS:
38131 if ((el_mode != DFmode || n != 2)
38132 && (el_mode != SFmode || n != 4))
38133 return NULL_TREE;
38134 break;
38136 default:
38137 return NULL_TREE;
38140 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38141 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38143 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38144 strcpy (name, "vmlsLn4");
38145 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38146 strcpy (name, "vmldLn2");
38147 else if (n == 4)
38149 sprintf (name, "vmls%s", bname+10);
38150 name[strlen (name)-1] = '4';
38152 else
38153 sprintf (name, "vmld%s2", bname+10);
38155 /* Convert to uppercase. */
38156 name[4] &= ~0x20;
38158 arity = 0;
38159 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38160 arity++;
38162 if (arity == 1)
38163 fntype = build_function_type_list (type_out, type_in, NULL);
38164 else
38165 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38167 /* Build a function declaration for the vectorized function. */
38168 new_fndecl = build_decl (BUILTINS_LOCATION,
38169 FUNCTION_DECL, get_identifier (name), fntype);
38170 TREE_PUBLIC (new_fndecl) = 1;
38171 DECL_EXTERNAL (new_fndecl) = 1;
38172 DECL_IS_NOVOPS (new_fndecl) = 1;
38173 TREE_READONLY (new_fndecl) = 1;
38175 return new_fndecl;
38178 /* Handler for an ACML-style interface to
38179 a library with vectorized intrinsics. */
38181 static tree
38182 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38184 char name[20] = "__vr.._";
38185 tree fntype, new_fndecl, args;
38186 unsigned arity;
38187 const char *bname;
38188 machine_mode el_mode, in_mode;
38189 int n, in_n;
38191 /* The ACML is 64bits only and suitable for unsafe math only as
38192 it does not correctly support parts of IEEE with the required
38193 precision such as denormals. */
38194 if (!TARGET_64BIT
38195 || !flag_unsafe_math_optimizations)
38196 return NULL_TREE;
38198 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38199 n = TYPE_VECTOR_SUBPARTS (type_out);
38200 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38201 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38202 if (el_mode != in_mode
38203 || n != in_n)
38204 return NULL_TREE;
38206 switch (fn)
38208 CASE_CFN_SIN:
38209 CASE_CFN_COS:
38210 CASE_CFN_EXP:
38211 CASE_CFN_LOG:
38212 CASE_CFN_LOG2:
38213 CASE_CFN_LOG10:
38214 if (el_mode == DFmode && n == 2)
38216 name[4] = 'd';
38217 name[5] = '2';
38219 else if (el_mode == SFmode && n == 4)
38221 name[4] = 's';
38222 name[5] = '4';
38224 else
38225 return NULL_TREE;
38226 break;
38228 default:
38229 return NULL_TREE;
38232 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38233 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38234 sprintf (name + 7, "%s", bname+10);
38236 arity = 0;
38237 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38238 arity++;
38240 if (arity == 1)
38241 fntype = build_function_type_list (type_out, type_in, NULL);
38242 else
38243 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38245 /* Build a function declaration for the vectorized function. */
38246 new_fndecl = build_decl (BUILTINS_LOCATION,
38247 FUNCTION_DECL, get_identifier (name), fntype);
38248 TREE_PUBLIC (new_fndecl) = 1;
38249 DECL_EXTERNAL (new_fndecl) = 1;
38250 DECL_IS_NOVOPS (new_fndecl) = 1;
38251 TREE_READONLY (new_fndecl) = 1;
38253 return new_fndecl;
38256 /* Returns a decl of a function that implements gather load with
38257 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38258 Return NULL_TREE if it is not available. */
38260 static tree
38261 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38262 const_tree index_type, int scale)
38264 bool si;
38265 enum ix86_builtins code;
38267 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
38268 return NULL_TREE;
38270 if ((TREE_CODE (index_type) != INTEGER_TYPE
38271 && !POINTER_TYPE_P (index_type))
38272 || (TYPE_MODE (index_type) != SImode
38273 && TYPE_MODE (index_type) != DImode))
38274 return NULL_TREE;
38276 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38277 return NULL_TREE;
38279 /* v*gather* insn sign extends index to pointer mode. */
38280 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38281 && TYPE_UNSIGNED (index_type))
38282 return NULL_TREE;
38284 if (scale <= 0
38285 || scale > 8
38286 || (scale & (scale - 1)) != 0)
38287 return NULL_TREE;
38289 si = TYPE_MODE (index_type) == SImode;
38290 switch (TYPE_MODE (mem_vectype))
38292 case E_V2DFmode:
38293 if (TARGET_AVX512VL)
38294 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38295 else
38296 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38297 break;
38298 case E_V4DFmode:
38299 if (TARGET_AVX512VL)
38300 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38301 else
38302 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38303 break;
38304 case E_V2DImode:
38305 if (TARGET_AVX512VL)
38306 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38307 else
38308 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38309 break;
38310 case E_V4DImode:
38311 if (TARGET_AVX512VL)
38312 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38313 else
38314 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38315 break;
38316 case E_V4SFmode:
38317 if (TARGET_AVX512VL)
38318 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38319 else
38320 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38321 break;
38322 case E_V8SFmode:
38323 if (TARGET_AVX512VL)
38324 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38325 else
38326 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38327 break;
38328 case E_V4SImode:
38329 if (TARGET_AVX512VL)
38330 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38331 else
38332 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38333 break;
38334 case E_V8SImode:
38335 if (TARGET_AVX512VL)
38336 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38337 else
38338 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38339 break;
38340 case E_V8DFmode:
38341 if (TARGET_AVX512F)
38342 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38343 else
38344 return NULL_TREE;
38345 break;
38346 case E_V8DImode:
38347 if (TARGET_AVX512F)
38348 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38349 else
38350 return NULL_TREE;
38351 break;
38352 case E_V16SFmode:
38353 if (TARGET_AVX512F)
38354 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38355 else
38356 return NULL_TREE;
38357 break;
38358 case E_V16SImode:
38359 if (TARGET_AVX512F)
38360 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38361 else
38362 return NULL_TREE;
38363 break;
38364 default:
38365 return NULL_TREE;
38368 return ix86_get_builtin (code);
38371 /* Returns a decl of a function that implements scatter store with
38372 register type VECTYPE and index type INDEX_TYPE and SCALE.
38373 Return NULL_TREE if it is not available. */
38375 static tree
38376 ix86_vectorize_builtin_scatter (const_tree vectype,
38377 const_tree index_type, int scale)
38379 bool si;
38380 enum ix86_builtins code;
38382 if (!TARGET_AVX512F)
38383 return NULL_TREE;
38385 if ((TREE_CODE (index_type) != INTEGER_TYPE
38386 && !POINTER_TYPE_P (index_type))
38387 || (TYPE_MODE (index_type) != SImode
38388 && TYPE_MODE (index_type) != DImode))
38389 return NULL_TREE;
38391 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38392 return NULL_TREE;
38394 /* v*scatter* insn sign extends index to pointer mode. */
38395 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38396 && TYPE_UNSIGNED (index_type))
38397 return NULL_TREE;
38399 /* Scale can be 1, 2, 4 or 8. */
38400 if (scale <= 0
38401 || scale > 8
38402 || (scale & (scale - 1)) != 0)
38403 return NULL_TREE;
38405 si = TYPE_MODE (index_type) == SImode;
38406 switch (TYPE_MODE (vectype))
38408 case E_V8DFmode:
38409 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38410 break;
38411 case E_V8DImode:
38412 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38413 break;
38414 case E_V16SFmode:
38415 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38416 break;
38417 case E_V16SImode:
38418 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38419 break;
38420 default:
38421 return NULL_TREE;
38424 return ix86_builtins[code];
38427 /* Return true if it is safe to use the rsqrt optabs to optimize
38428 1.0/sqrt. */
38430 static bool
38431 use_rsqrt_p ()
38433 return (TARGET_SSE_MATH
38434 && flag_finite_math_only
38435 && !flag_trapping_math
38436 && flag_unsafe_math_optimizations);
38439 /* Returns a code for a target-specific builtin that implements
38440 reciprocal of the function, or NULL_TREE if not available. */
38442 static tree
38443 ix86_builtin_reciprocal (tree fndecl)
38445 switch (DECL_FUNCTION_CODE (fndecl))
38447 /* Vectorized version of sqrt to rsqrt conversion. */
38448 case IX86_BUILTIN_SQRTPS_NR:
38449 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38451 case IX86_BUILTIN_SQRTPS_NR256:
38452 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38454 default:
38455 return NULL_TREE;
38459 /* Helper for avx_vpermilps256_operand et al. This is also used by
38460 the expansion functions to turn the parallel back into a mask.
38461 The return value is 0 for no match and the imm8+1 for a match. */
38464 avx_vpermilp_parallel (rtx par, machine_mode mode)
38466 unsigned i, nelt = GET_MODE_NUNITS (mode);
38467 unsigned mask = 0;
38468 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38470 if (XVECLEN (par, 0) != (int) nelt)
38471 return 0;
38473 /* Validate that all of the elements are constants, and not totally
38474 out of range. Copy the data into an integral array to make the
38475 subsequent checks easier. */
38476 for (i = 0; i < nelt; ++i)
38478 rtx er = XVECEXP (par, 0, i);
38479 unsigned HOST_WIDE_INT ei;
38481 if (!CONST_INT_P (er))
38482 return 0;
38483 ei = INTVAL (er);
38484 if (ei >= nelt)
38485 return 0;
38486 ipar[i] = ei;
38489 switch (mode)
38491 case E_V8DFmode:
38492 /* In the 512-bit DFmode case, we can only move elements within
38493 a 128-bit lane. First fill the second part of the mask,
38494 then fallthru. */
38495 for (i = 4; i < 6; ++i)
38497 if (ipar[i] < 4 || ipar[i] >= 6)
38498 return 0;
38499 mask |= (ipar[i] - 4) << i;
38501 for (i = 6; i < 8; ++i)
38503 if (ipar[i] < 6)
38504 return 0;
38505 mask |= (ipar[i] - 6) << i;
38507 /* FALLTHRU */
38509 case E_V4DFmode:
38510 /* In the 256-bit DFmode case, we can only move elements within
38511 a 128-bit lane. */
38512 for (i = 0; i < 2; ++i)
38514 if (ipar[i] >= 2)
38515 return 0;
38516 mask |= ipar[i] << i;
38518 for (i = 2; i < 4; ++i)
38520 if (ipar[i] < 2)
38521 return 0;
38522 mask |= (ipar[i] - 2) << i;
38524 break;
38526 case E_V16SFmode:
38527 /* In 512 bit SFmode case, permutation in the upper 256 bits
38528 must mirror the permutation in the lower 256-bits. */
38529 for (i = 0; i < 8; ++i)
38530 if (ipar[i] + 8 != ipar[i + 8])
38531 return 0;
38532 /* FALLTHRU */
38534 case E_V8SFmode:
38535 /* In 256 bit SFmode case, we have full freedom of
38536 movement within the low 128-bit lane, but the high 128-bit
38537 lane must mirror the exact same pattern. */
38538 for (i = 0; i < 4; ++i)
38539 if (ipar[i] + 4 != ipar[i + 4])
38540 return 0;
38541 nelt = 4;
38542 /* FALLTHRU */
38544 case E_V2DFmode:
38545 case E_V4SFmode:
38546 /* In the 128-bit case, we've full freedom in the placement of
38547 the elements from the source operand. */
38548 for (i = 0; i < nelt; ++i)
38549 mask |= ipar[i] << (i * (nelt / 2));
38550 break;
38552 default:
38553 gcc_unreachable ();
38556 /* Make sure success has a non-zero value by adding one. */
38557 return mask + 1;
38560 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38561 the expansion functions to turn the parallel back into a mask.
38562 The return value is 0 for no match and the imm8+1 for a match. */
38565 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38567 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38568 unsigned mask = 0;
38569 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38571 if (XVECLEN (par, 0) != (int) nelt)
38572 return 0;
38574 /* Validate that all of the elements are constants, and not totally
38575 out of range. Copy the data into an integral array to make the
38576 subsequent checks easier. */
38577 for (i = 0; i < nelt; ++i)
38579 rtx er = XVECEXP (par, 0, i);
38580 unsigned HOST_WIDE_INT ei;
38582 if (!CONST_INT_P (er))
38583 return 0;
38584 ei = INTVAL (er);
38585 if (ei >= 2 * nelt)
38586 return 0;
38587 ipar[i] = ei;
38590 /* Validate that the halves of the permute are halves. */
38591 for (i = 0; i < nelt2 - 1; ++i)
38592 if (ipar[i] + 1 != ipar[i + 1])
38593 return 0;
38594 for (i = nelt2; i < nelt - 1; ++i)
38595 if (ipar[i] + 1 != ipar[i + 1])
38596 return 0;
38598 /* Reconstruct the mask. */
38599 for (i = 0; i < 2; ++i)
38601 unsigned e = ipar[i * nelt2];
38602 if (e % nelt2)
38603 return 0;
38604 e /= nelt2;
38605 mask |= e << (i * 4);
38608 /* Make sure success has a non-zero value by adding one. */
38609 return mask + 1;
38612 /* Return a register priority for hard reg REGNO. */
38613 static int
38614 ix86_register_priority (int hard_regno)
38616 /* ebp and r13 as the base always wants a displacement, r12 as the
38617 base always wants an index. So discourage their usage in an
38618 address. */
38619 if (hard_regno == R12_REG || hard_regno == R13_REG)
38620 return 0;
38621 if (hard_regno == BP_REG)
38622 return 1;
38623 /* New x86-64 int registers result in bigger code size. Discourage
38624 them. */
38625 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38626 return 2;
38627 /* New x86-64 SSE registers result in bigger code size. Discourage
38628 them. */
38629 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38630 return 2;
38631 /* Usage of AX register results in smaller code. Prefer it. */
38632 if (hard_regno == AX_REG)
38633 return 4;
38634 return 3;
38637 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38639 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38640 QImode must go into class Q_REGS.
38641 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38642 movdf to do mem-to-mem moves through integer regs. */
38644 static reg_class_t
38645 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38647 machine_mode mode = GET_MODE (x);
38649 /* We're only allowed to return a subclass of CLASS. Many of the
38650 following checks fail for NO_REGS, so eliminate that early. */
38651 if (regclass == NO_REGS)
38652 return NO_REGS;
38654 /* All classes can load zeros. */
38655 if (x == CONST0_RTX (mode))
38656 return regclass;
38658 /* Force constants into memory if we are loading a (nonzero) constant into
38659 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38660 instructions to load from a constant. */
38661 if (CONSTANT_P (x)
38662 && (MAYBE_MMX_CLASS_P (regclass)
38663 || MAYBE_SSE_CLASS_P (regclass)
38664 || MAYBE_MASK_CLASS_P (regclass)))
38665 return NO_REGS;
38667 /* Floating-point constants need more complex checks. */
38668 if (CONST_DOUBLE_P (x))
38670 /* General regs can load everything. */
38671 if (INTEGER_CLASS_P (regclass))
38672 return regclass;
38674 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38675 zero above. We only want to wind up preferring 80387 registers if
38676 we plan on doing computation with them. */
38677 if (IS_STACK_MODE (mode)
38678 && standard_80387_constant_p (x) > 0)
38680 /* Limit class to FP regs. */
38681 if (FLOAT_CLASS_P (regclass))
38682 return FLOAT_REGS;
38683 else if (regclass == FP_TOP_SSE_REGS)
38684 return FP_TOP_REG;
38685 else if (regclass == FP_SECOND_SSE_REGS)
38686 return FP_SECOND_REG;
38689 return NO_REGS;
38692 /* Prefer SSE regs only, if we can use them for math. */
38693 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38694 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38696 /* Generally when we see PLUS here, it's the function invariant
38697 (plus soft-fp const_int). Which can only be computed into general
38698 regs. */
38699 if (GET_CODE (x) == PLUS)
38700 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38702 /* QImode constants are easy to load, but non-constant QImode data
38703 must go into Q_REGS. */
38704 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38706 if (Q_CLASS_P (regclass))
38707 return regclass;
38708 else if (reg_class_subset_p (Q_REGS, regclass))
38709 return Q_REGS;
38710 else
38711 return NO_REGS;
38714 return regclass;
38717 /* Discourage putting floating-point values in SSE registers unless
38718 SSE math is being used, and likewise for the 387 registers. */
38719 static reg_class_t
38720 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38722 machine_mode mode = GET_MODE (x);
38724 /* Restrict the output reload class to the register bank that we are doing
38725 math on. If we would like not to return a subset of CLASS, reject this
38726 alternative: if reload cannot do this, it will still use its choice. */
38727 mode = GET_MODE (x);
38728 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38729 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38731 if (IS_STACK_MODE (mode))
38733 if (regclass == FP_TOP_SSE_REGS)
38734 return FP_TOP_REG;
38735 else if (regclass == FP_SECOND_SSE_REGS)
38736 return FP_SECOND_REG;
38737 else
38738 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38741 return regclass;
38744 static reg_class_t
38745 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38746 machine_mode mode, secondary_reload_info *sri)
38748 /* Double-word spills from general registers to non-offsettable memory
38749 references (zero-extended addresses) require special handling. */
38750 if (TARGET_64BIT
38751 && MEM_P (x)
38752 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38753 && INTEGER_CLASS_P (rclass)
38754 && !offsettable_memref_p (x))
38756 sri->icode = (in_p
38757 ? CODE_FOR_reload_noff_load
38758 : CODE_FOR_reload_noff_store);
38759 /* Add the cost of moving address to a temporary. */
38760 sri->extra_cost = 1;
38762 return NO_REGS;
38765 /* QImode spills from non-QI registers require
38766 intermediate register on 32bit targets. */
38767 if (mode == QImode
38768 && ((!TARGET_64BIT && !in_p
38769 && INTEGER_CLASS_P (rclass)
38770 && MAYBE_NON_Q_CLASS_P (rclass))
38771 || (!TARGET_AVX512DQ
38772 && MAYBE_MASK_CLASS_P (rclass))))
38774 int regno = true_regnum (x);
38776 /* Return Q_REGS if the operand is in memory. */
38777 if (regno == -1)
38778 return Q_REGS;
38780 return NO_REGS;
38783 /* This condition handles corner case where an expression involving
38784 pointers gets vectorized. We're trying to use the address of a
38785 stack slot as a vector initializer.
38787 (set (reg:V2DI 74 [ vect_cst_.2 ])
38788 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38790 Eventually frame gets turned into sp+offset like this:
38792 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38793 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38794 (const_int 392 [0x188]))))
38796 That later gets turned into:
38798 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38799 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38800 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38802 We'll have the following reload recorded:
38804 Reload 0: reload_in (DI) =
38805 (plus:DI (reg/f:DI 7 sp)
38806 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38807 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38808 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38809 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38810 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38811 reload_reg_rtx: (reg:V2DI 22 xmm1)
38813 Which isn't going to work since SSE instructions can't handle scalar
38814 additions. Returning GENERAL_REGS forces the addition into integer
38815 register and reload can handle subsequent reloads without problems. */
38817 if (in_p && GET_CODE (x) == PLUS
38818 && SSE_CLASS_P (rclass)
38819 && SCALAR_INT_MODE_P (mode))
38820 return GENERAL_REGS;
38822 return NO_REGS;
38825 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38827 static bool
38828 ix86_class_likely_spilled_p (reg_class_t rclass)
38830 switch (rclass)
38832 case AREG:
38833 case DREG:
38834 case CREG:
38835 case BREG:
38836 case AD_REGS:
38837 case SIREG:
38838 case DIREG:
38839 case SSE_FIRST_REG:
38840 case FP_TOP_REG:
38841 case FP_SECOND_REG:
38842 case BND_REGS:
38843 return true;
38845 default:
38846 break;
38849 return false;
38852 /* If we are copying between registers from different register sets
38853 (e.g. FP and integer), we may need a memory location.
38855 The function can't work reliably when one of the CLASSES is a class
38856 containing registers from multiple sets. We avoid this by never combining
38857 different sets in a single alternative in the machine description.
38858 Ensure that this constraint holds to avoid unexpected surprises.
38860 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38861 so do not enforce these sanity checks.
38863 To optimize register_move_cost performance, define inline variant. */
38865 static inline bool
38866 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38867 reg_class_t class2, int strict)
38869 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38870 return false;
38872 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38873 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38874 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38875 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38876 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38877 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38878 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38879 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38881 gcc_assert (!strict || lra_in_progress);
38882 return true;
38885 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38886 return true;
38888 /* Between mask and general, we have moves no larger than word size. */
38889 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38890 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38891 return true;
38893 /* ??? This is a lie. We do have moves between mmx/general, and for
38894 mmx/sse2. But by saying we need secondary memory we discourage the
38895 register allocator from using the mmx registers unless needed. */
38896 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38897 return true;
38899 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38901 /* SSE1 doesn't have any direct moves from other classes. */
38902 if (!TARGET_SSE2)
38903 return true;
38905 /* If the target says that inter-unit moves are more expensive
38906 than moving through memory, then don't generate them. */
38907 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38908 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38909 return true;
38911 /* Between SSE and general, we have moves no larger than word size. */
38912 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38913 return true;
38916 return false;
38919 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38921 static bool
38922 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38923 reg_class_t class2)
38925 return inline_secondary_memory_needed (mode, class1, class2, true);
38928 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38930 get_secondary_mem widens integral modes to BITS_PER_WORD.
38931 There is no need to emit full 64 bit move on 64 bit targets
38932 for integral modes that can be moved using 32 bit move. */
38934 static machine_mode
38935 ix86_secondary_memory_needed_mode (machine_mode mode)
38937 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38938 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38939 return mode;
38942 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38944 On the 80386, this is the size of MODE in words,
38945 except in the FP regs, where a single reg is always enough. */
38947 static unsigned char
38948 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38950 if (MAYBE_INTEGER_CLASS_P (rclass))
38952 if (mode == XFmode)
38953 return (TARGET_64BIT ? 2 : 3);
38954 else if (mode == XCmode)
38955 return (TARGET_64BIT ? 4 : 6);
38956 else
38957 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38959 else
38961 if (COMPLEX_MODE_P (mode))
38962 return 2;
38963 else
38964 return 1;
38968 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38970 static bool
38971 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38972 reg_class_t regclass)
38974 if (from == to)
38975 return true;
38977 /* x87 registers can't do subreg at all, as all values are reformatted
38978 to extended precision. */
38979 if (MAYBE_FLOAT_CLASS_P (regclass))
38980 return false;
38982 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38984 /* Vector registers do not support QI or HImode loads. If we don't
38985 disallow a change to these modes, reload will assume it's ok to
38986 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38987 the vec_dupv4hi pattern. */
38988 if (GET_MODE_SIZE (from) < 4)
38989 return false;
38992 return true;
38995 /* Return index of MODE in the sse load/store tables. */
38997 static inline int
38998 sse_store_index (machine_mode mode)
39000 switch (GET_MODE_SIZE (mode))
39002 case 4:
39003 return 0;
39004 case 8:
39005 return 1;
39006 case 16:
39007 return 2;
39008 case 32:
39009 return 3;
39010 case 64:
39011 return 4;
39012 default:
39013 return -1;
39017 /* Return the cost of moving data of mode M between a
39018 register and memory. A value of 2 is the default; this cost is
39019 relative to those in `REGISTER_MOVE_COST'.
39021 This function is used extensively by register_move_cost that is used to
39022 build tables at startup. Make it inline in this case.
39023 When IN is 2, return maximum of in and out move cost.
39025 If moving between registers and memory is more expensive than
39026 between two registers, you should define this macro to express the
39027 relative cost.
39029 Model also increased moving costs of QImode registers in non
39030 Q_REGS classes.
39032 static inline int
39033 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39034 int in)
39036 int cost;
39037 if (FLOAT_CLASS_P (regclass))
39039 int index;
39040 switch (mode)
39042 case E_SFmode:
39043 index = 0;
39044 break;
39045 case E_DFmode:
39046 index = 1;
39047 break;
39048 case E_XFmode:
39049 index = 2;
39050 break;
39051 default:
39052 return 100;
39054 if (in == 2)
39055 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39056 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39058 if (SSE_CLASS_P (regclass))
39060 int index = sse_store_index (mode);
39061 if (index == -1)
39062 return 100;
39063 if (in == 2)
39064 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39065 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39067 if (MMX_CLASS_P (regclass))
39069 int index;
39070 switch (GET_MODE_SIZE (mode))
39072 case 4:
39073 index = 0;
39074 break;
39075 case 8:
39076 index = 1;
39077 break;
39078 default:
39079 return 100;
39081 if (in)
39082 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39083 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39085 switch (GET_MODE_SIZE (mode))
39087 case 1:
39088 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39090 if (!in)
39091 return ix86_cost->int_store[0];
39092 if (TARGET_PARTIAL_REG_DEPENDENCY
39093 && optimize_function_for_speed_p (cfun))
39094 cost = ix86_cost->movzbl_load;
39095 else
39096 cost = ix86_cost->int_load[0];
39097 if (in == 2)
39098 return MAX (cost, ix86_cost->int_store[0]);
39099 return cost;
39101 else
39103 if (in == 2)
39104 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39105 if (in)
39106 return ix86_cost->movzbl_load;
39107 else
39108 return ix86_cost->int_store[0] + 4;
39110 break;
39111 case 2:
39112 if (in == 2)
39113 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39114 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39115 default:
39116 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39117 if (mode == TFmode)
39118 mode = XFmode;
39119 if (in == 2)
39120 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39121 else if (in)
39122 cost = ix86_cost->int_load[2];
39123 else
39124 cost = ix86_cost->int_store[2];
39125 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39129 static int
39130 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39131 bool in)
39133 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39137 /* Return the cost of moving data from a register in class CLASS1 to
39138 one in class CLASS2.
39140 It is not required that the cost always equal 2 when FROM is the same as TO;
39141 on some machines it is expensive to move between registers if they are not
39142 general registers. */
39144 static int
39145 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39146 reg_class_t class2_i)
39148 enum reg_class class1 = (enum reg_class) class1_i;
39149 enum reg_class class2 = (enum reg_class) class2_i;
39151 /* In case we require secondary memory, compute cost of the store followed
39152 by load. In order to avoid bad register allocation choices, we need
39153 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39155 if (inline_secondary_memory_needed (mode, class1, class2, false))
39157 int cost = 1;
39159 cost += inline_memory_move_cost (mode, class1, 2);
39160 cost += inline_memory_move_cost (mode, class2, 2);
39162 /* In case of copying from general_purpose_register we may emit multiple
39163 stores followed by single load causing memory size mismatch stall.
39164 Count this as arbitrarily high cost of 20. */
39165 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39166 && TARGET_MEMORY_MISMATCH_STALL
39167 && targetm.class_max_nregs (class1, mode)
39168 > targetm.class_max_nregs (class2, mode))
39169 cost += 20;
39171 /* In the case of FP/MMX moves, the registers actually overlap, and we
39172 have to switch modes in order to treat them differently. */
39173 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39174 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39175 cost += 20;
39177 return cost;
39180 /* Moves between SSE/MMX and integer unit are expensive. */
39181 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39182 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39184 /* ??? By keeping returned value relatively high, we limit the number
39185 of moves between integer and MMX/SSE registers for all targets.
39186 Additionally, high value prevents problem with x86_modes_tieable_p(),
39187 where integer modes in MMX/SSE registers are not tieable
39188 because of missing QImode and HImode moves to, from or between
39189 MMX/SSE registers. */
39190 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39191 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39193 if (MAYBE_FLOAT_CLASS_P (class1))
39194 return ix86_cost->fp_move;
39195 if (MAYBE_SSE_CLASS_P (class1))
39197 if (GET_MODE_BITSIZE (mode) <= 128)
39198 return ix86_cost->xmm_move;
39199 if (GET_MODE_BITSIZE (mode) <= 256)
39200 return ix86_cost->ymm_move;
39201 return ix86_cost->zmm_move;
39203 if (MAYBE_MMX_CLASS_P (class1))
39204 return ix86_cost->mmx_move;
39205 return 2;
39208 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39209 words of a value of mode MODE but can be less for certain modes in
39210 special long registers.
39212 Actually there are no two word move instructions for consecutive
39213 registers. And only registers 0-3 may have mov byte instructions
39214 applied to them. */
39216 static unsigned int
39217 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
39219 if (GENERAL_REGNO_P (regno))
39221 if (mode == XFmode)
39222 return TARGET_64BIT ? 2 : 3;
39223 if (mode == XCmode)
39224 return TARGET_64BIT ? 4 : 6;
39225 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39227 if (COMPLEX_MODE_P (mode))
39228 return 2;
39229 if (mode == V64SFmode || mode == V64SImode)
39230 return 4;
39231 return 1;
39234 /* Implement TARGET_HARD_REGNO_MODE_OK. */
39236 static bool
39237 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
39239 /* Flags and only flags can only hold CCmode values. */
39240 if (CC_REGNO_P (regno))
39241 return GET_MODE_CLASS (mode) == MODE_CC;
39242 if (GET_MODE_CLASS (mode) == MODE_CC
39243 || GET_MODE_CLASS (mode) == MODE_RANDOM
39244 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39245 return false;
39246 if (STACK_REGNO_P (regno))
39247 return VALID_FP_MODE_P (mode);
39248 if (MASK_REGNO_P (regno))
39249 return (VALID_MASK_REG_MODE (mode)
39250 || (TARGET_AVX512BW
39251 && VALID_MASK_AVX512BW_MODE (mode)));
39252 if (BND_REGNO_P (regno))
39253 return VALID_BND_REG_MODE (mode);
39254 if (SSE_REGNO_P (regno))
39256 /* We implement the move patterns for all vector modes into and
39257 out of SSE registers, even when no operation instructions
39258 are available. */
39260 /* For AVX-512 we allow, regardless of regno:
39261 - XI mode
39262 - any of 512-bit wide vector mode
39263 - any scalar mode. */
39264 if (TARGET_AVX512F
39265 && (mode == XImode
39266 || VALID_AVX512F_REG_MODE (mode)
39267 || VALID_AVX512F_SCALAR_MODE (mode)))
39268 return true;
39270 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
39271 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39272 && MOD4_SSE_REGNO_P (regno)
39273 && mode == V64SFmode)
39274 return true;
39276 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
39277 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39278 && MOD4_SSE_REGNO_P (regno)
39279 && mode == V64SImode)
39280 return true;
39282 /* TODO check for QI/HI scalars. */
39283 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39284 if (TARGET_AVX512VL
39285 && (mode == OImode
39286 || mode == TImode
39287 || VALID_AVX256_REG_MODE (mode)
39288 || VALID_AVX512VL_128_REG_MODE (mode)))
39289 return true;
39291 /* xmm16-xmm31 are only available for AVX-512. */
39292 if (EXT_REX_SSE_REGNO_P (regno))
39293 return false;
39295 /* OImode and AVX modes are available only when AVX is enabled. */
39296 return ((TARGET_AVX
39297 && VALID_AVX256_REG_OR_OI_MODE (mode))
39298 || VALID_SSE_REG_MODE (mode)
39299 || VALID_SSE2_REG_MODE (mode)
39300 || VALID_MMX_REG_MODE (mode)
39301 || VALID_MMX_REG_MODE_3DNOW (mode));
39303 if (MMX_REGNO_P (regno))
39305 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39306 so if the register is available at all, then we can move data of
39307 the given mode into or out of it. */
39308 return (VALID_MMX_REG_MODE (mode)
39309 || VALID_MMX_REG_MODE_3DNOW (mode));
39312 if (mode == QImode)
39314 /* Take care for QImode values - they can be in non-QI regs,
39315 but then they do cause partial register stalls. */
39316 if (ANY_QI_REGNO_P (regno))
39317 return true;
39318 if (!TARGET_PARTIAL_REG_STALL)
39319 return true;
39320 /* LRA checks if the hard register is OK for the given mode.
39321 QImode values can live in non-QI regs, so we allow all
39322 registers here. */
39323 if (lra_in_progress)
39324 return true;
39325 return !can_create_pseudo_p ();
39327 /* We handle both integer and floats in the general purpose registers. */
39328 else if (VALID_INT_MODE_P (mode))
39329 return true;
39330 else if (VALID_FP_MODE_P (mode))
39331 return true;
39332 else if (VALID_DFP_MODE_P (mode))
39333 return true;
39334 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39335 on to use that value in smaller contexts, this can easily force a
39336 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39337 supporting DImode, allow it. */
39338 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39339 return true;
39341 return false;
39344 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
39345 saves SSE registers across calls is Win64 (thus no need to check the
39346 current ABI here), and with AVX enabled Win64 only guarantees that
39347 the low 16 bytes are saved. */
39349 static bool
39350 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
39352 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39355 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39356 tieable integer mode. */
39358 static bool
39359 ix86_tieable_integer_mode_p (machine_mode mode)
39361 switch (mode)
39363 case E_HImode:
39364 case E_SImode:
39365 return true;
39367 case E_QImode:
39368 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39370 case E_DImode:
39371 return TARGET_64BIT;
39373 default:
39374 return false;
39378 /* Implement TARGET_MODES_TIEABLE_P.
39380 Return true if MODE1 is accessible in a register that can hold MODE2
39381 without copying. That is, all register classes that can hold MODE2
39382 can also hold MODE1. */
39384 static bool
39385 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39387 if (mode1 == mode2)
39388 return true;
39390 if (ix86_tieable_integer_mode_p (mode1)
39391 && ix86_tieable_integer_mode_p (mode2))
39392 return true;
39394 /* MODE2 being XFmode implies fp stack or general regs, which means we
39395 can tie any smaller floating point modes to it. Note that we do not
39396 tie this with TFmode. */
39397 if (mode2 == XFmode)
39398 return mode1 == SFmode || mode1 == DFmode;
39400 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39401 that we can tie it with SFmode. */
39402 if (mode2 == DFmode)
39403 return mode1 == SFmode;
39405 /* If MODE2 is only appropriate for an SSE register, then tie with
39406 any other mode acceptable to SSE registers. */
39407 if (GET_MODE_SIZE (mode2) == 32
39408 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39409 return (GET_MODE_SIZE (mode1) == 32
39410 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39411 if (GET_MODE_SIZE (mode2) == 16
39412 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39413 return (GET_MODE_SIZE (mode1) == 16
39414 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39416 /* If MODE2 is appropriate for an MMX register, then tie
39417 with any other mode acceptable to MMX registers. */
39418 if (GET_MODE_SIZE (mode2) == 8
39419 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39420 return (GET_MODE_SIZE (mode1) == 8
39421 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39423 return false;
39426 /* Return the cost of moving between two registers of mode MODE. */
39428 static int
39429 ix86_set_reg_reg_cost (machine_mode mode)
39431 unsigned int units = UNITS_PER_WORD;
39433 switch (GET_MODE_CLASS (mode))
39435 default:
39436 break;
39438 case MODE_CC:
39439 units = GET_MODE_SIZE (CCmode);
39440 break;
39442 case MODE_FLOAT:
39443 if ((TARGET_SSE && mode == TFmode)
39444 || (TARGET_80387 && mode == XFmode)
39445 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39446 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39447 units = GET_MODE_SIZE (mode);
39448 break;
39450 case MODE_COMPLEX_FLOAT:
39451 if ((TARGET_SSE && mode == TCmode)
39452 || (TARGET_80387 && mode == XCmode)
39453 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39454 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39455 units = GET_MODE_SIZE (mode);
39456 break;
39458 case MODE_VECTOR_INT:
39459 case MODE_VECTOR_FLOAT:
39460 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39461 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39462 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39463 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39464 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39465 units = GET_MODE_SIZE (mode);
39468 /* Return the cost of moving between two registers of mode MODE,
39469 assuming that the move will be in pieces of at most UNITS bytes. */
39470 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39473 /* Return cost of vector operation in MODE given that scalar version has
39474 COST. If PARALLEL is true assume that CPU has more than one unit
39475 performing the operation. */
39477 static int
39478 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39480 if (!VECTOR_MODE_P (mode))
39481 return cost;
39483 if (!parallel)
39484 return cost * GET_MODE_NUNITS (mode);
39485 if (GET_MODE_BITSIZE (mode) == 128
39486 && TARGET_SSE_SPLIT_REGS)
39487 return cost * 2;
39488 if (GET_MODE_BITSIZE (mode) > 128
39489 && TARGET_AVX128_OPTIMAL)
39490 return cost * GET_MODE_BITSIZE (mode) / 128;
39491 return cost;
39494 /* Return cost of multiplication in MODE. */
39496 static int
39497 ix86_multiplication_cost (const struct processor_costs *cost,
39498 enum machine_mode mode)
39500 machine_mode inner_mode = mode;
39501 if (VECTOR_MODE_P (mode))
39502 inner_mode = GET_MODE_INNER (mode);
39504 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39505 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
39506 else if (X87_FLOAT_MODE_P (mode))
39507 return cost->fmul;
39508 else if (FLOAT_MODE_P (mode))
39509 return ix86_vec_cost (mode,
39510 inner_mode == DFmode
39511 ? cost->mulsd : cost->mulss, true);
39512 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39514 /* V*QImode is emulated with 7-13 insns. */
39515 if (mode == V16QImode || mode == V32QImode)
39517 int extra = 11;
39518 if (TARGET_XOP && mode == V16QImode)
39519 extra = 5;
39520 else if (TARGET_SSSE3)
39521 extra = 6;
39522 return ix86_vec_cost (mode,
39523 cost->mulss * 2 + cost->sse_op * extra,
39524 true);
39526 /* V*DImode is emulated with 5-8 insns. */
39527 else if (mode == V2DImode || mode == V4DImode)
39529 if (TARGET_XOP && mode == V2DImode)
39530 return ix86_vec_cost (mode,
39531 cost->mulss * 2 + cost->sse_op * 3,
39532 true);
39533 else
39534 return ix86_vec_cost (mode,
39535 cost->mulss * 3 + cost->sse_op * 5,
39536 true);
39538 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39539 insns, including two PMULUDQ. */
39540 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39541 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39542 true);
39543 else
39544 return ix86_vec_cost (mode, cost->mulss, true);
39546 else
39547 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
39550 /* Return cost of multiplication in MODE. */
39552 static int
39553 ix86_division_cost (const struct processor_costs *cost,
39554 enum machine_mode mode)
39556 machine_mode inner_mode = mode;
39557 if (VECTOR_MODE_P (mode))
39558 inner_mode = GET_MODE_INNER (mode);
39560 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39561 return inner_mode == DFmode ? cost->divsd : cost->divss;
39562 else if (X87_FLOAT_MODE_P (mode))
39563 return cost->fdiv;
39564 else if (FLOAT_MODE_P (mode))
39565 return ix86_vec_cost (mode,
39566 inner_mode == DFmode ? cost->divsd : cost->divss,
39567 true);
39568 else
39569 return cost->divide[MODE_INDEX (mode)];
39572 /* Return cost of shift in MODE.
39573 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
39574 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
39575 if op1 is a result of subreg.
39577 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
39579 static int
39580 ix86_shift_rotate_cost (const struct processor_costs *cost,
39581 enum machine_mode mode, bool constant_op1,
39582 HOST_WIDE_INT op1_val,
39583 bool speed,
39584 bool and_in_op1,
39585 bool shift_and_truncate,
39586 bool *skip_op0, bool *skip_op1)
39588 if (skip_op0)
39589 *skip_op0 = *skip_op1 = false;
39590 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39592 /* V*QImode is emulated with 1-11 insns. */
39593 if (mode == V16QImode || mode == V32QImode)
39595 int count = 11;
39596 if (TARGET_XOP && mode == V16QImode)
39598 /* For XOP we use vpshab, which requires a broadcast of the
39599 value to the variable shift insn. For constants this
39600 means a V16Q const in mem; even when we can perform the
39601 shift with one insn set the cost to prefer paddb. */
39602 if (constant_op1)
39604 if (skip_op1)
39605 *skip_op1 = true;
39606 return ix86_vec_cost (mode,
39607 cost->sse_op
39608 + (speed
39610 : COSTS_N_BYTES
39611 (GET_MODE_UNIT_SIZE (mode))), true);
39613 count = 3;
39615 else if (TARGET_SSSE3)
39616 count = 7;
39617 return ix86_vec_cost (mode, cost->sse_op * count, true);
39619 else
39620 return ix86_vec_cost (mode, cost->sse_op, true);
39622 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39624 if (constant_op1)
39626 if (op1_val > 32)
39627 return cost->shift_const + COSTS_N_INSNS (2);
39628 else
39629 return cost->shift_const * 2;
39631 else
39633 if (and_in_op1)
39634 return cost->shift_var * 2;
39635 else
39636 return cost->shift_var * 6 + COSTS_N_INSNS (2);
39639 else
39641 if (constant_op1)
39642 return cost->shift_const;
39643 else if (shift_and_truncate)
39645 if (skip_op0)
39646 *skip_op0 = *skip_op1 = true;
39647 /* Return the cost after shift-and truncation. */
39648 return cost->shift_var;
39650 else
39651 return cost->shift_var;
39653 return cost->shift_const;
39656 /* Compute a (partial) cost for rtx X. Return true if the complete
39657 cost has been computed, and false if subexpressions should be
39658 scanned. In either case, *TOTAL contains the cost result. */
39660 static bool
39661 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39662 int *total, bool speed)
39664 rtx mask;
39665 enum rtx_code code = GET_CODE (x);
39666 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39667 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39668 int src_cost;
39670 switch (code)
39672 case SET:
39673 if (register_operand (SET_DEST (x), VOIDmode)
39674 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39676 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39677 return true;
39680 if (register_operand (SET_SRC (x), VOIDmode))
39681 /* Avoid potentially incorrect high cost from rtx_costs
39682 for non-tieable SUBREGs. */
39683 src_cost = 0;
39684 else
39686 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39688 if (CONSTANT_P (SET_SRC (x)))
39689 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39690 a small value, possibly zero for cheap constants. */
39691 src_cost += COSTS_N_INSNS (1);
39694 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39695 return true;
39697 case CONST_INT:
39698 case CONST:
39699 case LABEL_REF:
39700 case SYMBOL_REF:
39701 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39702 *total = 3;
39703 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39704 *total = 2;
39705 else if (flag_pic && SYMBOLIC_CONST (x)
39706 && !(TARGET_64BIT
39707 && (GET_CODE (x) == LABEL_REF
39708 || (GET_CODE (x) == SYMBOL_REF
39709 && SYMBOL_REF_LOCAL_P (x))))
39710 /* Use 0 cost for CONST to improve its propagation. */
39711 && (TARGET_64BIT || GET_CODE (x) != CONST))
39712 *total = 1;
39713 else
39714 *total = 0;
39715 return true;
39717 case CONST_DOUBLE:
39718 if (IS_STACK_MODE (mode))
39719 switch (standard_80387_constant_p (x))
39721 case -1:
39722 case 0:
39723 break;
39724 case 1: /* 0.0 */
39725 *total = 1;
39726 return true;
39727 default: /* Other constants */
39728 *total = 2;
39729 return true;
39731 /* FALLTHRU */
39733 case CONST_VECTOR:
39734 switch (standard_sse_constant_p (x, mode))
39736 case 0:
39737 break;
39738 case 1: /* 0: xor eliminates false dependency */
39739 *total = 0;
39740 return true;
39741 default: /* -1: cmp contains false dependency */
39742 *total = 1;
39743 return true;
39745 /* FALLTHRU */
39747 case CONST_WIDE_INT:
39748 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39749 it'll probably end up. Add a penalty for size. */
39750 *total = (COSTS_N_INSNS (1)
39751 + (!TARGET_64BIT && flag_pic)
39752 + (GET_MODE_SIZE (mode) <= 4
39753 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39754 return true;
39756 case ZERO_EXTEND:
39757 /* The zero extensions is often completely free on x86_64, so make
39758 it as cheap as possible. */
39759 if (TARGET_64BIT && mode == DImode
39760 && GET_MODE (XEXP (x, 0)) == SImode)
39761 *total = 1;
39762 else if (TARGET_ZERO_EXTEND_WITH_AND)
39763 *total = cost->add;
39764 else
39765 *total = cost->movzx;
39766 return false;
39768 case SIGN_EXTEND:
39769 *total = cost->movsx;
39770 return false;
39772 case ASHIFT:
39773 if (SCALAR_INT_MODE_P (mode)
39774 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39775 && CONST_INT_P (XEXP (x, 1)))
39777 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39778 if (value == 1)
39780 *total = cost->add;
39781 return false;
39783 if ((value == 2 || value == 3)
39784 && cost->lea <= cost->shift_const)
39786 *total = cost->lea;
39787 return false;
39790 /* FALLTHRU */
39792 case ROTATE:
39793 case ASHIFTRT:
39794 case LSHIFTRT:
39795 case ROTATERT:
39796 bool skip_op0, skip_op1;
39797 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
39798 CONST_INT_P (XEXP (x, 1))
39799 ? INTVAL (XEXP (x, 1)) : -1,
39800 speed,
39801 GET_CODE (XEXP (x, 1)) == AND,
39802 SUBREG_P (XEXP (x, 1))
39803 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
39804 &skip_op0, &skip_op1);
39805 if (skip_op0 || skip_op1)
39807 if (!skip_op0)
39808 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
39809 if (!skip_op1)
39810 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
39811 return true;
39813 return false;
39815 case FMA:
39817 rtx sub;
39819 gcc_assert (FLOAT_MODE_P (mode));
39820 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39822 *total = ix86_vec_cost (mode,
39823 mode == SFmode ? cost->fmass : cost->fmasd,
39824 true);
39825 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39827 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39828 sub = XEXP (x, 0);
39829 if (GET_CODE (sub) == NEG)
39830 sub = XEXP (sub, 0);
39831 *total += rtx_cost (sub, mode, FMA, 0, speed);
39833 sub = XEXP (x, 2);
39834 if (GET_CODE (sub) == NEG)
39835 sub = XEXP (sub, 0);
39836 *total += rtx_cost (sub, mode, FMA, 2, speed);
39837 return true;
39840 case MULT:
39841 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
39843 rtx op0 = XEXP (x, 0);
39844 rtx op1 = XEXP (x, 1);
39845 int nbits;
39846 if (CONST_INT_P (XEXP (x, 1)))
39848 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39849 for (nbits = 0; value != 0; value &= value - 1)
39850 nbits++;
39852 else
39853 /* This is arbitrary. */
39854 nbits = 7;
39856 /* Compute costs correctly for widening multiplication. */
39857 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39858 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39859 == GET_MODE_SIZE (mode))
39861 int is_mulwiden = 0;
39862 machine_mode inner_mode = GET_MODE (op0);
39864 if (GET_CODE (op0) == GET_CODE (op1))
39865 is_mulwiden = 1, op1 = XEXP (op1, 0);
39866 else if (CONST_INT_P (op1))
39868 if (GET_CODE (op0) == SIGN_EXTEND)
39869 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39870 == INTVAL (op1);
39871 else
39872 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39875 if (is_mulwiden)
39876 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39879 *total = (cost->mult_init[MODE_INDEX (mode)]
39880 + nbits * cost->mult_bit
39881 + rtx_cost (op0, mode, outer_code, opno, speed)
39882 + rtx_cost (op1, mode, outer_code, opno, speed));
39884 return true;
39886 *total = ix86_multiplication_cost (cost, mode);
39887 return false;
39889 case DIV:
39890 case UDIV:
39891 case MOD:
39892 case UMOD:
39893 *total = ix86_division_cost (cost, mode);
39894 return false;
39896 case PLUS:
39897 if (GET_MODE_CLASS (mode) == MODE_INT
39898 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39900 if (GET_CODE (XEXP (x, 0)) == PLUS
39901 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39902 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39903 && CONSTANT_P (XEXP (x, 1)))
39905 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39906 if (val == 2 || val == 4 || val == 8)
39908 *total = cost->lea;
39909 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39910 outer_code, opno, speed);
39911 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39912 outer_code, opno, speed);
39913 *total += rtx_cost (XEXP (x, 1), mode,
39914 outer_code, opno, speed);
39915 return true;
39918 else if (GET_CODE (XEXP (x, 0)) == MULT
39919 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39921 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39922 if (val == 2 || val == 4 || val == 8)
39924 *total = cost->lea;
39925 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39926 outer_code, opno, speed);
39927 *total += rtx_cost (XEXP (x, 1), mode,
39928 outer_code, opno, speed);
39929 return true;
39932 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39934 /* Add with carry, ignore the cost of adding a carry flag. */
39935 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39936 *total = cost->add;
39937 else
39939 *total = cost->lea;
39940 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39941 outer_code, opno, speed);
39944 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39945 outer_code, opno, speed);
39946 *total += rtx_cost (XEXP (x, 1), mode,
39947 outer_code, opno, speed);
39948 return true;
39951 /* FALLTHRU */
39953 case MINUS:
39954 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39955 if (GET_MODE_CLASS (mode) == MODE_INT
39956 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39957 && GET_CODE (XEXP (x, 0)) == MINUS
39958 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39960 *total = cost->add;
39961 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39962 outer_code, opno, speed);
39963 *total += rtx_cost (XEXP (x, 1), mode,
39964 outer_code, opno, speed);
39965 return true;
39968 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39970 *total = cost->addss;
39971 return false;
39973 else if (X87_FLOAT_MODE_P (mode))
39975 *total = cost->fadd;
39976 return false;
39978 else if (FLOAT_MODE_P (mode))
39980 *total = ix86_vec_cost (mode, cost->addss, true);
39981 return false;
39983 /* FALLTHRU */
39985 case AND:
39986 case IOR:
39987 case XOR:
39988 if (GET_MODE_CLASS (mode) == MODE_INT
39989 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39991 *total = (cost->add * 2
39992 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39993 << (GET_MODE (XEXP (x, 0)) != DImode))
39994 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39995 << (GET_MODE (XEXP (x, 1)) != DImode)));
39996 return true;
39998 /* FALLTHRU */
40000 case NEG:
40001 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40003 *total = cost->sse_op;
40004 return false;
40006 else if (X87_FLOAT_MODE_P (mode))
40008 *total = cost->fchs;
40009 return false;
40011 else if (FLOAT_MODE_P (mode))
40013 *total = ix86_vec_cost (mode, cost->sse_op, true);
40014 return false;
40016 /* FALLTHRU */
40018 case NOT:
40019 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40020 *total = ix86_vec_cost (mode, cost->sse_op, true);
40021 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40022 *total = cost->add * 2;
40023 else
40024 *total = cost->add;
40025 return false;
40027 case COMPARE:
40028 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40029 && XEXP (XEXP (x, 0), 1) == const1_rtx
40030 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40031 && XEXP (x, 1) == const0_rtx)
40033 /* This kind of construct is implemented using test[bwl].
40034 Treat it as if we had an AND. */
40035 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40036 *total = (cost->add
40037 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40038 opno, speed)
40039 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40040 return true;
40043 /* The embedded comparison operand is completely free. */
40044 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40045 && XEXP (x, 1) == const0_rtx)
40046 *total = 0;
40048 return false;
40050 case FLOAT_EXTEND:
40051 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40052 *total = 0;
40053 else
40054 *total = ix86_vec_cost (mode, cost->addss, true);
40055 return false;
40057 case FLOAT_TRUNCATE:
40058 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40059 *total = cost->fadd;
40060 else
40061 *total = ix86_vec_cost (mode, cost->addss, true);
40062 return false;
40064 case ABS:
40065 /* SSE requires memory load for the constant operand. It may make
40066 sense to account for this. Of course the constant operand may or
40067 may not be reused. */
40068 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40069 *total = cost->sse_op;
40070 else if (X87_FLOAT_MODE_P (mode))
40071 *total = cost->fabs;
40072 else if (FLOAT_MODE_P (mode))
40073 *total = ix86_vec_cost (mode, cost->sse_op, true);
40074 return false;
40076 case SQRT:
40077 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40078 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40079 else if (X87_FLOAT_MODE_P (mode))
40080 *total = cost->fsqrt;
40081 else if (FLOAT_MODE_P (mode))
40082 *total = ix86_vec_cost (mode,
40083 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40084 true);
40085 return false;
40087 case UNSPEC:
40088 if (XINT (x, 1) == UNSPEC_TP)
40089 *total = 0;
40090 return false;
40092 case VEC_SELECT:
40093 case VEC_CONCAT:
40094 case VEC_DUPLICATE:
40095 /* ??? Assume all of these vector manipulation patterns are
40096 recognizable. In which case they all pretty much have the
40097 same cost. */
40098 *total = cost->sse_op;
40099 return true;
40100 case VEC_MERGE:
40101 mask = XEXP (x, 2);
40102 /* This is masked instruction, assume the same cost,
40103 as nonmasked variant. */
40104 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40105 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40106 else
40107 *total = cost->sse_op;
40108 return true;
40110 default:
40111 return false;
40115 #if TARGET_MACHO
40117 static int current_machopic_label_num;
40119 /* Given a symbol name and its associated stub, write out the
40120 definition of the stub. */
40122 void
40123 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40125 unsigned int length;
40126 char *binder_name, *symbol_name, lazy_ptr_name[32];
40127 int label = ++current_machopic_label_num;
40129 /* For 64-bit we shouldn't get here. */
40130 gcc_assert (!TARGET_64BIT);
40132 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40133 symb = targetm.strip_name_encoding (symb);
40135 length = strlen (stub);
40136 binder_name = XALLOCAVEC (char, length + 32);
40137 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40139 length = strlen (symb);
40140 symbol_name = XALLOCAVEC (char, length + 32);
40141 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40143 sprintf (lazy_ptr_name, "L%d$lz", label);
40145 if (MACHOPIC_ATT_STUB)
40146 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40147 else if (MACHOPIC_PURE)
40148 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40149 else
40150 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40152 fprintf (file, "%s:\n", stub);
40153 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40155 if (MACHOPIC_ATT_STUB)
40157 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40159 else if (MACHOPIC_PURE)
40161 /* PIC stub. */
40162 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40163 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40164 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40165 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40166 label, lazy_ptr_name, label);
40167 fprintf (file, "\tjmp\t*%%ecx\n");
40169 else
40170 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40172 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40173 it needs no stub-binding-helper. */
40174 if (MACHOPIC_ATT_STUB)
40175 return;
40177 fprintf (file, "%s:\n", binder_name);
40179 if (MACHOPIC_PURE)
40181 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40182 fprintf (file, "\tpushl\t%%ecx\n");
40184 else
40185 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40187 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40189 /* N.B. Keep the correspondence of these
40190 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40191 old-pic/new-pic/non-pic stubs; altering this will break
40192 compatibility with existing dylibs. */
40193 if (MACHOPIC_PURE)
40195 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40196 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40198 else
40199 /* 16-byte -mdynamic-no-pic stub. */
40200 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40202 fprintf (file, "%s:\n", lazy_ptr_name);
40203 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40204 fprintf (file, ASM_LONG "%s\n", binder_name);
40206 #endif /* TARGET_MACHO */
40208 /* Order the registers for register allocator. */
40210 void
40211 x86_order_regs_for_local_alloc (void)
40213 int pos = 0;
40214 int i;
40216 /* First allocate the local general purpose registers. */
40217 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40218 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40219 reg_alloc_order [pos++] = i;
40221 /* Global general purpose registers. */
40222 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40223 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40224 reg_alloc_order [pos++] = i;
40226 /* x87 registers come first in case we are doing FP math
40227 using them. */
40228 if (!TARGET_SSE_MATH)
40229 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40230 reg_alloc_order [pos++] = i;
40232 /* SSE registers. */
40233 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40234 reg_alloc_order [pos++] = i;
40235 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40236 reg_alloc_order [pos++] = i;
40238 /* Extended REX SSE registers. */
40239 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40240 reg_alloc_order [pos++] = i;
40242 /* Mask register. */
40243 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40244 reg_alloc_order [pos++] = i;
40246 /* MPX bound registers. */
40247 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40248 reg_alloc_order [pos++] = i;
40250 /* x87 registers. */
40251 if (TARGET_SSE_MATH)
40252 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40253 reg_alloc_order [pos++] = i;
40255 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40256 reg_alloc_order [pos++] = i;
40258 /* Initialize the rest of array as we do not allocate some registers
40259 at all. */
40260 while (pos < FIRST_PSEUDO_REGISTER)
40261 reg_alloc_order [pos++] = 0;
40264 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40265 in struct attribute_spec handler. */
40266 static tree
40267 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
40268 bool *no_add_attrs)
40270 if (TREE_CODE (*node) != FUNCTION_TYPE
40271 && TREE_CODE (*node) != METHOD_TYPE
40272 && TREE_CODE (*node) != FIELD_DECL
40273 && TREE_CODE (*node) != TYPE_DECL)
40275 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40276 name);
40277 *no_add_attrs = true;
40278 return NULL_TREE;
40280 if (TARGET_64BIT)
40282 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40283 name);
40284 *no_add_attrs = true;
40285 return NULL_TREE;
40287 if (is_attribute_p ("callee_pop_aggregate_return", name))
40289 tree cst;
40291 cst = TREE_VALUE (args);
40292 if (TREE_CODE (cst) != INTEGER_CST)
40294 warning (OPT_Wattributes,
40295 "%qE attribute requires an integer constant argument",
40296 name);
40297 *no_add_attrs = true;
40299 else if (compare_tree_int (cst, 0) != 0
40300 && compare_tree_int (cst, 1) != 0)
40302 warning (OPT_Wattributes,
40303 "argument to %qE attribute is neither zero, nor one",
40304 name);
40305 *no_add_attrs = true;
40308 return NULL_TREE;
40311 return NULL_TREE;
40314 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40315 struct attribute_spec.handler. */
40316 static tree
40317 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40318 bool *no_add_attrs)
40320 if (TREE_CODE (*node) != FUNCTION_TYPE
40321 && TREE_CODE (*node) != METHOD_TYPE
40322 && TREE_CODE (*node) != FIELD_DECL
40323 && TREE_CODE (*node) != TYPE_DECL)
40325 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40326 name);
40327 *no_add_attrs = true;
40328 return NULL_TREE;
40331 /* Can combine regparm with all attributes but fastcall. */
40332 if (is_attribute_p ("ms_abi", name))
40334 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40336 error ("ms_abi and sysv_abi attributes are not compatible");
40339 return NULL_TREE;
40341 else if (is_attribute_p ("sysv_abi", name))
40343 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40345 error ("ms_abi and sysv_abi attributes are not compatible");
40348 return NULL_TREE;
40351 return NULL_TREE;
40354 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40355 struct attribute_spec.handler. */
40356 static tree
40357 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40358 bool *no_add_attrs)
40360 tree *type = NULL;
40361 if (DECL_P (*node))
40363 if (TREE_CODE (*node) == TYPE_DECL)
40364 type = &TREE_TYPE (*node);
40366 else
40367 type = node;
40369 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40371 warning (OPT_Wattributes, "%qE attribute ignored",
40372 name);
40373 *no_add_attrs = true;
40376 else if ((is_attribute_p ("ms_struct", name)
40377 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40378 || ((is_attribute_p ("gcc_struct", name)
40379 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40381 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40382 name);
40383 *no_add_attrs = true;
40386 return NULL_TREE;
40389 static tree
40390 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40391 bool *no_add_attrs)
40393 if (TREE_CODE (*node) != FUNCTION_DECL)
40395 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40396 name);
40397 *no_add_attrs = true;
40399 return NULL_TREE;
40402 static tree
40403 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40404 int, bool *)
40406 return NULL_TREE;
40409 static tree
40410 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40412 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40413 but the function type contains args and return type data. */
40414 tree func_type = *node;
40415 tree return_type = TREE_TYPE (func_type);
40417 int nargs = 0;
40418 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40419 while (current_arg_type
40420 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40422 if (nargs == 0)
40424 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40425 error ("interrupt service routine should have a pointer "
40426 "as the first argument");
40428 else if (nargs == 1)
40430 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40431 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40432 error ("interrupt service routine should have unsigned %s"
40433 "int as the second argument",
40434 TARGET_64BIT
40435 ? (TARGET_X32 ? "long long " : "long ")
40436 : "");
40438 nargs++;
40439 current_arg_type = TREE_CHAIN (current_arg_type);
40441 if (!nargs || nargs > 2)
40442 error ("interrupt service routine can only have a pointer argument "
40443 "and an optional integer argument");
40444 if (! VOID_TYPE_P (return_type))
40445 error ("interrupt service routine can't have non-void return value");
40447 return NULL_TREE;
40450 static bool
40451 ix86_ms_bitfield_layout_p (const_tree record_type)
40453 return ((TARGET_MS_BITFIELD_LAYOUT
40454 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40455 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40458 /* Returns an expression indicating where the this parameter is
40459 located on entry to the FUNCTION. */
40461 static rtx
40462 x86_this_parameter (tree function)
40464 tree type = TREE_TYPE (function);
40465 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40466 int nregs;
40468 if (TARGET_64BIT)
40470 const int *parm_regs;
40472 if (ix86_function_type_abi (type) == MS_ABI)
40473 parm_regs = x86_64_ms_abi_int_parameter_registers;
40474 else
40475 parm_regs = x86_64_int_parameter_registers;
40476 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40479 nregs = ix86_function_regparm (type, function);
40481 if (nregs > 0 && !stdarg_p (type))
40483 int regno;
40484 unsigned int ccvt = ix86_get_callcvt (type);
40486 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40487 regno = aggr ? DX_REG : CX_REG;
40488 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40490 regno = CX_REG;
40491 if (aggr)
40492 return gen_rtx_MEM (SImode,
40493 plus_constant (Pmode, stack_pointer_rtx, 4));
40495 else
40497 regno = AX_REG;
40498 if (aggr)
40500 regno = DX_REG;
40501 if (nregs == 1)
40502 return gen_rtx_MEM (SImode,
40503 plus_constant (Pmode,
40504 stack_pointer_rtx, 4));
40507 return gen_rtx_REG (SImode, regno);
40510 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40511 aggr ? 8 : 4));
40514 /* Determine whether x86_output_mi_thunk can succeed. */
40516 static bool
40517 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40518 const_tree function)
40520 /* 64-bit can handle anything. */
40521 if (TARGET_64BIT)
40522 return true;
40524 /* For 32-bit, everything's fine if we have one free register. */
40525 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40526 return true;
40528 /* Need a free register for vcall_offset. */
40529 if (vcall_offset)
40530 return false;
40532 /* Need a free register for GOT references. */
40533 if (flag_pic && !targetm.binds_local_p (function))
40534 return false;
40536 /* Otherwise ok. */
40537 return true;
40540 /* Output the assembler code for a thunk function. THUNK_DECL is the
40541 declaration for the thunk function itself, FUNCTION is the decl for
40542 the target function. DELTA is an immediate constant offset to be
40543 added to THIS. If VCALL_OFFSET is nonzero, the word at
40544 *(*this + vcall_offset) should be added to THIS. */
40546 static void
40547 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40548 HOST_WIDE_INT vcall_offset, tree function)
40550 rtx this_param = x86_this_parameter (function);
40551 rtx this_reg, tmp, fnaddr;
40552 unsigned int tmp_regno;
40553 rtx_insn *insn;
40555 if (TARGET_64BIT)
40556 tmp_regno = R10_REG;
40557 else
40559 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40560 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40561 tmp_regno = AX_REG;
40562 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40563 tmp_regno = DX_REG;
40564 else
40565 tmp_regno = CX_REG;
40568 emit_note (NOTE_INSN_PROLOGUE_END);
40570 /* CET is enabled, insert EB instruction. */
40571 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40572 emit_insn (gen_nop_endbr ());
40574 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40575 pull it in now and let DELTA benefit. */
40576 if (REG_P (this_param))
40577 this_reg = this_param;
40578 else if (vcall_offset)
40580 /* Put the this parameter into %eax. */
40581 this_reg = gen_rtx_REG (Pmode, AX_REG);
40582 emit_move_insn (this_reg, this_param);
40584 else
40585 this_reg = NULL_RTX;
40587 /* Adjust the this parameter by a fixed constant. */
40588 if (delta)
40590 rtx delta_rtx = GEN_INT (delta);
40591 rtx delta_dst = this_reg ? this_reg : this_param;
40593 if (TARGET_64BIT)
40595 if (!x86_64_general_operand (delta_rtx, Pmode))
40597 tmp = gen_rtx_REG (Pmode, tmp_regno);
40598 emit_move_insn (tmp, delta_rtx);
40599 delta_rtx = tmp;
40603 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40606 /* Adjust the this parameter by a value stored in the vtable. */
40607 if (vcall_offset)
40609 rtx vcall_addr, vcall_mem, this_mem;
40611 tmp = gen_rtx_REG (Pmode, tmp_regno);
40613 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40614 if (Pmode != ptr_mode)
40615 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40616 emit_move_insn (tmp, this_mem);
40618 /* Adjust the this parameter. */
40619 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40620 if (TARGET_64BIT
40621 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40623 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40624 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40625 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40628 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40629 if (Pmode != ptr_mode)
40630 emit_insn (gen_addsi_1_zext (this_reg,
40631 gen_rtx_REG (ptr_mode,
40632 REGNO (this_reg)),
40633 vcall_mem));
40634 else
40635 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40638 /* If necessary, drop THIS back to its stack slot. */
40639 if (this_reg && this_reg != this_param)
40640 emit_move_insn (this_param, this_reg);
40642 fnaddr = XEXP (DECL_RTL (function), 0);
40643 if (TARGET_64BIT)
40645 if (!flag_pic || targetm.binds_local_p (function)
40646 || TARGET_PECOFF)
40648 else
40650 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40651 tmp = gen_rtx_CONST (Pmode, tmp);
40652 fnaddr = gen_const_mem (Pmode, tmp);
40655 else
40657 if (!flag_pic || targetm.binds_local_p (function))
40659 #if TARGET_MACHO
40660 else if (TARGET_MACHO)
40662 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40663 fnaddr = XEXP (fnaddr, 0);
40665 #endif /* TARGET_MACHO */
40666 else
40668 tmp = gen_rtx_REG (Pmode, CX_REG);
40669 output_set_got (tmp, NULL_RTX);
40671 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40672 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40673 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40674 fnaddr = gen_const_mem (Pmode, fnaddr);
40678 /* Our sibling call patterns do not allow memories, because we have no
40679 predicate that can distinguish between frame and non-frame memory.
40680 For our purposes here, we can get away with (ab)using a jump pattern,
40681 because we're going to do no optimization. */
40682 if (MEM_P (fnaddr))
40684 if (sibcall_insn_operand (fnaddr, word_mode))
40686 fnaddr = XEXP (DECL_RTL (function), 0);
40687 tmp = gen_rtx_MEM (QImode, fnaddr);
40688 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40689 tmp = emit_call_insn (tmp);
40690 SIBLING_CALL_P (tmp) = 1;
40692 else
40693 emit_jump_insn (gen_indirect_jump (fnaddr));
40695 else
40697 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40699 // CM_LARGE_PIC always uses pseudo PIC register which is
40700 // uninitialized. Since FUNCTION is local and calling it
40701 // doesn't go through PLT, we use scratch register %r11 as
40702 // PIC register and initialize it here.
40703 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40704 ix86_init_large_pic_reg (tmp_regno);
40705 fnaddr = legitimize_pic_address (fnaddr,
40706 gen_rtx_REG (Pmode, tmp_regno));
40709 if (!sibcall_insn_operand (fnaddr, word_mode))
40711 tmp = gen_rtx_REG (word_mode, tmp_regno);
40712 if (GET_MODE (fnaddr) != word_mode)
40713 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40714 emit_move_insn (tmp, fnaddr);
40715 fnaddr = tmp;
40718 tmp = gen_rtx_MEM (QImode, fnaddr);
40719 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40720 tmp = emit_call_insn (tmp);
40721 SIBLING_CALL_P (tmp) = 1;
40723 emit_barrier ();
40725 /* Emit just enough of rest_of_compilation to get the insns emitted.
40726 Note that use_thunk calls assemble_start_function et al. */
40727 insn = get_insns ();
40728 shorten_branches (insn);
40729 final_start_function (insn, file, 1);
40730 final (insn, file, 1);
40731 final_end_function ();
40734 static void
40735 x86_file_start (void)
40737 default_file_start ();
40738 if (TARGET_16BIT)
40739 fputs ("\t.code16gcc\n", asm_out_file);
40740 #if TARGET_MACHO
40741 darwin_file_start ();
40742 #endif
40743 if (X86_FILE_START_VERSION_DIRECTIVE)
40744 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40745 if (X86_FILE_START_FLTUSED)
40746 fputs ("\t.global\t__fltused\n", asm_out_file);
40747 if (ix86_asm_dialect == ASM_INTEL)
40748 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40752 x86_field_alignment (tree type, int computed)
40754 machine_mode mode;
40756 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40757 return computed;
40758 if (TARGET_IAMCU)
40759 return iamcu_alignment (type, computed);
40760 mode = TYPE_MODE (strip_array_types (type));
40761 if (mode == DFmode || mode == DCmode
40762 || GET_MODE_CLASS (mode) == MODE_INT
40763 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40764 return MIN (32, computed);
40765 return computed;
40768 /* Print call to TARGET to FILE. */
40770 static void
40771 x86_print_call_or_nop (FILE *file, const char *target)
40773 if (flag_nop_mcount)
40774 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
40775 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
40776 else
40777 fprintf (file, "1:\tcall\t%s\n", target);
40780 /* Output assembler code to FILE to increment profiler label # LABELNO
40781 for profiling a function entry. */
40782 void
40783 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40785 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40786 : MCOUNT_NAME);
40787 if (TARGET_64BIT)
40789 #ifndef NO_PROFILE_COUNTERS
40790 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40791 #endif
40793 if (!TARGET_PECOFF && flag_pic)
40794 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40795 else
40796 x86_print_call_or_nop (file, mcount_name);
40798 else if (flag_pic)
40800 #ifndef NO_PROFILE_COUNTERS
40801 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40802 LPREFIX, labelno);
40803 #endif
40804 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40806 else
40808 #ifndef NO_PROFILE_COUNTERS
40809 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40810 LPREFIX, labelno);
40811 #endif
40812 x86_print_call_or_nop (file, mcount_name);
40815 if (flag_record_mcount)
40817 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40818 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40819 fprintf (file, "\t.previous\n");
40823 /* We don't have exact information about the insn sizes, but we may assume
40824 quite safely that we are informed about all 1 byte insns and memory
40825 address sizes. This is enough to eliminate unnecessary padding in
40826 99% of cases. */
40829 ix86_min_insn_size (rtx_insn *insn)
40831 int l = 0, len;
40833 if (!INSN_P (insn) || !active_insn_p (insn))
40834 return 0;
40836 /* Discard alignments we've emit and jump instructions. */
40837 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40838 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40839 return 0;
40841 /* Important case - calls are always 5 bytes.
40842 It is common to have many calls in the row. */
40843 if (CALL_P (insn)
40844 && symbolic_reference_mentioned_p (PATTERN (insn))
40845 && !SIBLING_CALL_P (insn))
40846 return 5;
40847 len = get_attr_length (insn);
40848 if (len <= 1)
40849 return 1;
40851 /* For normal instructions we rely on get_attr_length being exact,
40852 with a few exceptions. */
40853 if (!JUMP_P (insn))
40855 enum attr_type type = get_attr_type (insn);
40857 switch (type)
40859 case TYPE_MULTI:
40860 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40861 || asm_noperands (PATTERN (insn)) >= 0)
40862 return 0;
40863 break;
40864 case TYPE_OTHER:
40865 case TYPE_FCMP:
40866 break;
40867 default:
40868 /* Otherwise trust get_attr_length. */
40869 return len;
40872 l = get_attr_length_address (insn);
40873 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40874 l = 4;
40876 if (l)
40877 return 1+l;
40878 else
40879 return 2;
40882 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40884 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40885 window. */
40887 static void
40888 ix86_avoid_jump_mispredicts (void)
40890 rtx_insn *insn, *start = get_insns ();
40891 int nbytes = 0, njumps = 0;
40892 bool isjump = false;
40894 /* Look for all minimal intervals of instructions containing 4 jumps.
40895 The intervals are bounded by START and INSN. NBYTES is the total
40896 size of instructions in the interval including INSN and not including
40897 START. When the NBYTES is smaller than 16 bytes, it is possible
40898 that the end of START and INSN ends up in the same 16byte page.
40900 The smallest offset in the page INSN can start is the case where START
40901 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40902 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40904 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40905 have to, control transfer to label(s) can be performed through other
40906 means, and also we estimate minimum length of all asm stmts as 0. */
40907 for (insn = start; insn; insn = NEXT_INSN (insn))
40909 int min_size;
40911 if (LABEL_P (insn))
40913 int align = label_to_alignment (insn);
40914 int max_skip = label_to_max_skip (insn);
40916 if (max_skip > 15)
40917 max_skip = 15;
40918 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40919 already in the current 16 byte page, because otherwise
40920 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40921 bytes to reach 16 byte boundary. */
40922 if (align <= 0
40923 || (align <= 3 && max_skip != (1 << align) - 1))
40924 max_skip = 0;
40925 if (dump_file)
40926 fprintf (dump_file, "Label %i with max_skip %i\n",
40927 INSN_UID (insn), max_skip);
40928 if (max_skip)
40930 while (nbytes + max_skip >= 16)
40932 start = NEXT_INSN (start);
40933 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40934 || CALL_P (start))
40935 njumps--, isjump = true;
40936 else
40937 isjump = false;
40938 nbytes -= ix86_min_insn_size (start);
40941 continue;
40944 min_size = ix86_min_insn_size (insn);
40945 nbytes += min_size;
40946 if (dump_file)
40947 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40948 INSN_UID (insn), min_size);
40949 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40950 || CALL_P (insn))
40951 njumps++;
40952 else
40953 continue;
40955 while (njumps > 3)
40957 start = NEXT_INSN (start);
40958 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40959 || CALL_P (start))
40960 njumps--, isjump = true;
40961 else
40962 isjump = false;
40963 nbytes -= ix86_min_insn_size (start);
40965 gcc_assert (njumps >= 0);
40966 if (dump_file)
40967 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40968 INSN_UID (start), INSN_UID (insn), nbytes);
40970 if (njumps == 3 && isjump && nbytes < 16)
40972 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40974 if (dump_file)
40975 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40976 INSN_UID (insn), padsize);
40977 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40981 #endif
40983 /* AMD Athlon works faster
40984 when RET is not destination of conditional jump or directly preceded
40985 by other jump instruction. We avoid the penalty by inserting NOP just
40986 before the RET instructions in such cases. */
40987 static void
40988 ix86_pad_returns (void)
40990 edge e;
40991 edge_iterator ei;
40993 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40995 basic_block bb = e->src;
40996 rtx_insn *ret = BB_END (bb);
40997 rtx_insn *prev;
40998 bool replace = false;
41000 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41001 || optimize_bb_for_size_p (bb))
41002 continue;
41003 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41004 if (active_insn_p (prev) || LABEL_P (prev))
41005 break;
41006 if (prev && LABEL_P (prev))
41008 edge e;
41009 edge_iterator ei;
41011 FOR_EACH_EDGE (e, ei, bb->preds)
41012 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41013 && !(e->flags & EDGE_FALLTHRU))
41015 replace = true;
41016 break;
41019 if (!replace)
41021 prev = prev_active_insn (ret);
41022 if (prev
41023 && ((JUMP_P (prev) && any_condjump_p (prev))
41024 || CALL_P (prev)))
41025 replace = true;
41026 /* Empty functions get branch mispredict even when
41027 the jump destination is not visible to us. */
41028 if (!prev && !optimize_function_for_size_p (cfun))
41029 replace = true;
41031 if (replace)
41033 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41034 delete_insn (ret);
41039 /* Count the minimum number of instructions in BB. Return 4 if the
41040 number of instructions >= 4. */
41042 static int
41043 ix86_count_insn_bb (basic_block bb)
41045 rtx_insn *insn;
41046 int insn_count = 0;
41048 /* Count number of instructions in this block. Return 4 if the number
41049 of instructions >= 4. */
41050 FOR_BB_INSNS (bb, insn)
41052 /* Only happen in exit blocks. */
41053 if (JUMP_P (insn)
41054 && ANY_RETURN_P (PATTERN (insn)))
41055 break;
41057 if (NONDEBUG_INSN_P (insn)
41058 && GET_CODE (PATTERN (insn)) != USE
41059 && GET_CODE (PATTERN (insn)) != CLOBBER)
41061 insn_count++;
41062 if (insn_count >= 4)
41063 return insn_count;
41067 return insn_count;
41071 /* Count the minimum number of instructions in code path in BB.
41072 Return 4 if the number of instructions >= 4. */
41074 static int
41075 ix86_count_insn (basic_block bb)
41077 edge e;
41078 edge_iterator ei;
41079 int min_prev_count;
41081 /* Only bother counting instructions along paths with no
41082 more than 2 basic blocks between entry and exit. Given
41083 that BB has an edge to exit, determine if a predecessor
41084 of BB has an edge from entry. If so, compute the number
41085 of instructions in the predecessor block. If there
41086 happen to be multiple such blocks, compute the minimum. */
41087 min_prev_count = 4;
41088 FOR_EACH_EDGE (e, ei, bb->preds)
41090 edge prev_e;
41091 edge_iterator prev_ei;
41093 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41095 min_prev_count = 0;
41096 break;
41098 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41100 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41102 int count = ix86_count_insn_bb (e->src);
41103 if (count < min_prev_count)
41104 min_prev_count = count;
41105 break;
41110 if (min_prev_count < 4)
41111 min_prev_count += ix86_count_insn_bb (bb);
41113 return min_prev_count;
41116 /* Pad short function to 4 instructions. */
41118 static void
41119 ix86_pad_short_function (void)
41121 edge e;
41122 edge_iterator ei;
41124 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41126 rtx_insn *ret = BB_END (e->src);
41127 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41129 int insn_count = ix86_count_insn (e->src);
41131 /* Pad short function. */
41132 if (insn_count < 4)
41134 rtx_insn *insn = ret;
41136 /* Find epilogue. */
41137 while (insn
41138 && (!NOTE_P (insn)
41139 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41140 insn = PREV_INSN (insn);
41142 if (!insn)
41143 insn = ret;
41145 /* Two NOPs count as one instruction. */
41146 insn_count = 2 * (4 - insn_count);
41147 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41153 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41154 the epilogue, the Windows system unwinder will apply epilogue logic and
41155 produce incorrect offsets. This can be avoided by adding a nop between
41156 the last insn that can throw and the first insn of the epilogue. */
41158 static void
41159 ix86_seh_fixup_eh_fallthru (void)
41161 edge e;
41162 edge_iterator ei;
41164 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41166 rtx_insn *insn, *next;
41168 /* Find the beginning of the epilogue. */
41169 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41170 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41171 break;
41172 if (insn == NULL)
41173 continue;
41175 /* We only care about preceding insns that can throw. */
41176 insn = prev_active_insn (insn);
41177 if (insn == NULL || !can_throw_internal (insn))
41178 continue;
41180 /* Do not separate calls from their debug information. */
41181 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41182 if (NOTE_P (next)
41183 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41184 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41185 insn = next;
41186 else
41187 break;
41189 emit_insn_after (gen_nops (const1_rtx), insn);
41193 /* Given a register number BASE, the lowest of a group of registers, update
41194 regsets IN and OUT with the registers that should be avoided in input
41195 and output operands respectively when trying to avoid generating a modr/m
41196 byte for -mmitigate-rop. */
41198 static void
41199 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41201 SET_HARD_REG_BIT (out, base);
41202 SET_HARD_REG_BIT (out, base + 1);
41203 SET_HARD_REG_BIT (in, base + 2);
41204 SET_HARD_REG_BIT (in, base + 3);
41207 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
41208 that certain encodings of modr/m bytes do not occur. */
41209 static void
41210 ix86_mitigate_rop (void)
41212 HARD_REG_SET input_risky;
41213 HARD_REG_SET output_risky;
41214 HARD_REG_SET inout_risky;
41216 CLEAR_HARD_REG_SET (output_risky);
41217 CLEAR_HARD_REG_SET (input_risky);
41218 SET_HARD_REG_BIT (output_risky, AX_REG);
41219 SET_HARD_REG_BIT (output_risky, CX_REG);
41220 SET_HARD_REG_BIT (input_risky, BX_REG);
41221 SET_HARD_REG_BIT (input_risky, DX_REG);
41222 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41223 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41224 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41225 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41226 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41227 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41228 COPY_HARD_REG_SET (inout_risky, input_risky);
41229 IOR_HARD_REG_SET (inout_risky, output_risky);
41231 df_note_add_problem ();
41232 /* Fix up what stack-regs did. */
41233 df_insn_rescan_all ();
41234 df_analyze ();
41236 regrename_init (true);
41237 regrename_analyze (NULL);
41239 auto_vec<du_head_p> cands;
41241 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41243 if (!NONDEBUG_INSN_P (insn))
41244 continue;
41246 if (GET_CODE (PATTERN (insn)) == USE
41247 || GET_CODE (PATTERN (insn)) == CLOBBER)
41248 continue;
41250 extract_insn (insn);
41252 int opno0, opno1;
41253 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41254 recog_data.n_operands, &opno0,
41255 &opno1);
41257 if (!ix86_rop_should_change_byte_p (modrm))
41258 continue;
41260 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41262 /* This happens when regrename has to fail a block. */
41263 if (!info->op_info)
41264 continue;
41266 if (info->op_info[opno0].n_chains != 0)
41268 gcc_assert (info->op_info[opno0].n_chains == 1);
41269 du_head_p op0c;
41270 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41271 if (op0c->target_data_1 + op0c->target_data_2 == 0
41272 && !op0c->cannot_rename)
41273 cands.safe_push (op0c);
41275 op0c->target_data_1++;
41277 if (info->op_info[opno1].n_chains != 0)
41279 gcc_assert (info->op_info[opno1].n_chains == 1);
41280 du_head_p op1c;
41281 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41282 if (op1c->target_data_1 + op1c->target_data_2 == 0
41283 && !op1c->cannot_rename)
41284 cands.safe_push (op1c);
41286 op1c->target_data_2++;
41290 int i;
41291 du_head_p head;
41292 FOR_EACH_VEC_ELT (cands, i, head)
41294 int old_reg, best_reg;
41295 HARD_REG_SET unavailable;
41297 CLEAR_HARD_REG_SET (unavailable);
41298 if (head->target_data_1)
41299 IOR_HARD_REG_SET (unavailable, output_risky);
41300 if (head->target_data_2)
41301 IOR_HARD_REG_SET (unavailable, input_risky);
41303 int n_uses;
41304 reg_class superclass = regrename_find_superclass (head, &n_uses,
41305 &unavailable);
41306 old_reg = head->regno;
41307 best_reg = find_rename_reg (head, superclass, &unavailable,
41308 old_reg, false);
41309 bool ok = regrename_do_replace (head, best_reg);
41310 gcc_assert (ok);
41311 if (dump_file)
41312 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41313 reg_names[best_reg], reg_class_names[superclass]);
41317 regrename_finish ();
41319 df_analyze ();
41321 basic_block bb;
41322 regset_head live;
41324 INIT_REG_SET (&live);
41326 FOR_EACH_BB_FN (bb, cfun)
41328 rtx_insn *insn;
41330 COPY_REG_SET (&live, DF_LR_OUT (bb));
41331 df_simulate_initialize_backwards (bb, &live);
41333 FOR_BB_INSNS_REVERSE (bb, insn)
41335 if (!NONDEBUG_INSN_P (insn))
41336 continue;
41338 df_simulate_one_insn_backwards (bb, insn, &live);
41340 if (GET_CODE (PATTERN (insn)) == USE
41341 || GET_CODE (PATTERN (insn)) == CLOBBER)
41342 continue;
41344 extract_insn (insn);
41345 constrain_operands_cached (insn, reload_completed);
41346 int opno0, opno1;
41347 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41348 recog_data.n_operands, &opno0,
41349 &opno1);
41350 if (modrm < 0
41351 || !ix86_rop_should_change_byte_p (modrm)
41352 || opno0 == opno1)
41353 continue;
41355 rtx oldreg = recog_data.operand[opno1];
41356 preprocess_constraints (insn);
41357 const operand_alternative *alt = which_op_alt ();
41359 int i;
41360 for (i = 0; i < recog_data.n_operands; i++)
41361 if (i != opno1
41362 && alt[i].earlyclobber
41363 && reg_overlap_mentioned_p (recog_data.operand[i],
41364 oldreg))
41365 break;
41367 if (i < recog_data.n_operands)
41368 continue;
41370 if (dump_file)
41371 fprintf (dump_file,
41372 "attempting to fix modrm byte in insn %d:"
41373 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41374 reg_class_names[alt[opno1].cl]);
41376 HARD_REG_SET unavailable;
41377 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41378 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41379 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41380 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41381 IOR_HARD_REG_SET (unavailable, output_risky);
41382 IOR_COMPL_HARD_REG_SET (unavailable,
41383 reg_class_contents[alt[opno1].cl]);
41385 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41386 if (!TEST_HARD_REG_BIT (unavailable, i))
41387 break;
41388 if (i == FIRST_PSEUDO_REGISTER)
41390 if (dump_file)
41391 fprintf (dump_file, ", none available\n");
41392 continue;
41394 if (dump_file)
41395 fprintf (dump_file, " -> %d\n", i);
41396 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41397 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41398 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41403 /* Implement machine specific optimizations. We implement padding of returns
41404 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41405 static void
41406 ix86_reorg (void)
41408 /* We are freeing block_for_insn in the toplev to keep compatibility
41409 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41410 compute_bb_for_insn ();
41412 if (flag_mitigate_rop)
41413 ix86_mitigate_rop ();
41415 if (TARGET_SEH && current_function_has_exception_handlers ())
41416 ix86_seh_fixup_eh_fallthru ();
41418 if (optimize && optimize_function_for_speed_p (cfun))
41420 if (TARGET_PAD_SHORT_FUNCTION)
41421 ix86_pad_short_function ();
41422 else if (TARGET_PAD_RETURNS)
41423 ix86_pad_returns ();
41424 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41425 if (TARGET_FOUR_JUMP_LIMIT)
41426 ix86_avoid_jump_mispredicts ();
41427 #endif
41431 /* Return nonzero when QImode register that must be represented via REX prefix
41432 is used. */
41433 bool
41434 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41436 int i;
41437 extract_insn_cached (insn);
41438 for (i = 0; i < recog_data.n_operands; i++)
41439 if (GENERAL_REG_P (recog_data.operand[i])
41440 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41441 return true;
41442 return false;
41445 /* Return true when INSN mentions register that must be encoded using REX
41446 prefix. */
41447 bool
41448 x86_extended_reg_mentioned_p (rtx insn)
41450 subrtx_iterator::array_type array;
41451 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41453 const_rtx x = *iter;
41454 if (REG_P (x)
41455 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41456 return true;
41458 return false;
41461 /* If profitable, negate (without causing overflow) integer constant
41462 of mode MODE at location LOC. Return true in this case. */
41463 bool
41464 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41466 HOST_WIDE_INT val;
41468 if (!CONST_INT_P (*loc))
41469 return false;
41471 switch (mode)
41473 case E_DImode:
41474 /* DImode x86_64 constants must fit in 32 bits. */
41475 gcc_assert (x86_64_immediate_operand (*loc, mode));
41477 mode = SImode;
41478 break;
41480 case E_SImode:
41481 case E_HImode:
41482 case E_QImode:
41483 break;
41485 default:
41486 gcc_unreachable ();
41489 /* Avoid overflows. */
41490 if (mode_signbit_p (mode, *loc))
41491 return false;
41493 val = INTVAL (*loc);
41495 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41496 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41497 if ((val < 0 && val != -128)
41498 || val == 128)
41500 *loc = GEN_INT (-val);
41501 return true;
41504 return false;
41507 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41508 optabs would emit if we didn't have TFmode patterns. */
41510 void
41511 x86_emit_floatuns (rtx operands[2])
41513 rtx_code_label *neglab, *donelab;
41514 rtx i0, i1, f0, in, out;
41515 machine_mode mode, inmode;
41517 inmode = GET_MODE (operands[1]);
41518 gcc_assert (inmode == SImode || inmode == DImode);
41520 out = operands[0];
41521 in = force_reg (inmode, operands[1]);
41522 mode = GET_MODE (out);
41523 neglab = gen_label_rtx ();
41524 donelab = gen_label_rtx ();
41525 f0 = gen_reg_rtx (mode);
41527 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41529 expand_float (out, in, 0);
41531 emit_jump_insn (gen_jump (donelab));
41532 emit_barrier ();
41534 emit_label (neglab);
41536 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41537 1, OPTAB_DIRECT);
41538 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41539 1, OPTAB_DIRECT);
41540 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41542 expand_float (f0, i0, 0);
41544 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41546 emit_label (donelab);
41549 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41550 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41551 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41552 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41554 /* Get a vector mode of the same size as the original but with elements
41555 twice as wide. This is only guaranteed to apply to integral vectors. */
41557 static inline machine_mode
41558 get_mode_wider_vector (machine_mode o)
41560 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41561 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41562 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41563 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41564 return n;
41567 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41568 fill target with val via vec_duplicate. */
41570 static bool
41571 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41573 bool ok;
41574 rtx_insn *insn;
41575 rtx dup;
41577 /* First attempt to recognize VAL as-is. */
41578 dup = gen_vec_duplicate (mode, val);
41579 insn = emit_insn (gen_rtx_SET (target, dup));
41580 if (recog_memoized (insn) < 0)
41582 rtx_insn *seq;
41583 machine_mode innermode = GET_MODE_INNER (mode);
41584 rtx reg;
41586 /* If that fails, force VAL into a register. */
41588 start_sequence ();
41589 reg = force_reg (innermode, val);
41590 if (GET_MODE (reg) != innermode)
41591 reg = gen_lowpart (innermode, reg);
41592 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
41593 seq = get_insns ();
41594 end_sequence ();
41595 if (seq)
41596 emit_insn_before (seq, insn);
41598 ok = recog_memoized (insn) >= 0;
41599 gcc_assert (ok);
41601 return true;
41604 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41605 with all elements equal to VAR. Return true if successful. */
41607 static bool
41608 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41609 rtx target, rtx val)
41611 bool ok;
41613 switch (mode)
41615 case E_V2SImode:
41616 case E_V2SFmode:
41617 if (!mmx_ok)
41618 return false;
41619 /* FALLTHRU */
41621 case E_V4DFmode:
41622 case E_V4DImode:
41623 case E_V8SFmode:
41624 case E_V8SImode:
41625 case E_V2DFmode:
41626 case E_V2DImode:
41627 case E_V4SFmode:
41628 case E_V4SImode:
41629 case E_V16SImode:
41630 case E_V8DImode:
41631 case E_V16SFmode:
41632 case E_V8DFmode:
41633 return ix86_vector_duplicate_value (mode, target, val);
41635 case E_V4HImode:
41636 if (!mmx_ok)
41637 return false;
41638 if (TARGET_SSE || TARGET_3DNOW_A)
41640 rtx x;
41642 val = gen_lowpart (SImode, val);
41643 x = gen_rtx_TRUNCATE (HImode, val);
41644 x = gen_rtx_VEC_DUPLICATE (mode, x);
41645 emit_insn (gen_rtx_SET (target, x));
41646 return true;
41648 goto widen;
41650 case E_V8QImode:
41651 if (!mmx_ok)
41652 return false;
41653 goto widen;
41655 case E_V8HImode:
41656 if (TARGET_AVX2)
41657 return ix86_vector_duplicate_value (mode, target, val);
41659 if (TARGET_SSE2)
41661 struct expand_vec_perm_d dperm;
41662 rtx tmp1, tmp2;
41664 permute:
41665 memset (&dperm, 0, sizeof (dperm));
41666 dperm.target = target;
41667 dperm.vmode = mode;
41668 dperm.nelt = GET_MODE_NUNITS (mode);
41669 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41670 dperm.one_operand_p = true;
41672 /* Extend to SImode using a paradoxical SUBREG. */
41673 tmp1 = gen_reg_rtx (SImode);
41674 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41676 /* Insert the SImode value as low element of a V4SImode vector. */
41677 tmp2 = gen_reg_rtx (V4SImode);
41678 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41679 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41681 ok = (expand_vec_perm_1 (&dperm)
41682 || expand_vec_perm_broadcast_1 (&dperm));
41683 gcc_assert (ok);
41684 return ok;
41686 goto widen;
41688 case E_V16QImode:
41689 if (TARGET_AVX2)
41690 return ix86_vector_duplicate_value (mode, target, val);
41692 if (TARGET_SSE2)
41693 goto permute;
41694 goto widen;
41696 widen:
41697 /* Replicate the value once into the next wider mode and recurse. */
41699 machine_mode smode, wsmode, wvmode;
41700 rtx x;
41702 smode = GET_MODE_INNER (mode);
41703 wvmode = get_mode_wider_vector (mode);
41704 wsmode = GET_MODE_INNER (wvmode);
41706 val = convert_modes (wsmode, smode, val, true);
41707 x = expand_simple_binop (wsmode, ASHIFT, val,
41708 GEN_INT (GET_MODE_BITSIZE (smode)),
41709 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41710 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41712 x = gen_reg_rtx (wvmode);
41713 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41714 gcc_assert (ok);
41715 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41716 return ok;
41719 case E_V16HImode:
41720 case E_V32QImode:
41721 if (TARGET_AVX2)
41722 return ix86_vector_duplicate_value (mode, target, val);
41723 else
41725 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41726 rtx x = gen_reg_rtx (hvmode);
41728 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41729 gcc_assert (ok);
41731 x = gen_rtx_VEC_CONCAT (mode, x, x);
41732 emit_insn (gen_rtx_SET (target, x));
41734 return true;
41736 case E_V64QImode:
41737 case E_V32HImode:
41738 if (TARGET_AVX512BW)
41739 return ix86_vector_duplicate_value (mode, target, val);
41740 else
41742 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41743 rtx x = gen_reg_rtx (hvmode);
41745 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41746 gcc_assert (ok);
41748 x = gen_rtx_VEC_CONCAT (mode, x, x);
41749 emit_insn (gen_rtx_SET (target, x));
41751 return true;
41753 default:
41754 return false;
41758 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41759 whose ONE_VAR element is VAR, and other elements are zero. Return true
41760 if successful. */
41762 static bool
41763 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41764 rtx target, rtx var, int one_var)
41766 machine_mode vsimode;
41767 rtx new_target;
41768 rtx x, tmp;
41769 bool use_vector_set = false;
41770 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
41772 switch (mode)
41774 case E_V2DImode:
41775 /* For SSE4.1, we normally use vector set. But if the second
41776 element is zero and inter-unit moves are OK, we use movq
41777 instead. */
41778 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41779 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41780 && one_var == 0));
41781 break;
41782 case E_V16QImode:
41783 case E_V4SImode:
41784 case E_V4SFmode:
41785 use_vector_set = TARGET_SSE4_1;
41786 break;
41787 case E_V8HImode:
41788 use_vector_set = TARGET_SSE2;
41789 break;
41790 case E_V4HImode:
41791 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41792 break;
41793 case E_V32QImode:
41794 case E_V16HImode:
41795 use_vector_set = TARGET_AVX;
41796 break;
41797 case E_V8SImode:
41798 use_vector_set = TARGET_AVX;
41799 gen_vec_set_0 = gen_vec_setv8si_0;
41800 break;
41801 case E_V8SFmode:
41802 use_vector_set = TARGET_AVX;
41803 gen_vec_set_0 = gen_vec_setv8sf_0;
41804 break;
41805 case E_V4DFmode:
41806 use_vector_set = TARGET_AVX;
41807 gen_vec_set_0 = gen_vec_setv4df_0;
41808 break;
41809 case E_V4DImode:
41810 /* Use ix86_expand_vector_set in 64bit mode only. */
41811 use_vector_set = TARGET_AVX && TARGET_64BIT;
41812 gen_vec_set_0 = gen_vec_setv4di_0;
41813 break;
41814 case E_V16SImode:
41815 use_vector_set = TARGET_AVX512F && one_var == 0;
41816 gen_vec_set_0 = gen_vec_setv16si_0;
41817 break;
41818 case E_V16SFmode:
41819 use_vector_set = TARGET_AVX512F && one_var == 0;
41820 gen_vec_set_0 = gen_vec_setv16sf_0;
41821 break;
41822 case E_V8DFmode:
41823 use_vector_set = TARGET_AVX512F && one_var == 0;
41824 gen_vec_set_0 = gen_vec_setv8df_0;
41825 break;
41826 case E_V8DImode:
41827 /* Use ix86_expand_vector_set in 64bit mode only. */
41828 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
41829 gen_vec_set_0 = gen_vec_setv8di_0;
41830 break;
41831 default:
41832 break;
41835 if (use_vector_set)
41837 if (gen_vec_set_0 && one_var == 0)
41839 var = force_reg (GET_MODE_INNER (mode), var);
41840 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
41841 return true;
41843 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41844 var = force_reg (GET_MODE_INNER (mode), var);
41845 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41846 return true;
41849 switch (mode)
41851 case E_V2SFmode:
41852 case E_V2SImode:
41853 if (!mmx_ok)
41854 return false;
41855 /* FALLTHRU */
41857 case E_V2DFmode:
41858 case E_V2DImode:
41859 if (one_var != 0)
41860 return false;
41861 var = force_reg (GET_MODE_INNER (mode), var);
41862 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41863 emit_insn (gen_rtx_SET (target, x));
41864 return true;
41866 case E_V4SFmode:
41867 case E_V4SImode:
41868 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41869 new_target = gen_reg_rtx (mode);
41870 else
41871 new_target = target;
41872 var = force_reg (GET_MODE_INNER (mode), var);
41873 x = gen_rtx_VEC_DUPLICATE (mode, var);
41874 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41875 emit_insn (gen_rtx_SET (new_target, x));
41876 if (one_var != 0)
41878 /* We need to shuffle the value to the correct position, so
41879 create a new pseudo to store the intermediate result. */
41881 /* With SSE2, we can use the integer shuffle insns. */
41882 if (mode != V4SFmode && TARGET_SSE2)
41884 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41885 const1_rtx,
41886 GEN_INT (one_var == 1 ? 0 : 1),
41887 GEN_INT (one_var == 2 ? 0 : 1),
41888 GEN_INT (one_var == 3 ? 0 : 1)));
41889 if (target != new_target)
41890 emit_move_insn (target, new_target);
41891 return true;
41894 /* Otherwise convert the intermediate result to V4SFmode and
41895 use the SSE1 shuffle instructions. */
41896 if (mode != V4SFmode)
41898 tmp = gen_reg_rtx (V4SFmode);
41899 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41901 else
41902 tmp = new_target;
41904 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41905 const1_rtx,
41906 GEN_INT (one_var == 1 ? 0 : 1),
41907 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41908 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41910 if (mode != V4SFmode)
41911 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41912 else if (tmp != target)
41913 emit_move_insn (target, tmp);
41915 else if (target != new_target)
41916 emit_move_insn (target, new_target);
41917 return true;
41919 case E_V8HImode:
41920 case E_V16QImode:
41921 vsimode = V4SImode;
41922 goto widen;
41923 case E_V4HImode:
41924 case E_V8QImode:
41925 if (!mmx_ok)
41926 return false;
41927 vsimode = V2SImode;
41928 goto widen;
41929 widen:
41930 if (one_var != 0)
41931 return false;
41933 /* Zero extend the variable element to SImode and recurse. */
41934 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41936 x = gen_reg_rtx (vsimode);
41937 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41938 var, one_var))
41939 gcc_unreachable ();
41941 emit_move_insn (target, gen_lowpart (mode, x));
41942 return true;
41944 default:
41945 return false;
41949 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41950 consisting of the values in VALS. It is known that all elements
41951 except ONE_VAR are constants. Return true if successful. */
41953 static bool
41954 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41955 rtx target, rtx vals, int one_var)
41957 rtx var = XVECEXP (vals, 0, one_var);
41958 machine_mode wmode;
41959 rtx const_vec, x;
41961 const_vec = copy_rtx (vals);
41962 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41963 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41965 switch (mode)
41967 case E_V2DFmode:
41968 case E_V2DImode:
41969 case E_V2SFmode:
41970 case E_V2SImode:
41971 /* For the two element vectors, it's just as easy to use
41972 the general case. */
41973 return false;
41975 case E_V4DImode:
41976 /* Use ix86_expand_vector_set in 64bit mode only. */
41977 if (!TARGET_64BIT)
41978 return false;
41979 /* FALLTHRU */
41980 case E_V4DFmode:
41981 case E_V8SFmode:
41982 case E_V8SImode:
41983 case E_V16HImode:
41984 case E_V32QImode:
41985 case E_V4SFmode:
41986 case E_V4SImode:
41987 case E_V8HImode:
41988 case E_V4HImode:
41989 break;
41991 case E_V16QImode:
41992 if (TARGET_SSE4_1)
41993 break;
41994 wmode = V8HImode;
41995 goto widen;
41996 case E_V8QImode:
41997 wmode = V4HImode;
41998 goto widen;
41999 widen:
42000 /* There's no way to set one QImode entry easily. Combine
42001 the variable value with its adjacent constant value, and
42002 promote to an HImode set. */
42003 x = XVECEXP (vals, 0, one_var ^ 1);
42004 if (one_var & 1)
42006 var = convert_modes (HImode, QImode, var, true);
42007 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42008 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42009 x = GEN_INT (INTVAL (x) & 0xff);
42011 else
42013 var = convert_modes (HImode, QImode, var, true);
42014 x = gen_int_mode (INTVAL (x) << 8, HImode);
42016 if (x != const0_rtx)
42017 var = expand_simple_binop (HImode, IOR, var, x, var,
42018 1, OPTAB_LIB_WIDEN);
42020 x = gen_reg_rtx (wmode);
42021 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42022 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42024 emit_move_insn (target, gen_lowpart (mode, x));
42025 return true;
42027 default:
42028 return false;
42031 emit_move_insn (target, const_vec);
42032 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42033 return true;
42036 /* A subroutine of ix86_expand_vector_init_general. Use vector
42037 concatenate to handle the most general case: all values variable,
42038 and none identical. */
42040 static void
42041 ix86_expand_vector_init_concat (machine_mode mode,
42042 rtx target, rtx *ops, int n)
42044 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42045 rtx first[16], second[8], third[4];
42046 rtvec v;
42047 int i, j;
42049 switch (n)
42051 case 2:
42052 switch (mode)
42054 case E_V16SImode:
42055 cmode = V8SImode;
42056 break;
42057 case E_V16SFmode:
42058 cmode = V8SFmode;
42059 break;
42060 case E_V8DImode:
42061 cmode = V4DImode;
42062 break;
42063 case E_V8DFmode:
42064 cmode = V4DFmode;
42065 break;
42066 case E_V8SImode:
42067 cmode = V4SImode;
42068 break;
42069 case E_V8SFmode:
42070 cmode = V4SFmode;
42071 break;
42072 case E_V4DImode:
42073 cmode = V2DImode;
42074 break;
42075 case E_V4DFmode:
42076 cmode = V2DFmode;
42077 break;
42078 case E_V4SImode:
42079 cmode = V2SImode;
42080 break;
42081 case E_V4SFmode:
42082 cmode = V2SFmode;
42083 break;
42084 case E_V2DImode:
42085 cmode = DImode;
42086 break;
42087 case E_V2SImode:
42088 cmode = SImode;
42089 break;
42090 case E_V2DFmode:
42091 cmode = DFmode;
42092 break;
42093 case E_V2SFmode:
42094 cmode = SFmode;
42095 break;
42096 default:
42097 gcc_unreachable ();
42100 if (!register_operand (ops[1], cmode))
42101 ops[1] = force_reg (cmode, ops[1]);
42102 if (!register_operand (ops[0], cmode))
42103 ops[0] = force_reg (cmode, ops[0]);
42104 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42105 ops[1])));
42106 break;
42108 case 4:
42109 switch (mode)
42111 case E_V4DImode:
42112 cmode = V2DImode;
42113 break;
42114 case E_V4DFmode:
42115 cmode = V2DFmode;
42116 break;
42117 case E_V4SImode:
42118 cmode = V2SImode;
42119 break;
42120 case E_V4SFmode:
42121 cmode = V2SFmode;
42122 break;
42123 default:
42124 gcc_unreachable ();
42126 goto half;
42128 case 8:
42129 switch (mode)
42131 case E_V8DImode:
42132 cmode = V2DImode;
42133 hmode = V4DImode;
42134 break;
42135 case E_V8DFmode:
42136 cmode = V2DFmode;
42137 hmode = V4DFmode;
42138 break;
42139 case E_V8SImode:
42140 cmode = V2SImode;
42141 hmode = V4SImode;
42142 break;
42143 case E_V8SFmode:
42144 cmode = V2SFmode;
42145 hmode = V4SFmode;
42146 break;
42147 default:
42148 gcc_unreachable ();
42150 goto half;
42152 case 16:
42153 switch (mode)
42155 case E_V16SImode:
42156 cmode = V2SImode;
42157 hmode = V4SImode;
42158 gmode = V8SImode;
42159 break;
42160 case E_V16SFmode:
42161 cmode = V2SFmode;
42162 hmode = V4SFmode;
42163 gmode = V8SFmode;
42164 break;
42165 default:
42166 gcc_unreachable ();
42168 goto half;
42170 half:
42171 /* FIXME: We process inputs backward to help RA. PR 36222. */
42172 i = n - 1;
42173 j = (n >> 1) - 1;
42174 for (; i > 0; i -= 2, j--)
42176 first[j] = gen_reg_rtx (cmode);
42177 v = gen_rtvec (2, ops[i - 1], ops[i]);
42178 ix86_expand_vector_init (false, first[j],
42179 gen_rtx_PARALLEL (cmode, v));
42182 n >>= 1;
42183 if (n > 4)
42185 gcc_assert (hmode != VOIDmode);
42186 gcc_assert (gmode != VOIDmode);
42187 for (i = j = 0; i < n; i += 2, j++)
42189 second[j] = gen_reg_rtx (hmode);
42190 ix86_expand_vector_init_concat (hmode, second [j],
42191 &first [i], 2);
42193 n >>= 1;
42194 for (i = j = 0; i < n; i += 2, j++)
42196 third[j] = gen_reg_rtx (gmode);
42197 ix86_expand_vector_init_concat (gmode, third[j],
42198 &second[i], 2);
42200 n >>= 1;
42201 ix86_expand_vector_init_concat (mode, target, third, n);
42203 else if (n > 2)
42205 gcc_assert (hmode != VOIDmode);
42206 for (i = j = 0; i < n; i += 2, j++)
42208 second[j] = gen_reg_rtx (hmode);
42209 ix86_expand_vector_init_concat (hmode, second [j],
42210 &first [i], 2);
42212 n >>= 1;
42213 ix86_expand_vector_init_concat (mode, target, second, n);
42215 else
42216 ix86_expand_vector_init_concat (mode, target, first, n);
42217 break;
42219 default:
42220 gcc_unreachable ();
42224 /* A subroutine of ix86_expand_vector_init_general. Use vector
42225 interleave to handle the most general case: all values variable,
42226 and none identical. */
42228 static void
42229 ix86_expand_vector_init_interleave (machine_mode mode,
42230 rtx target, rtx *ops, int n)
42232 machine_mode first_imode, second_imode, third_imode, inner_mode;
42233 int i, j;
42234 rtx op0, op1;
42235 rtx (*gen_load_even) (rtx, rtx, rtx);
42236 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42237 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42239 switch (mode)
42241 case E_V8HImode:
42242 gen_load_even = gen_vec_setv8hi;
42243 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42244 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42245 inner_mode = HImode;
42246 first_imode = V4SImode;
42247 second_imode = V2DImode;
42248 third_imode = VOIDmode;
42249 break;
42250 case E_V16QImode:
42251 gen_load_even = gen_vec_setv16qi;
42252 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42253 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42254 inner_mode = QImode;
42255 first_imode = V8HImode;
42256 second_imode = V4SImode;
42257 third_imode = V2DImode;
42258 break;
42259 default:
42260 gcc_unreachable ();
42263 for (i = 0; i < n; i++)
42265 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42266 op0 = gen_reg_rtx (SImode);
42267 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42269 /* Insert the SImode value as low element of V4SImode vector. */
42270 op1 = gen_reg_rtx (V4SImode);
42271 op0 = gen_rtx_VEC_MERGE (V4SImode,
42272 gen_rtx_VEC_DUPLICATE (V4SImode,
42273 op0),
42274 CONST0_RTX (V4SImode),
42275 const1_rtx);
42276 emit_insn (gen_rtx_SET (op1, op0));
42278 /* Cast the V4SImode vector back to a vector in orignal mode. */
42279 op0 = gen_reg_rtx (mode);
42280 emit_move_insn (op0, gen_lowpart (mode, op1));
42282 /* Load even elements into the second position. */
42283 emit_insn (gen_load_even (op0,
42284 force_reg (inner_mode,
42285 ops [i + i + 1]),
42286 const1_rtx));
42288 /* Cast vector to FIRST_IMODE vector. */
42289 ops[i] = gen_reg_rtx (first_imode);
42290 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42293 /* Interleave low FIRST_IMODE vectors. */
42294 for (i = j = 0; i < n; i += 2, j++)
42296 op0 = gen_reg_rtx (first_imode);
42297 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42299 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42300 ops[j] = gen_reg_rtx (second_imode);
42301 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42304 /* Interleave low SECOND_IMODE vectors. */
42305 switch (second_imode)
42307 case E_V4SImode:
42308 for (i = j = 0; i < n / 2; i += 2, j++)
42310 op0 = gen_reg_rtx (second_imode);
42311 emit_insn (gen_interleave_second_low (op0, ops[i],
42312 ops[i + 1]));
42314 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42315 vector. */
42316 ops[j] = gen_reg_rtx (third_imode);
42317 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42319 second_imode = V2DImode;
42320 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42321 /* FALLTHRU */
42323 case E_V2DImode:
42324 op0 = gen_reg_rtx (second_imode);
42325 emit_insn (gen_interleave_second_low (op0, ops[0],
42326 ops[1]));
42328 /* Cast the SECOND_IMODE vector back to a vector on original
42329 mode. */
42330 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42331 break;
42333 default:
42334 gcc_unreachable ();
42338 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42339 all values variable, and none identical. */
42341 static void
42342 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42343 rtx target, rtx vals)
42345 rtx ops[64], op0, op1, op2, op3, op4, op5;
42346 machine_mode half_mode = VOIDmode;
42347 machine_mode quarter_mode = VOIDmode;
42348 int n, i;
42350 switch (mode)
42352 case E_V2SFmode:
42353 case E_V2SImode:
42354 if (!mmx_ok && !TARGET_SSE)
42355 break;
42356 /* FALLTHRU */
42358 case E_V16SImode:
42359 case E_V16SFmode:
42360 case E_V8DFmode:
42361 case E_V8DImode:
42362 case E_V8SFmode:
42363 case E_V8SImode:
42364 case E_V4DFmode:
42365 case E_V4DImode:
42366 case E_V4SFmode:
42367 case E_V4SImode:
42368 case E_V2DFmode:
42369 case E_V2DImode:
42370 n = GET_MODE_NUNITS (mode);
42371 for (i = 0; i < n; i++)
42372 ops[i] = XVECEXP (vals, 0, i);
42373 ix86_expand_vector_init_concat (mode, target, ops, n);
42374 return;
42376 case E_V2TImode:
42377 for (i = 0; i < 2; i++)
42378 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42379 op0 = gen_reg_rtx (V4DImode);
42380 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
42381 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42382 return;
42384 case E_V4TImode:
42385 for (i = 0; i < 4; i++)
42386 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42387 ops[4] = gen_reg_rtx (V4DImode);
42388 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
42389 ops[5] = gen_reg_rtx (V4DImode);
42390 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
42391 op0 = gen_reg_rtx (V8DImode);
42392 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
42393 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42394 return;
42396 case E_V32QImode:
42397 half_mode = V16QImode;
42398 goto half;
42400 case E_V16HImode:
42401 half_mode = V8HImode;
42402 goto half;
42404 half:
42405 n = GET_MODE_NUNITS (mode);
42406 for (i = 0; i < n; i++)
42407 ops[i] = XVECEXP (vals, 0, i);
42408 op0 = gen_reg_rtx (half_mode);
42409 op1 = gen_reg_rtx (half_mode);
42410 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42411 n >> 2);
42412 ix86_expand_vector_init_interleave (half_mode, op1,
42413 &ops [n >> 1], n >> 2);
42414 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42415 return;
42417 case E_V64QImode:
42418 quarter_mode = V16QImode;
42419 half_mode = V32QImode;
42420 goto quarter;
42422 case E_V32HImode:
42423 quarter_mode = V8HImode;
42424 half_mode = V16HImode;
42425 goto quarter;
42427 quarter:
42428 n = GET_MODE_NUNITS (mode);
42429 for (i = 0; i < n; i++)
42430 ops[i] = XVECEXP (vals, 0, i);
42431 op0 = gen_reg_rtx (quarter_mode);
42432 op1 = gen_reg_rtx (quarter_mode);
42433 op2 = gen_reg_rtx (quarter_mode);
42434 op3 = gen_reg_rtx (quarter_mode);
42435 op4 = gen_reg_rtx (half_mode);
42436 op5 = gen_reg_rtx (half_mode);
42437 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42438 n >> 3);
42439 ix86_expand_vector_init_interleave (quarter_mode, op1,
42440 &ops [n >> 2], n >> 3);
42441 ix86_expand_vector_init_interleave (quarter_mode, op2,
42442 &ops [n >> 1], n >> 3);
42443 ix86_expand_vector_init_interleave (quarter_mode, op3,
42444 &ops [(n >> 1) | (n >> 2)], n >> 3);
42445 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42446 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42447 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42448 return;
42450 case E_V16QImode:
42451 if (!TARGET_SSE4_1)
42452 break;
42453 /* FALLTHRU */
42455 case E_V8HImode:
42456 if (!TARGET_SSE2)
42457 break;
42459 /* Don't use ix86_expand_vector_init_interleave if we can't
42460 move from GPR to SSE register directly. */
42461 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42462 break;
42464 n = GET_MODE_NUNITS (mode);
42465 for (i = 0; i < n; i++)
42466 ops[i] = XVECEXP (vals, 0, i);
42467 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42468 return;
42470 case E_V4HImode:
42471 case E_V8QImode:
42472 break;
42474 default:
42475 gcc_unreachable ();
42479 int i, j, n_elts, n_words, n_elt_per_word;
42480 machine_mode inner_mode;
42481 rtx words[4], shift;
42483 inner_mode = GET_MODE_INNER (mode);
42484 n_elts = GET_MODE_NUNITS (mode);
42485 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42486 n_elt_per_word = n_elts / n_words;
42487 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42489 for (i = 0; i < n_words; ++i)
42491 rtx word = NULL_RTX;
42493 for (j = 0; j < n_elt_per_word; ++j)
42495 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42496 elt = convert_modes (word_mode, inner_mode, elt, true);
42498 if (j == 0)
42499 word = elt;
42500 else
42502 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42503 word, 1, OPTAB_LIB_WIDEN);
42504 word = expand_simple_binop (word_mode, IOR, word, elt,
42505 word, 1, OPTAB_LIB_WIDEN);
42509 words[i] = word;
42512 if (n_words == 1)
42513 emit_move_insn (target, gen_lowpart (mode, words[0]));
42514 else if (n_words == 2)
42516 rtx tmp = gen_reg_rtx (mode);
42517 emit_clobber (tmp);
42518 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42519 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42520 emit_move_insn (target, tmp);
42522 else if (n_words == 4)
42524 rtx tmp = gen_reg_rtx (V4SImode);
42525 gcc_assert (word_mode == SImode);
42526 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42527 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42528 emit_move_insn (target, gen_lowpart (mode, tmp));
42530 else
42531 gcc_unreachable ();
42535 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42536 instructions unless MMX_OK is true. */
42538 void
42539 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42541 machine_mode mode = GET_MODE (target);
42542 machine_mode inner_mode = GET_MODE_INNER (mode);
42543 int n_elts = GET_MODE_NUNITS (mode);
42544 int n_var = 0, one_var = -1;
42545 bool all_same = true, all_const_zero = true;
42546 int i;
42547 rtx x;
42549 /* Handle first initialization from vector elts. */
42550 if (n_elts != XVECLEN (vals, 0))
42552 rtx subtarget = target;
42553 x = XVECEXP (vals, 0, 0);
42554 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42555 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42557 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42558 if (inner_mode == QImode || inner_mode == HImode)
42560 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42561 mode = mode_for_vector (SImode, n_bits / 4).require ();
42562 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42563 ops[0] = gen_lowpart (inner_mode, ops[0]);
42564 ops[1] = gen_lowpart (inner_mode, ops[1]);
42565 subtarget = gen_reg_rtx (mode);
42567 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42568 if (subtarget != target)
42569 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42570 return;
42572 gcc_unreachable ();
42575 for (i = 0; i < n_elts; ++i)
42577 x = XVECEXP (vals, 0, i);
42578 if (!(CONST_SCALAR_INT_P (x)
42579 || CONST_DOUBLE_P (x)
42580 || CONST_FIXED_P (x)))
42581 n_var++, one_var = i;
42582 else if (x != CONST0_RTX (inner_mode))
42583 all_const_zero = false;
42584 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42585 all_same = false;
42588 /* Constants are best loaded from the constant pool. */
42589 if (n_var == 0)
42591 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42592 return;
42595 /* If all values are identical, broadcast the value. */
42596 if (all_same
42597 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42598 XVECEXP (vals, 0, 0)))
42599 return;
42601 /* Values where only one field is non-constant are best loaded from
42602 the pool and overwritten via move later. */
42603 if (n_var == 1)
42605 if (all_const_zero
42606 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42607 XVECEXP (vals, 0, one_var),
42608 one_var))
42609 return;
42611 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42612 return;
42615 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42618 void
42619 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42621 machine_mode mode = GET_MODE (target);
42622 machine_mode inner_mode = GET_MODE_INNER (mode);
42623 machine_mode half_mode;
42624 bool use_vec_merge = false;
42625 rtx tmp;
42626 static rtx (*gen_extract[6][2]) (rtx, rtx)
42628 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42629 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42630 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42631 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42632 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42633 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42635 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42637 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42638 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42639 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42640 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42641 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42642 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42644 int i, j, n;
42645 machine_mode mmode = VOIDmode;
42646 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42648 switch (mode)
42650 case E_V2SFmode:
42651 case E_V2SImode:
42652 if (mmx_ok)
42654 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42655 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42656 if (elt == 0)
42657 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42658 else
42659 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42660 emit_insn (gen_rtx_SET (target, tmp));
42661 return;
42663 break;
42665 case E_V2DImode:
42666 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42667 if (use_vec_merge)
42668 break;
42670 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42671 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42672 if (elt == 0)
42673 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42674 else
42675 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42676 emit_insn (gen_rtx_SET (target, tmp));
42677 return;
42679 case E_V2DFmode:
42681 rtx op0, op1;
42683 /* For the two element vectors, we implement a VEC_CONCAT with
42684 the extraction of the other element. */
42686 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42687 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42689 if (elt == 0)
42690 op0 = val, op1 = tmp;
42691 else
42692 op0 = tmp, op1 = val;
42694 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42695 emit_insn (gen_rtx_SET (target, tmp));
42697 return;
42699 case E_V4SFmode:
42700 use_vec_merge = TARGET_SSE4_1;
42701 if (use_vec_merge)
42702 break;
42704 switch (elt)
42706 case 0:
42707 use_vec_merge = true;
42708 break;
42710 case 1:
42711 /* tmp = target = A B C D */
42712 tmp = copy_to_reg (target);
42713 /* target = A A B B */
42714 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42715 /* target = X A B B */
42716 ix86_expand_vector_set (false, target, val, 0);
42717 /* target = A X C D */
42718 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42719 const1_rtx, const0_rtx,
42720 GEN_INT (2+4), GEN_INT (3+4)));
42721 return;
42723 case 2:
42724 /* tmp = target = A B C D */
42725 tmp = copy_to_reg (target);
42726 /* tmp = X B C D */
42727 ix86_expand_vector_set (false, tmp, val, 0);
42728 /* target = A B X D */
42729 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42730 const0_rtx, const1_rtx,
42731 GEN_INT (0+4), GEN_INT (3+4)));
42732 return;
42734 case 3:
42735 /* tmp = target = A B C D */
42736 tmp = copy_to_reg (target);
42737 /* tmp = X B C D */
42738 ix86_expand_vector_set (false, tmp, val, 0);
42739 /* target = A B X D */
42740 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42741 const0_rtx, const1_rtx,
42742 GEN_INT (2+4), GEN_INT (0+4)));
42743 return;
42745 default:
42746 gcc_unreachable ();
42748 break;
42750 case E_V4SImode:
42751 use_vec_merge = TARGET_SSE4_1;
42752 if (use_vec_merge)
42753 break;
42755 /* Element 0 handled by vec_merge below. */
42756 if (elt == 0)
42758 use_vec_merge = true;
42759 break;
42762 if (TARGET_SSE2)
42764 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42765 store into element 0, then shuffle them back. */
42767 rtx order[4];
42769 order[0] = GEN_INT (elt);
42770 order[1] = const1_rtx;
42771 order[2] = const2_rtx;
42772 order[3] = GEN_INT (3);
42773 order[elt] = const0_rtx;
42775 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42776 order[1], order[2], order[3]));
42778 ix86_expand_vector_set (false, target, val, 0);
42780 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42781 order[1], order[2], order[3]));
42783 else
42785 /* For SSE1, we have to reuse the V4SF code. */
42786 rtx t = gen_reg_rtx (V4SFmode);
42787 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42788 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42789 emit_move_insn (target, gen_lowpart (mode, t));
42791 return;
42793 case E_V8HImode:
42794 use_vec_merge = TARGET_SSE2;
42795 break;
42796 case E_V4HImode:
42797 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42798 break;
42800 case E_V16QImode:
42801 use_vec_merge = TARGET_SSE4_1;
42802 break;
42804 case E_V8QImode:
42805 break;
42807 case E_V32QImode:
42808 half_mode = V16QImode;
42809 j = 0;
42810 n = 16;
42811 goto half;
42813 case E_V16HImode:
42814 half_mode = V8HImode;
42815 j = 1;
42816 n = 8;
42817 goto half;
42819 case E_V8SImode:
42820 half_mode = V4SImode;
42821 j = 2;
42822 n = 4;
42823 goto half;
42825 case E_V4DImode:
42826 half_mode = V2DImode;
42827 j = 3;
42828 n = 2;
42829 goto half;
42831 case E_V8SFmode:
42832 half_mode = V4SFmode;
42833 j = 4;
42834 n = 4;
42835 goto half;
42837 case E_V4DFmode:
42838 half_mode = V2DFmode;
42839 j = 5;
42840 n = 2;
42841 goto half;
42843 half:
42844 /* Compute offset. */
42845 i = elt / n;
42846 elt %= n;
42848 gcc_assert (i <= 1);
42850 /* Extract the half. */
42851 tmp = gen_reg_rtx (half_mode);
42852 emit_insn (gen_extract[j][i] (tmp, target));
42854 /* Put val in tmp at elt. */
42855 ix86_expand_vector_set (false, tmp, val, elt);
42857 /* Put it back. */
42858 emit_insn (gen_insert[j][i] (target, target, tmp));
42859 return;
42861 case E_V8DFmode:
42862 if (TARGET_AVX512F)
42864 mmode = QImode;
42865 gen_blendm = gen_avx512f_blendmv8df;
42867 break;
42869 case E_V8DImode:
42870 if (TARGET_AVX512F)
42872 mmode = QImode;
42873 gen_blendm = gen_avx512f_blendmv8di;
42875 break;
42877 case E_V16SFmode:
42878 if (TARGET_AVX512F)
42880 mmode = HImode;
42881 gen_blendm = gen_avx512f_blendmv16sf;
42883 break;
42885 case E_V16SImode:
42886 if (TARGET_AVX512F)
42888 mmode = HImode;
42889 gen_blendm = gen_avx512f_blendmv16si;
42891 break;
42893 case E_V32HImode:
42894 if (TARGET_AVX512F && TARGET_AVX512BW)
42896 mmode = SImode;
42897 gen_blendm = gen_avx512bw_blendmv32hi;
42899 break;
42901 case E_V64QImode:
42902 if (TARGET_AVX512F && TARGET_AVX512BW)
42904 mmode = DImode;
42905 gen_blendm = gen_avx512bw_blendmv64qi;
42907 break;
42909 default:
42910 break;
42913 if (mmode != VOIDmode)
42915 tmp = gen_reg_rtx (mode);
42916 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42917 /* The avx512*_blendm<mode> expanders have different operand order
42918 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42919 elements where the mask is set and second input operand otherwise,
42920 in {sse,avx}*_*blend* the first input operand is used for elements
42921 where the mask is clear and second input operand otherwise. */
42922 emit_insn (gen_blendm (target, target, tmp,
42923 force_reg (mmode,
42924 gen_int_mode (1 << elt, mmode))));
42926 else if (use_vec_merge)
42928 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42929 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42930 emit_insn (gen_rtx_SET (target, tmp));
42932 else
42934 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42936 emit_move_insn (mem, target);
42938 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42939 emit_move_insn (tmp, val);
42941 emit_move_insn (target, mem);
42945 void
42946 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42948 machine_mode mode = GET_MODE (vec);
42949 machine_mode inner_mode = GET_MODE_INNER (mode);
42950 bool use_vec_extr = false;
42951 rtx tmp;
42953 switch (mode)
42955 case E_V2SImode:
42956 case E_V2SFmode:
42957 if (!mmx_ok)
42958 break;
42959 /* FALLTHRU */
42961 case E_V2DFmode:
42962 case E_V2DImode:
42963 case E_V2TImode:
42964 case E_V4TImode:
42965 use_vec_extr = true;
42966 break;
42968 case E_V4SFmode:
42969 use_vec_extr = TARGET_SSE4_1;
42970 if (use_vec_extr)
42971 break;
42973 switch (elt)
42975 case 0:
42976 tmp = vec;
42977 break;
42979 case 1:
42980 case 3:
42981 tmp = gen_reg_rtx (mode);
42982 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42983 GEN_INT (elt), GEN_INT (elt),
42984 GEN_INT (elt+4), GEN_INT (elt+4)));
42985 break;
42987 case 2:
42988 tmp = gen_reg_rtx (mode);
42989 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42990 break;
42992 default:
42993 gcc_unreachable ();
42995 vec = tmp;
42996 use_vec_extr = true;
42997 elt = 0;
42998 break;
43000 case E_V4SImode:
43001 use_vec_extr = TARGET_SSE4_1;
43002 if (use_vec_extr)
43003 break;
43005 if (TARGET_SSE2)
43007 switch (elt)
43009 case 0:
43010 tmp = vec;
43011 break;
43013 case 1:
43014 case 3:
43015 tmp = gen_reg_rtx (mode);
43016 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43017 GEN_INT (elt), GEN_INT (elt),
43018 GEN_INT (elt), GEN_INT (elt)));
43019 break;
43021 case 2:
43022 tmp = gen_reg_rtx (mode);
43023 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43024 break;
43026 default:
43027 gcc_unreachable ();
43029 vec = tmp;
43030 use_vec_extr = true;
43031 elt = 0;
43033 else
43035 /* For SSE1, we have to reuse the V4SF code. */
43036 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43037 gen_lowpart (V4SFmode, vec), elt);
43038 return;
43040 break;
43042 case E_V8HImode:
43043 use_vec_extr = TARGET_SSE2;
43044 break;
43045 case E_V4HImode:
43046 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43047 break;
43049 case E_V16QImode:
43050 use_vec_extr = TARGET_SSE4_1;
43051 break;
43053 case E_V8SFmode:
43054 if (TARGET_AVX)
43056 tmp = gen_reg_rtx (V4SFmode);
43057 if (elt < 4)
43058 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43059 else
43060 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43061 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43062 return;
43064 break;
43066 case E_V4DFmode:
43067 if (TARGET_AVX)
43069 tmp = gen_reg_rtx (V2DFmode);
43070 if (elt < 2)
43071 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43072 else
43073 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43074 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43075 return;
43077 break;
43079 case E_V32QImode:
43080 if (TARGET_AVX)
43082 tmp = gen_reg_rtx (V16QImode);
43083 if (elt < 16)
43084 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43085 else
43086 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43087 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43088 return;
43090 break;
43092 case E_V16HImode:
43093 if (TARGET_AVX)
43095 tmp = gen_reg_rtx (V8HImode);
43096 if (elt < 8)
43097 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43098 else
43099 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43100 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43101 return;
43103 break;
43105 case E_V8SImode:
43106 if (TARGET_AVX)
43108 tmp = gen_reg_rtx (V4SImode);
43109 if (elt < 4)
43110 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43111 else
43112 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43113 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43114 return;
43116 break;
43118 case E_V4DImode:
43119 if (TARGET_AVX)
43121 tmp = gen_reg_rtx (V2DImode);
43122 if (elt < 2)
43123 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43124 else
43125 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43126 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43127 return;
43129 break;
43131 case E_V32HImode:
43132 if (TARGET_AVX512BW)
43134 tmp = gen_reg_rtx (V16HImode);
43135 if (elt < 16)
43136 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43137 else
43138 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43139 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43140 return;
43142 break;
43144 case E_V64QImode:
43145 if (TARGET_AVX512BW)
43147 tmp = gen_reg_rtx (V32QImode);
43148 if (elt < 32)
43149 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43150 else
43151 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43152 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43153 return;
43155 break;
43157 case E_V16SFmode:
43158 tmp = gen_reg_rtx (V8SFmode);
43159 if (elt < 8)
43160 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43161 else
43162 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43163 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43164 return;
43166 case E_V8DFmode:
43167 tmp = gen_reg_rtx (V4DFmode);
43168 if (elt < 4)
43169 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43170 else
43171 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43172 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43173 return;
43175 case E_V16SImode:
43176 tmp = gen_reg_rtx (V8SImode);
43177 if (elt < 8)
43178 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43179 else
43180 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43181 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43182 return;
43184 case E_V8DImode:
43185 tmp = gen_reg_rtx (V4DImode);
43186 if (elt < 4)
43187 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43188 else
43189 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43190 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43191 return;
43193 case E_V8QImode:
43194 /* ??? Could extract the appropriate HImode element and shift. */
43195 default:
43196 break;
43199 if (use_vec_extr)
43201 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43202 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43204 /* Let the rtl optimizers know about the zero extension performed. */
43205 if (inner_mode == QImode || inner_mode == HImode)
43207 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43208 target = gen_lowpart (SImode, target);
43211 emit_insn (gen_rtx_SET (target, tmp));
43213 else
43215 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43217 emit_move_insn (mem, vec);
43219 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43220 emit_move_insn (target, tmp);
43224 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43225 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43226 The upper bits of DEST are undefined, though they shouldn't cause
43227 exceptions (some bits from src or all zeros are ok). */
43229 static void
43230 emit_reduc_half (rtx dest, rtx src, int i)
43232 rtx tem, d = dest;
43233 switch (GET_MODE (src))
43235 case E_V4SFmode:
43236 if (i == 128)
43237 tem = gen_sse_movhlps (dest, src, src);
43238 else
43239 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43240 GEN_INT (1 + 4), GEN_INT (1 + 4));
43241 break;
43242 case E_V2DFmode:
43243 tem = gen_vec_interleave_highv2df (dest, src, src);
43244 break;
43245 case E_V16QImode:
43246 case E_V8HImode:
43247 case E_V4SImode:
43248 case E_V2DImode:
43249 d = gen_reg_rtx (V1TImode);
43250 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43251 GEN_INT (i / 2));
43252 break;
43253 case E_V8SFmode:
43254 if (i == 256)
43255 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43256 else
43257 tem = gen_avx_shufps256 (dest, src, src,
43258 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43259 break;
43260 case E_V4DFmode:
43261 if (i == 256)
43262 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43263 else
43264 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43265 break;
43266 case E_V32QImode:
43267 case E_V16HImode:
43268 case E_V8SImode:
43269 case E_V4DImode:
43270 if (i == 256)
43272 if (GET_MODE (dest) != V4DImode)
43273 d = gen_reg_rtx (V4DImode);
43274 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43275 gen_lowpart (V4DImode, src),
43276 const1_rtx);
43278 else
43280 d = gen_reg_rtx (V2TImode);
43281 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43282 GEN_INT (i / 2));
43284 break;
43285 case E_V64QImode:
43286 case E_V32HImode:
43287 case E_V16SImode:
43288 case E_V16SFmode:
43289 case E_V8DImode:
43290 case E_V8DFmode:
43291 if (i > 128)
43292 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43293 gen_lowpart (V16SImode, src),
43294 gen_lowpart (V16SImode, src),
43295 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43296 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43297 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43298 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43299 GEN_INT (0xC), GEN_INT (0xD),
43300 GEN_INT (0xE), GEN_INT (0xF),
43301 GEN_INT (0x10), GEN_INT (0x11),
43302 GEN_INT (0x12), GEN_INT (0x13),
43303 GEN_INT (0x14), GEN_INT (0x15),
43304 GEN_INT (0x16), GEN_INT (0x17));
43305 else
43306 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43307 gen_lowpart (V16SImode, src),
43308 GEN_INT (i == 128 ? 0x2 : 0x1),
43309 GEN_INT (0x3),
43310 GEN_INT (0x3),
43311 GEN_INT (0x3),
43312 GEN_INT (i == 128 ? 0x6 : 0x5),
43313 GEN_INT (0x7),
43314 GEN_INT (0x7),
43315 GEN_INT (0x7),
43316 GEN_INT (i == 128 ? 0xA : 0x9),
43317 GEN_INT (0xB),
43318 GEN_INT (0xB),
43319 GEN_INT (0xB),
43320 GEN_INT (i == 128 ? 0xE : 0xD),
43321 GEN_INT (0xF),
43322 GEN_INT (0xF),
43323 GEN_INT (0xF));
43324 break;
43325 default:
43326 gcc_unreachable ();
43328 emit_insn (tem);
43329 if (d != dest)
43330 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43333 /* Expand a vector reduction. FN is the binary pattern to reduce;
43334 DEST is the destination; IN is the input vector. */
43336 void
43337 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43339 rtx half, dst, vec = in;
43340 machine_mode mode = GET_MODE (in);
43341 int i;
43343 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43344 if (TARGET_SSE4_1
43345 && mode == V8HImode
43346 && fn == gen_uminv8hi3)
43348 emit_insn (gen_sse4_1_phminposuw (dest, in));
43349 return;
43352 for (i = GET_MODE_BITSIZE (mode);
43353 i > GET_MODE_UNIT_BITSIZE (mode);
43354 i >>= 1)
43356 half = gen_reg_rtx (mode);
43357 emit_reduc_half (half, vec, i);
43358 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43359 dst = dest;
43360 else
43361 dst = gen_reg_rtx (mode);
43362 emit_insn (fn (dst, half, vec));
43363 vec = dst;
43367 /* Target hook for scalar_mode_supported_p. */
43368 static bool
43369 ix86_scalar_mode_supported_p (scalar_mode mode)
43371 if (DECIMAL_FLOAT_MODE_P (mode))
43372 return default_decimal_float_supported_p ();
43373 else if (mode == TFmode)
43374 return true;
43375 else
43376 return default_scalar_mode_supported_p (mode);
43379 /* Implements target hook vector_mode_supported_p. */
43380 static bool
43381 ix86_vector_mode_supported_p (machine_mode mode)
43383 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43384 return true;
43385 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43386 return true;
43387 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43388 return true;
43389 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43390 return true;
43391 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43392 return true;
43393 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43394 return true;
43395 return false;
43398 /* Target hook for c_mode_for_suffix. */
43399 static machine_mode
43400 ix86_c_mode_for_suffix (char suffix)
43402 if (suffix == 'q')
43403 return TFmode;
43404 if (suffix == 'w')
43405 return XFmode;
43407 return VOIDmode;
43410 /* Worker function for TARGET_MD_ASM_ADJUST.
43412 We implement asm flag outputs, and maintain source compatibility
43413 with the old cc0-based compiler. */
43415 static rtx_insn *
43416 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43417 vec<const char *> &constraints,
43418 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43420 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43421 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43423 bool saw_asm_flag = false;
43425 start_sequence ();
43426 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43428 const char *con = constraints[i];
43429 if (strncmp (con, "=@cc", 4) != 0)
43430 continue;
43431 con += 4;
43432 if (strchr (con, ',') != NULL)
43434 error ("alternatives not allowed in asm flag output");
43435 continue;
43438 bool invert = false;
43439 if (con[0] == 'n')
43440 invert = true, con++;
43442 machine_mode mode = CCmode;
43443 rtx_code code = UNKNOWN;
43445 switch (con[0])
43447 case 'a':
43448 if (con[1] == 0)
43449 mode = CCAmode, code = EQ;
43450 else if (con[1] == 'e' && con[2] == 0)
43451 mode = CCCmode, code = NE;
43452 break;
43453 case 'b':
43454 if (con[1] == 0)
43455 mode = CCCmode, code = EQ;
43456 else if (con[1] == 'e' && con[2] == 0)
43457 mode = CCAmode, code = NE;
43458 break;
43459 case 'c':
43460 if (con[1] == 0)
43461 mode = CCCmode, code = EQ;
43462 break;
43463 case 'e':
43464 if (con[1] == 0)
43465 mode = CCZmode, code = EQ;
43466 break;
43467 case 'g':
43468 if (con[1] == 0)
43469 mode = CCGCmode, code = GT;
43470 else if (con[1] == 'e' && con[2] == 0)
43471 mode = CCGCmode, code = GE;
43472 break;
43473 case 'l':
43474 if (con[1] == 0)
43475 mode = CCGCmode, code = LT;
43476 else if (con[1] == 'e' && con[2] == 0)
43477 mode = CCGCmode, code = LE;
43478 break;
43479 case 'o':
43480 if (con[1] == 0)
43481 mode = CCOmode, code = EQ;
43482 break;
43483 case 'p':
43484 if (con[1] == 0)
43485 mode = CCPmode, code = EQ;
43486 break;
43487 case 's':
43488 if (con[1] == 0)
43489 mode = CCSmode, code = EQ;
43490 break;
43491 case 'z':
43492 if (con[1] == 0)
43493 mode = CCZmode, code = EQ;
43494 break;
43496 if (code == UNKNOWN)
43498 error ("unknown asm flag output %qs", constraints[i]);
43499 continue;
43501 if (invert)
43502 code = reverse_condition (code);
43504 rtx dest = outputs[i];
43505 if (!saw_asm_flag)
43507 /* This is the first asm flag output. Here we put the flags
43508 register in as the real output and adjust the condition to
43509 allow it. */
43510 constraints[i] = "=Bf";
43511 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43512 saw_asm_flag = true;
43514 else
43516 /* We don't need the flags register as output twice. */
43517 constraints[i] = "=X";
43518 outputs[i] = gen_rtx_SCRATCH (SImode);
43521 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43522 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43524 machine_mode dest_mode = GET_MODE (dest);
43525 if (!SCALAR_INT_MODE_P (dest_mode))
43527 error ("invalid type for asm flag output");
43528 continue;
43531 if (dest_mode == DImode && !TARGET_64BIT)
43532 dest_mode = SImode;
43534 if (dest_mode != QImode)
43536 rtx destqi = gen_reg_rtx (QImode);
43537 emit_insn (gen_rtx_SET (destqi, x));
43539 if (TARGET_ZERO_EXTEND_WITH_AND
43540 && optimize_function_for_speed_p (cfun))
43542 x = force_reg (dest_mode, const0_rtx);
43544 emit_insn (gen_movstrictqi
43545 (gen_lowpart (QImode, x), destqi));
43547 else
43548 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43551 if (dest_mode != GET_MODE (dest))
43553 rtx tmp = gen_reg_rtx (SImode);
43555 emit_insn (gen_rtx_SET (tmp, x));
43556 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43558 else
43559 emit_insn (gen_rtx_SET (dest, x));
43561 rtx_insn *seq = get_insns ();
43562 end_sequence ();
43564 if (saw_asm_flag)
43565 return seq;
43566 else
43568 /* If we had no asm flag outputs, clobber the flags. */
43569 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43570 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43571 return NULL;
43575 /* Implements target vector targetm.asm.encode_section_info. */
43577 static void ATTRIBUTE_UNUSED
43578 ix86_encode_section_info (tree decl, rtx rtl, int first)
43580 default_encode_section_info (decl, rtl, first);
43582 if (ix86_in_large_data_p (decl))
43583 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43586 /* Worker function for REVERSE_CONDITION. */
43588 enum rtx_code
43589 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43591 return (mode == CCFPmode
43592 ? reverse_condition_maybe_unordered (code)
43593 : reverse_condition (code));
43596 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43597 to OPERANDS[0]. */
43599 const char *
43600 output_387_reg_move (rtx_insn *insn, rtx *operands)
43602 if (REG_P (operands[0]))
43604 if (REG_P (operands[1])
43605 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43607 if (REGNO (operands[0]) == FIRST_STACK_REG)
43608 return output_387_ffreep (operands, 0);
43609 return "fstp\t%y0";
43611 if (STACK_TOP_P (operands[0]))
43612 return "fld%Z1\t%y1";
43613 return "fst\t%y0";
43615 else if (MEM_P (operands[0]))
43617 gcc_assert (REG_P (operands[1]));
43618 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43619 return "fstp%Z0\t%y0";
43620 else
43622 /* There is no non-popping store to memory for XFmode.
43623 So if we need one, follow the store with a load. */
43624 if (GET_MODE (operands[0]) == XFmode)
43625 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43626 else
43627 return "fst%Z0\t%y0";
43630 else
43631 gcc_unreachable();
43634 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43635 FP status register is set. */
43637 void
43638 ix86_emit_fp_unordered_jump (rtx label)
43640 rtx reg = gen_reg_rtx (HImode);
43641 rtx temp;
43643 emit_insn (gen_x86_fnstsw_1 (reg));
43645 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43647 emit_insn (gen_x86_sahf_1 (reg));
43649 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43650 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43652 else
43654 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43656 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43657 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43660 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43661 gen_rtx_LABEL_REF (VOIDmode, label),
43662 pc_rtx);
43663 temp = gen_rtx_SET (pc_rtx, temp);
43665 emit_jump_insn (temp);
43666 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43669 /* Output code to perform a log1p XFmode calculation. */
43671 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43673 rtx_code_label *label1 = gen_label_rtx ();
43674 rtx_code_label *label2 = gen_label_rtx ();
43676 rtx tmp = gen_reg_rtx (XFmode);
43677 rtx tmp2 = gen_reg_rtx (XFmode);
43678 rtx test;
43680 emit_insn (gen_absxf2 (tmp, op1));
43681 test = gen_rtx_GE (VOIDmode, tmp,
43682 const_double_from_real_value (
43683 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43684 XFmode));
43685 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43687 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43688 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43689 emit_jump (label2);
43691 emit_label (label1);
43692 emit_move_insn (tmp, CONST1_RTX (XFmode));
43693 emit_insn (gen_addxf3 (tmp, op1, tmp));
43694 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43695 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43697 emit_label (label2);
43700 /* Emit code for round calculation. */
43701 void ix86_emit_i387_round (rtx op0, rtx op1)
43703 machine_mode inmode = GET_MODE (op1);
43704 machine_mode outmode = GET_MODE (op0);
43705 rtx e1, e2, res, tmp, tmp1, half;
43706 rtx scratch = gen_reg_rtx (HImode);
43707 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43708 rtx_code_label *jump_label = gen_label_rtx ();
43709 rtx insn;
43710 rtx (*gen_abs) (rtx, rtx);
43711 rtx (*gen_neg) (rtx, rtx);
43713 switch (inmode)
43715 case E_SFmode:
43716 gen_abs = gen_abssf2;
43717 break;
43718 case E_DFmode:
43719 gen_abs = gen_absdf2;
43720 break;
43721 case E_XFmode:
43722 gen_abs = gen_absxf2;
43723 break;
43724 default:
43725 gcc_unreachable ();
43728 switch (outmode)
43730 case E_SFmode:
43731 gen_neg = gen_negsf2;
43732 break;
43733 case E_DFmode:
43734 gen_neg = gen_negdf2;
43735 break;
43736 case E_XFmode:
43737 gen_neg = gen_negxf2;
43738 break;
43739 case E_HImode:
43740 gen_neg = gen_neghi2;
43741 break;
43742 case E_SImode:
43743 gen_neg = gen_negsi2;
43744 break;
43745 case E_DImode:
43746 gen_neg = gen_negdi2;
43747 break;
43748 default:
43749 gcc_unreachable ();
43752 e1 = gen_reg_rtx (inmode);
43753 e2 = gen_reg_rtx (inmode);
43754 res = gen_reg_rtx (outmode);
43756 half = const_double_from_real_value (dconsthalf, inmode);
43758 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43760 /* scratch = fxam(op1) */
43761 emit_insn (gen_rtx_SET (scratch,
43762 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43763 UNSPEC_FXAM)));
43764 /* e1 = fabs(op1) */
43765 emit_insn (gen_abs (e1, op1));
43767 /* e2 = e1 + 0.5 */
43768 half = force_reg (inmode, half);
43769 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43771 /* res = floor(e2) */
43772 if (inmode != XFmode)
43774 tmp1 = gen_reg_rtx (XFmode);
43776 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43778 else
43779 tmp1 = e2;
43781 switch (outmode)
43783 case E_SFmode:
43784 case E_DFmode:
43786 rtx tmp0 = gen_reg_rtx (XFmode);
43788 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43790 emit_insn (gen_rtx_SET (res,
43791 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43792 UNSPEC_TRUNC_NOOP)));
43794 break;
43795 case E_XFmode:
43796 emit_insn (gen_frndintxf2_floor (res, tmp1));
43797 break;
43798 case E_HImode:
43799 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43800 break;
43801 case E_SImode:
43802 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43803 break;
43804 case E_DImode:
43805 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43806 break;
43807 default:
43808 gcc_unreachable ();
43811 /* flags = signbit(a) */
43812 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43814 /* if (flags) then res = -res */
43815 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43816 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43817 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43818 pc_rtx);
43819 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43820 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43821 JUMP_LABEL (insn) = jump_label;
43823 emit_insn (gen_neg (res, res));
43825 emit_label (jump_label);
43826 LABEL_NUSES (jump_label) = 1;
43828 emit_move_insn (op0, res);
43831 /* Output code to perform a Newton-Rhapson approximation of a single precision
43832 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43834 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43836 rtx x0, x1, e0, e1;
43838 x0 = gen_reg_rtx (mode);
43839 e0 = gen_reg_rtx (mode);
43840 e1 = gen_reg_rtx (mode);
43841 x1 = gen_reg_rtx (mode);
43843 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43845 b = force_reg (mode, b);
43847 /* x0 = rcp(b) estimate */
43848 if (mode == V16SFmode || mode == V8DFmode)
43850 if (TARGET_AVX512ER)
43852 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43853 UNSPEC_RCP28)));
43854 /* res = a * x0 */
43855 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43856 return;
43858 else
43859 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43860 UNSPEC_RCP14)));
43862 else
43863 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43864 UNSPEC_RCP)));
43866 /* e0 = x0 * b */
43867 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43869 /* e0 = x0 * e0 */
43870 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43872 /* e1 = x0 + x0 */
43873 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43875 /* x1 = e1 - e0 */
43876 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43878 /* res = a * x1 */
43879 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43882 /* Output code to perform a Newton-Rhapson approximation of a
43883 single precision floating point [reciprocal] square root. */
43885 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43887 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43888 REAL_VALUE_TYPE r;
43889 int unspec;
43891 x0 = gen_reg_rtx (mode);
43892 e0 = gen_reg_rtx (mode);
43893 e1 = gen_reg_rtx (mode);
43894 e2 = gen_reg_rtx (mode);
43895 e3 = gen_reg_rtx (mode);
43897 if (TARGET_AVX512ER && mode == V16SFmode)
43899 if (recip)
43900 /* res = rsqrt28(a) estimate */
43901 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43902 UNSPEC_RSQRT28)));
43903 else
43905 /* x0 = rsqrt28(a) estimate */
43906 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43907 UNSPEC_RSQRT28)));
43908 /* res = rcp28(x0) estimate */
43909 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43910 UNSPEC_RCP28)));
43912 return;
43915 real_from_integer (&r, VOIDmode, -3, SIGNED);
43916 mthree = const_double_from_real_value (r, SFmode);
43918 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43919 mhalf = const_double_from_real_value (r, SFmode);
43920 unspec = UNSPEC_RSQRT;
43922 if (VECTOR_MODE_P (mode))
43924 mthree = ix86_build_const_vector (mode, true, mthree);
43925 mhalf = ix86_build_const_vector (mode, true, mhalf);
43926 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43927 if (GET_MODE_SIZE (mode) == 64)
43928 unspec = UNSPEC_RSQRT14;
43931 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43932 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43934 a = force_reg (mode, a);
43936 /* x0 = rsqrt(a) estimate */
43937 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43938 unspec)));
43940 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43941 if (!recip)
43943 rtx zero = force_reg (mode, CONST0_RTX(mode));
43944 rtx mask;
43946 /* Handle masked compare. */
43947 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43949 mask = gen_reg_rtx (HImode);
43950 /* Imm value 0x4 corresponds to not-equal comparison. */
43951 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43952 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43954 else
43956 mask = gen_reg_rtx (mode);
43957 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43958 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43962 /* e0 = x0 * a */
43963 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43964 /* e1 = e0 * x0 */
43965 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43967 /* e2 = e1 - 3. */
43968 mthree = force_reg (mode, mthree);
43969 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43971 mhalf = force_reg (mode, mhalf);
43972 if (recip)
43973 /* e3 = -.5 * x0 */
43974 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43975 else
43976 /* e3 = -.5 * e0 */
43977 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43978 /* ret = e2 * e3 */
43979 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43982 #ifdef TARGET_SOLARIS
43983 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43985 static void
43986 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43987 tree decl)
43989 /* With Binutils 2.15, the "@unwind" marker must be specified on
43990 every occurrence of the ".eh_frame" section, not just the first
43991 one. */
43992 if (TARGET_64BIT
43993 && strcmp (name, ".eh_frame") == 0)
43995 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43996 flags & SECTION_WRITE ? "aw" : "a");
43997 return;
44000 #ifndef USE_GAS
44001 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44003 solaris_elf_asm_comdat_section (name, flags, decl);
44004 return;
44006 #endif
44008 default_elf_asm_named_section (name, flags, decl);
44010 #endif /* TARGET_SOLARIS */
44012 /* Return the mangling of TYPE if it is an extended fundamental type. */
44014 static const char *
44015 ix86_mangle_type (const_tree type)
44017 type = TYPE_MAIN_VARIANT (type);
44019 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44020 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44021 return NULL;
44023 switch (TYPE_MODE (type))
44025 case E_TFmode:
44026 /* __float128 is "g". */
44027 return "g";
44028 case E_XFmode:
44029 /* "long double" or __float80 is "e". */
44030 return "e";
44031 default:
44032 return NULL;
44036 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
44038 static tree
44039 ix86_stack_protect_guard (void)
44041 if (TARGET_SSP_TLS_GUARD)
44043 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
44044 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
44045 tree type = build_qualified_type (type_node, qual);
44046 tree t;
44048 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
44050 t = ix86_tls_stack_chk_guard_decl;
44052 if (t == NULL)
44054 rtx x;
44056 t = build_decl
44057 (UNKNOWN_LOCATION, VAR_DECL,
44058 get_identifier (ix86_stack_protector_guard_symbol_str),
44059 type);
44060 TREE_STATIC (t) = 1;
44061 TREE_PUBLIC (t) = 1;
44062 DECL_EXTERNAL (t) = 1;
44063 TREE_USED (t) = 1;
44064 TREE_THIS_VOLATILE (t) = 1;
44065 DECL_ARTIFICIAL (t) = 1;
44066 DECL_IGNORED_P (t) = 1;
44068 /* Do not share RTL as the declaration is visible outside of
44069 current function. */
44070 x = DECL_RTL (t);
44071 RTX_FLAG (x, used) = 1;
44073 ix86_tls_stack_chk_guard_decl = t;
44076 else
44078 tree asptrtype = build_pointer_type (type);
44080 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
44081 t = build2 (MEM_REF, asptrtype, t,
44082 build_int_cst (asptrtype, 0));
44085 return t;
44088 return default_stack_protect_guard ();
44091 /* For 32-bit code we can save PIC register setup by using
44092 __stack_chk_fail_local hidden function instead of calling
44093 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44094 register, so it is better to call __stack_chk_fail directly. */
44096 static tree ATTRIBUTE_UNUSED
44097 ix86_stack_protect_fail (void)
44099 return TARGET_64BIT
44100 ? default_external_stack_protect_fail ()
44101 : default_hidden_stack_protect_fail ();
44104 /* Select a format to encode pointers in exception handling data. CODE
44105 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44106 true if the symbol may be affected by dynamic relocations.
44108 ??? All x86 object file formats are capable of representing this.
44109 After all, the relocation needed is the same as for the call insn.
44110 Whether or not a particular assembler allows us to enter such, I
44111 guess we'll have to see. */
44113 asm_preferred_eh_data_format (int code, int global)
44115 if (flag_pic)
44117 int type = DW_EH_PE_sdata8;
44118 if (!TARGET_64BIT
44119 || ix86_cmodel == CM_SMALL_PIC
44120 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44121 type = DW_EH_PE_sdata4;
44122 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44124 if (ix86_cmodel == CM_SMALL
44125 || (ix86_cmodel == CM_MEDIUM && code))
44126 return DW_EH_PE_udata4;
44127 return DW_EH_PE_absptr;
44130 /* Expand copysign from SIGN to the positive value ABS_VALUE
44131 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44132 the sign-bit. */
44133 static void
44134 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44136 machine_mode mode = GET_MODE (sign);
44137 rtx sgn = gen_reg_rtx (mode);
44138 if (mask == NULL_RTX)
44140 machine_mode vmode;
44142 if (mode == SFmode)
44143 vmode = V4SFmode;
44144 else if (mode == DFmode)
44145 vmode = V2DFmode;
44146 else
44147 vmode = mode;
44149 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44150 if (!VECTOR_MODE_P (mode))
44152 /* We need to generate a scalar mode mask in this case. */
44153 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44154 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44155 mask = gen_reg_rtx (mode);
44156 emit_insn (gen_rtx_SET (mask, tmp));
44159 else
44160 mask = gen_rtx_NOT (mode, mask);
44161 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44162 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44165 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44166 mask for masking out the sign-bit is stored in *SMASK, if that is
44167 non-null. */
44168 static rtx
44169 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44171 machine_mode vmode, mode = GET_MODE (op0);
44172 rtx xa, mask;
44174 xa = gen_reg_rtx (mode);
44175 if (mode == SFmode)
44176 vmode = V4SFmode;
44177 else if (mode == DFmode)
44178 vmode = V2DFmode;
44179 else
44180 vmode = mode;
44181 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44182 if (!VECTOR_MODE_P (mode))
44184 /* We need to generate a scalar mode mask in this case. */
44185 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44186 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44187 mask = gen_reg_rtx (mode);
44188 emit_insn (gen_rtx_SET (mask, tmp));
44190 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44192 if (smask)
44193 *smask = mask;
44195 return xa;
44198 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44199 swapping the operands if SWAP_OPERANDS is true. The expanded
44200 code is a forward jump to a newly created label in case the
44201 comparison is true. The generated label rtx is returned. */
44202 static rtx_code_label *
44203 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44204 bool swap_operands)
44206 bool unordered_compare = ix86_unordered_fp_compare (code);
44207 rtx_code_label *label;
44208 rtx tmp, reg;
44210 if (swap_operands)
44211 std::swap (op0, op1);
44213 label = gen_label_rtx ();
44214 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
44215 if (unordered_compare)
44216 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
44217 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
44218 emit_insn (gen_rtx_SET (reg, tmp));
44219 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
44220 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44221 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44222 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44223 JUMP_LABEL (tmp) = label;
44225 return label;
44228 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44229 using comparison code CODE. Operands are swapped for the comparison if
44230 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44231 static rtx
44232 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44233 bool swap_operands)
44235 rtx (*insn)(rtx, rtx, rtx, rtx);
44236 machine_mode mode = GET_MODE (op0);
44237 rtx mask = gen_reg_rtx (mode);
44239 if (swap_operands)
44240 std::swap (op0, op1);
44242 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44244 emit_insn (insn (mask, op0, op1,
44245 gen_rtx_fmt_ee (code, mode, op0, op1)));
44246 return mask;
44249 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44250 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44251 static rtx
44252 ix86_gen_TWO52 (machine_mode mode)
44254 REAL_VALUE_TYPE TWO52r;
44255 rtx TWO52;
44257 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44258 TWO52 = const_double_from_real_value (TWO52r, mode);
44259 TWO52 = force_reg (mode, TWO52);
44261 return TWO52;
44264 /* Expand SSE sequence for computing lround from OP1 storing
44265 into OP0. */
44266 void
44267 ix86_expand_lround (rtx op0, rtx op1)
44269 /* C code for the stuff we're doing below:
44270 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44271 return (long)tmp;
44273 machine_mode mode = GET_MODE (op1);
44274 const struct real_format *fmt;
44275 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44276 rtx adj;
44278 /* load nextafter (0.5, 0.0) */
44279 fmt = REAL_MODE_FORMAT (mode);
44280 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44281 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44283 /* adj = copysign (0.5, op1) */
44284 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44285 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44287 /* adj = op1 + adj */
44288 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44290 /* op0 = (imode)adj */
44291 expand_fix (op0, adj, 0);
44294 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44295 into OPERAND0. */
44296 void
44297 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44299 /* C code for the stuff we're doing below (for do_floor):
44300 xi = (long)op1;
44301 xi -= (double)xi > op1 ? 1 : 0;
44302 return xi;
44304 machine_mode fmode = GET_MODE (op1);
44305 machine_mode imode = GET_MODE (op0);
44306 rtx ireg, freg, tmp;
44307 rtx_code_label *label;
44309 /* reg = (long)op1 */
44310 ireg = gen_reg_rtx (imode);
44311 expand_fix (ireg, op1, 0);
44313 /* freg = (double)reg */
44314 freg = gen_reg_rtx (fmode);
44315 expand_float (freg, ireg, 0);
44317 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44318 label = ix86_expand_sse_compare_and_jump (UNLE,
44319 freg, op1, !do_floor);
44320 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44321 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44322 emit_move_insn (ireg, tmp);
44324 emit_label (label);
44325 LABEL_NUSES (label) = 1;
44327 emit_move_insn (op0, ireg);
44330 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
44331 void
44332 ix86_expand_rint (rtx operand0, rtx operand1)
44334 /* C code for the stuff we're doing below:
44335 xa = fabs (operand1);
44336 if (!isless (xa, 2**52))
44337 return operand1;
44338 two52 = 2**52;
44339 if (flag_rounding_math)
44341 two52 = copysign (two52, operand1);
44342 xa = operand1;
44344 xa = xa + two52 - two52;
44345 return copysign (xa, operand1);
44347 machine_mode mode = GET_MODE (operand0);
44348 rtx res, xa, TWO52, two52, mask;
44349 rtx_code_label *label;
44351 res = gen_reg_rtx (mode);
44352 emit_move_insn (res, operand1);
44354 /* xa = abs (operand1) */
44355 xa = ix86_expand_sse_fabs (res, &mask);
44357 /* if (!isless (xa, TWO52)) goto label; */
44358 TWO52 = ix86_gen_TWO52 (mode);
44359 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44361 two52 = TWO52;
44362 if (flag_rounding_math)
44364 two52 = gen_reg_rtx (mode);
44365 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
44366 xa = res;
44369 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
44370 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
44372 ix86_sse_copysign_to_positive (res, xa, res, mask);
44374 emit_label (label);
44375 LABEL_NUSES (label) = 1;
44377 emit_move_insn (operand0, res);
44380 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44381 into OPERAND0. */
44382 void
44383 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44385 /* C code for the stuff we expand below.
44386 double xa = fabs (x), x2;
44387 if (!isless (xa, TWO52))
44388 return x;
44389 xa = xa + TWO52 - TWO52;
44390 x2 = copysign (xa, x);
44391 Compensate. Floor:
44392 if (x2 > x)
44393 x2 -= 1;
44394 Compensate. Ceil:
44395 if (x2 < x)
44396 x2 -= -1;
44397 return x2;
44399 machine_mode mode = GET_MODE (operand0);
44400 rtx xa, TWO52, tmp, one, res, mask;
44401 rtx_code_label *label;
44403 TWO52 = ix86_gen_TWO52 (mode);
44405 /* Temporary for holding the result, initialized to the input
44406 operand to ease control flow. */
44407 res = gen_reg_rtx (mode);
44408 emit_move_insn (res, operand1);
44410 /* xa = abs (operand1) */
44411 xa = ix86_expand_sse_fabs (res, &mask);
44413 /* if (!isless (xa, TWO52)) goto label; */
44414 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44416 /* xa = xa + TWO52 - TWO52; */
44417 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44418 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44420 /* xa = copysign (xa, operand1) */
44421 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44423 /* generate 1.0 or -1.0 */
44424 one = force_reg (mode,
44425 const_double_from_real_value (do_floor
44426 ? dconst1 : dconstm1, mode));
44428 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44429 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44430 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44431 /* We always need to subtract here to preserve signed zero. */
44432 tmp = expand_simple_binop (mode, MINUS,
44433 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44434 emit_move_insn (res, tmp);
44436 emit_label (label);
44437 LABEL_NUSES (label) = 1;
44439 emit_move_insn (operand0, res);
44442 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44443 into OPERAND0. */
44444 void
44445 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44447 /* C code for the stuff we expand below.
44448 double xa = fabs (x), x2;
44449 if (!isless (xa, TWO52))
44450 return x;
44451 x2 = (double)(long)x;
44452 Compensate. Floor:
44453 if (x2 > x)
44454 x2 -= 1;
44455 Compensate. Ceil:
44456 if (x2 < x)
44457 x2 += 1;
44458 if (HONOR_SIGNED_ZEROS (mode))
44459 return copysign (x2, x);
44460 return x2;
44462 machine_mode mode = GET_MODE (operand0);
44463 rtx xa, xi, TWO52, tmp, one, res, mask;
44464 rtx_code_label *label;
44466 TWO52 = ix86_gen_TWO52 (mode);
44468 /* Temporary for holding the result, initialized to the input
44469 operand to ease control flow. */
44470 res = gen_reg_rtx (mode);
44471 emit_move_insn (res, operand1);
44473 /* xa = abs (operand1) */
44474 xa = ix86_expand_sse_fabs (res, &mask);
44476 /* if (!isless (xa, TWO52)) goto label; */
44477 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44479 /* xa = (double)(long)x */
44480 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44481 expand_fix (xi, res, 0);
44482 expand_float (xa, xi, 0);
44484 /* generate 1.0 */
44485 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44487 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44488 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44489 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44490 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44491 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44492 emit_move_insn (res, tmp);
44494 if (HONOR_SIGNED_ZEROS (mode))
44495 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44497 emit_label (label);
44498 LABEL_NUSES (label) = 1;
44500 emit_move_insn (operand0, res);
44503 /* Expand SSE sequence for computing round from OPERAND1 storing
44504 into OPERAND0. Sequence that works without relying on DImode truncation
44505 via cvttsd2siq that is only available on 64bit targets. */
44506 void
44507 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44509 /* C code for the stuff we expand below.
44510 double xa = fabs (x), xa2, x2;
44511 if (!isless (xa, TWO52))
44512 return x;
44513 Using the absolute value and copying back sign makes
44514 -0.0 -> -0.0 correct.
44515 xa2 = xa + TWO52 - TWO52;
44516 Compensate.
44517 dxa = xa2 - xa;
44518 if (dxa <= -0.5)
44519 xa2 += 1;
44520 else if (dxa > 0.5)
44521 xa2 -= 1;
44522 x2 = copysign (xa2, x);
44523 return x2;
44525 machine_mode mode = GET_MODE (operand0);
44526 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44527 rtx_code_label *label;
44529 TWO52 = ix86_gen_TWO52 (mode);
44531 /* Temporary for holding the result, initialized to the input
44532 operand to ease control flow. */
44533 res = gen_reg_rtx (mode);
44534 emit_move_insn (res, operand1);
44536 /* xa = abs (operand1) */
44537 xa = ix86_expand_sse_fabs (res, &mask);
44539 /* if (!isless (xa, TWO52)) goto label; */
44540 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44542 /* xa2 = xa + TWO52 - TWO52; */
44543 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44544 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44546 /* dxa = xa2 - xa; */
44547 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44549 /* generate 0.5, 1.0 and -0.5 */
44550 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44551 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44552 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44553 0, OPTAB_DIRECT);
44555 /* Compensate. */
44556 tmp = gen_reg_rtx (mode);
44557 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44558 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44559 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44560 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44561 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44562 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44563 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44564 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44566 /* res = copysign (xa2, operand1) */
44567 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44569 emit_label (label);
44570 LABEL_NUSES (label) = 1;
44572 emit_move_insn (operand0, res);
44575 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44576 into OPERAND0. */
44577 void
44578 ix86_expand_trunc (rtx operand0, rtx operand1)
44580 /* C code for SSE variant we expand below.
44581 double xa = fabs (x), x2;
44582 if (!isless (xa, TWO52))
44583 return x;
44584 x2 = (double)(long)x;
44585 if (HONOR_SIGNED_ZEROS (mode))
44586 return copysign (x2, x);
44587 return x2;
44589 machine_mode mode = GET_MODE (operand0);
44590 rtx xa, xi, TWO52, res, mask;
44591 rtx_code_label *label;
44593 TWO52 = ix86_gen_TWO52 (mode);
44595 /* Temporary for holding the result, initialized to the input
44596 operand to ease control flow. */
44597 res = gen_reg_rtx (mode);
44598 emit_move_insn (res, operand1);
44600 /* xa = abs (operand1) */
44601 xa = ix86_expand_sse_fabs (res, &mask);
44603 /* if (!isless (xa, TWO52)) goto label; */
44604 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44606 /* x = (double)(long)x */
44607 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44608 expand_fix (xi, res, 0);
44609 expand_float (res, xi, 0);
44611 if (HONOR_SIGNED_ZEROS (mode))
44612 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44614 emit_label (label);
44615 LABEL_NUSES (label) = 1;
44617 emit_move_insn (operand0, res);
44620 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44621 into OPERAND0. */
44622 void
44623 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44625 machine_mode mode = GET_MODE (operand0);
44626 rtx xa, mask, TWO52, one, res, smask, tmp;
44627 rtx_code_label *label;
44629 /* C code for SSE variant we expand below.
44630 double xa = fabs (x), x2;
44631 if (!isless (xa, TWO52))
44632 return x;
44633 xa2 = xa + TWO52 - TWO52;
44634 Compensate:
44635 if (xa2 > xa)
44636 xa2 -= 1.0;
44637 x2 = copysign (xa2, x);
44638 return x2;
44641 TWO52 = ix86_gen_TWO52 (mode);
44643 /* Temporary for holding the result, initialized to the input
44644 operand to ease control flow. */
44645 res = gen_reg_rtx (mode);
44646 emit_move_insn (res, operand1);
44648 /* xa = abs (operand1) */
44649 xa = ix86_expand_sse_fabs (res, &smask);
44651 /* if (!isless (xa, TWO52)) goto label; */
44652 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44654 /* res = xa + TWO52 - TWO52; */
44655 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44656 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44657 emit_move_insn (res, tmp);
44659 /* generate 1.0 */
44660 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44662 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44663 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44664 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44665 tmp = expand_simple_binop (mode, MINUS,
44666 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44667 emit_move_insn (res, tmp);
44669 /* res = copysign (res, operand1) */
44670 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44672 emit_label (label);
44673 LABEL_NUSES (label) = 1;
44675 emit_move_insn (operand0, res);
44678 /* Expand SSE sequence for computing round from OPERAND1 storing
44679 into OPERAND0. */
44680 void
44681 ix86_expand_round (rtx operand0, rtx operand1)
44683 /* C code for the stuff we're doing below:
44684 double xa = fabs (x);
44685 if (!isless (xa, TWO52))
44686 return x;
44687 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44688 return copysign (xa, x);
44690 machine_mode mode = GET_MODE (operand0);
44691 rtx res, TWO52, xa, xi, half, mask;
44692 rtx_code_label *label;
44693 const struct real_format *fmt;
44694 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44696 /* Temporary for holding the result, initialized to the input
44697 operand to ease control flow. */
44698 res = gen_reg_rtx (mode);
44699 emit_move_insn (res, operand1);
44701 TWO52 = ix86_gen_TWO52 (mode);
44702 xa = ix86_expand_sse_fabs (res, &mask);
44703 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44705 /* load nextafter (0.5, 0.0) */
44706 fmt = REAL_MODE_FORMAT (mode);
44707 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44708 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44710 /* xa = xa + 0.5 */
44711 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44712 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44714 /* xa = (double)(int64_t)xa */
44715 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44716 expand_fix (xi, xa, 0);
44717 expand_float (xa, xi, 0);
44719 /* res = copysign (xa, operand1) */
44720 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44722 emit_label (label);
44723 LABEL_NUSES (label) = 1;
44725 emit_move_insn (operand0, res);
44728 /* Expand SSE sequence for computing round
44729 from OP1 storing into OP0 using sse4 round insn. */
44730 void
44731 ix86_expand_round_sse4 (rtx op0, rtx op1)
44733 machine_mode mode = GET_MODE (op0);
44734 rtx e1, e2, res, half;
44735 const struct real_format *fmt;
44736 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44737 rtx (*gen_copysign) (rtx, rtx, rtx);
44738 rtx (*gen_round) (rtx, rtx, rtx);
44740 switch (mode)
44742 case E_SFmode:
44743 gen_copysign = gen_copysignsf3;
44744 gen_round = gen_sse4_1_roundsf2;
44745 break;
44746 case E_DFmode:
44747 gen_copysign = gen_copysigndf3;
44748 gen_round = gen_sse4_1_rounddf2;
44749 break;
44750 default:
44751 gcc_unreachable ();
44754 /* round (a) = trunc (a + copysign (0.5, a)) */
44756 /* load nextafter (0.5, 0.0) */
44757 fmt = REAL_MODE_FORMAT (mode);
44758 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44759 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44760 half = const_double_from_real_value (pred_half, mode);
44762 /* e1 = copysign (0.5, op1) */
44763 e1 = gen_reg_rtx (mode);
44764 emit_insn (gen_copysign (e1, half, op1));
44766 /* e2 = op1 + e1 */
44767 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44769 /* res = trunc (e2) */
44770 res = gen_reg_rtx (mode);
44771 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44773 emit_move_insn (op0, res);
44777 /* Table of valid machine attributes. */
44778 static const struct attribute_spec ix86_attribute_table[] =
44780 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
44781 affects_type_identity, handler, exclude } */
44782 /* Stdcall attribute says callee is responsible for popping arguments
44783 if they are not variable. */
44784 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44785 NULL },
44786 /* Fastcall attribute says callee is responsible for popping arguments
44787 if they are not variable. */
44788 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44789 NULL },
44790 /* Thiscall attribute says callee is responsible for popping arguments
44791 if they are not variable. */
44792 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44793 NULL },
44794 /* Cdecl attribute says the callee is a normal C declaration */
44795 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44796 NULL },
44797 /* Regparm attribute specifies how many integer arguments are to be
44798 passed in registers. */
44799 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
44800 NULL },
44801 /* Sseregparm attribute says we are using x86_64 calling conventions
44802 for FP arguments. */
44803 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
44804 NULL },
44805 /* The transactional memory builtins are implicitly regparm or fastcall
44806 depending on the ABI. Override the generic do-nothing attribute that
44807 these builtins were declared with. */
44808 { "*tm regparm", 0, 0, false, true, true, true,
44809 ix86_handle_tm_regparm_attribute, NULL },
44810 /* force_align_arg_pointer says this function realigns the stack at entry. */
44811 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44812 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
44813 NULL },
44814 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44815 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
44816 NULL },
44817 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
44818 NULL },
44819 { "shared", 0, 0, true, false, false, false,
44820 ix86_handle_shared_attribute, NULL },
44821 #endif
44822 { "ms_struct", 0, 0, false, false, false, false,
44823 ix86_handle_struct_attribute, NULL },
44824 { "gcc_struct", 0, 0, false, false, false, false,
44825 ix86_handle_struct_attribute, NULL },
44826 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44827 SUBTARGET_ATTRIBUTE_TABLE,
44828 #endif
44829 /* ms_abi and sysv_abi calling convention function attributes. */
44830 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
44831 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
44832 NULL },
44833 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
44834 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
44835 { "ms_hook_prologue", 0, 0, true, false, false, false,
44836 ix86_handle_fndecl_attribute, NULL },
44837 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
44838 ix86_handle_callee_pop_aggregate_return, NULL },
44839 { "interrupt", 0, 0, false, true, true, false,
44840 ix86_handle_interrupt_attribute, NULL },
44841 { "no_caller_saved_registers", 0, 0, false, true, true, false,
44842 ix86_handle_no_caller_saved_registers_attribute, NULL },
44843 { "naked", 0, 0, true, false, false, false,
44844 ix86_handle_fndecl_attribute, NULL },
44846 /* End element. */
44847 { NULL, 0, 0, false, false, false, false, NULL, NULL }
44850 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44851 static int
44852 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44853 tree vectype, int)
44855 bool fp = false;
44856 machine_mode mode = TImode;
44857 int index;
44858 if (vectype != NULL)
44860 fp = FLOAT_TYPE_P (vectype);
44861 mode = TYPE_MODE (vectype);
44864 switch (type_of_cost)
44866 case scalar_stmt:
44867 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44869 case scalar_load:
44870 /* load/store costs are relative to register move which is 2. Recompute
44871 it to COSTS_N_INSNS so everything have same base. */
44872 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44873 : ix86_cost->int_load [2]) / 2;
44875 case scalar_store:
44876 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44877 : ix86_cost->int_store [2]) / 2;
44879 case vector_stmt:
44880 return ix86_vec_cost (mode,
44881 fp ? ix86_cost->addss : ix86_cost->sse_op,
44882 true);
44884 case vector_load:
44885 index = sse_store_index (mode);
44886 /* See PR82713 - we may end up being called on non-vector type. */
44887 if (index < 0)
44888 index = 2;
44889 return ix86_vec_cost (mode,
44890 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44891 true);
44893 case vector_store:
44894 index = sse_store_index (mode);
44895 /* See PR82713 - we may end up being called on non-vector type. */
44896 if (index < 0)
44897 index = 2;
44898 return ix86_vec_cost (mode,
44899 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44900 true);
44902 case vec_to_scalar:
44903 case scalar_to_vec:
44904 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44906 /* We should have separate costs for unaligned loads and gather/scatter.
44907 Do that incrementally. */
44908 case unaligned_load:
44909 index = sse_store_index (mode);
44910 /* See PR82713 - we may end up being called on non-vector type. */
44911 if (index < 0)
44912 index = 2;
44913 return ix86_vec_cost (mode,
44914 COSTS_N_INSNS
44915 (ix86_cost->sse_unaligned_load[index]) / 2,
44916 true);
44918 case unaligned_store:
44919 index = sse_store_index (mode);
44920 /* See PR82713 - we may end up being called on non-vector type. */
44921 if (index < 0)
44922 index = 2;
44923 return ix86_vec_cost (mode,
44924 COSTS_N_INSNS
44925 (ix86_cost->sse_unaligned_store[index]) / 2,
44926 true);
44928 case vector_gather_load:
44929 return ix86_vec_cost (mode,
44930 COSTS_N_INSNS
44931 (ix86_cost->gather_static
44932 + ix86_cost->gather_per_elt
44933 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44934 true);
44936 case vector_scatter_store:
44937 return ix86_vec_cost (mode,
44938 COSTS_N_INSNS
44939 (ix86_cost->scatter_static
44940 + ix86_cost->scatter_per_elt
44941 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44942 true);
44944 case cond_branch_taken:
44945 return ix86_cost->cond_taken_branch_cost;
44947 case cond_branch_not_taken:
44948 return ix86_cost->cond_not_taken_branch_cost;
44950 case vec_perm:
44951 case vec_promote_demote:
44952 return ix86_vec_cost (mode,
44953 ix86_cost->sse_op, true);
44955 case vec_construct:
44956 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44958 default:
44959 gcc_unreachable ();
44963 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44964 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44965 insn every time. */
44967 static GTY(()) rtx_insn *vselect_insn;
44969 /* Initialize vselect_insn. */
44971 static void
44972 init_vselect_insn (void)
44974 unsigned i;
44975 rtx x;
44977 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44978 for (i = 0; i < MAX_VECT_LEN; ++i)
44979 XVECEXP (x, 0, i) = const0_rtx;
44980 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44981 const0_rtx), x);
44982 x = gen_rtx_SET (const0_rtx, x);
44983 start_sequence ();
44984 vselect_insn = emit_insn (x);
44985 end_sequence ();
44988 /* Construct (set target (vec_select op0 (parallel perm))) and
44989 return true if that's a valid instruction in the active ISA. */
44991 static bool
44992 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44993 unsigned nelt, bool testing_p)
44995 unsigned int i;
44996 rtx x, save_vconcat;
44997 int icode;
44999 if (vselect_insn == NULL_RTX)
45000 init_vselect_insn ();
45002 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45003 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45004 for (i = 0; i < nelt; ++i)
45005 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45006 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45007 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45008 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45009 SET_DEST (PATTERN (vselect_insn)) = target;
45010 icode = recog_memoized (vselect_insn);
45012 if (icode >= 0 && !testing_p)
45013 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45015 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45016 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45017 INSN_CODE (vselect_insn) = -1;
45019 return icode >= 0;
45022 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45024 static bool
45025 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45026 const unsigned char *perm, unsigned nelt,
45027 bool testing_p)
45029 machine_mode v2mode;
45030 rtx x;
45031 bool ok;
45033 if (vselect_insn == NULL_RTX)
45034 init_vselect_insn ();
45036 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
45037 return false;
45038 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45039 PUT_MODE (x, v2mode);
45040 XEXP (x, 0) = op0;
45041 XEXP (x, 1) = op1;
45042 ok = expand_vselect (target, x, perm, nelt, testing_p);
45043 XEXP (x, 0) = const0_rtx;
45044 XEXP (x, 1) = const0_rtx;
45045 return ok;
45048 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45049 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45051 static bool
45052 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45054 machine_mode mmode, vmode = d->vmode;
45055 unsigned i, mask, nelt = d->nelt;
45056 rtx target, op0, op1, maskop, x;
45057 rtx rperm[32], vperm;
45059 if (d->one_operand_p)
45060 return false;
45061 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
45062 && (TARGET_AVX512BW
45063 || GET_MODE_UNIT_SIZE (vmode) >= 4))
45065 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45067 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45069 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45071 else
45072 return false;
45074 /* This is a blend, not a permute. Elements must stay in their
45075 respective lanes. */
45076 for (i = 0; i < nelt; ++i)
45078 unsigned e = d->perm[i];
45079 if (!(e == i || e == i + nelt))
45080 return false;
45083 if (d->testing_p)
45084 return true;
45086 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45087 decision should be extracted elsewhere, so that we only try that
45088 sequence once all budget==3 options have been tried. */
45089 target = d->target;
45090 op0 = d->op0;
45091 op1 = d->op1;
45092 mask = 0;
45094 switch (vmode)
45096 case E_V8DFmode:
45097 case E_V16SFmode:
45098 case E_V4DFmode:
45099 case E_V8SFmode:
45100 case E_V2DFmode:
45101 case E_V4SFmode:
45102 case E_V8HImode:
45103 case E_V8SImode:
45104 case E_V32HImode:
45105 case E_V64QImode:
45106 case E_V16SImode:
45107 case E_V8DImode:
45108 for (i = 0; i < nelt; ++i)
45109 mask |= (d->perm[i] >= nelt) << i;
45110 break;
45112 case E_V2DImode:
45113 for (i = 0; i < 2; ++i)
45114 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45115 vmode = V8HImode;
45116 goto do_subreg;
45118 case E_V4SImode:
45119 for (i = 0; i < 4; ++i)
45120 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45121 vmode = V8HImode;
45122 goto do_subreg;
45124 case E_V16QImode:
45125 /* See if bytes move in pairs so we can use pblendw with
45126 an immediate argument, rather than pblendvb with a vector
45127 argument. */
45128 for (i = 0; i < 16; i += 2)
45129 if (d->perm[i] + 1 != d->perm[i + 1])
45131 use_pblendvb:
45132 for (i = 0; i < nelt; ++i)
45133 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45135 finish_pblendvb:
45136 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45137 vperm = force_reg (vmode, vperm);
45139 if (GET_MODE_SIZE (vmode) == 16)
45140 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45141 else
45142 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45143 if (target != d->target)
45144 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45145 return true;
45148 for (i = 0; i < 8; ++i)
45149 mask |= (d->perm[i * 2] >= 16) << i;
45150 vmode = V8HImode;
45151 /* FALLTHRU */
45153 do_subreg:
45154 target = gen_reg_rtx (vmode);
45155 op0 = gen_lowpart (vmode, op0);
45156 op1 = gen_lowpart (vmode, op1);
45157 break;
45159 case E_V32QImode:
45160 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45161 for (i = 0; i < 32; i += 2)
45162 if (d->perm[i] + 1 != d->perm[i + 1])
45163 goto use_pblendvb;
45164 /* See if bytes move in quadruplets. If yes, vpblendd
45165 with immediate can be used. */
45166 for (i = 0; i < 32; i += 4)
45167 if (d->perm[i] + 2 != d->perm[i + 2])
45168 break;
45169 if (i < 32)
45171 /* See if bytes move the same in both lanes. If yes,
45172 vpblendw with immediate can be used. */
45173 for (i = 0; i < 16; i += 2)
45174 if (d->perm[i] + 16 != d->perm[i + 16])
45175 goto use_pblendvb;
45177 /* Use vpblendw. */
45178 for (i = 0; i < 16; ++i)
45179 mask |= (d->perm[i * 2] >= 32) << i;
45180 vmode = V16HImode;
45181 goto do_subreg;
45184 /* Use vpblendd. */
45185 for (i = 0; i < 8; ++i)
45186 mask |= (d->perm[i * 4] >= 32) << i;
45187 vmode = V8SImode;
45188 goto do_subreg;
45190 case E_V16HImode:
45191 /* See if words move in pairs. If yes, vpblendd can be used. */
45192 for (i = 0; i < 16; i += 2)
45193 if (d->perm[i] + 1 != d->perm[i + 1])
45194 break;
45195 if (i < 16)
45197 /* See if words move the same in both lanes. If not,
45198 vpblendvb must be used. */
45199 for (i = 0; i < 8; i++)
45200 if (d->perm[i] + 8 != d->perm[i + 8])
45202 /* Use vpblendvb. */
45203 for (i = 0; i < 32; ++i)
45204 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45206 vmode = V32QImode;
45207 nelt = 32;
45208 target = gen_reg_rtx (vmode);
45209 op0 = gen_lowpart (vmode, op0);
45210 op1 = gen_lowpart (vmode, op1);
45211 goto finish_pblendvb;
45214 /* Use vpblendw. */
45215 for (i = 0; i < 16; ++i)
45216 mask |= (d->perm[i] >= 16) << i;
45217 break;
45220 /* Use vpblendd. */
45221 for (i = 0; i < 8; ++i)
45222 mask |= (d->perm[i * 2] >= 16) << i;
45223 vmode = V8SImode;
45224 goto do_subreg;
45226 case E_V4DImode:
45227 /* Use vpblendd. */
45228 for (i = 0; i < 4; ++i)
45229 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45230 vmode = V8SImode;
45231 goto do_subreg;
45233 default:
45234 gcc_unreachable ();
45237 switch (vmode)
45239 case E_V8DFmode:
45240 case E_V8DImode:
45241 mmode = QImode;
45242 break;
45243 case E_V16SFmode:
45244 case E_V16SImode:
45245 mmode = HImode;
45246 break;
45247 case E_V32HImode:
45248 mmode = SImode;
45249 break;
45250 case E_V64QImode:
45251 mmode = DImode;
45252 break;
45253 default:
45254 mmode = VOIDmode;
45257 if (mmode != VOIDmode)
45258 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45259 else
45260 maskop = GEN_INT (mask);
45262 /* This matches five different patterns with the different modes. */
45263 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45264 x = gen_rtx_SET (target, x);
45265 emit_insn (x);
45266 if (target != d->target)
45267 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45269 return true;
45272 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45273 in terms of the variable form of vpermilps.
45275 Note that we will have already failed the immediate input vpermilps,
45276 which requires that the high and low part shuffle be identical; the
45277 variable form doesn't require that. */
45279 static bool
45280 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45282 rtx rperm[8], vperm;
45283 unsigned i;
45285 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45286 return false;
45288 /* We can only permute within the 128-bit lane. */
45289 for (i = 0; i < 8; ++i)
45291 unsigned e = d->perm[i];
45292 if (i < 4 ? e >= 4 : e < 4)
45293 return false;
45296 if (d->testing_p)
45297 return true;
45299 for (i = 0; i < 8; ++i)
45301 unsigned e = d->perm[i];
45303 /* Within each 128-bit lane, the elements of op0 are numbered
45304 from 0 and the elements of op1 are numbered from 4. */
45305 if (e >= 8 + 4)
45306 e -= 8;
45307 else if (e >= 4)
45308 e -= 4;
45310 rperm[i] = GEN_INT (e);
45313 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45314 vperm = force_reg (V8SImode, vperm);
45315 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45317 return true;
45320 /* Return true if permutation D can be performed as VMODE permutation
45321 instead. */
45323 static bool
45324 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45326 unsigned int i, j, chunk;
45328 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45329 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45330 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45331 return false;
45333 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45334 return true;
45336 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45337 for (i = 0; i < d->nelt; i += chunk)
45338 if (d->perm[i] & (chunk - 1))
45339 return false;
45340 else
45341 for (j = 1; j < chunk; ++j)
45342 if (d->perm[i] + j != d->perm[i + j])
45343 return false;
45345 return true;
45348 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45349 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45351 static bool
45352 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45354 unsigned i, nelt, eltsz, mask;
45355 unsigned char perm[64];
45356 machine_mode vmode = V16QImode;
45357 rtx rperm[64], vperm, target, op0, op1;
45359 nelt = d->nelt;
45361 if (!d->one_operand_p)
45363 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45365 if (TARGET_AVX2
45366 && valid_perm_using_mode_p (V2TImode, d))
45368 if (d->testing_p)
45369 return true;
45371 /* Use vperm2i128 insn. The pattern uses
45372 V4DImode instead of V2TImode. */
45373 target = d->target;
45374 if (d->vmode != V4DImode)
45375 target = gen_reg_rtx (V4DImode);
45376 op0 = gen_lowpart (V4DImode, d->op0);
45377 op1 = gen_lowpart (V4DImode, d->op1);
45378 rperm[0]
45379 = GEN_INT ((d->perm[0] / (nelt / 2))
45380 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45381 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45382 if (target != d->target)
45383 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45384 return true;
45386 return false;
45389 else
45391 if (GET_MODE_SIZE (d->vmode) == 16)
45393 if (!TARGET_SSSE3)
45394 return false;
45396 else if (GET_MODE_SIZE (d->vmode) == 32)
45398 if (!TARGET_AVX2)
45399 return false;
45401 /* V4DImode should be already handled through
45402 expand_vselect by vpermq instruction. */
45403 gcc_assert (d->vmode != V4DImode);
45405 vmode = V32QImode;
45406 if (d->vmode == V8SImode
45407 || d->vmode == V16HImode
45408 || d->vmode == V32QImode)
45410 /* First see if vpermq can be used for
45411 V8SImode/V16HImode/V32QImode. */
45412 if (valid_perm_using_mode_p (V4DImode, d))
45414 for (i = 0; i < 4; i++)
45415 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45416 if (d->testing_p)
45417 return true;
45418 target = gen_reg_rtx (V4DImode);
45419 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45420 perm, 4, false))
45422 emit_move_insn (d->target,
45423 gen_lowpart (d->vmode, target));
45424 return true;
45426 return false;
45429 /* Next see if vpermd can be used. */
45430 if (valid_perm_using_mode_p (V8SImode, d))
45431 vmode = V8SImode;
45433 /* Or if vpermps can be used. */
45434 else if (d->vmode == V8SFmode)
45435 vmode = V8SImode;
45437 if (vmode == V32QImode)
45439 /* vpshufb only works intra lanes, it is not
45440 possible to shuffle bytes in between the lanes. */
45441 for (i = 0; i < nelt; ++i)
45442 if ((d->perm[i] ^ i) & (nelt / 2))
45443 return false;
45446 else if (GET_MODE_SIZE (d->vmode) == 64)
45448 if (!TARGET_AVX512BW)
45449 return false;
45451 /* If vpermq didn't work, vpshufb won't work either. */
45452 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45453 return false;
45455 vmode = V64QImode;
45456 if (d->vmode == V16SImode
45457 || d->vmode == V32HImode
45458 || d->vmode == V64QImode)
45460 /* First see if vpermq can be used for
45461 V16SImode/V32HImode/V64QImode. */
45462 if (valid_perm_using_mode_p (V8DImode, d))
45464 for (i = 0; i < 8; i++)
45465 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45466 if (d->testing_p)
45467 return true;
45468 target = gen_reg_rtx (V8DImode);
45469 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45470 perm, 8, false))
45472 emit_move_insn (d->target,
45473 gen_lowpart (d->vmode, target));
45474 return true;
45476 return false;
45479 /* Next see if vpermd can be used. */
45480 if (valid_perm_using_mode_p (V16SImode, d))
45481 vmode = V16SImode;
45483 /* Or if vpermps can be used. */
45484 else if (d->vmode == V16SFmode)
45485 vmode = V16SImode;
45486 if (vmode == V64QImode)
45488 /* vpshufb only works intra lanes, it is not
45489 possible to shuffle bytes in between the lanes. */
45490 for (i = 0; i < nelt; ++i)
45491 if ((d->perm[i] ^ i) & (nelt / 4))
45492 return false;
45495 else
45496 return false;
45499 if (d->testing_p)
45500 return true;
45502 if (vmode == V8SImode)
45503 for (i = 0; i < 8; ++i)
45504 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45505 else if (vmode == V16SImode)
45506 for (i = 0; i < 16; ++i)
45507 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45508 else
45510 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45511 if (!d->one_operand_p)
45512 mask = 2 * nelt - 1;
45513 else if (vmode == V16QImode)
45514 mask = nelt - 1;
45515 else if (vmode == V64QImode)
45516 mask = nelt / 4 - 1;
45517 else
45518 mask = nelt / 2 - 1;
45520 for (i = 0; i < nelt; ++i)
45522 unsigned j, e = d->perm[i] & mask;
45523 for (j = 0; j < eltsz; ++j)
45524 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45528 vperm = gen_rtx_CONST_VECTOR (vmode,
45529 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45530 vperm = force_reg (vmode, vperm);
45532 target = d->target;
45533 if (d->vmode != vmode)
45534 target = gen_reg_rtx (vmode);
45535 op0 = gen_lowpart (vmode, d->op0);
45536 if (d->one_operand_p)
45538 if (vmode == V16QImode)
45539 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45540 else if (vmode == V32QImode)
45541 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45542 else if (vmode == V64QImode)
45543 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45544 else if (vmode == V8SFmode)
45545 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45546 else if (vmode == V8SImode)
45547 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45548 else if (vmode == V16SFmode)
45549 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45550 else if (vmode == V16SImode)
45551 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45552 else
45553 gcc_unreachable ();
45555 else
45557 op1 = gen_lowpart (vmode, d->op1);
45558 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45560 if (target != d->target)
45561 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45563 return true;
45566 /* For V*[QHS]Imode permutations, check if the same permutation
45567 can't be performed in a 2x, 4x or 8x wider inner mode. */
45569 static bool
45570 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45571 struct expand_vec_perm_d *nd)
45573 int i;
45574 machine_mode mode = VOIDmode;
45576 switch (d->vmode)
45578 case E_V16QImode: mode = V8HImode; break;
45579 case E_V32QImode: mode = V16HImode; break;
45580 case E_V64QImode: mode = V32HImode; break;
45581 case E_V8HImode: mode = V4SImode; break;
45582 case E_V16HImode: mode = V8SImode; break;
45583 case E_V32HImode: mode = V16SImode; break;
45584 case E_V4SImode: mode = V2DImode; break;
45585 case E_V8SImode: mode = V4DImode; break;
45586 case E_V16SImode: mode = V8DImode; break;
45587 default: return false;
45589 for (i = 0; i < d->nelt; i += 2)
45590 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45591 return false;
45592 nd->vmode = mode;
45593 nd->nelt = d->nelt / 2;
45594 for (i = 0; i < nd->nelt; i++)
45595 nd->perm[i] = d->perm[2 * i] / 2;
45596 if (GET_MODE_INNER (mode) != DImode)
45597 canonicalize_vector_int_perm (nd, nd);
45598 if (nd != d)
45600 nd->one_operand_p = d->one_operand_p;
45601 nd->testing_p = d->testing_p;
45602 if (d->op0 == d->op1)
45603 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45604 else
45606 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45607 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45609 if (d->testing_p)
45610 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45611 else
45612 nd->target = gen_reg_rtx (nd->vmode);
45614 return true;
45617 /* Try to expand one-operand permutation with constant mask. */
45619 static bool
45620 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45622 machine_mode mode = GET_MODE (d->op0);
45623 machine_mode maskmode = mode;
45624 rtx (*gen) (rtx, rtx, rtx) = NULL;
45625 rtx target, op0, mask;
45626 rtx vec[64];
45628 if (!rtx_equal_p (d->op0, d->op1))
45629 return false;
45631 if (!TARGET_AVX512F)
45632 return false;
45634 switch (mode)
45636 case E_V16SImode:
45637 gen = gen_avx512f_permvarv16si;
45638 break;
45639 case E_V16SFmode:
45640 gen = gen_avx512f_permvarv16sf;
45641 maskmode = V16SImode;
45642 break;
45643 case E_V8DImode:
45644 gen = gen_avx512f_permvarv8di;
45645 break;
45646 case E_V8DFmode:
45647 gen = gen_avx512f_permvarv8df;
45648 maskmode = V8DImode;
45649 break;
45650 default:
45651 return false;
45654 target = d->target;
45655 op0 = d->op0;
45656 for (int i = 0; i < d->nelt; ++i)
45657 vec[i] = GEN_INT (d->perm[i]);
45658 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45659 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45660 return true;
45663 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45664 in a single instruction. */
45666 static bool
45667 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45669 unsigned i, nelt = d->nelt;
45670 struct expand_vec_perm_d nd;
45672 /* Check plain VEC_SELECT first, because AVX has instructions that could
45673 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45674 input where SEL+CONCAT may not. */
45675 if (d->one_operand_p)
45677 int mask = nelt - 1;
45678 bool identity_perm = true;
45679 bool broadcast_perm = true;
45681 for (i = 0; i < nelt; i++)
45683 nd.perm[i] = d->perm[i] & mask;
45684 if (nd.perm[i] != i)
45685 identity_perm = false;
45686 if (nd.perm[i])
45687 broadcast_perm = false;
45690 if (identity_perm)
45692 if (!d->testing_p)
45693 emit_move_insn (d->target, d->op0);
45694 return true;
45696 else if (broadcast_perm && TARGET_AVX2)
45698 /* Use vpbroadcast{b,w,d}. */
45699 rtx (*gen) (rtx, rtx) = NULL;
45700 switch (d->vmode)
45702 case E_V64QImode:
45703 if (TARGET_AVX512BW)
45704 gen = gen_avx512bw_vec_dupv64qi_1;
45705 break;
45706 case E_V32QImode:
45707 gen = gen_avx2_pbroadcastv32qi_1;
45708 break;
45709 case E_V32HImode:
45710 if (TARGET_AVX512BW)
45711 gen = gen_avx512bw_vec_dupv32hi_1;
45712 break;
45713 case E_V16HImode:
45714 gen = gen_avx2_pbroadcastv16hi_1;
45715 break;
45716 case E_V16SImode:
45717 if (TARGET_AVX512F)
45718 gen = gen_avx512f_vec_dupv16si_1;
45719 break;
45720 case E_V8SImode:
45721 gen = gen_avx2_pbroadcastv8si_1;
45722 break;
45723 case E_V16QImode:
45724 gen = gen_avx2_pbroadcastv16qi;
45725 break;
45726 case E_V8HImode:
45727 gen = gen_avx2_pbroadcastv8hi;
45728 break;
45729 case E_V16SFmode:
45730 if (TARGET_AVX512F)
45731 gen = gen_avx512f_vec_dupv16sf_1;
45732 break;
45733 case E_V8SFmode:
45734 gen = gen_avx2_vec_dupv8sf_1;
45735 break;
45736 case E_V8DFmode:
45737 if (TARGET_AVX512F)
45738 gen = gen_avx512f_vec_dupv8df_1;
45739 break;
45740 case E_V8DImode:
45741 if (TARGET_AVX512F)
45742 gen = gen_avx512f_vec_dupv8di_1;
45743 break;
45744 /* For other modes prefer other shuffles this function creates. */
45745 default: break;
45747 if (gen != NULL)
45749 if (!d->testing_p)
45750 emit_insn (gen (d->target, d->op0));
45751 return true;
45755 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45756 return true;
45758 /* There are plenty of patterns in sse.md that are written for
45759 SEL+CONCAT and are not replicated for a single op. Perhaps
45760 that should be changed, to avoid the nastiness here. */
45762 /* Recognize interleave style patterns, which means incrementing
45763 every other permutation operand. */
45764 for (i = 0; i < nelt; i += 2)
45766 nd.perm[i] = d->perm[i] & mask;
45767 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45769 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45770 d->testing_p))
45771 return true;
45773 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45774 if (nelt >= 4)
45776 for (i = 0; i < nelt; i += 4)
45778 nd.perm[i + 0] = d->perm[i + 0] & mask;
45779 nd.perm[i + 1] = d->perm[i + 1] & mask;
45780 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45781 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45784 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45785 d->testing_p))
45786 return true;
45790 /* Finally, try the fully general two operand permute. */
45791 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45792 d->testing_p))
45793 return true;
45795 /* Recognize interleave style patterns with reversed operands. */
45796 if (!d->one_operand_p)
45798 for (i = 0; i < nelt; ++i)
45800 unsigned e = d->perm[i];
45801 if (e >= nelt)
45802 e -= nelt;
45803 else
45804 e += nelt;
45805 nd.perm[i] = e;
45808 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45809 d->testing_p))
45810 return true;
45813 /* Try the SSE4.1 blend variable merge instructions. */
45814 if (expand_vec_perm_blend (d))
45815 return true;
45817 /* Try one of the AVX vpermil variable permutations. */
45818 if (expand_vec_perm_vpermil (d))
45819 return true;
45821 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45822 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45823 if (expand_vec_perm_pshufb (d))
45824 return true;
45826 /* Try the AVX2 vpalignr instruction. */
45827 if (expand_vec_perm_palignr (d, true))
45828 return true;
45830 /* Try the AVX512F vperm{s,d} instructions. */
45831 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45832 return true;
45834 /* Try the AVX512F vpermt2/vpermi2 instructions. */
45835 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45836 return true;
45838 /* See if we can get the same permutation in different vector integer
45839 mode. */
45840 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45842 if (!d->testing_p)
45843 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45844 return true;
45846 return false;
45849 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45850 in terms of a pair of pshuflw + pshufhw instructions. */
45852 static bool
45853 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45855 unsigned char perm2[MAX_VECT_LEN];
45856 unsigned i;
45857 bool ok;
45859 if (d->vmode != V8HImode || !d->one_operand_p)
45860 return false;
45862 /* The two permutations only operate in 64-bit lanes. */
45863 for (i = 0; i < 4; ++i)
45864 if (d->perm[i] >= 4)
45865 return false;
45866 for (i = 4; i < 8; ++i)
45867 if (d->perm[i] < 4)
45868 return false;
45870 if (d->testing_p)
45871 return true;
45873 /* Emit the pshuflw. */
45874 memcpy (perm2, d->perm, 4);
45875 for (i = 4; i < 8; ++i)
45876 perm2[i] = i;
45877 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45878 gcc_assert (ok);
45880 /* Emit the pshufhw. */
45881 memcpy (perm2 + 4, d->perm + 4, 4);
45882 for (i = 0; i < 4; ++i)
45883 perm2[i] = i;
45884 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45885 gcc_assert (ok);
45887 return true;
45890 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45891 the permutation using the SSSE3 palignr instruction. This succeeds
45892 when all of the elements in PERM fit within one vector and we merely
45893 need to shift them down so that a single vector permutation has a
45894 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45895 the vpalignr instruction itself can perform the requested permutation. */
45897 static bool
45898 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45900 unsigned i, nelt = d->nelt;
45901 unsigned min, max, minswap, maxswap;
45902 bool in_order, ok, swap = false;
45903 rtx shift, target;
45904 struct expand_vec_perm_d dcopy;
45906 /* Even with AVX, palignr only operates on 128-bit vectors,
45907 in AVX2 palignr operates on both 128-bit lanes. */
45908 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45909 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45910 return false;
45912 min = 2 * nelt;
45913 max = 0;
45914 minswap = 2 * nelt;
45915 maxswap = 0;
45916 for (i = 0; i < nelt; ++i)
45918 unsigned e = d->perm[i];
45919 unsigned eswap = d->perm[i] ^ nelt;
45920 if (GET_MODE_SIZE (d->vmode) == 32)
45922 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45923 eswap = e ^ (nelt / 2);
45925 if (e < min)
45926 min = e;
45927 if (e > max)
45928 max = e;
45929 if (eswap < minswap)
45930 minswap = eswap;
45931 if (eswap > maxswap)
45932 maxswap = eswap;
45934 if (min == 0
45935 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45937 if (d->one_operand_p
45938 || minswap == 0
45939 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45940 ? nelt / 2 : nelt))
45941 return false;
45942 swap = true;
45943 min = minswap;
45944 max = maxswap;
45947 /* Given that we have SSSE3, we know we'll be able to implement the
45948 single operand permutation after the palignr with pshufb for
45949 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45950 first. */
45951 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45952 return true;
45954 dcopy = *d;
45955 if (swap)
45957 dcopy.op0 = d->op1;
45958 dcopy.op1 = d->op0;
45959 for (i = 0; i < nelt; ++i)
45960 dcopy.perm[i] ^= nelt;
45963 in_order = true;
45964 for (i = 0; i < nelt; ++i)
45966 unsigned e = dcopy.perm[i];
45967 if (GET_MODE_SIZE (d->vmode) == 32
45968 && e >= nelt
45969 && (e & (nelt / 2 - 1)) < min)
45970 e = e - min - (nelt / 2);
45971 else
45972 e = e - min;
45973 if (e != i)
45974 in_order = false;
45975 dcopy.perm[i] = e;
45977 dcopy.one_operand_p = true;
45979 if (single_insn_only_p && !in_order)
45980 return false;
45982 /* For AVX2, test whether we can permute the result in one instruction. */
45983 if (d->testing_p)
45985 if (in_order)
45986 return true;
45987 dcopy.op1 = dcopy.op0;
45988 return expand_vec_perm_1 (&dcopy);
45991 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45992 if (GET_MODE_SIZE (d->vmode) == 16)
45994 target = gen_reg_rtx (TImode);
45995 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45996 gen_lowpart (TImode, dcopy.op0), shift));
45998 else
46000 target = gen_reg_rtx (V2TImode);
46001 emit_insn (gen_avx2_palignrv2ti (target,
46002 gen_lowpart (V2TImode, dcopy.op1),
46003 gen_lowpart (V2TImode, dcopy.op0),
46004 shift));
46007 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46009 /* Test for the degenerate case where the alignment by itself
46010 produces the desired permutation. */
46011 if (in_order)
46013 emit_move_insn (d->target, dcopy.op0);
46014 return true;
46017 ok = expand_vec_perm_1 (&dcopy);
46018 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46020 return ok;
46023 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46024 the permutation using the SSE4_1 pblendv instruction. Potentially
46025 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46027 static bool
46028 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46030 unsigned i, which, nelt = d->nelt;
46031 struct expand_vec_perm_d dcopy, dcopy1;
46032 machine_mode vmode = d->vmode;
46033 bool ok;
46035 /* Use the same checks as in expand_vec_perm_blend. */
46036 if (d->one_operand_p)
46037 return false;
46038 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46040 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46042 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46044 else
46045 return false;
46047 /* Figure out where permutation elements stay not in their
46048 respective lanes. */
46049 for (i = 0, which = 0; i < nelt; ++i)
46051 unsigned e = d->perm[i];
46052 if (e != i)
46053 which |= (e < nelt ? 1 : 2);
46055 /* We can pblend the part where elements stay not in their
46056 respective lanes only when these elements are all in one
46057 half of a permutation.
46058 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46059 lanes, but both 8 and 9 >= 8
46060 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
46061 respective lanes and 8 >= 8, but 2 not. */
46062 if (which != 1 && which != 2)
46063 return false;
46064 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
46065 return true;
46067 /* First we apply one operand permutation to the part where
46068 elements stay not in their respective lanes. */
46069 dcopy = *d;
46070 if (which == 2)
46071 dcopy.op0 = dcopy.op1 = d->op1;
46072 else
46073 dcopy.op0 = dcopy.op1 = d->op0;
46074 if (!d->testing_p)
46075 dcopy.target = gen_reg_rtx (vmode);
46076 dcopy.one_operand_p = true;
46078 for (i = 0; i < nelt; ++i)
46079 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46081 ok = expand_vec_perm_1 (&dcopy);
46082 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46083 return false;
46084 else
46085 gcc_assert (ok);
46086 if (d->testing_p)
46087 return true;
46089 /* Next we put permuted elements into their positions. */
46090 dcopy1 = *d;
46091 if (which == 2)
46092 dcopy1.op1 = dcopy.target;
46093 else
46094 dcopy1.op0 = dcopy.target;
46096 for (i = 0; i < nelt; ++i)
46097 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46099 ok = expand_vec_perm_blend (&dcopy1);
46100 gcc_assert (ok);
46102 return true;
46105 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46107 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46108 a two vector permutation into a single vector permutation by using
46109 an interleave operation to merge the vectors. */
46111 static bool
46112 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46114 struct expand_vec_perm_d dremap, dfinal;
46115 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46116 unsigned HOST_WIDE_INT contents;
46117 unsigned char remap[2 * MAX_VECT_LEN];
46118 rtx_insn *seq;
46119 bool ok, same_halves = false;
46121 if (GET_MODE_SIZE (d->vmode) == 16)
46123 if (d->one_operand_p)
46124 return false;
46126 else if (GET_MODE_SIZE (d->vmode) == 32)
46128 if (!TARGET_AVX)
46129 return false;
46130 /* For 32-byte modes allow even d->one_operand_p.
46131 The lack of cross-lane shuffling in some instructions
46132 might prevent a single insn shuffle. */
46133 dfinal = *d;
46134 dfinal.testing_p = true;
46135 /* If expand_vec_perm_interleave3 can expand this into
46136 a 3 insn sequence, give up and let it be expanded as
46137 3 insn sequence. While that is one insn longer,
46138 it doesn't need a memory operand and in the common
46139 case that both interleave low and high permutations
46140 with the same operands are adjacent needs 4 insns
46141 for both after CSE. */
46142 if (expand_vec_perm_interleave3 (&dfinal))
46143 return false;
46145 else
46146 return false;
46148 /* Examine from whence the elements come. */
46149 contents = 0;
46150 for (i = 0; i < nelt; ++i)
46151 contents |= HOST_WIDE_INT_1U << d->perm[i];
46153 memset (remap, 0xff, sizeof (remap));
46154 dremap = *d;
46156 if (GET_MODE_SIZE (d->vmode) == 16)
46158 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46160 /* Split the two input vectors into 4 halves. */
46161 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46162 h2 = h1 << nelt2;
46163 h3 = h2 << nelt2;
46164 h4 = h3 << nelt2;
46166 /* If the elements from the low halves use interleave low, and similarly
46167 for interleave high. If the elements are from mis-matched halves, we
46168 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46169 if ((contents & (h1 | h3)) == contents)
46171 /* punpckl* */
46172 for (i = 0; i < nelt2; ++i)
46174 remap[i] = i * 2;
46175 remap[i + nelt] = i * 2 + 1;
46176 dremap.perm[i * 2] = i;
46177 dremap.perm[i * 2 + 1] = i + nelt;
46179 if (!TARGET_SSE2 && d->vmode == V4SImode)
46180 dremap.vmode = V4SFmode;
46182 else if ((contents & (h2 | h4)) == contents)
46184 /* punpckh* */
46185 for (i = 0; i < nelt2; ++i)
46187 remap[i + nelt2] = i * 2;
46188 remap[i + nelt + nelt2] = i * 2 + 1;
46189 dremap.perm[i * 2] = i + nelt2;
46190 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46192 if (!TARGET_SSE2 && d->vmode == V4SImode)
46193 dremap.vmode = V4SFmode;
46195 else if ((contents & (h1 | h4)) == contents)
46197 /* shufps */
46198 for (i = 0; i < nelt2; ++i)
46200 remap[i] = i;
46201 remap[i + nelt + nelt2] = i + nelt2;
46202 dremap.perm[i] = i;
46203 dremap.perm[i + nelt2] = i + nelt + nelt2;
46205 if (nelt != 4)
46207 /* shufpd */
46208 dremap.vmode = V2DImode;
46209 dremap.nelt = 2;
46210 dremap.perm[0] = 0;
46211 dremap.perm[1] = 3;
46214 else if ((contents & (h2 | h3)) == contents)
46216 /* shufps */
46217 for (i = 0; i < nelt2; ++i)
46219 remap[i + nelt2] = i;
46220 remap[i + nelt] = i + nelt2;
46221 dremap.perm[i] = i + nelt2;
46222 dremap.perm[i + nelt2] = i + nelt;
46224 if (nelt != 4)
46226 /* shufpd */
46227 dremap.vmode = V2DImode;
46228 dremap.nelt = 2;
46229 dremap.perm[0] = 1;
46230 dremap.perm[1] = 2;
46233 else
46234 return false;
46236 else
46238 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46239 unsigned HOST_WIDE_INT q[8];
46240 unsigned int nonzero_halves[4];
46242 /* Split the two input vectors into 8 quarters. */
46243 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46244 for (i = 1; i < 8; ++i)
46245 q[i] = q[0] << (nelt4 * i);
46246 for (i = 0; i < 4; ++i)
46247 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46249 nonzero_halves[nzcnt] = i;
46250 ++nzcnt;
46253 if (nzcnt == 1)
46255 gcc_assert (d->one_operand_p);
46256 nonzero_halves[1] = nonzero_halves[0];
46257 same_halves = true;
46259 else if (d->one_operand_p)
46261 gcc_assert (nonzero_halves[0] == 0);
46262 gcc_assert (nonzero_halves[1] == 1);
46265 if (nzcnt <= 2)
46267 if (d->perm[0] / nelt2 == nonzero_halves[1])
46269 /* Attempt to increase the likelihood that dfinal
46270 shuffle will be intra-lane. */
46271 std::swap (nonzero_halves[0], nonzero_halves[1]);
46274 /* vperm2f128 or vperm2i128. */
46275 for (i = 0; i < nelt2; ++i)
46277 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46278 remap[i + nonzero_halves[0] * nelt2] = i;
46279 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46280 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46283 if (d->vmode != V8SFmode
46284 && d->vmode != V4DFmode
46285 && d->vmode != V8SImode)
46287 dremap.vmode = V8SImode;
46288 dremap.nelt = 8;
46289 for (i = 0; i < 4; ++i)
46291 dremap.perm[i] = i + nonzero_halves[0] * 4;
46292 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46296 else if (d->one_operand_p)
46297 return false;
46298 else if (TARGET_AVX2
46299 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46301 /* vpunpckl* */
46302 for (i = 0; i < nelt4; ++i)
46304 remap[i] = i * 2;
46305 remap[i + nelt] = i * 2 + 1;
46306 remap[i + nelt2] = i * 2 + nelt2;
46307 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46308 dremap.perm[i * 2] = i;
46309 dremap.perm[i * 2 + 1] = i + nelt;
46310 dremap.perm[i * 2 + nelt2] = i + nelt2;
46311 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46314 else if (TARGET_AVX2
46315 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46317 /* vpunpckh* */
46318 for (i = 0; i < nelt4; ++i)
46320 remap[i + nelt4] = i * 2;
46321 remap[i + nelt + nelt4] = i * 2 + 1;
46322 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46323 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46324 dremap.perm[i * 2] = i + nelt4;
46325 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46326 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46327 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46330 else
46331 return false;
46334 /* Use the remapping array set up above to move the elements from their
46335 swizzled locations into their final destinations. */
46336 dfinal = *d;
46337 for (i = 0; i < nelt; ++i)
46339 unsigned e = remap[d->perm[i]];
46340 gcc_assert (e < nelt);
46341 /* If same_halves is true, both halves of the remapped vector are the
46342 same. Avoid cross-lane accesses if possible. */
46343 if (same_halves && i >= nelt2)
46345 gcc_assert (e < nelt2);
46346 dfinal.perm[i] = e + nelt2;
46348 else
46349 dfinal.perm[i] = e;
46351 if (!d->testing_p)
46353 dremap.target = gen_reg_rtx (dremap.vmode);
46354 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46356 dfinal.op1 = dfinal.op0;
46357 dfinal.one_operand_p = true;
46359 /* Test if the final remap can be done with a single insn. For V4SFmode or
46360 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46361 start_sequence ();
46362 ok = expand_vec_perm_1 (&dfinal);
46363 seq = get_insns ();
46364 end_sequence ();
46366 if (!ok)
46367 return false;
46369 if (d->testing_p)
46370 return true;
46372 if (dremap.vmode != dfinal.vmode)
46374 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46375 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46378 ok = expand_vec_perm_1 (&dremap);
46379 gcc_assert (ok);
46381 emit_insn (seq);
46382 return true;
46385 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46386 a single vector cross-lane permutation into vpermq followed
46387 by any of the single insn permutations. */
46389 static bool
46390 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46392 struct expand_vec_perm_d dremap, dfinal;
46393 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46394 unsigned contents[2];
46395 bool ok;
46397 if (!(TARGET_AVX2
46398 && (d->vmode == V32QImode || d->vmode == V16HImode)
46399 && d->one_operand_p))
46400 return false;
46402 contents[0] = 0;
46403 contents[1] = 0;
46404 for (i = 0; i < nelt2; ++i)
46406 contents[0] |= 1u << (d->perm[i] / nelt4);
46407 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46410 for (i = 0; i < 2; ++i)
46412 unsigned int cnt = 0;
46413 for (j = 0; j < 4; ++j)
46414 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46415 return false;
46418 if (d->testing_p)
46419 return true;
46421 dremap = *d;
46422 dremap.vmode = V4DImode;
46423 dremap.nelt = 4;
46424 dremap.target = gen_reg_rtx (V4DImode);
46425 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46426 dremap.op1 = dremap.op0;
46427 dremap.one_operand_p = true;
46428 for (i = 0; i < 2; ++i)
46430 unsigned int cnt = 0;
46431 for (j = 0; j < 4; ++j)
46432 if ((contents[i] & (1u << j)) != 0)
46433 dremap.perm[2 * i + cnt++] = j;
46434 for (; cnt < 2; ++cnt)
46435 dremap.perm[2 * i + cnt] = 0;
46438 dfinal = *d;
46439 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46440 dfinal.op1 = dfinal.op0;
46441 dfinal.one_operand_p = true;
46442 for (i = 0, j = 0; i < nelt; ++i)
46444 if (i == nelt2)
46445 j = 2;
46446 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46447 if ((d->perm[i] / nelt4) == dremap.perm[j])
46449 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46450 dfinal.perm[i] |= nelt4;
46451 else
46452 gcc_unreachable ();
46455 ok = expand_vec_perm_1 (&dremap);
46456 gcc_assert (ok);
46458 ok = expand_vec_perm_1 (&dfinal);
46459 gcc_assert (ok);
46461 return true;
46464 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46465 a vector permutation using two instructions, vperm2f128 resp.
46466 vperm2i128 followed by any single in-lane permutation. */
46468 static bool
46469 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46471 struct expand_vec_perm_d dfirst, dsecond;
46472 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46473 bool ok;
46475 if (!TARGET_AVX
46476 || GET_MODE_SIZE (d->vmode) != 32
46477 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46478 return false;
46480 dsecond = *d;
46481 dsecond.one_operand_p = false;
46482 dsecond.testing_p = true;
46484 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46485 immediate. For perm < 16 the second permutation uses
46486 d->op0 as first operand, for perm >= 16 it uses d->op1
46487 as first operand. The second operand is the result of
46488 vperm2[fi]128. */
46489 for (perm = 0; perm < 32; perm++)
46491 /* Ignore permutations which do not move anything cross-lane. */
46492 if (perm < 16)
46494 /* The second shuffle for e.g. V4DFmode has
46495 0123 and ABCD operands.
46496 Ignore AB23, as 23 is already in the second lane
46497 of the first operand. */
46498 if ((perm & 0xc) == (1 << 2)) continue;
46499 /* And 01CD, as 01 is in the first lane of the first
46500 operand. */
46501 if ((perm & 3) == 0) continue;
46502 /* And 4567, as then the vperm2[fi]128 doesn't change
46503 anything on the original 4567 second operand. */
46504 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46506 else
46508 /* The second shuffle for e.g. V4DFmode has
46509 4567 and ABCD operands.
46510 Ignore AB67, as 67 is already in the second lane
46511 of the first operand. */
46512 if ((perm & 0xc) == (3 << 2)) continue;
46513 /* And 45CD, as 45 is in the first lane of the first
46514 operand. */
46515 if ((perm & 3) == 2) continue;
46516 /* And 0123, as then the vperm2[fi]128 doesn't change
46517 anything on the original 0123 first operand. */
46518 if ((perm & 0xf) == (1 << 2)) continue;
46521 for (i = 0; i < nelt; i++)
46523 j = d->perm[i] / nelt2;
46524 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46525 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46526 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46527 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46528 else
46529 break;
46532 if (i == nelt)
46534 start_sequence ();
46535 ok = expand_vec_perm_1 (&dsecond);
46536 end_sequence ();
46538 else
46539 ok = false;
46541 if (ok)
46543 if (d->testing_p)
46544 return true;
46546 /* Found a usable second shuffle. dfirst will be
46547 vperm2f128 on d->op0 and d->op1. */
46548 dsecond.testing_p = false;
46549 dfirst = *d;
46550 dfirst.target = gen_reg_rtx (d->vmode);
46551 for (i = 0; i < nelt; i++)
46552 dfirst.perm[i] = (i & (nelt2 - 1))
46553 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46555 canonicalize_perm (&dfirst);
46556 ok = expand_vec_perm_1 (&dfirst);
46557 gcc_assert (ok);
46559 /* And dsecond is some single insn shuffle, taking
46560 d->op0 and result of vperm2f128 (if perm < 16) or
46561 d->op1 and result of vperm2f128 (otherwise). */
46562 if (perm >= 16)
46563 dsecond.op0 = dsecond.op1;
46564 dsecond.op1 = dfirst.target;
46566 ok = expand_vec_perm_1 (&dsecond);
46567 gcc_assert (ok);
46569 return true;
46572 /* For one operand, the only useful vperm2f128 permutation is 0x01
46573 aka lanes swap. */
46574 if (d->one_operand_p)
46575 return false;
46578 return false;
46581 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46582 a two vector permutation using 2 intra-lane interleave insns
46583 and cross-lane shuffle for 32-byte vectors. */
46585 static bool
46586 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46588 unsigned i, nelt;
46589 rtx (*gen) (rtx, rtx, rtx);
46591 if (d->one_operand_p)
46592 return false;
46593 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46595 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46597 else
46598 return false;
46600 nelt = d->nelt;
46601 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46602 return false;
46603 for (i = 0; i < nelt; i += 2)
46604 if (d->perm[i] != d->perm[0] + i / 2
46605 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46606 return false;
46608 if (d->testing_p)
46609 return true;
46611 switch (d->vmode)
46613 case E_V32QImode:
46614 if (d->perm[0])
46615 gen = gen_vec_interleave_highv32qi;
46616 else
46617 gen = gen_vec_interleave_lowv32qi;
46618 break;
46619 case E_V16HImode:
46620 if (d->perm[0])
46621 gen = gen_vec_interleave_highv16hi;
46622 else
46623 gen = gen_vec_interleave_lowv16hi;
46624 break;
46625 case E_V8SImode:
46626 if (d->perm[0])
46627 gen = gen_vec_interleave_highv8si;
46628 else
46629 gen = gen_vec_interleave_lowv8si;
46630 break;
46631 case E_V4DImode:
46632 if (d->perm[0])
46633 gen = gen_vec_interleave_highv4di;
46634 else
46635 gen = gen_vec_interleave_lowv4di;
46636 break;
46637 case E_V8SFmode:
46638 if (d->perm[0])
46639 gen = gen_vec_interleave_highv8sf;
46640 else
46641 gen = gen_vec_interleave_lowv8sf;
46642 break;
46643 case E_V4DFmode:
46644 if (d->perm[0])
46645 gen = gen_vec_interleave_highv4df;
46646 else
46647 gen = gen_vec_interleave_lowv4df;
46648 break;
46649 default:
46650 gcc_unreachable ();
46653 emit_insn (gen (d->target, d->op0, d->op1));
46654 return true;
46657 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46658 a single vector permutation using a single intra-lane vector
46659 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46660 the non-swapped and swapped vectors together. */
46662 static bool
46663 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46665 struct expand_vec_perm_d dfirst, dsecond;
46666 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46667 rtx_insn *seq;
46668 bool ok;
46669 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46671 if (!TARGET_AVX
46672 || TARGET_AVX2
46673 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46674 || !d->one_operand_p)
46675 return false;
46677 dfirst = *d;
46678 for (i = 0; i < nelt; i++)
46679 dfirst.perm[i] = 0xff;
46680 for (i = 0, msk = 0; i < nelt; i++)
46682 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46683 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46684 return false;
46685 dfirst.perm[j] = d->perm[i];
46686 if (j != i)
46687 msk |= (1 << i);
46689 for (i = 0; i < nelt; i++)
46690 if (dfirst.perm[i] == 0xff)
46691 dfirst.perm[i] = i;
46693 if (!d->testing_p)
46694 dfirst.target = gen_reg_rtx (dfirst.vmode);
46696 start_sequence ();
46697 ok = expand_vec_perm_1 (&dfirst);
46698 seq = get_insns ();
46699 end_sequence ();
46701 if (!ok)
46702 return false;
46704 if (d->testing_p)
46705 return true;
46707 emit_insn (seq);
46709 dsecond = *d;
46710 dsecond.op0 = dfirst.target;
46711 dsecond.op1 = dfirst.target;
46712 dsecond.one_operand_p = true;
46713 dsecond.target = gen_reg_rtx (dsecond.vmode);
46714 for (i = 0; i < nelt; i++)
46715 dsecond.perm[i] = i ^ nelt2;
46717 ok = expand_vec_perm_1 (&dsecond);
46718 gcc_assert (ok);
46720 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46721 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46722 return true;
46725 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46726 permutation using two vperm2f128, followed by a vshufpd insn blending
46727 the two vectors together. */
46729 static bool
46730 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46732 struct expand_vec_perm_d dfirst, dsecond, dthird;
46733 bool ok;
46735 if (!TARGET_AVX || (d->vmode != V4DFmode))
46736 return false;
46738 if (d->testing_p)
46739 return true;
46741 dfirst = *d;
46742 dsecond = *d;
46743 dthird = *d;
46745 dfirst.perm[0] = (d->perm[0] & ~1);
46746 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46747 dfirst.perm[2] = (d->perm[2] & ~1);
46748 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46749 dsecond.perm[0] = (d->perm[1] & ~1);
46750 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46751 dsecond.perm[2] = (d->perm[3] & ~1);
46752 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46753 dthird.perm[0] = (d->perm[0] % 2);
46754 dthird.perm[1] = (d->perm[1] % 2) + 4;
46755 dthird.perm[2] = (d->perm[2] % 2) + 2;
46756 dthird.perm[3] = (d->perm[3] % 2) + 6;
46758 dfirst.target = gen_reg_rtx (dfirst.vmode);
46759 dsecond.target = gen_reg_rtx (dsecond.vmode);
46760 dthird.op0 = dfirst.target;
46761 dthird.op1 = dsecond.target;
46762 dthird.one_operand_p = false;
46764 canonicalize_perm (&dfirst);
46765 canonicalize_perm (&dsecond);
46767 ok = expand_vec_perm_1 (&dfirst)
46768 && expand_vec_perm_1 (&dsecond)
46769 && expand_vec_perm_1 (&dthird);
46771 gcc_assert (ok);
46773 return true;
46776 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46777 permutation with two pshufb insns and an ior. We should have already
46778 failed all two instruction sequences. */
46780 static bool
46781 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46783 rtx rperm[2][16], vperm, l, h, op, m128;
46784 unsigned int i, nelt, eltsz;
46786 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46787 return false;
46788 gcc_assert (!d->one_operand_p);
46790 if (d->testing_p)
46791 return true;
46793 nelt = d->nelt;
46794 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46796 /* Generate two permutation masks. If the required element is within
46797 the given vector it is shuffled into the proper lane. If the required
46798 element is in the other vector, force a zero into the lane by setting
46799 bit 7 in the permutation mask. */
46800 m128 = GEN_INT (-128);
46801 for (i = 0; i < nelt; ++i)
46803 unsigned j, e = d->perm[i];
46804 unsigned which = (e >= nelt);
46805 if (e >= nelt)
46806 e -= nelt;
46808 for (j = 0; j < eltsz; ++j)
46810 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46811 rperm[1-which][i*eltsz + j] = m128;
46815 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46816 vperm = force_reg (V16QImode, vperm);
46818 l = gen_reg_rtx (V16QImode);
46819 op = gen_lowpart (V16QImode, d->op0);
46820 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46822 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46823 vperm = force_reg (V16QImode, vperm);
46825 h = gen_reg_rtx (V16QImode);
46826 op = gen_lowpart (V16QImode, d->op1);
46827 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46829 op = d->target;
46830 if (d->vmode != V16QImode)
46831 op = gen_reg_rtx (V16QImode);
46832 emit_insn (gen_iorv16qi3 (op, l, h));
46833 if (op != d->target)
46834 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46836 return true;
46839 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46840 with two vpshufb insns, vpermq and vpor. We should have already failed
46841 all two or three instruction sequences. */
46843 static bool
46844 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46846 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46847 unsigned int i, nelt, eltsz;
46849 if (!TARGET_AVX2
46850 || !d->one_operand_p
46851 || (d->vmode != V32QImode && d->vmode != V16HImode))
46852 return false;
46854 if (d->testing_p)
46855 return true;
46857 nelt = d->nelt;
46858 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46860 /* Generate two permutation masks. If the required element is within
46861 the same lane, it is shuffled in. If the required element from the
46862 other lane, force a zero by setting bit 7 in the permutation mask.
46863 In the other mask the mask has non-negative elements if element
46864 is requested from the other lane, but also moved to the other lane,
46865 so that the result of vpshufb can have the two V2TImode halves
46866 swapped. */
46867 m128 = GEN_INT (-128);
46868 for (i = 0; i < nelt; ++i)
46870 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46871 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46873 for (j = 0; j < eltsz; ++j)
46875 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46876 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46880 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46881 vperm = force_reg (V32QImode, vperm);
46883 h = gen_reg_rtx (V32QImode);
46884 op = gen_lowpart (V32QImode, d->op0);
46885 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46887 /* Swap the 128-byte lanes of h into hp. */
46888 hp = gen_reg_rtx (V4DImode);
46889 op = gen_lowpart (V4DImode, h);
46890 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46891 const1_rtx));
46893 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46894 vperm = force_reg (V32QImode, vperm);
46896 l = gen_reg_rtx (V32QImode);
46897 op = gen_lowpart (V32QImode, d->op0);
46898 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46900 op = d->target;
46901 if (d->vmode != V32QImode)
46902 op = gen_reg_rtx (V32QImode);
46903 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46904 if (op != d->target)
46905 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46907 return true;
46910 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46911 and extract-odd permutations of two V32QImode and V16QImode operand
46912 with two vpshufb insns, vpor and vpermq. We should have already
46913 failed all two or three instruction sequences. */
46915 static bool
46916 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46918 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46919 unsigned int i, nelt, eltsz;
46921 if (!TARGET_AVX2
46922 || d->one_operand_p
46923 || (d->vmode != V32QImode && d->vmode != V16HImode))
46924 return false;
46926 for (i = 0; i < d->nelt; ++i)
46927 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46928 return false;
46930 if (d->testing_p)
46931 return true;
46933 nelt = d->nelt;
46934 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46936 /* Generate two permutation masks. In the first permutation mask
46937 the first quarter will contain indexes for the first half
46938 of the op0, the second quarter will contain bit 7 set, third quarter
46939 will contain indexes for the second half of the op0 and the
46940 last quarter bit 7 set. In the second permutation mask
46941 the first quarter will contain bit 7 set, the second quarter
46942 indexes for the first half of the op1, the third quarter bit 7 set
46943 and last quarter indexes for the second half of the op1.
46944 I.e. the first mask e.g. for V32QImode extract even will be:
46945 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46946 (all values masked with 0xf except for -128) and second mask
46947 for extract even will be
46948 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46949 m128 = GEN_INT (-128);
46950 for (i = 0; i < nelt; ++i)
46952 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46953 unsigned which = d->perm[i] >= nelt;
46954 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46956 for (j = 0; j < eltsz; ++j)
46958 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46959 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46963 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46964 vperm = force_reg (V32QImode, vperm);
46966 l = gen_reg_rtx (V32QImode);
46967 op = gen_lowpart (V32QImode, d->op0);
46968 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46970 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46971 vperm = force_reg (V32QImode, vperm);
46973 h = gen_reg_rtx (V32QImode);
46974 op = gen_lowpart (V32QImode, d->op1);
46975 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46977 ior = gen_reg_rtx (V32QImode);
46978 emit_insn (gen_iorv32qi3 (ior, l, h));
46980 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46981 op = gen_reg_rtx (V4DImode);
46982 ior = gen_lowpart (V4DImode, ior);
46983 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46984 const1_rtx, GEN_INT (3)));
46985 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46987 return true;
46990 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46991 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46992 with two "and" and "pack" or two "shift" and "pack" insns. We should
46993 have already failed all two instruction sequences. */
46995 static bool
46996 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46998 rtx op, dop0, dop1, t;
46999 unsigned i, odd, c, s, nelt = d->nelt;
47000 bool end_perm = false;
47001 machine_mode half_mode;
47002 rtx (*gen_and) (rtx, rtx, rtx);
47003 rtx (*gen_pack) (rtx, rtx, rtx);
47004 rtx (*gen_shift) (rtx, rtx, rtx);
47006 if (d->one_operand_p)
47007 return false;
47009 switch (d->vmode)
47011 case E_V8HImode:
47012 /* Required for "pack". */
47013 if (!TARGET_SSE4_1)
47014 return false;
47015 c = 0xffff;
47016 s = 16;
47017 half_mode = V4SImode;
47018 gen_and = gen_andv4si3;
47019 gen_pack = gen_sse4_1_packusdw;
47020 gen_shift = gen_lshrv4si3;
47021 break;
47022 case E_V16QImode:
47023 /* No check as all instructions are SSE2. */
47024 c = 0xff;
47025 s = 8;
47026 half_mode = V8HImode;
47027 gen_and = gen_andv8hi3;
47028 gen_pack = gen_sse2_packuswb;
47029 gen_shift = gen_lshrv8hi3;
47030 break;
47031 case E_V16HImode:
47032 if (!TARGET_AVX2)
47033 return false;
47034 c = 0xffff;
47035 s = 16;
47036 half_mode = V8SImode;
47037 gen_and = gen_andv8si3;
47038 gen_pack = gen_avx2_packusdw;
47039 gen_shift = gen_lshrv8si3;
47040 end_perm = true;
47041 break;
47042 case E_V32QImode:
47043 if (!TARGET_AVX2)
47044 return false;
47045 c = 0xff;
47046 s = 8;
47047 half_mode = V16HImode;
47048 gen_and = gen_andv16hi3;
47049 gen_pack = gen_avx2_packuswb;
47050 gen_shift = gen_lshrv16hi3;
47051 end_perm = true;
47052 break;
47053 default:
47054 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47055 general shuffles. */
47056 return false;
47059 /* Check that permutation is even or odd. */
47060 odd = d->perm[0];
47061 if (odd > 1)
47062 return false;
47064 for (i = 1; i < nelt; ++i)
47065 if (d->perm[i] != 2 * i + odd)
47066 return false;
47068 if (d->testing_p)
47069 return true;
47071 dop0 = gen_reg_rtx (half_mode);
47072 dop1 = gen_reg_rtx (half_mode);
47073 if (odd == 0)
47075 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
47076 t = force_reg (half_mode, t);
47077 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47078 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47080 else
47082 emit_insn (gen_shift (dop0,
47083 gen_lowpart (half_mode, d->op0),
47084 GEN_INT (s)));
47085 emit_insn (gen_shift (dop1,
47086 gen_lowpart (half_mode, d->op1),
47087 GEN_INT (s)));
47089 /* In AVX2 for 256 bit case we need to permute pack result. */
47090 if (TARGET_AVX2 && end_perm)
47092 op = gen_reg_rtx (d->vmode);
47093 t = gen_reg_rtx (V4DImode);
47094 emit_insn (gen_pack (op, dop0, dop1));
47095 emit_insn (gen_avx2_permv4di_1 (t,
47096 gen_lowpart (V4DImode, op),
47097 const0_rtx,
47098 const2_rtx,
47099 const1_rtx,
47100 GEN_INT (3)));
47101 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47103 else
47104 emit_insn (gen_pack (d->target, dop0, dop1));
47106 return true;
47109 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47110 and extract-odd permutations of two V64QI operands
47111 with two "shifts", two "truncs" and one "concat" insns for "odd"
47112 and two "truncs" and one concat insn for "even."
47113 Have already failed all two instruction sequences. */
47115 static bool
47116 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47118 rtx t1, t2, t3, t4;
47119 unsigned i, odd, nelt = d->nelt;
47121 if (!TARGET_AVX512BW
47122 || d->one_operand_p
47123 || d->vmode != V64QImode)
47124 return false;
47126 /* Check that permutation is even or odd. */
47127 odd = d->perm[0];
47128 if (odd > 1)
47129 return false;
47131 for (i = 1; i < nelt; ++i)
47132 if (d->perm[i] != 2 * i + odd)
47133 return false;
47135 if (d->testing_p)
47136 return true;
47139 if (odd)
47141 t1 = gen_reg_rtx (V32HImode);
47142 t2 = gen_reg_rtx (V32HImode);
47143 emit_insn (gen_lshrv32hi3 (t1,
47144 gen_lowpart (V32HImode, d->op0),
47145 GEN_INT (8)));
47146 emit_insn (gen_lshrv32hi3 (t2,
47147 gen_lowpart (V32HImode, d->op1),
47148 GEN_INT (8)));
47150 else
47152 t1 = gen_lowpart (V32HImode, d->op0);
47153 t2 = gen_lowpart (V32HImode, d->op1);
47156 t3 = gen_reg_rtx (V32QImode);
47157 t4 = gen_reg_rtx (V32QImode);
47158 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47159 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47160 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47162 return true;
47165 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47166 and extract-odd permutations. */
47168 static bool
47169 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47171 rtx t1, t2, t3, t4, t5;
47173 switch (d->vmode)
47175 case E_V4DFmode:
47176 if (d->testing_p)
47177 break;
47178 t1 = gen_reg_rtx (V4DFmode);
47179 t2 = gen_reg_rtx (V4DFmode);
47181 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47182 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47183 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47185 /* Now an unpck[lh]pd will produce the result required. */
47186 if (odd)
47187 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47188 else
47189 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47190 emit_insn (t3);
47191 break;
47193 case E_V8SFmode:
47195 int mask = odd ? 0xdd : 0x88;
47197 if (d->testing_p)
47198 break;
47199 t1 = gen_reg_rtx (V8SFmode);
47200 t2 = gen_reg_rtx (V8SFmode);
47201 t3 = gen_reg_rtx (V8SFmode);
47203 /* Shuffle within the 128-bit lanes to produce:
47204 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47205 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47206 GEN_INT (mask)));
47208 /* Shuffle the lanes around to produce:
47209 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47210 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47211 GEN_INT (0x3)));
47213 /* Shuffle within the 128-bit lanes to produce:
47214 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47215 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47217 /* Shuffle within the 128-bit lanes to produce:
47218 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47219 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47221 /* Shuffle the lanes around to produce:
47222 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47223 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47224 GEN_INT (0x20)));
47226 break;
47228 case E_V2DFmode:
47229 case E_V4SFmode:
47230 case E_V2DImode:
47231 case E_V4SImode:
47232 /* These are always directly implementable by expand_vec_perm_1. */
47233 gcc_unreachable ();
47235 case E_V8HImode:
47236 if (TARGET_SSE4_1)
47237 return expand_vec_perm_even_odd_pack (d);
47238 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47239 return expand_vec_perm_pshufb2 (d);
47240 else
47242 if (d->testing_p)
47243 break;
47244 /* We need 2*log2(N)-1 operations to achieve odd/even
47245 with interleave. */
47246 t1 = gen_reg_rtx (V8HImode);
47247 t2 = gen_reg_rtx (V8HImode);
47248 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47249 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47250 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47251 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47252 if (odd)
47253 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47254 else
47255 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47256 emit_insn (t3);
47258 break;
47260 case E_V16QImode:
47261 return expand_vec_perm_even_odd_pack (d);
47263 case E_V16HImode:
47264 case E_V32QImode:
47265 return expand_vec_perm_even_odd_pack (d);
47267 case E_V64QImode:
47268 return expand_vec_perm_even_odd_trunc (d);
47270 case E_V4DImode:
47271 if (!TARGET_AVX2)
47273 struct expand_vec_perm_d d_copy = *d;
47274 d_copy.vmode = V4DFmode;
47275 if (d->testing_p)
47276 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47277 else
47278 d_copy.target = gen_reg_rtx (V4DFmode);
47279 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47280 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47281 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47283 if (!d->testing_p)
47284 emit_move_insn (d->target,
47285 gen_lowpart (V4DImode, d_copy.target));
47286 return true;
47288 return false;
47291 if (d->testing_p)
47292 break;
47294 t1 = gen_reg_rtx (V4DImode);
47295 t2 = gen_reg_rtx (V4DImode);
47297 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47298 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47299 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47301 /* Now an vpunpck[lh]qdq will produce the result required. */
47302 if (odd)
47303 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47304 else
47305 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47306 emit_insn (t3);
47307 break;
47309 case E_V8SImode:
47310 if (!TARGET_AVX2)
47312 struct expand_vec_perm_d d_copy = *d;
47313 d_copy.vmode = V8SFmode;
47314 if (d->testing_p)
47315 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47316 else
47317 d_copy.target = gen_reg_rtx (V8SFmode);
47318 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47319 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47320 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47322 if (!d->testing_p)
47323 emit_move_insn (d->target,
47324 gen_lowpart (V8SImode, d_copy.target));
47325 return true;
47327 return false;
47330 if (d->testing_p)
47331 break;
47333 t1 = gen_reg_rtx (V8SImode);
47334 t2 = gen_reg_rtx (V8SImode);
47335 t3 = gen_reg_rtx (V4DImode);
47336 t4 = gen_reg_rtx (V4DImode);
47337 t5 = gen_reg_rtx (V4DImode);
47339 /* Shuffle the lanes around into
47340 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47341 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47342 gen_lowpart (V4DImode, d->op1),
47343 GEN_INT (0x20)));
47344 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47345 gen_lowpart (V4DImode, d->op1),
47346 GEN_INT (0x31)));
47348 /* Swap the 2nd and 3rd position in each lane into
47349 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47350 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47351 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47352 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47353 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47355 /* Now an vpunpck[lh]qdq will produce
47356 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47357 if (odd)
47358 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47359 gen_lowpart (V4DImode, t2));
47360 else
47361 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47362 gen_lowpart (V4DImode, t2));
47363 emit_insn (t3);
47364 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47365 break;
47367 default:
47368 gcc_unreachable ();
47371 return true;
47374 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47375 extract-even and extract-odd permutations. */
47377 static bool
47378 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47380 unsigned i, odd, nelt = d->nelt;
47382 odd = d->perm[0];
47383 if (odd != 0 && odd != 1)
47384 return false;
47386 for (i = 1; i < nelt; ++i)
47387 if (d->perm[i] != 2 * i + odd)
47388 return false;
47390 return expand_vec_perm_even_odd_1 (d, odd);
47393 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47394 permutations. We assume that expand_vec_perm_1 has already failed. */
47396 static bool
47397 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47399 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47400 machine_mode vmode = d->vmode;
47401 unsigned char perm2[4];
47402 rtx op0 = d->op0, dest;
47403 bool ok;
47405 switch (vmode)
47407 case E_V4DFmode:
47408 case E_V8SFmode:
47409 /* These are special-cased in sse.md so that we can optionally
47410 use the vbroadcast instruction. They expand to two insns
47411 if the input happens to be in a register. */
47412 gcc_unreachable ();
47414 case E_V2DFmode:
47415 case E_V2DImode:
47416 case E_V4SFmode:
47417 case E_V4SImode:
47418 /* These are always implementable using standard shuffle patterns. */
47419 gcc_unreachable ();
47421 case E_V8HImode:
47422 case E_V16QImode:
47423 /* These can be implemented via interleave. We save one insn by
47424 stopping once we have promoted to V4SImode and then use pshufd. */
47425 if (d->testing_p)
47426 return true;
47429 rtx dest;
47430 rtx (*gen) (rtx, rtx, rtx)
47431 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47432 : gen_vec_interleave_lowv8hi;
47434 if (elt >= nelt2)
47436 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47437 : gen_vec_interleave_highv8hi;
47438 elt -= nelt2;
47440 nelt2 /= 2;
47442 dest = gen_reg_rtx (vmode);
47443 emit_insn (gen (dest, op0, op0));
47444 vmode = get_mode_wider_vector (vmode);
47445 op0 = gen_lowpart (vmode, dest);
47447 while (vmode != V4SImode);
47449 memset (perm2, elt, 4);
47450 dest = gen_reg_rtx (V4SImode);
47451 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47452 gcc_assert (ok);
47453 if (!d->testing_p)
47454 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47455 return true;
47457 case E_V64QImode:
47458 case E_V32QImode:
47459 case E_V16HImode:
47460 case E_V8SImode:
47461 case E_V4DImode:
47462 /* For AVX2 broadcasts of the first element vpbroadcast* or
47463 vpermq should be used by expand_vec_perm_1. */
47464 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47465 return false;
47467 default:
47468 gcc_unreachable ();
47472 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47473 broadcast permutations. */
47475 static bool
47476 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47478 unsigned i, elt, nelt = d->nelt;
47480 if (!d->one_operand_p)
47481 return false;
47483 elt = d->perm[0];
47484 for (i = 1; i < nelt; ++i)
47485 if (d->perm[i] != elt)
47486 return false;
47488 return expand_vec_perm_broadcast_1 (d);
47491 /* Implement arbitrary permutations of two V64QImode operands
47492 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
47493 static bool
47494 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
47496 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47497 return false;
47499 if (d->testing_p)
47500 return true;
47502 struct expand_vec_perm_d ds[2];
47503 rtx rperm[128], vperm, target0, target1;
47504 unsigned int i, nelt;
47505 machine_mode vmode;
47507 nelt = d->nelt;
47508 vmode = V64QImode;
47510 for (i = 0; i < 2; i++)
47512 ds[i] = *d;
47513 ds[i].vmode = V32HImode;
47514 ds[i].nelt = 32;
47515 ds[i].target = gen_reg_rtx (V32HImode);
47516 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47517 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47520 /* Prepare permutations such that the first one takes care of
47521 putting the even bytes into the right positions or one higher
47522 positions (ds[0]) and the second one takes care of
47523 putting the odd bytes into the right positions or one below
47524 (ds[1]). */
47526 for (i = 0; i < nelt; i++)
47528 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47529 if (i & 1)
47531 rperm[i] = constm1_rtx;
47532 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47534 else
47536 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47537 rperm[i + 64] = constm1_rtx;
47541 bool ok = expand_vec_perm_1 (&ds[0]);
47542 gcc_assert (ok);
47543 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47545 ok = expand_vec_perm_1 (&ds[1]);
47546 gcc_assert (ok);
47547 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47549 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47550 vperm = force_reg (vmode, vperm);
47551 target0 = gen_reg_rtx (V64QImode);
47552 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47554 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47555 vperm = force_reg (vmode, vperm);
47556 target1 = gen_reg_rtx (V64QImode);
47557 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47559 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47560 return true;
47563 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47564 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47565 all the shorter instruction sequences. */
47567 static bool
47568 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47570 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47571 unsigned int i, nelt, eltsz;
47572 bool used[4];
47574 if (!TARGET_AVX2
47575 || d->one_operand_p
47576 || (d->vmode != V32QImode && d->vmode != V16HImode))
47577 return false;
47579 if (d->testing_p)
47580 return true;
47582 nelt = d->nelt;
47583 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47585 /* Generate 4 permutation masks. If the required element is within
47586 the same lane, it is shuffled in. If the required element from the
47587 other lane, force a zero by setting bit 7 in the permutation mask.
47588 In the other mask the mask has non-negative elements if element
47589 is requested from the other lane, but also moved to the other lane,
47590 so that the result of vpshufb can have the two V2TImode halves
47591 swapped. */
47592 m128 = GEN_INT (-128);
47593 for (i = 0; i < 32; ++i)
47595 rperm[0][i] = m128;
47596 rperm[1][i] = m128;
47597 rperm[2][i] = m128;
47598 rperm[3][i] = m128;
47600 used[0] = false;
47601 used[1] = false;
47602 used[2] = false;
47603 used[3] = false;
47604 for (i = 0; i < nelt; ++i)
47606 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47607 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47608 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47610 for (j = 0; j < eltsz; ++j)
47611 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47612 used[which] = true;
47615 for (i = 0; i < 2; ++i)
47617 if (!used[2 * i + 1])
47619 h[i] = NULL_RTX;
47620 continue;
47622 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47623 gen_rtvec_v (32, rperm[2 * i + 1]));
47624 vperm = force_reg (V32QImode, vperm);
47625 h[i] = gen_reg_rtx (V32QImode);
47626 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47627 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47630 /* Swap the 128-byte lanes of h[X]. */
47631 for (i = 0; i < 2; ++i)
47633 if (h[i] == NULL_RTX)
47634 continue;
47635 op = gen_reg_rtx (V4DImode);
47636 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47637 const2_rtx, GEN_INT (3), const0_rtx,
47638 const1_rtx));
47639 h[i] = gen_lowpart (V32QImode, op);
47642 for (i = 0; i < 2; ++i)
47644 if (!used[2 * i])
47646 l[i] = NULL_RTX;
47647 continue;
47649 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47650 vperm = force_reg (V32QImode, vperm);
47651 l[i] = gen_reg_rtx (V32QImode);
47652 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47653 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47656 for (i = 0; i < 2; ++i)
47658 if (h[i] && l[i])
47660 op = gen_reg_rtx (V32QImode);
47661 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47662 l[i] = op;
47664 else if (h[i])
47665 l[i] = h[i];
47668 gcc_assert (l[0] && l[1]);
47669 op = d->target;
47670 if (d->vmode != V32QImode)
47671 op = gen_reg_rtx (V32QImode);
47672 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47673 if (op != d->target)
47674 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47675 return true;
47678 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
47679 taken care of, perform the expansion in D and return true on success. */
47681 static bool
47682 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47684 /* Try a single instruction expansion. */
47685 if (expand_vec_perm_1 (d))
47686 return true;
47688 /* Try sequences of two instructions. */
47690 if (expand_vec_perm_pshuflw_pshufhw (d))
47691 return true;
47693 if (expand_vec_perm_palignr (d, false))
47694 return true;
47696 if (expand_vec_perm_interleave2 (d))
47697 return true;
47699 if (expand_vec_perm_broadcast (d))
47700 return true;
47702 if (expand_vec_perm_vpermq_perm_1 (d))
47703 return true;
47705 if (expand_vec_perm_vperm2f128 (d))
47706 return true;
47708 if (expand_vec_perm_pblendv (d))
47709 return true;
47711 /* Try sequences of three instructions. */
47713 if (expand_vec_perm_even_odd_pack (d))
47714 return true;
47716 if (expand_vec_perm_2vperm2f128_vshuf (d))
47717 return true;
47719 if (expand_vec_perm_pshufb2 (d))
47720 return true;
47722 if (expand_vec_perm_interleave3 (d))
47723 return true;
47725 if (expand_vec_perm_vperm2f128_vblend (d))
47726 return true;
47728 /* Try sequences of four instructions. */
47730 if (expand_vec_perm_even_odd_trunc (d))
47731 return true;
47732 if (expand_vec_perm_vpshufb2_vpermq (d))
47733 return true;
47735 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47736 return true;
47738 if (expand_vec_perm_vpermt2_vpshub2 (d))
47739 return true;
47741 /* ??? Look for narrow permutations whose element orderings would
47742 allow the promotion to a wider mode. */
47744 /* ??? Look for sequences of interleave or a wider permute that place
47745 the data into the correct lanes for a half-vector shuffle like
47746 pshuf[lh]w or vpermilps. */
47748 /* ??? Look for sequences of interleave that produce the desired results.
47749 The combinatorics of punpck[lh] get pretty ugly... */
47751 if (expand_vec_perm_even_odd (d))
47752 return true;
47754 /* Even longer sequences. */
47755 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47756 return true;
47758 /* See if we can get the same permutation in different vector integer
47759 mode. */
47760 struct expand_vec_perm_d nd;
47761 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47763 if (!d->testing_p)
47764 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47765 return true;
47768 return false;
47771 /* If a permutation only uses one operand, make it clear. Returns true
47772 if the permutation references both operands. */
47774 static bool
47775 canonicalize_perm (struct expand_vec_perm_d *d)
47777 int i, which, nelt = d->nelt;
47779 for (i = which = 0; i < nelt; ++i)
47780 which |= (d->perm[i] < nelt ? 1 : 2);
47782 d->one_operand_p = true;
47783 switch (which)
47785 default:
47786 gcc_unreachable();
47788 case 3:
47789 if (!rtx_equal_p (d->op0, d->op1))
47791 d->one_operand_p = false;
47792 break;
47794 /* The elements of PERM do not suggest that only the first operand
47795 is used, but both operands are identical. Allow easier matching
47796 of the permutation by folding the permutation into the single
47797 input vector. */
47798 /* FALLTHRU */
47800 case 2:
47801 for (i = 0; i < nelt; ++i)
47802 d->perm[i] &= nelt - 1;
47803 d->op0 = d->op1;
47804 break;
47806 case 1:
47807 d->op1 = d->op0;
47808 break;
47811 return (which == 3);
47814 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
47816 static bool
47817 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
47818 rtx op1, const vec_perm_indices &sel)
47820 struct expand_vec_perm_d d;
47821 unsigned char perm[MAX_VECT_LEN];
47822 unsigned int i, nelt, which;
47823 bool two_args;
47825 d.target = target;
47826 d.op0 = op0;
47827 d.op1 = op1;
47829 d.vmode = vmode;
47830 gcc_assert (VECTOR_MODE_P (d.vmode));
47831 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47832 d.testing_p = !target;
47834 gcc_assert (sel.length () == nelt);
47835 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47837 /* Given sufficient ISA support we can just return true here
47838 for selected vector modes. */
47839 switch (d.vmode)
47841 case E_V16SFmode:
47842 case E_V16SImode:
47843 case E_V8DImode:
47844 case E_V8DFmode:
47845 if (!TARGET_AVX512F)
47846 return false;
47847 /* All implementable with a single vperm[it]2 insn. */
47848 if (d.testing_p)
47849 return true;
47850 break;
47851 case E_V32HImode:
47852 if (!TARGET_AVX512BW)
47853 return false;
47854 if (d.testing_p)
47855 /* All implementable with a single vperm[it]2 insn. */
47856 return true;
47857 break;
47858 case E_V64QImode:
47859 if (!TARGET_AVX512BW)
47860 return false;
47861 if (d.testing_p)
47862 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
47863 return true;
47864 break;
47865 case E_V8SImode:
47866 case E_V8SFmode:
47867 case E_V4DFmode:
47868 case E_V4DImode:
47869 if (!TARGET_AVX)
47870 return false;
47871 if (d.testing_p && TARGET_AVX512VL)
47872 /* All implementable with a single vperm[it]2 insn. */
47873 return true;
47874 break;
47875 case E_V16HImode:
47876 if (!TARGET_SSE2)
47877 return false;
47878 if (d.testing_p && TARGET_AVX2)
47879 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47880 return true;
47881 break;
47882 case E_V32QImode:
47883 if (!TARGET_SSE2)
47884 return false;
47885 if (d.testing_p && TARGET_AVX2)
47886 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47887 return true;
47888 break;
47889 case E_V8HImode:
47890 case E_V16QImode:
47891 if (!TARGET_SSE2)
47892 return false;
47893 /* Fall through. */
47894 case E_V4SImode:
47895 case E_V4SFmode:
47896 if (!TARGET_SSE)
47897 return false;
47898 /* All implementable with a single vpperm insn. */
47899 if (d.testing_p && TARGET_XOP)
47900 return true;
47901 /* All implementable with 2 pshufb + 1 ior. */
47902 if (d.testing_p && TARGET_SSSE3)
47903 return true;
47904 break;
47905 case E_V2DImode:
47906 case E_V2DFmode:
47907 if (!TARGET_SSE)
47908 return false;
47909 /* All implementable with shufpd or unpck[lh]pd. */
47910 if (d.testing_p)
47911 return true;
47912 break;
47913 default:
47914 return false;
47917 for (i = which = 0; i < nelt; ++i)
47919 unsigned char e = sel[i];
47920 gcc_assert (e < 2 * nelt);
47921 d.perm[i] = e;
47922 perm[i] = e;
47923 which |= (e < nelt ? 1 : 2);
47926 if (d.testing_p)
47928 /* For all elements from second vector, fold the elements to first. */
47929 if (which == 2)
47930 for (i = 0; i < nelt; ++i)
47931 d.perm[i] -= nelt;
47933 /* Check whether the mask can be applied to the vector type. */
47934 d.one_operand_p = (which != 3);
47936 /* Implementable with shufps or pshufd. */
47937 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47938 return true;
47940 /* Otherwise we have to go through the motions and see if we can
47941 figure out how to generate the requested permutation. */
47942 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47943 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47944 if (!d.one_operand_p)
47945 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47947 start_sequence ();
47948 bool ret = ix86_expand_vec_perm_const_1 (&d);
47949 end_sequence ();
47951 return ret;
47954 two_args = canonicalize_perm (&d);
47956 if (ix86_expand_vec_perm_const_1 (&d))
47957 return true;
47959 /* If the selector says both arguments are needed, but the operands are the
47960 same, the above tried to expand with one_operand_p and flattened selector.
47961 If that didn't work, retry without one_operand_p; we succeeded with that
47962 during testing. */
47963 if (two_args && d.one_operand_p)
47965 d.one_operand_p = false;
47966 memcpy (d.perm, perm, sizeof (perm));
47967 return ix86_expand_vec_perm_const_1 (&d);
47970 return false;
47973 void
47974 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47976 struct expand_vec_perm_d d;
47977 unsigned i, nelt;
47979 d.target = targ;
47980 d.op0 = op0;
47981 d.op1 = op1;
47982 d.vmode = GET_MODE (targ);
47983 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47984 d.one_operand_p = false;
47985 d.testing_p = false;
47987 for (i = 0; i < nelt; ++i)
47988 d.perm[i] = i * 2 + odd;
47990 /* We'll either be able to implement the permutation directly... */
47991 if (expand_vec_perm_1 (&d))
47992 return;
47994 /* ... or we use the special-case patterns. */
47995 expand_vec_perm_even_odd_1 (&d, odd);
47998 static void
47999 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48001 struct expand_vec_perm_d d;
48002 unsigned i, nelt, base;
48003 bool ok;
48005 d.target = targ;
48006 d.op0 = op0;
48007 d.op1 = op1;
48008 d.vmode = GET_MODE (targ);
48009 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48010 d.one_operand_p = false;
48011 d.testing_p = false;
48013 base = high_p ? nelt / 2 : 0;
48014 for (i = 0; i < nelt / 2; ++i)
48016 d.perm[i * 2] = i + base;
48017 d.perm[i * 2 + 1] = i + base + nelt;
48020 /* Note that for AVX this isn't one instruction. */
48021 ok = ix86_expand_vec_perm_const_1 (&d);
48022 gcc_assert (ok);
48026 /* Expand a vector operation CODE for a V*QImode in terms of the
48027 same operation on V*HImode. */
48029 void
48030 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48032 machine_mode qimode = GET_MODE (dest);
48033 machine_mode himode;
48034 rtx (*gen_il) (rtx, rtx, rtx);
48035 rtx (*gen_ih) (rtx, rtx, rtx);
48036 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48037 struct expand_vec_perm_d d;
48038 bool ok, full_interleave;
48039 bool uns_p = false;
48040 int i;
48042 switch (qimode)
48044 case E_V16QImode:
48045 himode = V8HImode;
48046 gen_il = gen_vec_interleave_lowv16qi;
48047 gen_ih = gen_vec_interleave_highv16qi;
48048 break;
48049 case E_V32QImode:
48050 himode = V16HImode;
48051 gen_il = gen_avx2_interleave_lowv32qi;
48052 gen_ih = gen_avx2_interleave_highv32qi;
48053 break;
48054 case E_V64QImode:
48055 himode = V32HImode;
48056 gen_il = gen_avx512bw_interleave_lowv64qi;
48057 gen_ih = gen_avx512bw_interleave_highv64qi;
48058 break;
48059 default:
48060 gcc_unreachable ();
48063 op2_l = op2_h = op2;
48064 switch (code)
48066 case MULT:
48067 /* Unpack data such that we've got a source byte in each low byte of
48068 each word. We don't care what goes into the high byte of each word.
48069 Rather than trying to get zero in there, most convenient is to let
48070 it be a copy of the low byte. */
48071 op2_l = gen_reg_rtx (qimode);
48072 op2_h = gen_reg_rtx (qimode);
48073 emit_insn (gen_il (op2_l, op2, op2));
48074 emit_insn (gen_ih (op2_h, op2, op2));
48076 op1_l = gen_reg_rtx (qimode);
48077 op1_h = gen_reg_rtx (qimode);
48078 emit_insn (gen_il (op1_l, op1, op1));
48079 emit_insn (gen_ih (op1_h, op1, op1));
48080 full_interleave = qimode == V16QImode;
48081 break;
48083 case ASHIFT:
48084 case LSHIFTRT:
48085 uns_p = true;
48086 /* FALLTHRU */
48087 case ASHIFTRT:
48088 op1_l = gen_reg_rtx (himode);
48089 op1_h = gen_reg_rtx (himode);
48090 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48091 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48092 full_interleave = true;
48093 break;
48094 default:
48095 gcc_unreachable ();
48098 /* Perform the operation. */
48099 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48100 1, OPTAB_DIRECT);
48101 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48102 1, OPTAB_DIRECT);
48103 gcc_assert (res_l && res_h);
48105 /* Merge the data back into the right place. */
48106 d.target = dest;
48107 d.op0 = gen_lowpart (qimode, res_l);
48108 d.op1 = gen_lowpart (qimode, res_h);
48109 d.vmode = qimode;
48110 d.nelt = GET_MODE_NUNITS (qimode);
48111 d.one_operand_p = false;
48112 d.testing_p = false;
48114 if (full_interleave)
48116 /* For SSE2, we used an full interleave, so the desired
48117 results are in the even elements. */
48118 for (i = 0; i < d.nelt; ++i)
48119 d.perm[i] = i * 2;
48121 else
48123 /* For AVX, the interleave used above was not cross-lane. So the
48124 extraction is evens but with the second and third quarter swapped.
48125 Happily, that is even one insn shorter than even extraction.
48126 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48127 always first from the first and then from the second source operand,
48128 the index bits above the low 4 bits remains the same.
48129 Thus, for d.nelt == 32 we want permutation
48130 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48131 and for d.nelt == 64 we want permutation
48132 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48133 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48134 for (i = 0; i < d.nelt; ++i)
48135 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48138 ok = ix86_expand_vec_perm_const_1 (&d);
48139 gcc_assert (ok);
48141 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48142 gen_rtx_fmt_ee (code, qimode, op1, op2));
48145 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48146 if op is CONST_VECTOR with all odd elements equal to their
48147 preceding element. */
48149 static bool
48150 const_vector_equal_evenodd_p (rtx op)
48152 machine_mode mode = GET_MODE (op);
48153 int i, nunits = GET_MODE_NUNITS (mode);
48154 if (GET_CODE (op) != CONST_VECTOR
48155 || nunits != CONST_VECTOR_NUNITS (op))
48156 return false;
48157 for (i = 0; i < nunits; i += 2)
48158 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48159 return false;
48160 return true;
48163 void
48164 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48165 bool uns_p, bool odd_p)
48167 machine_mode mode = GET_MODE (op1);
48168 machine_mode wmode = GET_MODE (dest);
48169 rtx x;
48170 rtx orig_op1 = op1, orig_op2 = op2;
48172 if (!nonimmediate_operand (op1, mode))
48173 op1 = force_reg (mode, op1);
48174 if (!nonimmediate_operand (op2, mode))
48175 op2 = force_reg (mode, op2);
48177 /* We only play even/odd games with vectors of SImode. */
48178 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48180 /* If we're looking for the odd results, shift those members down to
48181 the even slots. For some cpus this is faster than a PSHUFD. */
48182 if (odd_p)
48184 /* For XOP use vpmacsdqh, but only for smult, as it is only
48185 signed. */
48186 if (TARGET_XOP && mode == V4SImode && !uns_p)
48188 x = force_reg (wmode, CONST0_RTX (wmode));
48189 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48190 return;
48193 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48194 if (!const_vector_equal_evenodd_p (orig_op1))
48195 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48196 x, NULL, 1, OPTAB_DIRECT);
48197 if (!const_vector_equal_evenodd_p (orig_op2))
48198 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48199 x, NULL, 1, OPTAB_DIRECT);
48200 op1 = gen_lowpart (mode, op1);
48201 op2 = gen_lowpart (mode, op2);
48204 if (mode == V16SImode)
48206 if (uns_p)
48207 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48208 else
48209 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48211 else if (mode == V8SImode)
48213 if (uns_p)
48214 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48215 else
48216 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48218 else if (uns_p)
48219 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48220 else if (TARGET_SSE4_1)
48221 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48222 else
48224 rtx s1, s2, t0, t1, t2;
48226 /* The easiest way to implement this without PMULDQ is to go through
48227 the motions as if we are performing a full 64-bit multiply. With
48228 the exception that we need to do less shuffling of the elements. */
48230 /* Compute the sign-extension, aka highparts, of the two operands. */
48231 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48232 op1, pc_rtx, pc_rtx);
48233 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48234 op2, pc_rtx, pc_rtx);
48236 /* Multiply LO(A) * HI(B), and vice-versa. */
48237 t1 = gen_reg_rtx (wmode);
48238 t2 = gen_reg_rtx (wmode);
48239 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48240 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48242 /* Multiply LO(A) * LO(B). */
48243 t0 = gen_reg_rtx (wmode);
48244 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48246 /* Combine and shift the highparts into place. */
48247 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48248 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48249 1, OPTAB_DIRECT);
48251 /* Combine high and low parts. */
48252 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48253 return;
48255 emit_insn (x);
48258 void
48259 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48260 bool uns_p, bool high_p)
48262 machine_mode wmode = GET_MODE (dest);
48263 machine_mode mode = GET_MODE (op1);
48264 rtx t1, t2, t3, t4, mask;
48266 switch (mode)
48268 case E_V4SImode:
48269 t1 = gen_reg_rtx (mode);
48270 t2 = gen_reg_rtx (mode);
48271 if (TARGET_XOP && !uns_p)
48273 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48274 shuffle the elements once so that all elements are in the right
48275 place for immediate use: { A C B D }. */
48276 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48277 const1_rtx, GEN_INT (3)));
48278 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48279 const1_rtx, GEN_INT (3)));
48281 else
48283 /* Put the elements into place for the multiply. */
48284 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48285 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48286 high_p = false;
48288 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48289 break;
48291 case E_V8SImode:
48292 /* Shuffle the elements between the lanes. After this we
48293 have { A B E F | C D G H } for each operand. */
48294 t1 = gen_reg_rtx (V4DImode);
48295 t2 = gen_reg_rtx (V4DImode);
48296 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48297 const0_rtx, const2_rtx,
48298 const1_rtx, GEN_INT (3)));
48299 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48300 const0_rtx, const2_rtx,
48301 const1_rtx, GEN_INT (3)));
48303 /* Shuffle the elements within the lanes. After this we
48304 have { A A B B | C C D D } or { E E F F | G G H H }. */
48305 t3 = gen_reg_rtx (V8SImode);
48306 t4 = gen_reg_rtx (V8SImode);
48307 mask = GEN_INT (high_p
48308 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48309 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48310 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48311 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48313 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48314 break;
48316 case E_V8HImode:
48317 case E_V16HImode:
48318 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48319 uns_p, OPTAB_DIRECT);
48320 t2 = expand_binop (mode,
48321 uns_p ? umul_highpart_optab : smul_highpart_optab,
48322 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48323 gcc_assert (t1 && t2);
48325 t3 = gen_reg_rtx (mode);
48326 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48327 emit_move_insn (dest, gen_lowpart (wmode, t3));
48328 break;
48330 case E_V16QImode:
48331 case E_V32QImode:
48332 case E_V32HImode:
48333 case E_V16SImode:
48334 case E_V64QImode:
48335 t1 = gen_reg_rtx (wmode);
48336 t2 = gen_reg_rtx (wmode);
48337 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48338 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48340 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48341 break;
48343 default:
48344 gcc_unreachable ();
48348 void
48349 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48351 rtx res_1, res_2, res_3, res_4;
48353 res_1 = gen_reg_rtx (V4SImode);
48354 res_2 = gen_reg_rtx (V4SImode);
48355 res_3 = gen_reg_rtx (V2DImode);
48356 res_4 = gen_reg_rtx (V2DImode);
48357 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48358 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48360 /* Move the results in element 2 down to element 1; we don't care
48361 what goes in elements 2 and 3. Then we can merge the parts
48362 back together with an interleave.
48364 Note that two other sequences were tried:
48365 (1) Use interleaves at the start instead of psrldq, which allows
48366 us to use a single shufps to merge things back at the end.
48367 (2) Use shufps here to combine the two vectors, then pshufd to
48368 put the elements in the correct order.
48369 In both cases the cost of the reformatting stall was too high
48370 and the overall sequence slower. */
48372 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48373 const0_rtx, const2_rtx,
48374 const0_rtx, const0_rtx));
48375 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48376 const0_rtx, const2_rtx,
48377 const0_rtx, const0_rtx));
48378 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48380 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48383 void
48384 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48386 machine_mode mode = GET_MODE (op0);
48387 rtx t1, t2, t3, t4, t5, t6;
48389 if (TARGET_AVX512DQ && mode == V8DImode)
48390 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48391 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48392 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48393 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48394 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48395 else if (TARGET_XOP && mode == V2DImode)
48397 /* op1: A,B,C,D, op2: E,F,G,H */
48398 op1 = gen_lowpart (V4SImode, op1);
48399 op2 = gen_lowpart (V4SImode, op2);
48401 t1 = gen_reg_rtx (V4SImode);
48402 t2 = gen_reg_rtx (V4SImode);
48403 t3 = gen_reg_rtx (V2DImode);
48404 t4 = gen_reg_rtx (V2DImode);
48406 /* t1: B,A,D,C */
48407 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48408 GEN_INT (1),
48409 GEN_INT (0),
48410 GEN_INT (3),
48411 GEN_INT (2)));
48413 /* t2: (B*E),(A*F),(D*G),(C*H) */
48414 emit_insn (gen_mulv4si3 (t2, t1, op2));
48416 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48417 emit_insn (gen_xop_phadddq (t3, t2));
48419 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48420 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48422 /* Multiply lower parts and add all */
48423 t5 = gen_reg_rtx (V2DImode);
48424 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48425 gen_lowpart (V4SImode, op1),
48426 gen_lowpart (V4SImode, op2)));
48427 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48430 else
48432 machine_mode nmode;
48433 rtx (*umul) (rtx, rtx, rtx);
48435 if (mode == V2DImode)
48437 umul = gen_vec_widen_umult_even_v4si;
48438 nmode = V4SImode;
48440 else if (mode == V4DImode)
48442 umul = gen_vec_widen_umult_even_v8si;
48443 nmode = V8SImode;
48445 else if (mode == V8DImode)
48447 umul = gen_vec_widen_umult_even_v16si;
48448 nmode = V16SImode;
48450 else
48451 gcc_unreachable ();
48454 /* Multiply low parts. */
48455 t1 = gen_reg_rtx (mode);
48456 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48458 /* Shift input vectors right 32 bits so we can multiply high parts. */
48459 t6 = GEN_INT (32);
48460 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48461 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48463 /* Multiply high parts by low parts. */
48464 t4 = gen_reg_rtx (mode);
48465 t5 = gen_reg_rtx (mode);
48466 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48467 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48469 /* Combine and shift the highparts back. */
48470 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48471 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48473 /* Combine high and low parts. */
48474 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48477 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48478 gen_rtx_MULT (mode, op1, op2));
48481 /* Return 1 if control tansfer instruction INSN
48482 should be encoded with bnd prefix.
48483 If insn is NULL then return 1 when control
48484 transfer instructions should be prefixed with
48485 bnd by default for current function. */
48487 bool
48488 ix86_bnd_prefixed_insn_p (rtx insn)
48490 /* For call insns check special flag. */
48491 if (insn && CALL_P (insn))
48493 rtx call = get_call_rtx_from (insn);
48494 if (call)
48495 return CALL_EXPR_WITH_BOUNDS_P (call);
48498 /* All other insns are prefixed only if function is instrumented. */
48499 return chkp_function_instrumented_p (current_function_decl);
48502 /* Return 1 if control tansfer instruction INSN
48503 should be encoded with notrack prefix. */
48505 static bool
48506 ix86_notrack_prefixed_insn_p (rtx insn)
48508 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48509 return false;
48511 if (CALL_P (insn))
48513 rtx call = get_call_rtx_from (insn);
48514 gcc_assert (call != NULL_RTX);
48515 rtx addr = XEXP (call, 0);
48517 /* Do not emit 'notrack' if it's not an indirect call. */
48518 if (MEM_P (addr)
48519 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48520 return false;
48521 else
48522 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48525 if (JUMP_P (insn) && !flag_cet_switch)
48527 rtx target = JUMP_LABEL (insn);
48528 if (target == NULL_RTX || ANY_RETURN_P (target))
48529 return false;
48531 /* Check the jump is a switch table. */
48532 rtx_insn *label = as_a<rtx_insn *> (target);
48533 rtx_insn *table = next_insn (label);
48534 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48535 return false;
48536 else
48537 return true;
48539 return false;
48542 /* Calculate integer abs() using only SSE2 instructions. */
48544 void
48545 ix86_expand_sse2_abs (rtx target, rtx input)
48547 machine_mode mode = GET_MODE (target);
48548 rtx tmp0, tmp1, x;
48550 switch (mode)
48552 /* For 32-bit signed integer X, the best way to calculate the absolute
48553 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48554 case E_V4SImode:
48555 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48556 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48557 NULL, 0, OPTAB_DIRECT);
48558 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48559 NULL, 0, OPTAB_DIRECT);
48560 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48561 target, 0, OPTAB_DIRECT);
48562 break;
48564 /* For 16-bit signed integer X, the best way to calculate the absolute
48565 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48566 case E_V8HImode:
48567 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48569 x = expand_simple_binop (mode, SMAX, tmp0, input,
48570 target, 0, OPTAB_DIRECT);
48571 break;
48573 /* For 8-bit signed integer X, the best way to calculate the absolute
48574 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48575 as SSE2 provides the PMINUB insn. */
48576 case E_V16QImode:
48577 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48579 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48580 target, 0, OPTAB_DIRECT);
48581 break;
48583 default:
48584 gcc_unreachable ();
48587 if (x != target)
48588 emit_move_insn (target, x);
48591 /* Expand an extract from a vector register through pextr insn.
48592 Return true if successful. */
48594 bool
48595 ix86_expand_pextr (rtx *operands)
48597 rtx dst = operands[0];
48598 rtx src = operands[1];
48600 unsigned int size = INTVAL (operands[2]);
48601 unsigned int pos = INTVAL (operands[3]);
48603 if (SUBREG_P (dst))
48605 /* Reject non-lowpart subregs. */
48606 if (SUBREG_BYTE (dst) > 0)
48607 return false;
48608 dst = SUBREG_REG (dst);
48611 if (SUBREG_P (src))
48613 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48614 src = SUBREG_REG (src);
48617 switch (GET_MODE (src))
48619 case E_V16QImode:
48620 case E_V8HImode:
48621 case E_V4SImode:
48622 case E_V2DImode:
48623 case E_V1TImode:
48624 case E_TImode:
48626 machine_mode srcmode, dstmode;
48627 rtx d, pat;
48629 if (!int_mode_for_size (size, 0).exists (&dstmode))
48630 return false;
48632 switch (dstmode)
48634 case E_QImode:
48635 if (!TARGET_SSE4_1)
48636 return false;
48637 srcmode = V16QImode;
48638 break;
48640 case E_HImode:
48641 if (!TARGET_SSE2)
48642 return false;
48643 srcmode = V8HImode;
48644 break;
48646 case E_SImode:
48647 if (!TARGET_SSE4_1)
48648 return false;
48649 srcmode = V4SImode;
48650 break;
48652 case E_DImode:
48653 gcc_assert (TARGET_64BIT);
48654 if (!TARGET_SSE4_1)
48655 return false;
48656 srcmode = V2DImode;
48657 break;
48659 default:
48660 return false;
48663 /* Reject extractions from misaligned positions. */
48664 if (pos & (size-1))
48665 return false;
48667 if (GET_MODE (dst) == dstmode)
48668 d = dst;
48669 else
48670 d = gen_reg_rtx (dstmode);
48672 /* Construct insn pattern. */
48673 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48674 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48676 /* Let the rtl optimizers know about the zero extension performed. */
48677 if (dstmode == QImode || dstmode == HImode)
48679 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48680 d = gen_lowpart (SImode, d);
48683 emit_insn (gen_rtx_SET (d, pat));
48685 if (d != dst)
48686 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48687 return true;
48690 default:
48691 return false;
48695 /* Expand an insert into a vector register through pinsr insn.
48696 Return true if successful. */
48698 bool
48699 ix86_expand_pinsr (rtx *operands)
48701 rtx dst = operands[0];
48702 rtx src = operands[3];
48704 unsigned int size = INTVAL (operands[1]);
48705 unsigned int pos = INTVAL (operands[2]);
48707 if (SUBREG_P (dst))
48709 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48710 dst = SUBREG_REG (dst);
48713 switch (GET_MODE (dst))
48715 case E_V16QImode:
48716 case E_V8HImode:
48717 case E_V4SImode:
48718 case E_V2DImode:
48719 case E_V1TImode:
48720 case E_TImode:
48722 machine_mode srcmode, dstmode;
48723 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48724 rtx d;
48726 if (!int_mode_for_size (size, 0).exists (&srcmode))
48727 return false;
48729 switch (srcmode)
48731 case E_QImode:
48732 if (!TARGET_SSE4_1)
48733 return false;
48734 dstmode = V16QImode;
48735 pinsr = gen_sse4_1_pinsrb;
48736 break;
48738 case E_HImode:
48739 if (!TARGET_SSE2)
48740 return false;
48741 dstmode = V8HImode;
48742 pinsr = gen_sse2_pinsrw;
48743 break;
48745 case E_SImode:
48746 if (!TARGET_SSE4_1)
48747 return false;
48748 dstmode = V4SImode;
48749 pinsr = gen_sse4_1_pinsrd;
48750 break;
48752 case E_DImode:
48753 gcc_assert (TARGET_64BIT);
48754 if (!TARGET_SSE4_1)
48755 return false;
48756 dstmode = V2DImode;
48757 pinsr = gen_sse4_1_pinsrq;
48758 break;
48760 default:
48761 return false;
48764 /* Reject insertions to misaligned positions. */
48765 if (pos & (size-1))
48766 return false;
48768 if (SUBREG_P (src))
48770 unsigned int srcpos = SUBREG_BYTE (src);
48772 if (srcpos > 0)
48774 rtx extr_ops[4];
48776 extr_ops[0] = gen_reg_rtx (srcmode);
48777 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48778 extr_ops[2] = GEN_INT (size);
48779 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48781 if (!ix86_expand_pextr (extr_ops))
48782 return false;
48784 src = extr_ops[0];
48786 else
48787 src = gen_lowpart (srcmode, SUBREG_REG (src));
48790 if (GET_MODE (dst) == dstmode)
48791 d = dst;
48792 else
48793 d = gen_reg_rtx (dstmode);
48795 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48796 gen_lowpart (srcmode, src),
48797 GEN_INT (1 << (pos / size))));
48798 if (d != dst)
48799 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48800 return true;
48803 default:
48804 return false;
48808 /* This function returns the calling abi specific va_list type node.
48809 It returns the FNDECL specific va_list type. */
48811 static tree
48812 ix86_fn_abi_va_list (tree fndecl)
48814 if (!TARGET_64BIT)
48815 return va_list_type_node;
48816 gcc_assert (fndecl != NULL_TREE);
48818 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48819 return ms_va_list_type_node;
48820 else
48821 return sysv_va_list_type_node;
48824 /* Returns the canonical va_list type specified by TYPE. If there
48825 is no valid TYPE provided, it return NULL_TREE. */
48827 static tree
48828 ix86_canonical_va_list_type (tree type)
48830 if (TARGET_64BIT)
48832 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48833 return ms_va_list_type_node;
48835 if ((TREE_CODE (type) == ARRAY_TYPE
48836 && integer_zerop (array_type_nelts (type)))
48837 || POINTER_TYPE_P (type))
48839 tree elem_type = TREE_TYPE (type);
48840 if (TREE_CODE (elem_type) == RECORD_TYPE
48841 && lookup_attribute ("sysv_abi va_list",
48842 TYPE_ATTRIBUTES (elem_type)))
48843 return sysv_va_list_type_node;
48846 return NULL_TREE;
48849 return std_canonical_va_list_type (type);
48852 /* Iterate through the target-specific builtin types for va_list.
48853 IDX denotes the iterator, *PTREE is set to the result type of
48854 the va_list builtin, and *PNAME to its internal type.
48855 Returns zero if there is no element for this index, otherwise
48856 IDX should be increased upon the next call.
48857 Note, do not iterate a base builtin's name like __builtin_va_list.
48858 Used from c_common_nodes_and_builtins. */
48860 static int
48861 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48863 if (TARGET_64BIT)
48865 switch (idx)
48867 default:
48868 break;
48870 case 0:
48871 *ptree = ms_va_list_type_node;
48872 *pname = "__builtin_ms_va_list";
48873 return 1;
48875 case 1:
48876 *ptree = sysv_va_list_type_node;
48877 *pname = "__builtin_sysv_va_list";
48878 return 1;
48882 return 0;
48885 #undef TARGET_SCHED_DISPATCH
48886 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48887 #undef TARGET_SCHED_DISPATCH_DO
48888 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48889 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48890 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48891 #undef TARGET_SCHED_REORDER
48892 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48893 #undef TARGET_SCHED_ADJUST_PRIORITY
48894 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48895 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48896 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48897 ix86_dependencies_evaluation_hook
48900 /* Implementation of reassociation_width target hook used by
48901 reassoc phase to identify parallelism level in reassociated
48902 tree. Statements tree_code is passed in OPC. Arguments type
48903 is passed in MODE. */
48905 static int
48906 ix86_reassociation_width (unsigned int op, machine_mode mode)
48908 int width = 1;
48909 /* Vector part. */
48910 if (VECTOR_MODE_P (mode))
48912 int div = 1;
48913 if (INTEGRAL_MODE_P (mode))
48914 width = ix86_cost->reassoc_vec_int;
48915 else if (FLOAT_MODE_P (mode))
48916 width = ix86_cost->reassoc_vec_fp;
48918 if (width == 1)
48919 return 1;
48921 /* Integer vector instructions execute in FP unit
48922 and can execute 3 additions and one multiplication per cycle. */
48923 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48924 && op != PLUS && op != MINUS)
48925 return 1;
48927 /* Account for targets that splits wide vectors into multiple parts. */
48928 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48929 div = GET_MODE_BITSIZE (mode) / 128;
48930 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48931 div = GET_MODE_BITSIZE (mode) / 64;
48932 width = (width + div - 1) / div;
48934 /* Scalar part. */
48935 else if (INTEGRAL_MODE_P (mode))
48936 width = ix86_cost->reassoc_int;
48937 else if (FLOAT_MODE_P (mode))
48938 width = ix86_cost->reassoc_fp;
48940 /* Avoid using too many registers in 32bit mode. */
48941 if (!TARGET_64BIT && width > 2)
48942 width = 2;
48943 return width;
48946 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48947 place emms and femms instructions. */
48949 static machine_mode
48950 ix86_preferred_simd_mode (scalar_mode mode)
48952 if (!TARGET_SSE)
48953 return word_mode;
48955 switch (mode)
48957 case E_QImode:
48958 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48959 return V64QImode;
48960 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48961 return V32QImode;
48962 else
48963 return V16QImode;
48965 case E_HImode:
48966 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48967 return V32HImode;
48968 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48969 return V16HImode;
48970 else
48971 return V8HImode;
48973 case E_SImode:
48974 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48975 return V16SImode;
48976 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48977 return V8SImode;
48978 else
48979 return V4SImode;
48981 case E_DImode:
48982 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48983 return V8DImode;
48984 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48985 return V4DImode;
48986 else
48987 return V2DImode;
48989 case E_SFmode:
48990 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48991 return V16SFmode;
48992 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48993 return V8SFmode;
48994 else
48995 return V4SFmode;
48997 case E_DFmode:
48998 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48999 return V8DFmode;
49000 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49001 return V4DFmode;
49002 else if (TARGET_SSE2)
49003 return V2DFmode;
49004 /* FALLTHRU */
49006 default:
49007 return word_mode;
49011 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49012 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49013 256bit and 128bit vectors. */
49015 static void
49016 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
49018 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49020 sizes->safe_push (64);
49021 sizes->safe_push (32);
49022 sizes->safe_push (16);
49024 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49026 sizes->safe_push (32);
49027 sizes->safe_push (16);
49031 /* Implemenation of targetm.vectorize.get_mask_mode. */
49033 static opt_machine_mode
49034 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
49036 unsigned elem_size = vector_size / nunits;
49038 /* Scalar mask case. */
49039 if ((TARGET_AVX512F && vector_size == 64)
49040 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
49042 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
49043 return smallest_int_mode_for_size (nunits);
49046 scalar_int_mode elem_mode
49047 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
49049 gcc_assert (elem_size * nunits == vector_size);
49051 return mode_for_vector (elem_mode, nunits);
49056 /* Return class of registers which could be used for pseudo of MODE
49057 and of class RCLASS for spilling instead of memory. Return NO_REGS
49058 if it is not possible or non-profitable. */
49060 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
49062 static reg_class_t
49063 ix86_spill_class (reg_class_t rclass, machine_mode mode)
49065 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
49066 && TARGET_SSE2
49067 && TARGET_INTER_UNIT_MOVES_TO_VEC
49068 && TARGET_INTER_UNIT_MOVES_FROM_VEC
49069 && (mode == SImode || (TARGET_64BIT && mode == DImode))
49070 && INTEGER_CLASS_P (rclass))
49071 return ALL_SSE_REGS;
49072 return NO_REGS;
49075 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
49076 but returns a lower bound. */
49078 static unsigned int
49079 ix86_max_noce_ifcvt_seq_cost (edge e)
49081 bool predictable_p = predictable_edge_p (e);
49083 enum compiler_param param
49084 = (predictable_p
49085 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
49086 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
49088 /* If we have a parameter set, use that, otherwise take a guess using
49089 BRANCH_COST. */
49090 if (global_options_set.x_param_values[param])
49091 return PARAM_VALUE (param);
49092 else
49093 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
49096 /* Return true if SEQ is a good candidate as a replacement for the
49097 if-convertible sequence described in IF_INFO. */
49099 static bool
49100 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
49102 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
49104 int cmov_cnt = 0;
49105 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
49106 Maybe we should allow even more conditional moves as long as they
49107 are used far enough not to stall the CPU, or also consider
49108 IF_INFO->TEST_BB succ edge probabilities. */
49109 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
49111 rtx set = single_set (insn);
49112 if (!set)
49113 continue;
49114 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
49115 continue;
49116 rtx src = SET_SRC (set);
49117 machine_mode mode = GET_MODE (src);
49118 if (GET_MODE_CLASS (mode) != MODE_INT
49119 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49120 continue;
49121 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49122 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49123 continue;
49124 /* insn is CMOV or FCMOV. */
49125 if (++cmov_cnt > 1)
49126 return false;
49129 return default_noce_conversion_profitable_p (seq, if_info);
49132 /* Implement targetm.vectorize.init_cost. */
49134 static void *
49135 ix86_init_cost (struct loop *)
49137 unsigned *cost = XNEWVEC (unsigned, 3);
49138 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49139 return cost;
49142 /* Implement targetm.vectorize.add_stmt_cost. */
49144 static unsigned
49145 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49146 struct _stmt_vec_info *stmt_info, int misalign,
49147 enum vect_cost_model_location where)
49149 unsigned *cost = (unsigned *) data;
49150 unsigned retval = 0;
49152 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49153 int stmt_cost = - 1;
49155 if ((kind == vector_stmt || kind == scalar_stmt)
49156 && stmt_info
49157 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
49159 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
49160 bool fp = false;
49161 machine_mode mode = TImode;
49163 if (vectype != NULL)
49165 fp = FLOAT_TYPE_P (vectype);
49166 mode = TYPE_MODE (vectype);
49168 /*machine_mode inner_mode = mode;
49169 if (VECTOR_MODE_P (mode))
49170 inner_mode = GET_MODE_INNER (mode);*/
49172 switch (subcode)
49174 case PLUS_EXPR:
49175 case POINTER_PLUS_EXPR:
49176 case MINUS_EXPR:
49177 if (kind == scalar_stmt)
49179 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49180 stmt_cost = ix86_cost->addss;
49181 else if (X87_FLOAT_MODE_P (mode))
49182 stmt_cost = ix86_cost->fadd;
49183 else
49184 stmt_cost = ix86_cost->add;
49186 else
49187 stmt_cost = ix86_vec_cost (mode,
49188 fp ? ix86_cost->addss
49189 : ix86_cost->sse_op,
49190 true);
49191 break;
49193 case MULT_EXPR:
49194 case WIDEN_MULT_EXPR:
49195 case MULT_HIGHPART_EXPR:
49196 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
49197 break;
49198 case FMA_EXPR:
49199 stmt_cost = ix86_vec_cost (mode,
49200 mode == SFmode ? ix86_cost->fmass
49201 : ix86_cost->fmasd,
49202 true);
49203 break;
49204 case NEGATE_EXPR:
49205 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49206 stmt_cost = ix86_cost->sse_op;
49207 else if (X87_FLOAT_MODE_P (mode))
49208 stmt_cost = ix86_cost->fchs;
49209 else if (VECTOR_MODE_P (mode))
49210 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49211 else
49212 stmt_cost = ix86_cost->add;
49213 break;
49214 case TRUNC_DIV_EXPR:
49215 case CEIL_DIV_EXPR:
49216 case FLOOR_DIV_EXPR:
49217 case ROUND_DIV_EXPR:
49218 case TRUNC_MOD_EXPR:
49219 case CEIL_MOD_EXPR:
49220 case FLOOR_MOD_EXPR:
49221 case RDIV_EXPR:
49222 case ROUND_MOD_EXPR:
49223 case EXACT_DIV_EXPR:
49224 stmt_cost = ix86_division_cost (ix86_cost, mode);
49225 break;
49227 case RSHIFT_EXPR:
49228 case LSHIFT_EXPR:
49229 case LROTATE_EXPR:
49230 case RROTATE_EXPR:
49232 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
49233 stmt_cost = ix86_shift_rotate_cost
49234 (ix86_cost, mode,
49235 TREE_CODE (op2) == INTEGER_CST,
49236 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
49237 true, false, false, NULL, NULL);
49239 break;
49240 case NOP_EXPR:
49241 stmt_cost = 0;
49242 break;
49244 case BIT_IOR_EXPR:
49245 case ABS_EXPR:
49246 case MIN_EXPR:
49247 case MAX_EXPR:
49248 case BIT_XOR_EXPR:
49249 case BIT_AND_EXPR:
49250 case BIT_NOT_EXPR:
49251 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49252 stmt_cost = ix86_cost->sse_op;
49253 else if (VECTOR_MODE_P (mode))
49254 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49255 else
49256 stmt_cost = ix86_cost->add;
49257 break;
49258 default:
49259 break;
49262 if (stmt_cost == -1)
49263 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49265 /* Penalize DFmode vector operations for Bonnell. */
49266 if (TARGET_BONNELL && kind == vector_stmt
49267 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49268 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49270 /* Statements in an inner loop relative to the loop being
49271 vectorized are weighted more heavily. The value here is
49272 arbitrary and could potentially be improved with analysis. */
49273 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49274 count *= 50; /* FIXME. */
49276 retval = (unsigned) (count * stmt_cost);
49278 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49279 for Silvermont as it has out of order integer pipeline and can execute
49280 2 scalar instruction per tick, but has in order SIMD pipeline. */
49281 if ((TARGET_SILVERMONT || TARGET_INTEL)
49282 && stmt_info && stmt_info->stmt)
49284 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49285 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49286 retval = (retval * 17) / 10;
49289 cost[where] += retval;
49291 return retval;
49294 /* Implement targetm.vectorize.finish_cost. */
49296 static void
49297 ix86_finish_cost (void *data, unsigned *prologue_cost,
49298 unsigned *body_cost, unsigned *epilogue_cost)
49300 unsigned *cost = (unsigned *) data;
49301 *prologue_cost = cost[vect_prologue];
49302 *body_cost = cost[vect_body];
49303 *epilogue_cost = cost[vect_epilogue];
49306 /* Implement targetm.vectorize.destroy_cost_data. */
49308 static void
49309 ix86_destroy_cost_data (void *data)
49311 free (data);
49314 /* Validate target specific memory model bits in VAL. */
49316 static unsigned HOST_WIDE_INT
49317 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49319 enum memmodel model = memmodel_from_int (val);
49320 bool strong;
49322 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49323 |MEMMODEL_MASK)
49324 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49326 warning (OPT_Winvalid_memory_model,
49327 "unknown architecture specific memory model");
49328 return MEMMODEL_SEQ_CST;
49330 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49331 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49333 warning (OPT_Winvalid_memory_model,
49334 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49335 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49337 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49339 warning (OPT_Winvalid_memory_model,
49340 "HLE_RELEASE not used with RELEASE or stronger memory model");
49341 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49343 return val;
49346 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49347 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49348 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49349 or number of vecsize_mangle variants that should be emitted. */
49351 static int
49352 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49353 struct cgraph_simd_clone *clonei,
49354 tree base_type, int num)
49356 int ret = 1;
49358 if (clonei->simdlen
49359 && (clonei->simdlen < 2
49360 || clonei->simdlen > 1024
49361 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49363 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49364 "unsupported simdlen %d", clonei->simdlen);
49365 return 0;
49368 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49369 if (TREE_CODE (ret_type) != VOID_TYPE)
49370 switch (TYPE_MODE (ret_type))
49372 case E_QImode:
49373 case E_HImode:
49374 case E_SImode:
49375 case E_DImode:
49376 case E_SFmode:
49377 case E_DFmode:
49378 /* case E_SCmode: */
49379 /* case E_DCmode: */
49380 break;
49381 default:
49382 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49383 "unsupported return type %qT for simd\n", ret_type);
49384 return 0;
49387 tree t;
49388 int i;
49390 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49391 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49392 switch (TYPE_MODE (TREE_TYPE (t)))
49394 case E_QImode:
49395 case E_HImode:
49396 case E_SImode:
49397 case E_DImode:
49398 case E_SFmode:
49399 case E_DFmode:
49400 /* case E_SCmode: */
49401 /* case E_DCmode: */
49402 break;
49403 default:
49404 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49405 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49406 return 0;
49409 if (!TREE_PUBLIC (node->decl))
49411 /* If the function isn't exported, we can pick up just one ISA
49412 for the clones. */
49413 if (TARGET_AVX512F)
49414 clonei->vecsize_mangle = 'e';
49415 else if (TARGET_AVX2)
49416 clonei->vecsize_mangle = 'd';
49417 else if (TARGET_AVX)
49418 clonei->vecsize_mangle = 'c';
49419 else
49420 clonei->vecsize_mangle = 'b';
49421 ret = 1;
49423 else
49425 clonei->vecsize_mangle = "bcde"[num];
49426 ret = 4;
49428 clonei->mask_mode = VOIDmode;
49429 switch (clonei->vecsize_mangle)
49431 case 'b':
49432 clonei->vecsize_int = 128;
49433 clonei->vecsize_float = 128;
49434 break;
49435 case 'c':
49436 clonei->vecsize_int = 128;
49437 clonei->vecsize_float = 256;
49438 break;
49439 case 'd':
49440 clonei->vecsize_int = 256;
49441 clonei->vecsize_float = 256;
49442 break;
49443 case 'e':
49444 clonei->vecsize_int = 512;
49445 clonei->vecsize_float = 512;
49446 if (TYPE_MODE (base_type) == QImode)
49447 clonei->mask_mode = DImode;
49448 else
49449 clonei->mask_mode = SImode;
49450 break;
49452 if (clonei->simdlen == 0)
49454 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49455 clonei->simdlen = clonei->vecsize_int;
49456 else
49457 clonei->simdlen = clonei->vecsize_float;
49458 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49460 else if (clonei->simdlen > 16)
49462 /* For compatibility with ICC, use the same upper bounds
49463 for simdlen. In particular, for CTYPE below, use the return type,
49464 unless the function returns void, in that case use the characteristic
49465 type. If it is possible for given SIMDLEN to pass CTYPE value
49466 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49467 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49468 emit corresponding clone. */
49469 tree ctype = ret_type;
49470 if (TREE_CODE (ret_type) == VOID_TYPE)
49471 ctype = base_type;
49472 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49473 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49474 cnt /= clonei->vecsize_int;
49475 else
49476 cnt /= clonei->vecsize_float;
49477 if (cnt > (TARGET_64BIT ? 16 : 8))
49479 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49480 "unsupported simdlen %d", clonei->simdlen);
49481 return 0;
49484 return ret;
49487 /* Add target attribute to SIMD clone NODE if needed. */
49489 static void
49490 ix86_simd_clone_adjust (struct cgraph_node *node)
49492 const char *str = NULL;
49493 gcc_assert (node->decl == cfun->decl);
49494 switch (node->simdclone->vecsize_mangle)
49496 case 'b':
49497 if (!TARGET_SSE2)
49498 str = "sse2";
49499 break;
49500 case 'c':
49501 if (!TARGET_AVX)
49502 str = "avx";
49503 break;
49504 case 'd':
49505 if (!TARGET_AVX2)
49506 str = "avx2";
49507 break;
49508 case 'e':
49509 if (!TARGET_AVX512F)
49510 str = "avx512f";
49511 break;
49512 default:
49513 gcc_unreachable ();
49515 if (str == NULL)
49516 return;
49517 push_cfun (NULL);
49518 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49519 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49520 gcc_assert (ok);
49521 pop_cfun ();
49522 ix86_reset_previous_fndecl ();
49523 ix86_set_current_function (node->decl);
49526 /* If SIMD clone NODE can't be used in a vectorized loop
49527 in current function, return -1, otherwise return a badness of using it
49528 (0 if it is most desirable from vecsize_mangle point of view, 1
49529 slightly less desirable, etc.). */
49531 static int
49532 ix86_simd_clone_usable (struct cgraph_node *node)
49534 switch (node->simdclone->vecsize_mangle)
49536 case 'b':
49537 if (!TARGET_SSE2)
49538 return -1;
49539 if (!TARGET_AVX)
49540 return 0;
49541 return TARGET_AVX2 ? 2 : 1;
49542 case 'c':
49543 if (!TARGET_AVX)
49544 return -1;
49545 return TARGET_AVX2 ? 1 : 0;
49546 case 'd':
49547 if (!TARGET_AVX2)
49548 return -1;
49549 return 0;
49550 case 'e':
49551 if (!TARGET_AVX512F)
49552 return -1;
49553 return 0;
49554 default:
49555 gcc_unreachable ();
49559 /* This function adjusts the unroll factor based on
49560 the hardware capabilities. For ex, bdver3 has
49561 a loop buffer which makes unrolling of smaller
49562 loops less important. This function decides the
49563 unroll factor using number of memory references
49564 (value 32 is used) as a heuristic. */
49566 static unsigned
49567 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49569 basic_block *bbs;
49570 rtx_insn *insn;
49571 unsigned i;
49572 unsigned mem_count = 0;
49574 if (!TARGET_ADJUST_UNROLL)
49575 return nunroll;
49577 /* Count the number of memory references within the loop body.
49578 This value determines the unrolling factor for bdver3 and bdver4
49579 architectures. */
49580 subrtx_iterator::array_type array;
49581 bbs = get_loop_body (loop);
49582 for (i = 0; i < loop->num_nodes; i++)
49583 FOR_BB_INSNS (bbs[i], insn)
49584 if (NONDEBUG_INSN_P (insn))
49585 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49586 if (const_rtx x = *iter)
49587 if (MEM_P (x))
49589 machine_mode mode = GET_MODE (x);
49590 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49591 if (n_words > 4)
49592 mem_count += 2;
49593 else
49594 mem_count += 1;
49596 free (bbs);
49598 if (mem_count && mem_count <=32)
49599 return 32/mem_count;
49601 return nunroll;
49605 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49607 static bool
49608 ix86_float_exceptions_rounding_supported_p (void)
49610 /* For x87 floating point with standard excess precision handling,
49611 there is no adddf3 pattern (since x87 floating point only has
49612 XFmode operations) so the default hook implementation gets this
49613 wrong. */
49614 return TARGET_80387 || TARGET_SSE_MATH;
49617 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49619 static void
49620 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49622 if (!TARGET_80387 && !TARGET_SSE_MATH)
49623 return;
49624 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49625 if (TARGET_80387)
49627 tree fenv_index_type = build_index_type (size_int (6));
49628 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49629 tree fenv_var = create_tmp_var_raw (fenv_type);
49630 TREE_ADDRESSABLE (fenv_var) = 1;
49631 tree fenv_ptr = build_pointer_type (fenv_type);
49632 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49633 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49634 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49635 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49636 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49637 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49638 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49639 tree hold_fnclex = build_call_expr (fnclex, 0);
49640 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49641 NULL_TREE, NULL_TREE);
49642 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49643 hold_fnclex);
49644 *clear = build_call_expr (fnclex, 0);
49645 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49646 tree fnstsw_call = build_call_expr (fnstsw, 0);
49647 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49648 sw_var, fnstsw_call);
49649 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49650 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49651 exceptions_var, exceptions_x87);
49652 *update = build2 (COMPOUND_EXPR, integer_type_node,
49653 sw_mod, update_mod);
49654 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49655 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49657 if (TARGET_SSE_MATH)
49659 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49660 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49661 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49662 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49663 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49664 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49665 mxcsr_orig_var, stmxcsr_hold_call);
49666 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49667 mxcsr_orig_var,
49668 build_int_cst (unsigned_type_node, 0x1f80));
49669 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49670 build_int_cst (unsigned_type_node, 0xffffffc0));
49671 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49672 mxcsr_mod_var, hold_mod_val);
49673 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49674 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49675 hold_assign_orig, hold_assign_mod);
49676 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49677 ldmxcsr_hold_call);
49678 if (*hold)
49679 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49680 else
49681 *hold = hold_all;
49682 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49683 if (*clear)
49684 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49685 ldmxcsr_clear_call);
49686 else
49687 *clear = ldmxcsr_clear_call;
49688 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49689 tree exceptions_sse = fold_convert (integer_type_node,
49690 stxmcsr_update_call);
49691 if (*update)
49693 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49694 exceptions_var, exceptions_sse);
49695 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49696 exceptions_var, exceptions_mod);
49697 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49698 exceptions_assign);
49700 else
49701 *update = build2 (MODIFY_EXPR, integer_type_node,
49702 exceptions_var, exceptions_sse);
49703 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49704 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49705 ldmxcsr_update_call);
49707 tree atomic_feraiseexcept
49708 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49709 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49710 1, exceptions_var);
49711 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49712 atomic_feraiseexcept_call);
49715 /* Return mode to be used for bounds or VOIDmode
49716 if bounds are not supported. */
49718 static machine_mode
49719 ix86_mpx_bound_mode ()
49721 /* Do not support pointer checker if MPX
49722 is not enabled. */
49723 if (!TARGET_MPX)
49725 if (flag_check_pointer_bounds)
49726 warning (0, "Pointer Checker requires MPX support on this target."
49727 " Use -mmpx options to enable MPX.");
49728 return VOIDmode;
49731 return BNDmode;
49734 /* Return constant used to statically initialize constant bounds.
49736 This function is used to create special bound values. For now
49737 only INIT bounds and NONE bounds are expected. More special
49738 values may be added later. */
49740 static tree
49741 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49743 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49744 : build_zero_cst (pointer_sized_int_node);
49745 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49746 : build_minus_one_cst (pointer_sized_int_node);
49748 /* This function is supposed to be used to create INIT and
49749 NONE bounds only. */
49750 gcc_assert ((lb == 0 && ub == -1)
49751 || (lb == -1 && ub == 0));
49753 return build_complex (NULL, low, high);
49756 /* Generate a list of statements STMTS to initialize pointer bounds
49757 variable VAR with bounds LB and UB. Return the number of generated
49758 statements. */
49760 static int
49761 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49763 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49764 tree lhs, modify, var_p;
49766 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49767 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49769 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49770 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49771 append_to_statement_list (modify, stmts);
49773 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49774 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49775 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49776 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49777 append_to_statement_list (modify, stmts);
49779 return 2;
49782 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49783 /* For i386, common symbol is local only for non-PIE binaries. For
49784 x86-64, common symbol is local only for non-PIE binaries or linker
49785 supports copy reloc in PIE binaries. */
49787 static bool
49788 ix86_binds_local_p (const_tree exp)
49790 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49791 (!flag_pic
49792 || (TARGET_64BIT
49793 && HAVE_LD_PIE_COPYRELOC != 0)));
49795 #endif
49797 /* If MEM is in the form of [base+offset], extract the two parts
49798 of address and set to BASE and OFFSET, otherwise return false. */
49800 static bool
49801 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49803 rtx addr;
49805 gcc_assert (MEM_P (mem));
49807 addr = XEXP (mem, 0);
49809 if (GET_CODE (addr) == CONST)
49810 addr = XEXP (addr, 0);
49812 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49814 *base = addr;
49815 *offset = const0_rtx;
49816 return true;
49819 if (GET_CODE (addr) == PLUS
49820 && (REG_P (XEXP (addr, 0))
49821 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49822 && CONST_INT_P (XEXP (addr, 1)))
49824 *base = XEXP (addr, 0);
49825 *offset = XEXP (addr, 1);
49826 return true;
49829 return false;
49832 /* Given OPERANDS of consecutive load/store, check if we can merge
49833 them into move multiple. LOAD is true if they are load instructions.
49834 MODE is the mode of memory operands. */
49836 bool
49837 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49838 machine_mode mode)
49840 HOST_WIDE_INT offval_1, offval_2, msize;
49841 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49843 if (load)
49845 mem_1 = operands[1];
49846 mem_2 = operands[3];
49847 reg_1 = operands[0];
49848 reg_2 = operands[2];
49850 else
49852 mem_1 = operands[0];
49853 mem_2 = operands[2];
49854 reg_1 = operands[1];
49855 reg_2 = operands[3];
49858 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49860 if (REGNO (reg_1) != REGNO (reg_2))
49861 return false;
49863 /* Check if the addresses are in the form of [base+offset]. */
49864 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49865 return false;
49866 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49867 return false;
49869 /* Check if the bases are the same. */
49870 if (!rtx_equal_p (base_1, base_2))
49871 return false;
49873 offval_1 = INTVAL (offset_1);
49874 offval_2 = INTVAL (offset_2);
49875 msize = GET_MODE_SIZE (mode);
49876 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49877 if (offval_1 + msize != offval_2)
49878 return false;
49880 return true;
49883 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49885 static bool
49886 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49887 optimization_type opt_type)
49889 switch (op)
49891 case asin_optab:
49892 case acos_optab:
49893 case log1p_optab:
49894 case exp_optab:
49895 case exp10_optab:
49896 case exp2_optab:
49897 case expm1_optab:
49898 case ldexp_optab:
49899 case scalb_optab:
49900 case round_optab:
49901 return opt_type == OPTIMIZE_FOR_SPEED;
49903 case rint_optab:
49904 if (SSE_FLOAT_MODE_P (mode1)
49905 && TARGET_SSE_MATH
49906 && !flag_trapping_math
49907 && !TARGET_SSE4_1)
49908 return opt_type == OPTIMIZE_FOR_SPEED;
49909 return true;
49911 case floor_optab:
49912 case ceil_optab:
49913 case btrunc_optab:
49914 if (SSE_FLOAT_MODE_P (mode1)
49915 && TARGET_SSE_MATH
49916 && !flag_trapping_math
49917 && TARGET_SSE4_1)
49918 return true;
49919 return opt_type == OPTIMIZE_FOR_SPEED;
49921 case rsqrt_optab:
49922 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49924 default:
49925 return true;
49929 /* Address space support.
49931 This is not "far pointers" in the 16-bit sense, but an easy way
49932 to use %fs and %gs segment prefixes. Therefore:
49934 (a) All address spaces have the same modes,
49935 (b) All address spaces have the same addresss forms,
49936 (c) While %fs and %gs are technically subsets of the generic
49937 address space, they are probably not subsets of each other.
49938 (d) Since we have no access to the segment base register values
49939 without resorting to a system call, we cannot convert a
49940 non-default address space to a default address space.
49941 Therefore we do not claim %fs or %gs are subsets of generic.
49943 Therefore we can (mostly) use the default hooks. */
49945 /* All use of segmentation is assumed to make address 0 valid. */
49947 static bool
49948 ix86_addr_space_zero_address_valid (addr_space_t as)
49950 return as != ADDR_SPACE_GENERIC;
49953 static void
49954 ix86_init_libfuncs (void)
49956 if (TARGET_64BIT)
49958 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49959 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49961 else
49963 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49964 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49967 #if TARGET_MACHO
49968 darwin_rename_builtins ();
49969 #endif
49972 /* Generate call to __divmoddi4. */
49974 static void
49975 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49976 rtx op0, rtx op1,
49977 rtx *quot_p, rtx *rem_p)
49979 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49981 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49982 mode,
49983 op0, GET_MODE (op0),
49984 op1, GET_MODE (op1),
49985 XEXP (rem, 0), Pmode);
49986 *quot_p = quot;
49987 *rem_p = rem;
49990 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49991 FPU, assume that the fpcw is set to extended precision; when using
49992 only SSE, rounding is correct; when using both SSE and the FPU,
49993 the rounding precision is indeterminate, since either may be chosen
49994 apparently at random. */
49996 static enum flt_eval_method
49997 ix86_excess_precision (enum excess_precision_type type)
49999 switch (type)
50001 case EXCESS_PRECISION_TYPE_FAST:
50002 /* The fastest type to promote to will always be the native type,
50003 whether that occurs with implicit excess precision or
50004 otherwise. */
50005 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50006 case EXCESS_PRECISION_TYPE_STANDARD:
50007 case EXCESS_PRECISION_TYPE_IMPLICIT:
50008 /* Otherwise, the excess precision we want when we are
50009 in a standards compliant mode, and the implicit precision we
50010 provide would be identical were it not for the unpredictable
50011 cases. */
50012 if (!TARGET_80387)
50013 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50014 else if (!TARGET_MIX_SSE_I387)
50016 if (!TARGET_SSE_MATH)
50017 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
50018 else if (TARGET_SSE2)
50019 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50022 /* If we are in standards compliant mode, but we know we will
50023 calculate in unpredictable precision, return
50024 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
50025 excess precision if the target can't guarantee it will honor
50026 it. */
50027 return (type == EXCESS_PRECISION_TYPE_STANDARD
50028 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
50029 : FLT_EVAL_METHOD_UNPREDICTABLE);
50030 default:
50031 gcc_unreachable ();
50034 return FLT_EVAL_METHOD_UNPREDICTABLE;
50037 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
50038 decrements by exactly 2 no matter what the position was, there is no pushb.
50040 But as CIE data alignment factor on this arch is -4 for 32bit targets
50041 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
50042 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
50044 poly_int64
50045 ix86_push_rounding (poly_int64 bytes)
50047 return ROUND_UP (bytes, UNITS_PER_WORD);
50050 /* Target-specific selftests. */
50052 #if CHECKING_P
50054 namespace selftest {
50056 /* Verify that hard regs are dumped as expected (in compact mode). */
50058 static void
50059 ix86_test_dumping_hard_regs ()
50061 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
50062 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
50065 /* Test dumping an insn with repeated references to the same SCRATCH,
50066 to verify the rtx_reuse code. */
50068 static void
50069 ix86_test_dumping_memory_blockage ()
50071 set_new_first_and_last_insn (NULL, NULL);
50073 rtx pat = gen_memory_blockage ();
50074 rtx_reuse_manager r;
50075 r.preprocess (pat);
50077 /* Verify that the repeated references to the SCRATCH show use
50078 reuse IDS. The first should be prefixed with a reuse ID,
50079 and the second should be dumped as a "reuse_rtx" of that ID.
50080 The expected string assumes Pmode == DImode. */
50081 if (Pmode == DImode)
50082 ASSERT_RTL_DUMP_EQ_WITH_REUSE
50083 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
50084 " (unspec:BLK [\n"
50085 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
50086 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
50089 /* Verify loading an RTL dump; specifically a dump of copying
50090 a param on x86_64 from a hard reg into the frame.
50091 This test is target-specific since the dump contains target-specific
50092 hard reg names. */
50094 static void
50095 ix86_test_loading_dump_fragment_1 ()
50097 rtl_dump_test t (SELFTEST_LOCATION,
50098 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
50100 rtx_insn *insn = get_insn_by_uid (1);
50102 /* The block structure and indentation here is purely for
50103 readability; it mirrors the structure of the rtx. */
50104 tree mem_expr;
50106 rtx pat = PATTERN (insn);
50107 ASSERT_EQ (SET, GET_CODE (pat));
50109 rtx dest = SET_DEST (pat);
50110 ASSERT_EQ (MEM, GET_CODE (dest));
50111 /* Verify the "/c" was parsed. */
50112 ASSERT_TRUE (RTX_FLAG (dest, call));
50113 ASSERT_EQ (SImode, GET_MODE (dest));
50115 rtx addr = XEXP (dest, 0);
50116 ASSERT_EQ (PLUS, GET_CODE (addr));
50117 ASSERT_EQ (DImode, GET_MODE (addr));
50119 rtx lhs = XEXP (addr, 0);
50120 /* Verify that the "frame" REG was consolidated. */
50121 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
50124 rtx rhs = XEXP (addr, 1);
50125 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
50126 ASSERT_EQ (-4, INTVAL (rhs));
50129 /* Verify the "[1 i+0 S4 A32]" was parsed. */
50130 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
50131 /* "i" should have been handled by synthesizing a global int
50132 variable named "i". */
50133 mem_expr = MEM_EXPR (dest);
50134 ASSERT_NE (mem_expr, NULL);
50135 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
50136 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
50137 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
50138 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
50139 /* "+0". */
50140 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
50141 ASSERT_EQ (0, MEM_OFFSET (dest));
50142 /* "S4". */
50143 ASSERT_EQ (4, MEM_SIZE (dest));
50144 /* "A32. */
50145 ASSERT_EQ (32, MEM_ALIGN (dest));
50148 rtx src = SET_SRC (pat);
50149 ASSERT_EQ (REG, GET_CODE (src));
50150 ASSERT_EQ (SImode, GET_MODE (src));
50151 ASSERT_EQ (5, REGNO (src));
50152 tree reg_expr = REG_EXPR (src);
50153 /* "i" here should point to the same var as for the MEM_EXPR. */
50154 ASSERT_EQ (reg_expr, mem_expr);
50159 /* Verify that the RTL loader copes with a call_insn dump.
50160 This test is target-specific since the dump contains a target-specific
50161 hard reg name. */
50163 static void
50164 ix86_test_loading_call_insn ()
50166 /* The test dump includes register "xmm0", where requires TARGET_SSE
50167 to exist. */
50168 if (!TARGET_SSE)
50169 return;
50171 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
50173 rtx_insn *insn = get_insns ();
50174 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
50176 /* "/j". */
50177 ASSERT_TRUE (RTX_FLAG (insn, jump));
50179 rtx pat = PATTERN (insn);
50180 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
50182 /* Verify REG_NOTES. */
50184 /* "(expr_list:REG_CALL_DECL". */
50185 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
50186 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
50187 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
50189 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
50190 rtx_expr_list *note1 = note0->next ();
50191 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
50193 ASSERT_EQ (NULL, note1->next ());
50196 /* Verify CALL_INSN_FUNCTION_USAGE. */
50198 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
50199 rtx_expr_list *usage
50200 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
50201 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
50202 ASSERT_EQ (DFmode, GET_MODE (usage));
50203 ASSERT_EQ (USE, GET_CODE (usage->element ()));
50204 ASSERT_EQ (NULL, usage->next ());
50208 /* Verify that the RTL loader copes a dump from print_rtx_function.
50209 This test is target-specific since the dump contains target-specific
50210 hard reg names. */
50212 static void
50213 ix86_test_loading_full_dump ()
50215 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
50217 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50219 rtx_insn *insn_1 = get_insn_by_uid (1);
50220 ASSERT_EQ (NOTE, GET_CODE (insn_1));
50222 rtx_insn *insn_7 = get_insn_by_uid (7);
50223 ASSERT_EQ (INSN, GET_CODE (insn_7));
50224 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
50226 rtx_insn *insn_15 = get_insn_by_uid (15);
50227 ASSERT_EQ (INSN, GET_CODE (insn_15));
50228 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
50230 /* Verify crtl->return_rtx. */
50231 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
50232 ASSERT_EQ (0, REGNO (crtl->return_rtx));
50233 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
50236 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
50237 In particular, verify that it correctly loads the 2nd operand.
50238 This test is target-specific since these are machine-specific
50239 operands (and enums). */
50241 static void
50242 ix86_test_loading_unspec ()
50244 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
50246 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50248 ASSERT_TRUE (cfun);
50250 /* Test of an UNSPEC. */
50251 rtx_insn *insn = get_insns ();
50252 ASSERT_EQ (INSN, GET_CODE (insn));
50253 rtx set = single_set (insn);
50254 ASSERT_NE (NULL, set);
50255 rtx dst = SET_DEST (set);
50256 ASSERT_EQ (MEM, GET_CODE (dst));
50257 rtx src = SET_SRC (set);
50258 ASSERT_EQ (UNSPEC, GET_CODE (src));
50259 ASSERT_EQ (BLKmode, GET_MODE (src));
50260 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
50262 rtx v0 = XVECEXP (src, 0, 0);
50264 /* Verify that the two uses of the first SCRATCH have pointer
50265 equality. */
50266 rtx scratch_a = XEXP (dst, 0);
50267 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
50269 rtx scratch_b = XEXP (v0, 0);
50270 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
50272 ASSERT_EQ (scratch_a, scratch_b);
50274 /* Verify that the two mems are thus treated as equal. */
50275 ASSERT_TRUE (rtx_equal_p (dst, v0));
50277 /* Verify the the insn is recognized. */
50278 ASSERT_NE(-1, recog_memoized (insn));
50280 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
50281 insn = NEXT_INSN (insn);
50282 ASSERT_EQ (INSN, GET_CODE (insn));
50284 set = single_set (insn);
50285 ASSERT_NE (NULL, set);
50287 src = SET_SRC (set);
50288 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
50289 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
50292 /* Run all target-specific selftests. */
50294 static void
50295 ix86_run_selftests (void)
50297 ix86_test_dumping_hard_regs ();
50298 ix86_test_dumping_memory_blockage ();
50300 /* Various tests of loading RTL dumps, here because they contain
50301 ix86-isms (e.g. names of hard regs). */
50302 ix86_test_loading_dump_fragment_1 ();
50303 ix86_test_loading_call_insn ();
50304 ix86_test_loading_full_dump ();
50305 ix86_test_loading_unspec ();
50308 } // namespace selftest
50310 #endif /* CHECKING_P */
50312 /* Initialize the GCC target structure. */
50313 #undef TARGET_RETURN_IN_MEMORY
50314 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50316 #undef TARGET_LEGITIMIZE_ADDRESS
50317 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50319 #undef TARGET_ATTRIBUTE_TABLE
50320 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50321 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50322 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50323 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50324 # undef TARGET_MERGE_DECL_ATTRIBUTES
50325 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50326 #endif
50328 #undef TARGET_COMP_TYPE_ATTRIBUTES
50329 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50331 #undef TARGET_INIT_BUILTINS
50332 #define TARGET_INIT_BUILTINS ix86_init_builtins
50333 #undef TARGET_BUILTIN_DECL
50334 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50335 #undef TARGET_EXPAND_BUILTIN
50336 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50338 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50339 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50340 ix86_builtin_vectorized_function
50342 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50343 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50345 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50346 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50348 #undef TARGET_BUILTIN_RECIPROCAL
50349 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50351 #undef TARGET_ASM_FUNCTION_EPILOGUE
50352 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50354 #undef TARGET_ENCODE_SECTION_INFO
50355 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50356 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50357 #else
50358 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50359 #endif
50361 #undef TARGET_ASM_OPEN_PAREN
50362 #define TARGET_ASM_OPEN_PAREN ""
50363 #undef TARGET_ASM_CLOSE_PAREN
50364 #define TARGET_ASM_CLOSE_PAREN ""
50366 #undef TARGET_ASM_BYTE_OP
50367 #define TARGET_ASM_BYTE_OP ASM_BYTE
50369 #undef TARGET_ASM_ALIGNED_HI_OP
50370 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50371 #undef TARGET_ASM_ALIGNED_SI_OP
50372 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50373 #ifdef ASM_QUAD
50374 #undef TARGET_ASM_ALIGNED_DI_OP
50375 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50376 #endif
50378 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50379 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50381 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50382 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50384 #undef TARGET_ASM_UNALIGNED_HI_OP
50385 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50386 #undef TARGET_ASM_UNALIGNED_SI_OP
50387 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50388 #undef TARGET_ASM_UNALIGNED_DI_OP
50389 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50391 #undef TARGET_PRINT_OPERAND
50392 #define TARGET_PRINT_OPERAND ix86_print_operand
50393 #undef TARGET_PRINT_OPERAND_ADDRESS
50394 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50395 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50396 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50397 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50398 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50400 #undef TARGET_SCHED_INIT_GLOBAL
50401 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50402 #undef TARGET_SCHED_ADJUST_COST
50403 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50404 #undef TARGET_SCHED_ISSUE_RATE
50405 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50406 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50407 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50408 ia32_multipass_dfa_lookahead
50409 #undef TARGET_SCHED_MACRO_FUSION_P
50410 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50411 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50412 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50414 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50415 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50417 #undef TARGET_MEMMODEL_CHECK
50418 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50420 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50421 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50423 #ifdef HAVE_AS_TLS
50424 #undef TARGET_HAVE_TLS
50425 #define TARGET_HAVE_TLS true
50426 #endif
50427 #undef TARGET_CANNOT_FORCE_CONST_MEM
50428 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50429 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50430 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50432 #undef TARGET_DELEGITIMIZE_ADDRESS
50433 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50435 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
50436 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
50438 #undef TARGET_MS_BITFIELD_LAYOUT_P
50439 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50441 #if TARGET_MACHO
50442 #undef TARGET_BINDS_LOCAL_P
50443 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50444 #else
50445 #undef TARGET_BINDS_LOCAL_P
50446 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50447 #endif
50448 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50449 #undef TARGET_BINDS_LOCAL_P
50450 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50451 #endif
50453 #undef TARGET_ASM_OUTPUT_MI_THUNK
50454 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50455 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50456 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50458 #undef TARGET_ASM_FILE_START
50459 #define TARGET_ASM_FILE_START x86_file_start
50461 #undef TARGET_OPTION_OVERRIDE
50462 #define TARGET_OPTION_OVERRIDE ix86_option_override
50464 #undef TARGET_REGISTER_MOVE_COST
50465 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50466 #undef TARGET_MEMORY_MOVE_COST
50467 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50468 #undef TARGET_RTX_COSTS
50469 #define TARGET_RTX_COSTS ix86_rtx_costs
50470 #undef TARGET_ADDRESS_COST
50471 #define TARGET_ADDRESS_COST ix86_address_cost
50473 #undef TARGET_FLAGS_REGNUM
50474 #define TARGET_FLAGS_REGNUM FLAGS_REG
50475 #undef TARGET_FIXED_CONDITION_CODE_REGS
50476 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50477 #undef TARGET_CC_MODES_COMPATIBLE
50478 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50480 #undef TARGET_MACHINE_DEPENDENT_REORG
50481 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50483 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50484 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50486 #undef TARGET_BUILD_BUILTIN_VA_LIST
50487 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50489 #undef TARGET_FOLD_BUILTIN
50490 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50492 #undef TARGET_GIMPLE_FOLD_BUILTIN
50493 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50495 #undef TARGET_COMPARE_VERSION_PRIORITY
50496 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50498 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50499 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50500 ix86_generate_version_dispatcher_body
50502 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50503 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50504 ix86_get_function_versions_dispatcher
50506 #undef TARGET_ENUM_VA_LIST_P
50507 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50509 #undef TARGET_FN_ABI_VA_LIST
50510 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50512 #undef TARGET_CANONICAL_VA_LIST_TYPE
50513 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50515 #undef TARGET_EXPAND_BUILTIN_VA_START
50516 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50518 #undef TARGET_MD_ASM_ADJUST
50519 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50521 #undef TARGET_C_EXCESS_PRECISION
50522 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
50523 #undef TARGET_PROMOTE_PROTOTYPES
50524 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50525 #undef TARGET_SETUP_INCOMING_VARARGS
50526 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50527 #undef TARGET_MUST_PASS_IN_STACK
50528 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50529 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
50530 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
50531 #undef TARGET_FUNCTION_ARG_ADVANCE
50532 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50533 #undef TARGET_FUNCTION_ARG
50534 #define TARGET_FUNCTION_ARG ix86_function_arg
50535 #undef TARGET_INIT_PIC_REG
50536 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50537 #undef TARGET_USE_PSEUDO_PIC_REG
50538 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50539 #undef TARGET_FUNCTION_ARG_BOUNDARY
50540 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50541 #undef TARGET_PASS_BY_REFERENCE
50542 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50543 #undef TARGET_INTERNAL_ARG_POINTER
50544 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50545 #undef TARGET_UPDATE_STACK_BOUNDARY
50546 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50547 #undef TARGET_GET_DRAP_RTX
50548 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50549 #undef TARGET_STRICT_ARGUMENT_NAMING
50550 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50551 #undef TARGET_STATIC_CHAIN
50552 #define TARGET_STATIC_CHAIN ix86_static_chain
50553 #undef TARGET_TRAMPOLINE_INIT
50554 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50555 #undef TARGET_RETURN_POPS_ARGS
50556 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50558 #undef TARGET_WARN_FUNC_RETURN
50559 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
50561 #undef TARGET_LEGITIMATE_COMBINED_INSN
50562 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50564 #undef TARGET_ASAN_SHADOW_OFFSET
50565 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50567 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50568 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50570 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50571 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50573 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50574 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50576 #undef TARGET_C_MODE_FOR_SUFFIX
50577 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50579 #ifdef HAVE_AS_TLS
50580 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50581 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50582 #endif
50584 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50585 #undef TARGET_INSERT_ATTRIBUTES
50586 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50587 #endif
50589 #undef TARGET_MANGLE_TYPE
50590 #define TARGET_MANGLE_TYPE ix86_mangle_type
50592 #undef TARGET_STACK_PROTECT_GUARD
50593 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50595 #if !TARGET_MACHO
50596 #undef TARGET_STACK_PROTECT_FAIL
50597 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50598 #endif
50600 #undef TARGET_FUNCTION_VALUE
50601 #define TARGET_FUNCTION_VALUE ix86_function_value
50603 #undef TARGET_FUNCTION_VALUE_REGNO_P
50604 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50606 #undef TARGET_PROMOTE_FUNCTION_MODE
50607 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50609 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50610 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50612 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50613 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50615 #undef TARGET_INSTANTIATE_DECLS
50616 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50618 #undef TARGET_SECONDARY_RELOAD
50619 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50620 #undef TARGET_SECONDARY_MEMORY_NEEDED
50621 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50622 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50623 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50625 #undef TARGET_CLASS_MAX_NREGS
50626 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50628 #undef TARGET_PREFERRED_RELOAD_CLASS
50629 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50630 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50631 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50632 #undef TARGET_CLASS_LIKELY_SPILLED_P
50633 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50635 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50636 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50637 ix86_builtin_vectorization_cost
50638 #undef TARGET_VECTORIZE_VEC_PERM_CONST
50639 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
50640 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50641 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50642 ix86_preferred_simd_mode
50643 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50644 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50645 ix86_autovectorize_vector_sizes
50646 #undef TARGET_VECTORIZE_GET_MASK_MODE
50647 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50648 #undef TARGET_VECTORIZE_INIT_COST
50649 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50650 #undef TARGET_VECTORIZE_ADD_STMT_COST
50651 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50652 #undef TARGET_VECTORIZE_FINISH_COST
50653 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50654 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50655 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50657 #undef TARGET_SET_CURRENT_FUNCTION
50658 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50660 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50661 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50663 #undef TARGET_OPTION_SAVE
50664 #define TARGET_OPTION_SAVE ix86_function_specific_save
50666 #undef TARGET_OPTION_RESTORE
50667 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50669 #undef TARGET_OPTION_POST_STREAM_IN
50670 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50672 #undef TARGET_OPTION_PRINT
50673 #define TARGET_OPTION_PRINT ix86_function_specific_print
50675 #undef TARGET_OPTION_FUNCTION_VERSIONS
50676 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50678 #undef TARGET_CAN_INLINE_P
50679 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50681 #undef TARGET_LEGITIMATE_ADDRESS_P
50682 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50684 #undef TARGET_REGISTER_PRIORITY
50685 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50687 #undef TARGET_REGISTER_USAGE_LEVELING_P
50688 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50690 #undef TARGET_LEGITIMATE_CONSTANT_P
50691 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50693 #undef TARGET_COMPUTE_FRAME_LAYOUT
50694 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50696 #undef TARGET_FRAME_POINTER_REQUIRED
50697 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50699 #undef TARGET_CAN_ELIMINATE
50700 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50702 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50703 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50705 #undef TARGET_ASM_CODE_END
50706 #define TARGET_ASM_CODE_END ix86_code_end
50708 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50709 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50711 #undef TARGET_CANONICALIZE_COMPARISON
50712 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50714 #undef TARGET_LOOP_UNROLL_ADJUST
50715 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50717 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50718 #undef TARGET_SPILL_CLASS
50719 #define TARGET_SPILL_CLASS ix86_spill_class
50721 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50722 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50723 ix86_simd_clone_compute_vecsize_and_simdlen
50725 #undef TARGET_SIMD_CLONE_ADJUST
50726 #define TARGET_SIMD_CLONE_ADJUST \
50727 ix86_simd_clone_adjust
50729 #undef TARGET_SIMD_CLONE_USABLE
50730 #define TARGET_SIMD_CLONE_USABLE \
50731 ix86_simd_clone_usable
50733 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50734 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50735 ix86_float_exceptions_rounding_supported_p
50737 #undef TARGET_MODE_EMIT
50738 #define TARGET_MODE_EMIT ix86_emit_mode_set
50740 #undef TARGET_MODE_NEEDED
50741 #define TARGET_MODE_NEEDED ix86_mode_needed
50743 #undef TARGET_MODE_AFTER
50744 #define TARGET_MODE_AFTER ix86_mode_after
50746 #undef TARGET_MODE_ENTRY
50747 #define TARGET_MODE_ENTRY ix86_mode_entry
50749 #undef TARGET_MODE_EXIT
50750 #define TARGET_MODE_EXIT ix86_mode_exit
50752 #undef TARGET_MODE_PRIORITY
50753 #define TARGET_MODE_PRIORITY ix86_mode_priority
50755 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50756 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50758 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50759 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50761 #undef TARGET_STORE_BOUNDS_FOR_ARG
50762 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50764 #undef TARGET_LOAD_RETURNED_BOUNDS
50765 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50767 #undef TARGET_STORE_RETURNED_BOUNDS
50768 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50770 #undef TARGET_CHKP_BOUND_MODE
50771 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50773 #undef TARGET_BUILTIN_CHKP_FUNCTION
50774 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50776 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50777 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50779 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50780 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50782 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50783 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50785 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50786 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50788 #undef TARGET_OFFLOAD_OPTIONS
50789 #define TARGET_OFFLOAD_OPTIONS \
50790 ix86_offload_options
50792 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50793 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50795 #undef TARGET_OPTAB_SUPPORTED_P
50796 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50798 #undef TARGET_HARD_REGNO_SCRATCH_OK
50799 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50801 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50802 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50804 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50805 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50807 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50808 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50810 #undef TARGET_INIT_LIBFUNCS
50811 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50813 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50814 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50816 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50817 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50819 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50820 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50822 #undef TARGET_HARD_REGNO_NREGS
50823 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50824 #undef TARGET_HARD_REGNO_MODE_OK
50825 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50827 #undef TARGET_MODES_TIEABLE_P
50828 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50830 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50831 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50832 ix86_hard_regno_call_part_clobbered
50834 #undef TARGET_CAN_CHANGE_MODE_CLASS
50835 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50837 #undef TARGET_STATIC_RTX_ALIGNMENT
50838 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
50839 #undef TARGET_CONSTANT_ALIGNMENT
50840 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50842 #undef TARGET_EMPTY_RECORD_P
50843 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
50845 #undef TARGET_WARN_PARAMETER_PASSING_ABI
50846 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
50848 #if CHECKING_P
50849 #undef TARGET_RUN_TARGET_SELFTESTS
50850 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50851 #endif /* #if CHECKING_P */
50853 struct gcc_target targetm = TARGET_INITIALIZER;
50855 #include "gt-i386.h"