PR target/84064
[official-gcc.git] / gcc / config / i386 / i386.c
blobfef34a1ef64005b5a681f7488e4ad699e4bda7bf
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "tree-chkp.h"
77 #include "rtl-chkp.h"
78 #include "dbgcnt.h"
79 #include "case-cfn-macros.h"
80 #include "regrename.h"
81 #include "dojump.h"
82 #include "fold-const-call.h"
83 #include "tree-vrp.h"
84 #include "tree-ssanames.h"
85 #include "selftest.h"
86 #include "selftest-rtl.h"
87 #include "print-rtl.h"
88 #include "intl.h"
89 #include "ifcvt.h"
90 #include "symbol-summary.h"
91 #include "ipa-prop.h"
92 #include "ipa-fnsummary.h"
94 /* This file should be included last. */
95 #include "target-def.h"
97 #include "x86-tune-costs.h"
99 static rtx legitimize_dllimport_symbol (rtx, bool);
100 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
101 static rtx legitimize_pe_coff_symbol (rtx, bool);
102 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
103 static bool ix86_save_reg (unsigned int, bool, bool);
104 static bool ix86_function_naked (const_tree);
105 static bool ix86_notrack_prefixed_insn_p (rtx);
106 static void ix86_emit_restore_reg_using_pop (rtx);
109 #ifndef CHECK_STACK_LIMIT
110 #define CHECK_STACK_LIMIT (-1)
111 #endif
113 /* Return index of given mode in mult and division cost tables. */
114 #define MODE_INDEX(mode) \
115 ((mode) == QImode ? 0 \
116 : (mode) == HImode ? 1 \
117 : (mode) == SImode ? 2 \
118 : (mode) == DImode ? 3 \
119 : 4)
122 /* Set by -mtune. */
123 const struct processor_costs *ix86_tune_cost = NULL;
125 /* Set by -mtune or -Os. */
126 const struct processor_costs *ix86_cost = NULL;
128 /* Processor feature/optimization bitmasks. */
129 #define m_386 (1U<<PROCESSOR_I386)
130 #define m_486 (1U<<PROCESSOR_I486)
131 #define m_PENT (1U<<PROCESSOR_PENTIUM)
132 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
133 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
134 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
135 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
136 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
137 #define m_CORE2 (1U<<PROCESSOR_CORE2)
138 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
139 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
140 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
141 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
142 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
143 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
144 #define m_KNL (1U<<PROCESSOR_KNL)
145 #define m_KNM (1U<<PROCESSOR_KNM)
146 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
147 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
148 #define m_INTEL (1U<<PROCESSOR_INTEL)
150 #define m_GEODE (1U<<PROCESSOR_GEODE)
151 #define m_K6 (1U<<PROCESSOR_K6)
152 #define m_K6_GEODE (m_K6 | m_GEODE)
153 #define m_K8 (1U<<PROCESSOR_K8)
154 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
155 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
156 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
157 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
158 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
159 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
160 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
161 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
162 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
163 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
164 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
165 #define m_BTVER (m_BTVER1 | m_BTVER2)
166 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
167 | m_ZNVER1)
169 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
171 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
172 #undef DEF_TUNE
173 #define DEF_TUNE(tune, name, selector) name,
174 #include "x86-tune.def"
175 #undef DEF_TUNE
178 /* Feature tests against the various tunings. */
179 unsigned char ix86_tune_features[X86_TUNE_LAST];
181 /* Feature tests against the various tunings used to create ix86_tune_features
182 based on the processor mask. */
183 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
184 #undef DEF_TUNE
185 #define DEF_TUNE(tune, name, selector) selector,
186 #include "x86-tune.def"
187 #undef DEF_TUNE
190 /* Feature tests against the various architecture variations. */
191 unsigned char ix86_arch_features[X86_ARCH_LAST];
193 /* Feature tests against the various architecture variations, used to create
194 ix86_arch_features based on the processor mask. */
195 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
196 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
197 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
199 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
200 ~m_386,
202 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
203 ~(m_386 | m_486),
205 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
206 ~m_386,
208 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
209 ~m_386,
212 /* In case the average insn count for single function invocation is
213 lower than this constant, emit fast (but longer) prologue and
214 epilogue code. */
215 #define FAST_PROLOGUE_INSN_COUNT 20
217 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
218 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
219 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
220 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
222 /* Array of the smallest class containing reg number REGNO, indexed by
223 REGNO. Used by REGNO_REG_CLASS in i386.h. */
225 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
227 /* ax, dx, cx, bx */
228 AREG, DREG, CREG, BREG,
229 /* si, di, bp, sp */
230 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
231 /* FP registers */
232 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
233 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
234 /* arg pointer */
235 NON_Q_REGS,
236 /* flags, fpsr, fpcr, frame */
237 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
238 /* SSE registers */
239 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
240 SSE_REGS, SSE_REGS,
241 /* MMX registers */
242 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
243 MMX_REGS, MMX_REGS,
244 /* REX registers */
245 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
246 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
247 /* SSE REX registers */
248 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
249 SSE_REGS, SSE_REGS,
250 /* AVX-512 SSE registers */
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
254 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
255 /* Mask registers. */
256 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
257 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
258 /* MPX bound registers */
259 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
262 /* The "default" register map used in 32bit mode. */
264 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
266 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
267 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
268 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
269 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
270 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
272 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
273 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
274 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
275 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
276 101, 102, 103, 104, /* bound registers */
279 /* The "default" register map used in 64bit mode. */
281 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
283 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
284 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
285 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
286 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
287 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
288 8,9,10,11,12,13,14,15, /* extended integer registers */
289 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
290 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
291 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
292 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
293 126, 127, 128, 129, /* bound registers */
296 /* Define the register numbers to be used in Dwarf debugging information.
297 The SVR4 reference port C compiler uses the following register numbers
298 in its Dwarf output code:
299 0 for %eax (gcc regno = 0)
300 1 for %ecx (gcc regno = 2)
301 2 for %edx (gcc regno = 1)
302 3 for %ebx (gcc regno = 3)
303 4 for %esp (gcc regno = 7)
304 5 for %ebp (gcc regno = 6)
305 6 for %esi (gcc regno = 4)
306 7 for %edi (gcc regno = 5)
307 The following three DWARF register numbers are never generated by
308 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
309 believed these numbers have these meanings.
310 8 for %eip (no gcc equivalent)
311 9 for %eflags (gcc regno = 17)
312 10 for %trapno (no gcc equivalent)
313 It is not at all clear how we should number the FP stack registers
314 for the x86 architecture. If the version of SDB on x86/svr4 were
315 a bit less brain dead with respect to floating-point then we would
316 have a precedent to follow with respect to DWARF register numbers
317 for x86 FP registers, but the SDB on x86/svr4 was so completely
318 broken with respect to FP registers that it is hardly worth thinking
319 of it as something to strive for compatibility with.
320 The version of x86/svr4 SDB I had does (partially)
321 seem to believe that DWARF register number 11 is associated with
322 the x86 register %st(0), but that's about all. Higher DWARF
323 register numbers don't seem to be associated with anything in
324 particular, and even for DWARF regno 11, SDB only seemed to under-
325 stand that it should say that a variable lives in %st(0) (when
326 asked via an `=' command) if we said it was in DWARF regno 11,
327 but SDB still printed garbage when asked for the value of the
328 variable in question (via a `/' command).
329 (Also note that the labels SDB printed for various FP stack regs
330 when doing an `x' command were all wrong.)
331 Note that these problems generally don't affect the native SVR4
332 C compiler because it doesn't allow the use of -O with -g and
333 because when it is *not* optimizing, it allocates a memory
334 location for each floating-point variable, and the memory
335 location is what gets described in the DWARF AT_location
336 attribute for the variable in question.
337 Regardless of the severe mental illness of the x86/svr4 SDB, we
338 do something sensible here and we use the following DWARF
339 register numbers. Note that these are all stack-top-relative
340 numbers.
341 11 for %st(0) (gcc regno = 8)
342 12 for %st(1) (gcc regno = 9)
343 13 for %st(2) (gcc regno = 10)
344 14 for %st(3) (gcc regno = 11)
345 15 for %st(4) (gcc regno = 12)
346 16 for %st(5) (gcc regno = 13)
347 17 for %st(6) (gcc regno = 14)
348 18 for %st(7) (gcc regno = 15)
350 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
352 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
353 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
354 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
355 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
356 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
358 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
359 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
360 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
361 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
362 101, 102, 103, 104, /* bound registers */
365 /* Define parameter passing and return registers. */
367 static int const x86_64_int_parameter_registers[6] =
369 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
372 static int const x86_64_ms_abi_int_parameter_registers[4] =
374 CX_REG, DX_REG, R8_REG, R9_REG
377 static int const x86_64_int_return_registers[4] =
379 AX_REG, DX_REG, DI_REG, SI_REG
382 /* Additional registers that are clobbered by SYSV calls. */
384 #define NUM_X86_64_MS_CLOBBERED_REGS 12
385 static int const x86_64_ms_sysv_extra_clobbered_registers
386 [NUM_X86_64_MS_CLOBBERED_REGS] =
388 SI_REG, DI_REG,
389 XMM6_REG, XMM7_REG,
390 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
391 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
394 enum xlogue_stub {
395 XLOGUE_STUB_SAVE,
396 XLOGUE_STUB_RESTORE,
397 XLOGUE_STUB_RESTORE_TAIL,
398 XLOGUE_STUB_SAVE_HFP,
399 XLOGUE_STUB_RESTORE_HFP,
400 XLOGUE_STUB_RESTORE_HFP_TAIL,
402 XLOGUE_STUB_COUNT
405 enum xlogue_stub_sets {
406 XLOGUE_SET_ALIGNED,
407 XLOGUE_SET_ALIGNED_PLUS_8,
408 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
409 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
411 XLOGUE_SET_COUNT
414 /* Register save/restore layout used by out-of-line stubs. */
415 class xlogue_layout {
416 public:
417 struct reginfo
419 unsigned regno;
420 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
421 rsi) to where each register is stored. */
424 unsigned get_nregs () const {return m_nregs;}
425 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
427 const reginfo &get_reginfo (unsigned reg) const
429 gcc_assert (reg < m_nregs);
430 return m_regs[reg];
433 static const char *get_stub_name (enum xlogue_stub stub,
434 unsigned n_extra_args);
436 /* Returns an rtx for the stub's symbol based upon
437 1.) the specified stub (save, restore or restore_ret) and
438 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
439 3.) rather or not stack alignment is being performed. */
440 static rtx get_stub_rtx (enum xlogue_stub stub);
442 /* Returns the amount of stack space (including padding) that the stub
443 needs to store registers based upon data in the machine_function. */
444 HOST_WIDE_INT get_stack_space_used () const
446 const struct machine_function *m = cfun->machine;
447 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
449 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
450 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
453 /* Returns the offset for the base pointer used by the stub. */
454 HOST_WIDE_INT get_stub_ptr_offset () const
456 return STUB_INDEX_OFFSET + m_stack_align_off_in;
459 static const struct xlogue_layout &get_instance ();
460 static unsigned count_stub_managed_regs ();
461 static bool is_stub_managed_reg (unsigned regno, unsigned count);
463 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
464 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
465 static const unsigned MAX_REGS = 18;
466 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
467 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
468 static const unsigned STUB_NAME_MAX_LEN = 20;
469 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
470 static const unsigned REG_ORDER[MAX_REGS];
471 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
473 private:
474 xlogue_layout ();
475 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
476 xlogue_layout (const xlogue_layout &);
478 /* True if hard frame pointer is used. */
479 bool m_hfp;
481 /* Max number of register this layout manages. */
482 unsigned m_nregs;
484 /* Incoming offset from 16-byte alignment. */
485 HOST_WIDE_INT m_stack_align_off_in;
487 /* Register order and offsets. */
488 struct reginfo m_regs[MAX_REGS];
490 /* Lazy-inited cache of symbol names for stubs. */
491 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
492 [STUB_NAME_MAX_LEN];
494 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
497 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
498 "savms64",
499 "resms64",
500 "resms64x",
501 "savms64f",
502 "resms64f",
503 "resms64fx"
506 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
507 /* The below offset values are where each register is stored for the layout
508 relative to incoming stack pointer. The value of each m_regs[].offset will
509 be relative to the incoming base pointer (rax or rsi) used by the stub.
511 s_instances: 0 1 2 3
512 Offset: realigned or aligned + 8
513 Register aligned aligned + 8 aligned w/HFP w/HFP */
514 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
515 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
516 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
517 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
518 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
519 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
520 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
521 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
522 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
523 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
524 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
525 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
526 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
527 BP_REG, /* 0xc0 0xc8 N/A N/A */
528 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
529 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
530 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
531 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
534 /* Instantiate static const values. */
535 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
536 const unsigned xlogue_layout::MIN_REGS;
537 const unsigned xlogue_layout::MAX_REGS;
538 const unsigned xlogue_layout::MAX_EXTRA_REGS;
539 const unsigned xlogue_layout::VARIANT_COUNT;
540 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
542 /* Initialize xlogue_layout::s_stub_names to zero. */
543 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
544 [STUB_NAME_MAX_LEN];
546 /* Instantiates all xlogue_layout instances. */
547 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
548 xlogue_layout (0, false),
549 xlogue_layout (8, false),
550 xlogue_layout (0, true),
551 xlogue_layout (8, true)
554 /* Return an appropriate const instance of xlogue_layout based upon values
555 in cfun->machine and crtl. */
556 const struct xlogue_layout &
557 xlogue_layout::get_instance ()
559 enum xlogue_stub_sets stub_set;
560 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
562 if (stack_realign_fp)
563 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
564 else if (frame_pointer_needed)
565 stub_set = aligned_plus_8
566 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
567 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
568 else
569 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
571 return s_instances[stub_set];
574 /* Determine how many clobbered registers can be saved by the stub.
575 Returns the count of registers the stub will save and restore. */
576 unsigned
577 xlogue_layout::count_stub_managed_regs ()
579 bool hfp = frame_pointer_needed || stack_realign_fp;
580 unsigned i, count;
581 unsigned regno;
583 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
585 regno = REG_ORDER[i];
586 if (regno == BP_REG && hfp)
587 continue;
588 if (!ix86_save_reg (regno, false, false))
589 break;
590 ++count;
592 return count;
595 /* Determine if register REGNO is a stub managed register given the
596 total COUNT of stub managed registers. */
597 bool
598 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
600 bool hfp = frame_pointer_needed || stack_realign_fp;
601 unsigned i;
603 for (i = 0; i < count; ++i)
605 gcc_assert (i < MAX_REGS);
606 if (REG_ORDER[i] == BP_REG && hfp)
607 ++count;
608 else if (REG_ORDER[i] == regno)
609 return true;
611 return false;
614 /* Constructor for xlogue_layout. */
615 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
616 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
617 m_stack_align_off_in (stack_align_off_in)
619 HOST_WIDE_INT offset = stack_align_off_in;
620 unsigned i, j;
622 for (i = j = 0; i < MAX_REGS; ++i)
624 unsigned regno = REG_ORDER[i];
626 if (regno == BP_REG && hfp)
627 continue;
628 if (SSE_REGNO_P (regno))
630 offset += 16;
631 /* Verify that SSE regs are always aligned. */
632 gcc_assert (!((stack_align_off_in + offset) & 15));
634 else
635 offset += 8;
637 m_regs[j].regno = regno;
638 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
640 gcc_assert (j == m_nregs);
643 const char *
644 xlogue_layout::get_stub_name (enum xlogue_stub stub,
645 unsigned n_extra_regs)
647 const int have_avx = TARGET_AVX;
648 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
650 /* Lazy init */
651 if (!*name)
653 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
654 (have_avx ? "avx" : "sse"),
655 STUB_BASE_NAMES[stub],
656 MIN_REGS + n_extra_regs);
657 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
660 return name;
663 /* Return rtx of a symbol ref for the entry point (based upon
664 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
666 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
668 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
669 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
670 gcc_assert (stub < XLOGUE_STUB_COUNT);
671 gcc_assert (crtl->stack_realign_finalized);
673 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
676 /* Define the structure for the machine field in struct function. */
678 struct GTY(()) stack_local_entry {
679 unsigned short mode;
680 unsigned short n;
681 rtx rtl;
682 struct stack_local_entry *next;
685 /* Which cpu are we scheduling for. */
686 enum attr_cpu ix86_schedule;
688 /* Which cpu are we optimizing for. */
689 enum processor_type ix86_tune;
691 /* Which instruction set architecture to use. */
692 enum processor_type ix86_arch;
694 /* True if processor has SSE prefetch instruction. */
695 unsigned char x86_prefetch_sse;
697 /* -mstackrealign option */
698 static const char ix86_force_align_arg_pointer_string[]
699 = "force_align_arg_pointer";
701 static rtx (*ix86_gen_leave) (void);
702 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
703 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
704 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
705 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
706 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
707 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_clzero) (rtx);
709 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
711 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
713 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
716 /* Preferred alignment for stack boundary in bits. */
717 unsigned int ix86_preferred_stack_boundary;
719 /* Alignment for incoming stack boundary in bits specified at
720 command line. */
721 static unsigned int ix86_user_incoming_stack_boundary;
723 /* Default alignment for incoming stack boundary in bits. */
724 static unsigned int ix86_default_incoming_stack_boundary;
726 /* Alignment for incoming stack boundary in bits. */
727 unsigned int ix86_incoming_stack_boundary;
729 /* Calling abi specific va_list type nodes. */
730 static GTY(()) tree sysv_va_list_type_node;
731 static GTY(()) tree ms_va_list_type_node;
733 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
734 char internal_label_prefix[16];
735 int internal_label_prefix_len;
737 /* Fence to use after loop using movnt. */
738 tree x86_mfence;
740 /* Register class used for passing given 64bit part of the argument.
741 These represent classes as documented by the PS ABI, with the exception
742 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
743 use SF or DFmode move instead of DImode to avoid reformatting penalties.
745 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
746 whenever possible (upper half does contain padding). */
747 enum x86_64_reg_class
749 X86_64_NO_CLASS,
750 X86_64_INTEGER_CLASS,
751 X86_64_INTEGERSI_CLASS,
752 X86_64_SSE_CLASS,
753 X86_64_SSESF_CLASS,
754 X86_64_SSEDF_CLASS,
755 X86_64_SSEUP_CLASS,
756 X86_64_X87_CLASS,
757 X86_64_X87UP_CLASS,
758 X86_64_COMPLEX_X87_CLASS,
759 X86_64_MEMORY_CLASS
762 #define MAX_CLASSES 8
764 /* Table of constants used by fldpi, fldln2, etc.... */
765 static REAL_VALUE_TYPE ext_80387_constants_table [5];
766 static bool ext_80387_constants_init;
769 static struct machine_function * ix86_init_machine_status (void);
770 static rtx ix86_function_value (const_tree, const_tree, bool);
771 static bool ix86_function_value_regno_p (const unsigned int);
772 static unsigned int ix86_function_arg_boundary (machine_mode,
773 const_tree);
774 static rtx ix86_static_chain (const_tree, bool);
775 static int ix86_function_regparm (const_tree, const_tree);
776 static void ix86_compute_frame_layout (void);
777 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
778 rtx, rtx, int);
779 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
780 static tree ix86_canonical_va_list_type (tree);
781 static void predict_jump (int);
782 static unsigned int split_stack_prologue_scratch_regno (void);
783 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
785 enum ix86_function_specific_strings
787 IX86_FUNCTION_SPECIFIC_ARCH,
788 IX86_FUNCTION_SPECIFIC_TUNE,
789 IX86_FUNCTION_SPECIFIC_MAX
792 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
793 const char *, const char *, enum fpmath_unit,
794 bool);
795 static void ix86_function_specific_save (struct cl_target_option *,
796 struct gcc_options *opts);
797 static void ix86_function_specific_restore (struct gcc_options *opts,
798 struct cl_target_option *);
799 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
800 static void ix86_function_specific_print (FILE *, int,
801 struct cl_target_option *);
802 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
803 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
804 struct gcc_options *,
805 struct gcc_options *,
806 struct gcc_options *);
807 static bool ix86_can_inline_p (tree, tree);
808 static void ix86_set_current_function (tree);
809 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
811 static enum calling_abi ix86_function_abi (const_tree);
814 #ifndef SUBTARGET32_DEFAULT_CPU
815 #define SUBTARGET32_DEFAULT_CPU "i386"
816 #endif
818 /* Whether -mtune= or -march= were specified */
819 static int ix86_tune_defaulted;
820 static int ix86_arch_specified;
822 /* Vectorization library interface and handlers. */
823 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
825 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
826 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
828 /* Processor target table, indexed by processor number */
829 struct ptt
831 const char *const name; /* processor name */
832 const struct processor_costs *cost; /* Processor costs */
833 const int align_loop; /* Default alignments. */
834 const int align_loop_max_skip;
835 const int align_jump;
836 const int align_jump_max_skip;
837 const int align_func;
840 /* This table must be in sync with enum processor_type in i386.h. */
841 static const struct ptt processor_target_table[PROCESSOR_max] =
843 {"generic", &generic_cost, 16, 10, 16, 10, 16},
844 {"i386", &i386_cost, 4, 3, 4, 3, 4},
845 {"i486", &i486_cost, 16, 15, 16, 15, 16},
846 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
847 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
848 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
849 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
850 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
851 {"core2", &core_cost, 16, 10, 16, 10, 16},
852 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
853 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
854 {"haswell", &core_cost, 16, 10, 16, 10, 16},
855 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
856 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
857 {"knl", &slm_cost, 16, 15, 16, 7, 16},
858 {"knm", &slm_cost, 16, 15, 16, 7, 16},
859 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
860 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
861 {"intel", &intel_cost, 16, 15, 16, 7, 16},
862 {"geode", &geode_cost, 0, 0, 0, 0, 0},
863 {"k6", &k6_cost, 32, 7, 32, 7, 32},
864 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
865 {"k8", &k8_cost, 16, 7, 16, 7, 16},
866 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
867 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
868 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
869 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
870 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
871 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
872 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
873 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
876 static unsigned int
877 rest_of_handle_insert_vzeroupper (void)
879 int i;
881 /* vzeroupper instructions are inserted immediately after reload to
882 account for possible spills from 256bit or 512bit registers. The pass
883 reuses mode switching infrastructure by re-running mode insertion
884 pass, so disable entities that have already been processed. */
885 for (i = 0; i < MAX_386_ENTITIES; i++)
886 ix86_optimize_mode_switching[i] = 0;
888 ix86_optimize_mode_switching[AVX_U128] = 1;
890 /* Call optimize_mode_switching. */
891 g->get_passes ()->execute_pass_mode_switching ();
892 return 0;
895 /* Return 1 if INSN uses or defines a hard register.
896 Hard register uses in a memory address are ignored.
897 Clobbers and flags definitions are ignored. */
899 static bool
900 has_non_address_hard_reg (rtx_insn *insn)
902 df_ref ref;
903 FOR_EACH_INSN_DEF (ref, insn)
904 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
905 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
906 && DF_REF_REGNO (ref) != FLAGS_REG)
907 return true;
909 FOR_EACH_INSN_USE (ref, insn)
910 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
911 return true;
913 return false;
916 /* Check if comparison INSN may be transformed
917 into vector comparison. Currently we transform
918 zero checks only which look like:
920 (set (reg:CCZ 17 flags)
921 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
922 (subreg:SI (reg:DI x) 0))
923 (const_int 0 [0]))) */
925 static bool
926 convertible_comparison_p (rtx_insn *insn)
928 if (!TARGET_SSE4_1)
929 return false;
931 rtx def_set = single_set (insn);
933 gcc_assert (def_set);
935 rtx src = SET_SRC (def_set);
936 rtx dst = SET_DEST (def_set);
938 gcc_assert (GET_CODE (src) == COMPARE);
940 if (GET_CODE (dst) != REG
941 || REGNO (dst) != FLAGS_REG
942 || GET_MODE (dst) != CCZmode)
943 return false;
945 rtx op1 = XEXP (src, 0);
946 rtx op2 = XEXP (src, 1);
948 if (op2 != CONST0_RTX (GET_MODE (op2)))
949 return false;
951 if (GET_CODE (op1) != IOR)
952 return false;
954 op2 = XEXP (op1, 1);
955 op1 = XEXP (op1, 0);
957 if (!SUBREG_P (op1)
958 || !SUBREG_P (op2)
959 || GET_MODE (op1) != SImode
960 || GET_MODE (op2) != SImode
961 || ((SUBREG_BYTE (op1) != 0
962 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
963 && (SUBREG_BYTE (op2) != 0
964 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
965 return false;
967 op1 = SUBREG_REG (op1);
968 op2 = SUBREG_REG (op2);
970 if (op1 != op2
971 || !REG_P (op1)
972 || GET_MODE (op1) != DImode)
973 return false;
975 return true;
978 /* The DImode version of scalar_to_vector_candidate_p. */
980 static bool
981 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
983 rtx def_set = single_set (insn);
985 if (!def_set)
986 return false;
988 if (has_non_address_hard_reg (insn))
989 return false;
991 rtx src = SET_SRC (def_set);
992 rtx dst = SET_DEST (def_set);
994 if (GET_CODE (src) == COMPARE)
995 return convertible_comparison_p (insn);
997 /* We are interested in DImode promotion only. */
998 if ((GET_MODE (src) != DImode
999 && !CONST_INT_P (src))
1000 || GET_MODE (dst) != DImode)
1001 return false;
1003 if (!REG_P (dst) && !MEM_P (dst))
1004 return false;
1006 switch (GET_CODE (src))
1008 case ASHIFTRT:
1009 if (!TARGET_AVX512VL)
1010 return false;
1011 /* FALLTHRU */
1013 case ASHIFT:
1014 case LSHIFTRT:
1015 if (!REG_P (XEXP (src, 1))
1016 && (!SUBREG_P (XEXP (src, 1))
1017 || SUBREG_BYTE (XEXP (src, 1)) != 0
1018 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1019 && (!CONST_INT_P (XEXP (src, 1))
1020 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1021 return false;
1023 if (GET_MODE (XEXP (src, 1)) != QImode
1024 && !CONST_INT_P (XEXP (src, 1)))
1025 return false;
1026 break;
1028 case PLUS:
1029 case MINUS:
1030 case IOR:
1031 case XOR:
1032 case AND:
1033 if (!REG_P (XEXP (src, 1))
1034 && !MEM_P (XEXP (src, 1))
1035 && !CONST_INT_P (XEXP (src, 1)))
1036 return false;
1038 if (GET_MODE (XEXP (src, 1)) != DImode
1039 && !CONST_INT_P (XEXP (src, 1)))
1040 return false;
1041 break;
1043 case NEG:
1044 case NOT:
1045 break;
1047 case REG:
1048 return true;
1050 case MEM:
1051 case CONST_INT:
1052 return REG_P (dst);
1054 default:
1055 return false;
1058 if (!REG_P (XEXP (src, 0))
1059 && !MEM_P (XEXP (src, 0))
1060 && !CONST_INT_P (XEXP (src, 0))
1061 /* Check for andnot case. */
1062 && (GET_CODE (src) != AND
1063 || GET_CODE (XEXP (src, 0)) != NOT
1064 || !REG_P (XEXP (XEXP (src, 0), 0))))
1065 return false;
1067 if (GET_MODE (XEXP (src, 0)) != DImode
1068 && !CONST_INT_P (XEXP (src, 0)))
1069 return false;
1071 return true;
1074 /* The TImode version of scalar_to_vector_candidate_p. */
1076 static bool
1077 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1079 rtx def_set = single_set (insn);
1081 if (!def_set)
1082 return false;
1084 if (has_non_address_hard_reg (insn))
1085 return false;
1087 rtx src = SET_SRC (def_set);
1088 rtx dst = SET_DEST (def_set);
1090 /* Only TImode load and store are allowed. */
1091 if (GET_MODE (dst) != TImode)
1092 return false;
1094 if (MEM_P (dst))
1096 /* Check for store. Memory must be aligned or unaligned store
1097 is optimal. Only support store from register, standard SSE
1098 constant or CONST_WIDE_INT generated from piecewise store.
1100 ??? Verify performance impact before enabling CONST_INT for
1101 __int128 store. */
1102 if (misaligned_operand (dst, TImode)
1103 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1104 return false;
1106 switch (GET_CODE (src))
1108 default:
1109 return false;
1111 case REG:
1112 case CONST_WIDE_INT:
1113 return true;
1115 case CONST_INT:
1116 return standard_sse_constant_p (src, TImode);
1119 else if (MEM_P (src))
1121 /* Check for load. Memory must be aligned or unaligned load is
1122 optimal. */
1123 return (REG_P (dst)
1124 && (!misaligned_operand (src, TImode)
1125 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1128 return false;
1131 /* Return 1 if INSN may be converted into vector
1132 instruction. */
1134 static bool
1135 scalar_to_vector_candidate_p (rtx_insn *insn)
1137 if (TARGET_64BIT)
1138 return timode_scalar_to_vector_candidate_p (insn);
1139 else
1140 return dimode_scalar_to_vector_candidate_p (insn);
1143 /* The DImode version of remove_non_convertible_regs. */
1145 static void
1146 dimode_remove_non_convertible_regs (bitmap candidates)
1148 bitmap_iterator bi;
1149 unsigned id;
1150 bitmap regs = BITMAP_ALLOC (NULL);
1152 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1154 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1155 rtx reg = SET_DEST (def_set);
1157 if (!REG_P (reg)
1158 || bitmap_bit_p (regs, REGNO (reg))
1159 || HARD_REGISTER_P (reg))
1160 continue;
1162 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1163 def;
1164 def = DF_REF_NEXT_REG (def))
1166 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1168 if (dump_file)
1169 fprintf (dump_file,
1170 "r%d has non convertible definition in insn %d\n",
1171 REGNO (reg), DF_REF_INSN_UID (def));
1173 bitmap_set_bit (regs, REGNO (reg));
1174 break;
1179 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1181 for (df_ref def = DF_REG_DEF_CHAIN (id);
1182 def;
1183 def = DF_REF_NEXT_REG (def))
1184 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1186 if (dump_file)
1187 fprintf (dump_file, "Removing insn %d from candidates list\n",
1188 DF_REF_INSN_UID (def));
1190 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1194 BITMAP_FREE (regs);
1197 /* For a register REGNO, scan instructions for its defs and uses.
1198 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1200 static void
1201 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1202 unsigned int regno)
1204 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1205 def;
1206 def = DF_REF_NEXT_REG (def))
1208 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1210 if (dump_file)
1211 fprintf (dump_file,
1212 "r%d has non convertible def in insn %d\n",
1213 regno, DF_REF_INSN_UID (def));
1215 bitmap_set_bit (regs, regno);
1216 break;
1220 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1221 ref;
1222 ref = DF_REF_NEXT_REG (ref))
1224 /* Debug instructions are skipped. */
1225 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1226 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1228 if (dump_file)
1229 fprintf (dump_file,
1230 "r%d has non convertible use in insn %d\n",
1231 regno, DF_REF_INSN_UID (ref));
1233 bitmap_set_bit (regs, regno);
1234 break;
1239 /* The TImode version of remove_non_convertible_regs. */
1241 static void
1242 timode_remove_non_convertible_regs (bitmap candidates)
1244 bitmap_iterator bi;
1245 unsigned id;
1246 bitmap regs = BITMAP_ALLOC (NULL);
1248 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1250 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1251 rtx dest = SET_DEST (def_set);
1252 rtx src = SET_SRC (def_set);
1254 if ((!REG_P (dest)
1255 || bitmap_bit_p (regs, REGNO (dest))
1256 || HARD_REGISTER_P (dest))
1257 && (!REG_P (src)
1258 || bitmap_bit_p (regs, REGNO (src))
1259 || HARD_REGISTER_P (src)))
1260 continue;
1262 if (REG_P (dest))
1263 timode_check_non_convertible_regs (candidates, regs,
1264 REGNO (dest));
1266 if (REG_P (src))
1267 timode_check_non_convertible_regs (candidates, regs,
1268 REGNO (src));
1271 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1273 for (df_ref def = DF_REG_DEF_CHAIN (id);
1274 def;
1275 def = DF_REF_NEXT_REG (def))
1276 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1278 if (dump_file)
1279 fprintf (dump_file, "Removing insn %d from candidates list\n",
1280 DF_REF_INSN_UID (def));
1282 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1285 for (df_ref ref = DF_REG_USE_CHAIN (id);
1286 ref;
1287 ref = DF_REF_NEXT_REG (ref))
1288 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1290 if (dump_file)
1291 fprintf (dump_file, "Removing insn %d from candidates list\n",
1292 DF_REF_INSN_UID (ref));
1294 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1298 BITMAP_FREE (regs);
1301 /* For a given bitmap of insn UIDs scans all instruction and
1302 remove insn from CANDIDATES in case it has both convertible
1303 and not convertible definitions.
1305 All insns in a bitmap are conversion candidates according to
1306 scalar_to_vector_candidate_p. Currently it implies all insns
1307 are single_set. */
1309 static void
1310 remove_non_convertible_regs (bitmap candidates)
1312 if (TARGET_64BIT)
1313 timode_remove_non_convertible_regs (candidates);
1314 else
1315 dimode_remove_non_convertible_regs (candidates);
1318 class scalar_chain
1320 public:
1321 scalar_chain ();
1322 virtual ~scalar_chain ();
1324 static unsigned max_id;
1326 /* ID of a chain. */
1327 unsigned int chain_id;
1328 /* A queue of instructions to be included into a chain. */
1329 bitmap queue;
1330 /* Instructions included into a chain. */
1331 bitmap insns;
1332 /* All registers defined by a chain. */
1333 bitmap defs;
1334 /* Registers used in both vector and sclar modes. */
1335 bitmap defs_conv;
1337 void build (bitmap candidates, unsigned insn_uid);
1338 virtual int compute_convert_gain () = 0;
1339 int convert ();
1341 protected:
1342 void add_to_queue (unsigned insn_uid);
1343 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1345 private:
1346 void add_insn (bitmap candidates, unsigned insn_uid);
1347 void analyze_register_chain (bitmap candidates, df_ref ref);
1348 virtual void mark_dual_mode_def (df_ref def) = 0;
1349 virtual void convert_insn (rtx_insn *insn) = 0;
1350 virtual void convert_registers () = 0;
1353 class dimode_scalar_chain : public scalar_chain
1355 public:
1356 int compute_convert_gain ();
1357 private:
1358 void mark_dual_mode_def (df_ref def);
1359 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1360 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1361 void convert_insn (rtx_insn *insn);
1362 void convert_op (rtx *op, rtx_insn *insn);
1363 void convert_reg (unsigned regno);
1364 void make_vector_copies (unsigned regno);
1365 void convert_registers ();
1366 int vector_const_cost (rtx exp);
1369 class timode_scalar_chain : public scalar_chain
1371 public:
1372 /* Convert from TImode to V1TImode is always faster. */
1373 int compute_convert_gain () { return 1; }
1375 private:
1376 void mark_dual_mode_def (df_ref def);
1377 void fix_debug_reg_uses (rtx reg);
1378 void convert_insn (rtx_insn *insn);
1379 /* We don't convert registers to difference size. */
1380 void convert_registers () {}
1383 unsigned scalar_chain::max_id = 0;
1385 /* Initialize new chain. */
1387 scalar_chain::scalar_chain ()
1389 chain_id = ++max_id;
1391 if (dump_file)
1392 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1394 bitmap_obstack_initialize (NULL);
1395 insns = BITMAP_ALLOC (NULL);
1396 defs = BITMAP_ALLOC (NULL);
1397 defs_conv = BITMAP_ALLOC (NULL);
1398 queue = NULL;
1401 /* Free chain's data. */
1403 scalar_chain::~scalar_chain ()
1405 BITMAP_FREE (insns);
1406 BITMAP_FREE (defs);
1407 BITMAP_FREE (defs_conv);
1408 bitmap_obstack_release (NULL);
1411 /* Add instruction into chains' queue. */
1413 void
1414 scalar_chain::add_to_queue (unsigned insn_uid)
1416 if (bitmap_bit_p (insns, insn_uid)
1417 || bitmap_bit_p (queue, insn_uid))
1418 return;
1420 if (dump_file)
1421 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1422 insn_uid, chain_id);
1423 bitmap_set_bit (queue, insn_uid);
1426 /* For DImode conversion, mark register defined by DEF as requiring
1427 conversion. */
1429 void
1430 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1432 gcc_assert (DF_REF_REG_DEF_P (def));
1434 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1435 return;
1437 if (dump_file)
1438 fprintf (dump_file,
1439 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1440 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1442 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1445 /* For TImode conversion, it is unused. */
1447 void
1448 timode_scalar_chain::mark_dual_mode_def (df_ref)
1450 gcc_unreachable ();
1453 /* Check REF's chain to add new insns into a queue
1454 and find registers requiring conversion. */
1456 void
1457 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1459 df_link *chain;
1461 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1462 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1463 add_to_queue (DF_REF_INSN_UID (ref));
1465 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1467 unsigned uid = DF_REF_INSN_UID (chain->ref);
1469 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1470 continue;
1472 if (!DF_REF_REG_MEM_P (chain->ref))
1474 if (bitmap_bit_p (insns, uid))
1475 continue;
1477 if (bitmap_bit_p (candidates, uid))
1479 add_to_queue (uid);
1480 continue;
1484 if (DF_REF_REG_DEF_P (chain->ref))
1486 if (dump_file)
1487 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1488 DF_REF_REGNO (chain->ref), uid);
1489 mark_dual_mode_def (chain->ref);
1491 else
1493 if (dump_file)
1494 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1495 DF_REF_REGNO (chain->ref), uid);
1496 mark_dual_mode_def (ref);
1501 /* Add instruction into a chain. */
1503 void
1504 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1506 if (bitmap_bit_p (insns, insn_uid))
1507 return;
1509 if (dump_file)
1510 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1512 bitmap_set_bit (insns, insn_uid);
1514 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1515 rtx def_set = single_set (insn);
1516 if (def_set && REG_P (SET_DEST (def_set))
1517 && !HARD_REGISTER_P (SET_DEST (def_set)))
1518 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1520 df_ref ref;
1521 df_ref def;
1522 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1523 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1524 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1525 def;
1526 def = DF_REF_NEXT_REG (def))
1527 analyze_register_chain (candidates, def);
1528 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1529 if (!DF_REF_REG_MEM_P (ref))
1530 analyze_register_chain (candidates, ref);
1533 /* Build new chain starting from insn INSN_UID recursively
1534 adding all dependent uses and definitions. */
1536 void
1537 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1539 queue = BITMAP_ALLOC (NULL);
1540 bitmap_set_bit (queue, insn_uid);
1542 if (dump_file)
1543 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1545 while (!bitmap_empty_p (queue))
1547 insn_uid = bitmap_first_set_bit (queue);
1548 bitmap_clear_bit (queue, insn_uid);
1549 bitmap_clear_bit (candidates, insn_uid);
1550 add_insn (candidates, insn_uid);
1553 if (dump_file)
1555 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1556 fprintf (dump_file, " insns: ");
1557 dump_bitmap (dump_file, insns);
1558 if (!bitmap_empty_p (defs_conv))
1560 bitmap_iterator bi;
1561 unsigned id;
1562 const char *comma = "";
1563 fprintf (dump_file, " defs to convert: ");
1564 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1566 fprintf (dump_file, "%sr%d", comma, id);
1567 comma = ", ";
1569 fprintf (dump_file, "\n");
1573 BITMAP_FREE (queue);
1576 /* Return a cost of building a vector costant
1577 instead of using a scalar one. */
1580 dimode_scalar_chain::vector_const_cost (rtx exp)
1582 gcc_assert (CONST_INT_P (exp));
1584 if (standard_sse_constant_p (exp, V2DImode))
1585 return COSTS_N_INSNS (1);
1586 return ix86_cost->sse_load[1];
1589 /* Compute a gain for chain conversion. */
1592 dimode_scalar_chain::compute_convert_gain ()
1594 bitmap_iterator bi;
1595 unsigned insn_uid;
1596 int gain = 0;
1597 int cost = 0;
1599 if (dump_file)
1600 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1602 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1604 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1605 rtx def_set = single_set (insn);
1606 rtx src = SET_SRC (def_set);
1607 rtx dst = SET_DEST (def_set);
1609 if (REG_P (src) && REG_P (dst))
1610 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1611 else if (REG_P (src) && MEM_P (dst))
1612 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1613 else if (MEM_P (src) && REG_P (dst))
1614 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1615 else if (GET_CODE (src) == ASHIFT
1616 || GET_CODE (src) == ASHIFTRT
1617 || GET_CODE (src) == LSHIFTRT)
1619 if (CONST_INT_P (XEXP (src, 0)))
1620 gain -= vector_const_cost (XEXP (src, 0));
1621 if (CONST_INT_P (XEXP (src, 1)))
1623 gain += ix86_cost->shift_const;
1624 if (INTVAL (XEXP (src, 1)) >= 32)
1625 gain -= COSTS_N_INSNS (1);
1627 else
1628 /* Additional gain for omitting two CMOVs. */
1629 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1631 else if (GET_CODE (src) == PLUS
1632 || GET_CODE (src) == MINUS
1633 || GET_CODE (src) == IOR
1634 || GET_CODE (src) == XOR
1635 || GET_CODE (src) == AND)
1637 gain += ix86_cost->add;
1638 /* Additional gain for andnot for targets without BMI. */
1639 if (GET_CODE (XEXP (src, 0)) == NOT
1640 && !TARGET_BMI)
1641 gain += 2 * ix86_cost->add;
1643 if (CONST_INT_P (XEXP (src, 0)))
1644 gain -= vector_const_cost (XEXP (src, 0));
1645 if (CONST_INT_P (XEXP (src, 1)))
1646 gain -= vector_const_cost (XEXP (src, 1));
1648 else if (GET_CODE (src) == NEG
1649 || GET_CODE (src) == NOT)
1650 gain += ix86_cost->add - COSTS_N_INSNS (1);
1651 else if (GET_CODE (src) == COMPARE)
1653 /* Assume comparison cost is the same. */
1655 else if (CONST_INT_P (src))
1657 if (REG_P (dst))
1658 gain += COSTS_N_INSNS (2);
1659 else if (MEM_P (dst))
1660 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1661 gain -= vector_const_cost (src);
1663 else
1664 gcc_unreachable ();
1667 if (dump_file)
1668 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1670 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1671 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1673 if (dump_file)
1674 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1676 gain -= cost;
1678 if (dump_file)
1679 fprintf (dump_file, " Total gain: %d\n", gain);
1681 return gain;
1684 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1687 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1689 if (x == reg)
1690 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1692 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1693 int i, j;
1694 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1696 if (fmt[i] == 'e')
1697 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1698 else if (fmt[i] == 'E')
1699 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1700 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1701 reg, new_reg);
1704 return x;
1707 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1709 void
1710 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1711 rtx reg, rtx new_reg)
1713 replace_with_subreg (single_set (insn), reg, new_reg);
1716 /* Insert generated conversion instruction sequence INSNS
1717 after instruction AFTER. New BB may be required in case
1718 instruction has EH region attached. */
1720 void
1721 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1723 if (!control_flow_insn_p (after))
1725 emit_insn_after (insns, after);
1726 return;
1729 basic_block bb = BLOCK_FOR_INSN (after);
1730 edge e = find_fallthru_edge (bb->succs);
1731 gcc_assert (e);
1733 basic_block new_bb = split_edge (e);
1734 emit_insn_after (insns, BB_HEAD (new_bb));
1737 /* Make vector copies for all register REGNO definitions
1738 and replace its uses in a chain. */
1740 void
1741 dimode_scalar_chain::make_vector_copies (unsigned regno)
1743 rtx reg = regno_reg_rtx[regno];
1744 rtx vreg = gen_reg_rtx (DImode);
1745 bool count_reg = false;
1746 df_ref ref;
1748 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1749 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1751 df_ref use;
1753 /* Detect the count register of a shift instruction. */
1754 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1755 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1757 rtx_insn *insn = DF_REF_INSN (use);
1758 rtx def_set = single_set (insn);
1760 gcc_assert (def_set);
1762 rtx src = SET_SRC (def_set);
1764 if ((GET_CODE (src) == ASHIFT
1765 || GET_CODE (src) == ASHIFTRT
1766 || GET_CODE (src) == LSHIFTRT)
1767 && !CONST_INT_P (XEXP (src, 1))
1768 && reg_or_subregno (XEXP (src, 1)) == regno)
1769 count_reg = true;
1772 start_sequence ();
1773 if (count_reg)
1775 rtx qreg = gen_lowpart (QImode, reg);
1776 rtx tmp = gen_reg_rtx (SImode);
1778 if (TARGET_ZERO_EXTEND_WITH_AND
1779 && optimize_function_for_speed_p (cfun))
1781 emit_move_insn (tmp, const0_rtx);
1782 emit_insn (gen_movstrictqi
1783 (gen_lowpart (QImode, tmp), qreg));
1785 else
1786 emit_insn (gen_rtx_SET
1787 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1789 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1791 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1792 emit_move_insn (slot, tmp);
1793 tmp = copy_rtx (slot);
1796 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1798 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1800 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1801 emit_move_insn (adjust_address (tmp, SImode, 0),
1802 gen_rtx_SUBREG (SImode, reg, 0));
1803 emit_move_insn (adjust_address (tmp, SImode, 4),
1804 gen_rtx_SUBREG (SImode, reg, 4));
1805 emit_move_insn (vreg, tmp);
1807 else if (TARGET_SSE4_1)
1809 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1810 CONST0_RTX (V4SImode),
1811 gen_rtx_SUBREG (SImode, reg, 0)));
1812 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1813 gen_rtx_SUBREG (V4SImode, vreg, 0),
1814 gen_rtx_SUBREG (SImode, reg, 4),
1815 GEN_INT (2)));
1817 else
1819 rtx tmp = gen_reg_rtx (DImode);
1820 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1821 CONST0_RTX (V4SImode),
1822 gen_rtx_SUBREG (SImode, reg, 0)));
1823 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1824 CONST0_RTX (V4SImode),
1825 gen_rtx_SUBREG (SImode, reg, 4)));
1826 emit_insn (gen_vec_interleave_lowv4si
1827 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1828 gen_rtx_SUBREG (V4SImode, vreg, 0),
1829 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1831 rtx_insn *seq = get_insns ();
1832 end_sequence ();
1833 rtx_insn *insn = DF_REF_INSN (ref);
1834 emit_conversion_insns (seq, insn);
1836 if (dump_file)
1837 fprintf (dump_file,
1838 " Copied r%d to a vector register r%d for insn %d\n",
1839 regno, REGNO (vreg), INSN_UID (insn));
1842 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1843 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1845 rtx_insn *insn = DF_REF_INSN (ref);
1846 if (count_reg)
1848 rtx def_set = single_set (insn);
1849 gcc_assert (def_set);
1851 rtx src = SET_SRC (def_set);
1853 if ((GET_CODE (src) == ASHIFT
1854 || GET_CODE (src) == ASHIFTRT
1855 || GET_CODE (src) == LSHIFTRT)
1856 && !CONST_INT_P (XEXP (src, 1))
1857 && reg_or_subregno (XEXP (src, 1)) == regno)
1858 XEXP (src, 1) = vreg;
1860 else
1861 replace_with_subreg_in_insn (insn, reg, vreg);
1863 if (dump_file)
1864 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1865 regno, REGNO (vreg), INSN_UID (insn));
1869 /* Convert all definitions of register REGNO
1870 and fix its uses. Scalar copies may be created
1871 in case register is used in not convertible insn. */
1873 void
1874 dimode_scalar_chain::convert_reg (unsigned regno)
1876 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1877 rtx reg = regno_reg_rtx[regno];
1878 rtx scopy = NULL_RTX;
1879 df_ref ref;
1880 bitmap conv;
1882 conv = BITMAP_ALLOC (NULL);
1883 bitmap_copy (conv, insns);
1885 if (scalar_copy)
1886 scopy = gen_reg_rtx (DImode);
1888 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1890 rtx_insn *insn = DF_REF_INSN (ref);
1891 rtx def_set = single_set (insn);
1892 rtx src = SET_SRC (def_set);
1893 rtx reg = DF_REF_REG (ref);
1895 if (!MEM_P (src))
1897 replace_with_subreg_in_insn (insn, reg, reg);
1898 bitmap_clear_bit (conv, INSN_UID (insn));
1901 if (scalar_copy)
1903 start_sequence ();
1904 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1906 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1907 emit_move_insn (tmp, reg);
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1909 adjust_address (tmp, SImode, 0));
1910 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1911 adjust_address (tmp, SImode, 4));
1913 else if (TARGET_SSE4_1)
1915 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1916 emit_insn
1917 (gen_rtx_SET
1918 (gen_rtx_SUBREG (SImode, scopy, 0),
1919 gen_rtx_VEC_SELECT (SImode,
1920 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1922 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1923 emit_insn
1924 (gen_rtx_SET
1925 (gen_rtx_SUBREG (SImode, scopy, 4),
1926 gen_rtx_VEC_SELECT (SImode,
1927 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1929 else
1931 rtx vcopy = gen_reg_rtx (V2DImode);
1932 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1933 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1934 gen_rtx_SUBREG (SImode, vcopy, 0));
1935 emit_move_insn (vcopy,
1936 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1937 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1938 gen_rtx_SUBREG (SImode, vcopy, 0));
1940 rtx_insn *seq = get_insns ();
1941 end_sequence ();
1942 emit_conversion_insns (seq, insn);
1944 if (dump_file)
1945 fprintf (dump_file,
1946 " Copied r%d to a scalar register r%d for insn %d\n",
1947 regno, REGNO (scopy), INSN_UID (insn));
1951 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1952 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1954 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1956 rtx_insn *insn = DF_REF_INSN (ref);
1958 rtx def_set = single_set (insn);
1959 gcc_assert (def_set);
1961 rtx src = SET_SRC (def_set);
1962 rtx dst = SET_DEST (def_set);
1964 if ((GET_CODE (src) == ASHIFT
1965 || GET_CODE (src) == ASHIFTRT
1966 || GET_CODE (src) == LSHIFTRT)
1967 && !CONST_INT_P (XEXP (src, 1))
1968 && reg_or_subregno (XEXP (src, 1)) == regno)
1970 rtx tmp2 = gen_reg_rtx (V2DImode);
1972 start_sequence ();
1974 if (TARGET_SSE4_1)
1975 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1976 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1977 else
1979 rtx vec_cst
1980 = gen_rtx_CONST_VECTOR (V2DImode,
1981 gen_rtvec (2, GEN_INT (0xff),
1982 const0_rtx));
1983 vec_cst
1984 = validize_mem (force_const_mem (V2DImode, vec_cst));
1986 emit_insn (gen_rtx_SET
1987 (tmp2,
1988 gen_rtx_AND (V2DImode,
1989 gen_rtx_SUBREG (V2DImode, reg, 0),
1990 vec_cst)));
1992 rtx_insn *seq = get_insns ();
1993 end_sequence ();
1995 emit_insn_before (seq, insn);
1997 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1999 else if (!MEM_P (dst) || !REG_P (src))
2000 replace_with_subreg_in_insn (insn, reg, reg);
2002 bitmap_clear_bit (conv, INSN_UID (insn));
2005 /* Skip debug insns and uninitialized uses. */
2006 else if (DF_REF_CHAIN (ref)
2007 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2009 gcc_assert (scopy);
2010 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2011 df_insn_rescan (DF_REF_INSN (ref));
2014 BITMAP_FREE (conv);
2017 /* Convert operand OP in INSN. We should handle
2018 memory operands and uninitialized registers.
2019 All other register uses are converted during
2020 registers conversion. */
2022 void
2023 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2025 *op = copy_rtx_if_shared (*op);
2027 if (GET_CODE (*op) == NOT)
2029 convert_op (&XEXP (*op, 0), insn);
2030 PUT_MODE (*op, V2DImode);
2032 else if (MEM_P (*op))
2034 rtx tmp = gen_reg_rtx (DImode);
2036 emit_insn_before (gen_move_insn (tmp, *op), insn);
2037 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2039 if (dump_file)
2040 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2041 INSN_UID (insn), REGNO (tmp));
2043 else if (REG_P (*op))
2045 /* We may have not converted register usage in case
2046 this register has no definition. Otherwise it
2047 should be converted in convert_reg. */
2048 df_ref ref;
2049 FOR_EACH_INSN_USE (ref, insn)
2050 if (DF_REF_REGNO (ref) == REGNO (*op))
2052 gcc_assert (!DF_REF_CHAIN (ref));
2053 break;
2055 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2057 else if (CONST_INT_P (*op))
2059 rtx vec_cst;
2060 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2062 /* Prefer all ones vector in case of -1. */
2063 if (constm1_operand (*op, GET_MODE (*op)))
2064 vec_cst = CONSTM1_RTX (V2DImode);
2065 else
2066 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2067 gen_rtvec (2, *op, const0_rtx));
2069 if (!standard_sse_constant_p (vec_cst, V2DImode))
2071 start_sequence ();
2072 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2073 rtx_insn *seq = get_insns ();
2074 end_sequence ();
2075 emit_insn_before (seq, insn);
2078 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2079 *op = tmp;
2081 else
2083 gcc_assert (SUBREG_P (*op));
2084 gcc_assert (GET_MODE (*op) == V2DImode);
2088 /* Convert INSN to vector mode. */
2090 void
2091 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2093 rtx def_set = single_set (insn);
2094 rtx src = SET_SRC (def_set);
2095 rtx dst = SET_DEST (def_set);
2096 rtx subreg;
2098 if (MEM_P (dst) && !REG_P (src))
2100 /* There are no scalar integer instructions and therefore
2101 temporary register usage is required. */
2102 rtx tmp = gen_reg_rtx (DImode);
2103 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2104 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2107 switch (GET_CODE (src))
2109 case ASHIFT:
2110 case ASHIFTRT:
2111 case LSHIFTRT:
2112 convert_op (&XEXP (src, 0), insn);
2113 PUT_MODE (src, V2DImode);
2114 break;
2116 case PLUS:
2117 case MINUS:
2118 case IOR:
2119 case XOR:
2120 case AND:
2121 convert_op (&XEXP (src, 0), insn);
2122 convert_op (&XEXP (src, 1), insn);
2123 PUT_MODE (src, V2DImode);
2124 break;
2126 case NEG:
2127 src = XEXP (src, 0);
2128 convert_op (&src, insn);
2129 subreg = gen_reg_rtx (V2DImode);
2130 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2131 src = gen_rtx_MINUS (V2DImode, subreg, src);
2132 break;
2134 case NOT:
2135 src = XEXP (src, 0);
2136 convert_op (&src, insn);
2137 subreg = gen_reg_rtx (V2DImode);
2138 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2139 src = gen_rtx_XOR (V2DImode, src, subreg);
2140 break;
2142 case MEM:
2143 if (!REG_P (dst))
2144 convert_op (&src, insn);
2145 break;
2147 case REG:
2148 if (!MEM_P (dst))
2149 convert_op (&src, insn);
2150 break;
2152 case SUBREG:
2153 gcc_assert (GET_MODE (src) == V2DImode);
2154 break;
2156 case COMPARE:
2157 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2159 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2160 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2162 if (REG_P (src))
2163 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2164 else
2165 subreg = copy_rtx_if_shared (src);
2166 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2167 copy_rtx_if_shared (subreg),
2168 copy_rtx_if_shared (subreg)),
2169 insn);
2170 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2171 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2172 copy_rtx_if_shared (src)),
2173 UNSPEC_PTEST);
2174 break;
2176 case CONST_INT:
2177 convert_op (&src, insn);
2178 break;
2180 default:
2181 gcc_unreachable ();
2184 SET_SRC (def_set) = src;
2185 SET_DEST (def_set) = dst;
2187 /* Drop possible dead definitions. */
2188 PATTERN (insn) = def_set;
2190 INSN_CODE (insn) = -1;
2191 recog_memoized (insn);
2192 df_insn_rescan (insn);
2195 /* Fix uses of converted REG in debug insns. */
2197 void
2198 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2200 if (!flag_var_tracking)
2201 return;
2203 df_ref ref, next;
2204 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2206 rtx_insn *insn = DF_REF_INSN (ref);
2207 /* Make sure the next ref is for a different instruction,
2208 so that we're not affected by the rescan. */
2209 next = DF_REF_NEXT_REG (ref);
2210 while (next && DF_REF_INSN (next) == insn)
2211 next = DF_REF_NEXT_REG (next);
2213 if (DEBUG_INSN_P (insn))
2215 /* It may be a debug insn with a TImode variable in
2216 register. */
2217 bool changed = false;
2218 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2220 rtx *loc = DF_REF_LOC (ref);
2221 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2223 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2224 changed = true;
2227 if (changed)
2228 df_insn_rescan (insn);
2233 /* Convert INSN from TImode to V1T1mode. */
2235 void
2236 timode_scalar_chain::convert_insn (rtx_insn *insn)
2238 rtx def_set = single_set (insn);
2239 rtx src = SET_SRC (def_set);
2240 rtx dst = SET_DEST (def_set);
2242 switch (GET_CODE (dst))
2244 case REG:
2246 rtx tmp = find_reg_equal_equiv_note (insn);
2247 if (tmp)
2248 PUT_MODE (XEXP (tmp, 0), V1TImode);
2249 PUT_MODE (dst, V1TImode);
2250 fix_debug_reg_uses (dst);
2252 break;
2253 case MEM:
2254 PUT_MODE (dst, V1TImode);
2255 break;
2257 default:
2258 gcc_unreachable ();
2261 switch (GET_CODE (src))
2263 case REG:
2264 PUT_MODE (src, V1TImode);
2265 /* Call fix_debug_reg_uses only if SRC is never defined. */
2266 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2267 fix_debug_reg_uses (src);
2268 break;
2270 case MEM:
2271 PUT_MODE (src, V1TImode);
2272 break;
2274 case CONST_WIDE_INT:
2275 if (NONDEBUG_INSN_P (insn))
2277 /* Since there are no instructions to store 128-bit constant,
2278 temporary register usage is required. */
2279 rtx tmp = gen_reg_rtx (V1TImode);
2280 start_sequence ();
2281 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2282 src = validize_mem (force_const_mem (V1TImode, src));
2283 rtx_insn *seq = get_insns ();
2284 end_sequence ();
2285 if (seq)
2286 emit_insn_before (seq, insn);
2287 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2288 dst = tmp;
2290 break;
2292 case CONST_INT:
2293 switch (standard_sse_constant_p (src, TImode))
2295 case 1:
2296 src = CONST0_RTX (GET_MODE (dst));
2297 break;
2298 case 2:
2299 src = CONSTM1_RTX (GET_MODE (dst));
2300 break;
2301 default:
2302 gcc_unreachable ();
2304 if (NONDEBUG_INSN_P (insn))
2306 rtx tmp = gen_reg_rtx (V1TImode);
2307 /* Since there are no instructions to store standard SSE
2308 constant, temporary register usage is required. */
2309 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2310 dst = tmp;
2312 break;
2314 default:
2315 gcc_unreachable ();
2318 SET_SRC (def_set) = src;
2319 SET_DEST (def_set) = dst;
2321 /* Drop possible dead definitions. */
2322 PATTERN (insn) = def_set;
2324 INSN_CODE (insn) = -1;
2325 recog_memoized (insn);
2326 df_insn_rescan (insn);
2329 void
2330 dimode_scalar_chain::convert_registers ()
2332 bitmap_iterator bi;
2333 unsigned id;
2335 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2336 convert_reg (id);
2338 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2339 make_vector_copies (id);
2342 /* Convert whole chain creating required register
2343 conversions and copies. */
2346 scalar_chain::convert ()
2348 bitmap_iterator bi;
2349 unsigned id;
2350 int converted_insns = 0;
2352 if (!dbg_cnt (stv_conversion))
2353 return 0;
2355 if (dump_file)
2356 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2358 convert_registers ();
2360 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2362 convert_insn (DF_INSN_UID_GET (id)->insn);
2363 converted_insns++;
2366 return converted_insns;
2369 /* Main STV pass function. Find and convert scalar
2370 instructions into vector mode when profitable. */
2372 static unsigned int
2373 convert_scalars_to_vector ()
2375 basic_block bb;
2376 bitmap candidates;
2377 int converted_insns = 0;
2379 bitmap_obstack_initialize (NULL);
2380 candidates = BITMAP_ALLOC (NULL);
2382 calculate_dominance_info (CDI_DOMINATORS);
2383 df_set_flags (DF_DEFER_INSN_RESCAN);
2384 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2385 df_md_add_problem ();
2386 df_analyze ();
2388 /* Find all instructions we want to convert into vector mode. */
2389 if (dump_file)
2390 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2392 FOR_EACH_BB_FN (bb, cfun)
2394 rtx_insn *insn;
2395 FOR_BB_INSNS (bb, insn)
2396 if (scalar_to_vector_candidate_p (insn))
2398 if (dump_file)
2399 fprintf (dump_file, " insn %d is marked as a candidate\n",
2400 INSN_UID (insn));
2402 bitmap_set_bit (candidates, INSN_UID (insn));
2406 remove_non_convertible_regs (candidates);
2408 if (bitmap_empty_p (candidates))
2409 if (dump_file)
2410 fprintf (dump_file, "There are no candidates for optimization.\n");
2412 while (!bitmap_empty_p (candidates))
2414 unsigned uid = bitmap_first_set_bit (candidates);
2415 scalar_chain *chain;
2417 if (TARGET_64BIT)
2418 chain = new timode_scalar_chain;
2419 else
2420 chain = new dimode_scalar_chain;
2422 /* Find instructions chain we want to convert to vector mode.
2423 Check all uses and definitions to estimate all required
2424 conversions. */
2425 chain->build (candidates, uid);
2427 if (chain->compute_convert_gain () > 0)
2428 converted_insns += chain->convert ();
2429 else
2430 if (dump_file)
2431 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2432 chain->chain_id);
2434 delete chain;
2437 if (dump_file)
2438 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2440 BITMAP_FREE (candidates);
2441 bitmap_obstack_release (NULL);
2442 df_process_deferred_rescans ();
2444 /* Conversion means we may have 128bit register spills/fills
2445 which require aligned stack. */
2446 if (converted_insns)
2448 if (crtl->stack_alignment_needed < 128)
2449 crtl->stack_alignment_needed = 128;
2450 if (crtl->stack_alignment_estimated < 128)
2451 crtl->stack_alignment_estimated = 128;
2452 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2453 if (TARGET_64BIT)
2454 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2455 parm; parm = DECL_CHAIN (parm))
2457 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2458 continue;
2459 if (DECL_RTL_SET_P (parm)
2460 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2462 rtx r = DECL_RTL (parm);
2463 if (REG_P (r))
2464 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2466 if (DECL_INCOMING_RTL (parm)
2467 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2469 rtx r = DECL_INCOMING_RTL (parm);
2470 if (REG_P (r))
2471 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2476 return 0;
2479 namespace {
2481 const pass_data pass_data_insert_vzeroupper =
2483 RTL_PASS, /* type */
2484 "vzeroupper", /* name */
2485 OPTGROUP_NONE, /* optinfo_flags */
2486 TV_MACH_DEP, /* tv_id */
2487 0, /* properties_required */
2488 0, /* properties_provided */
2489 0, /* properties_destroyed */
2490 0, /* todo_flags_start */
2491 TODO_df_finish, /* todo_flags_finish */
2494 class pass_insert_vzeroupper : public rtl_opt_pass
2496 public:
2497 pass_insert_vzeroupper(gcc::context *ctxt)
2498 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2501 /* opt_pass methods: */
2502 virtual bool gate (function *)
2504 return TARGET_AVX
2505 && TARGET_VZEROUPPER && flag_expensive_optimizations
2506 && !optimize_size;
2509 virtual unsigned int execute (function *)
2511 return rest_of_handle_insert_vzeroupper ();
2514 }; // class pass_insert_vzeroupper
2516 const pass_data pass_data_stv =
2518 RTL_PASS, /* type */
2519 "stv", /* name */
2520 OPTGROUP_NONE, /* optinfo_flags */
2521 TV_MACH_DEP, /* tv_id */
2522 0, /* properties_required */
2523 0, /* properties_provided */
2524 0, /* properties_destroyed */
2525 0, /* todo_flags_start */
2526 TODO_df_finish, /* todo_flags_finish */
2529 class pass_stv : public rtl_opt_pass
2531 public:
2532 pass_stv (gcc::context *ctxt)
2533 : rtl_opt_pass (pass_data_stv, ctxt),
2534 timode_p (false)
2537 /* opt_pass methods: */
2538 virtual bool gate (function *)
2540 return (timode_p == !!TARGET_64BIT
2541 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2544 virtual unsigned int execute (function *)
2546 return convert_scalars_to_vector ();
2549 opt_pass *clone ()
2551 return new pass_stv (m_ctxt);
2554 void set_pass_param (unsigned int n, bool param)
2556 gcc_assert (n == 0);
2557 timode_p = param;
2560 private:
2561 bool timode_p;
2562 }; // class pass_stv
2564 } // anon namespace
2566 rtl_opt_pass *
2567 make_pass_insert_vzeroupper (gcc::context *ctxt)
2569 return new pass_insert_vzeroupper (ctxt);
2572 rtl_opt_pass *
2573 make_pass_stv (gcc::context *ctxt)
2575 return new pass_stv (ctxt);
2578 /* Inserting ENDBRANCH instructions. */
2580 static unsigned int
2581 rest_of_insert_endbranch (void)
2583 timevar_push (TV_MACH_DEP);
2585 rtx cet_eb;
2586 rtx_insn *insn;
2587 basic_block bb;
2589 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2590 absent among function attributes. Later an optimization will be
2591 introduced to make analysis if an address of a static function is
2592 taken. A static function whose address is not taken will get a
2593 nocf_check attribute. This will allow to reduce the number of EB. */
2595 if (!lookup_attribute ("nocf_check",
2596 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2597 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2599 cet_eb = gen_nop_endbr ();
2601 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2602 insn = BB_HEAD (bb);
2603 emit_insn_before (cet_eb, insn);
2606 bb = 0;
2607 FOR_EACH_BB_FN (bb, cfun)
2609 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2610 insn = NEXT_INSN (insn))
2612 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2614 if (find_reg_note (insn, REG_SETJMP, NULL) == NULL)
2615 continue;
2616 /* Generate ENDBRANCH after CALL, which can return more than
2617 twice, setjmp-like functions. */
2619 /* Skip notes and debug insns that must be next to the
2620 call insn. ??? This might skip a lot more than
2621 that... ??? Skipping barriers and emitting code
2622 after them surely looks like a mistake; we probably
2623 won't ever hit it, for we'll hit BB_END first. */
2624 rtx_insn *next_insn = insn;
2625 while ((next_insn != BB_END (bb))
2626 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2627 || NOTE_P (NEXT_INSN (next_insn))
2628 || BARRIER_P (NEXT_INSN (next_insn))))
2629 next_insn = NEXT_INSN (next_insn);
2631 cet_eb = gen_nop_endbr ();
2632 emit_insn_after_setloc (cet_eb, next_insn, INSN_LOCATION (insn));
2633 continue;
2636 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2638 rtx target = JUMP_LABEL (insn);
2639 if (target == NULL_RTX || ANY_RETURN_P (target))
2640 continue;
2642 /* Check the jump is a switch table. */
2643 rtx_insn *label = as_a<rtx_insn *> (target);
2644 rtx_insn *table = next_insn (label);
2645 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2646 continue;
2648 /* For the indirect jump find out all places it jumps and insert
2649 ENDBRANCH there. It should be done under a special flag to
2650 control ENDBRANCH generation for switch stmts. */
2651 edge_iterator ei;
2652 edge e;
2653 basic_block dest_blk;
2655 FOR_EACH_EDGE (e, ei, bb->succs)
2657 rtx_insn *insn;
2659 dest_blk = e->dest;
2660 insn = BB_HEAD (dest_blk);
2661 gcc_assert (LABEL_P (insn));
2662 cet_eb = gen_nop_endbr ();
2663 emit_insn_after (cet_eb, insn);
2665 continue;
2668 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2669 || (NOTE_P (insn)
2670 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2671 /* TODO. Check /s bit also. */
2673 cet_eb = gen_nop_endbr ();
2674 emit_insn_after (cet_eb, insn);
2675 continue;
2680 timevar_pop (TV_MACH_DEP);
2681 return 0;
2684 namespace {
2686 const pass_data pass_data_insert_endbranch =
2688 RTL_PASS, /* type. */
2689 "cet", /* name. */
2690 OPTGROUP_NONE, /* optinfo_flags. */
2691 TV_MACH_DEP, /* tv_id. */
2692 0, /* properties_required. */
2693 0, /* properties_provided. */
2694 0, /* properties_destroyed. */
2695 0, /* todo_flags_start. */
2696 0, /* todo_flags_finish. */
2699 class pass_insert_endbranch : public rtl_opt_pass
2701 public:
2702 pass_insert_endbranch (gcc::context *ctxt)
2703 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2706 /* opt_pass methods: */
2707 virtual bool gate (function *)
2709 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2712 virtual unsigned int execute (function *)
2714 return rest_of_insert_endbranch ();
2717 }; // class pass_insert_endbranch
2719 } // anon namespace
2721 rtl_opt_pass *
2722 make_pass_insert_endbranch (gcc::context *ctxt)
2724 return new pass_insert_endbranch (ctxt);
2727 /* Return true if a red-zone is in use. We can't use red-zone when
2728 there are local indirect jumps, like "indirect_jump" or "tablejump",
2729 which jumps to another place in the function, since "call" in the
2730 indirect thunk pushes the return address onto stack, destroying
2731 red-zone.
2733 TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2734 for CALL, in red-zone, we can allow local indirect jumps with
2735 indirect thunk. */
2737 bool
2738 ix86_using_red_zone (void)
2740 return (TARGET_RED_ZONE
2741 && !TARGET_64BIT_MS_ABI
2742 && (!cfun->machine->has_local_indirect_jump
2743 || cfun->machine->indirect_branch_type == indirect_branch_keep));
2746 /* Return a string that documents the current -m options. The caller is
2747 responsible for freeing the string. */
2749 static char *
2750 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2751 int flags, int flags2,
2752 const char *arch, const char *tune,
2753 enum fpmath_unit fpmath, bool add_nl_p)
2755 struct ix86_target_opts
2757 const char *option; /* option string */
2758 HOST_WIDE_INT mask; /* isa mask options */
2761 /* This table is ordered so that options like -msse4.2 that imply other
2762 ISAs come first. Target string will be displayed in the same order. */
2763 static struct ix86_target_opts isa2_opts[] =
2765 { "-mcx16", OPTION_MASK_ISA_CX16 },
2766 { "-mmpx", OPTION_MASK_ISA_MPX },
2767 { "-mvaes", OPTION_MASK_ISA_VAES },
2768 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2769 { "-msgx", OPTION_MASK_ISA_SGX },
2770 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2771 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2772 { "-mibt", OPTION_MASK_ISA_IBT },
2773 { "-mhle", OPTION_MASK_ISA_HLE },
2774 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2775 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2776 { "-mmwaitx", OPTION_MASK_ISA_MWAITX }
2778 static struct ix86_target_opts isa_opts[] =
2780 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2781 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2782 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2783 { "-mgfni", OPTION_MASK_ISA_GFNI },
2784 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2785 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2786 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2787 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2788 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2789 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2790 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2791 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2792 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2793 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2794 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2795 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2796 { "-mfma", OPTION_MASK_ISA_FMA },
2797 { "-mxop", OPTION_MASK_ISA_XOP },
2798 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2799 { "-mf16c", OPTION_MASK_ISA_F16C },
2800 { "-mavx", OPTION_MASK_ISA_AVX },
2801 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2802 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2803 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2804 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2805 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2806 { "-msse3", OPTION_MASK_ISA_SSE3 },
2807 { "-maes", OPTION_MASK_ISA_AES },
2808 { "-msha", OPTION_MASK_ISA_SHA },
2809 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2810 { "-msse2", OPTION_MASK_ISA_SSE2 },
2811 { "-msse", OPTION_MASK_ISA_SSE },
2812 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2813 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2814 { "-mmmx", OPTION_MASK_ISA_MMX },
2815 { "-mrtm", OPTION_MASK_ISA_RTM },
2816 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2817 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2818 { "-madx", OPTION_MASK_ISA_ADX },
2819 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2820 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2821 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2822 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2823 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2824 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2825 { "-mabm", OPTION_MASK_ISA_ABM },
2826 { "-mbmi", OPTION_MASK_ISA_BMI },
2827 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2828 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2829 { "-mtbm", OPTION_MASK_ISA_TBM },
2830 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2831 { "-msahf", OPTION_MASK_ISA_SAHF },
2832 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2833 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2834 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2835 { "-mpku", OPTION_MASK_ISA_PKU },
2836 { "-mlwp", OPTION_MASK_ISA_LWP },
2837 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2838 { "-mclwb", OPTION_MASK_ISA_CLWB },
2839 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2842 /* Flag options. */
2843 static struct ix86_target_opts flag_opts[] =
2845 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2846 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2847 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2848 { "-m80387", MASK_80387 },
2849 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2850 { "-malign-double", MASK_ALIGN_DOUBLE },
2851 { "-mcld", MASK_CLD },
2852 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2853 { "-mieee-fp", MASK_IEEE_FP },
2854 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2855 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2856 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2857 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2858 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2859 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2860 { "-mno-red-zone", MASK_NO_RED_ZONE },
2861 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2862 { "-mrecip", MASK_RECIP },
2863 { "-mrtd", MASK_RTD },
2864 { "-msseregparm", MASK_SSEREGPARM },
2865 { "-mstack-arg-probe", MASK_STACK_PROBE },
2866 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2867 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2868 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2869 { "-mvzeroupper", MASK_VZEROUPPER },
2870 { "-mstv", MASK_STV },
2871 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2872 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2873 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2876 /* Additional flag options. */
2877 static struct ix86_target_opts flag2_opts[] =
2879 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2882 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2883 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2885 char isa_other[40];
2886 char isa2_other[40];
2887 char flags_other[40];
2888 char flags2_other[40];
2889 unsigned num = 0;
2890 unsigned i, j;
2891 char *ret;
2892 char *ptr;
2893 size_t len;
2894 size_t line_len;
2895 size_t sep_len;
2896 const char *abi;
2898 memset (opts, '\0', sizeof (opts));
2900 /* Add -march= option. */
2901 if (arch)
2903 opts[num][0] = "-march=";
2904 opts[num++][1] = arch;
2907 /* Add -mtune= option. */
2908 if (tune)
2910 opts[num][0] = "-mtune=";
2911 opts[num++][1] = tune;
2914 /* Add -m32/-m64/-mx32. */
2915 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2917 if ((isa & OPTION_MASK_ABI_64) != 0)
2918 abi = "-m64";
2919 else
2920 abi = "-mx32";
2921 isa &= ~ (OPTION_MASK_ISA_64BIT
2922 | OPTION_MASK_ABI_64
2923 | OPTION_MASK_ABI_X32);
2925 else
2926 abi = "-m32";
2927 opts[num++][0] = abi;
2929 /* Pick out the options in isa2 options. */
2930 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2932 if ((isa2 & isa2_opts[i].mask) != 0)
2934 opts[num++][0] = isa2_opts[i].option;
2935 isa2 &= ~ isa2_opts[i].mask;
2939 if (isa2 && add_nl_p)
2941 opts[num++][0] = isa2_other;
2942 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2945 /* Pick out the options in isa options. */
2946 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2948 if ((isa & isa_opts[i].mask) != 0)
2950 opts[num++][0] = isa_opts[i].option;
2951 isa &= ~ isa_opts[i].mask;
2955 if (isa && add_nl_p)
2957 opts[num++][0] = isa_other;
2958 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2961 /* Add flag options. */
2962 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2964 if ((flags & flag_opts[i].mask) != 0)
2966 opts[num++][0] = flag_opts[i].option;
2967 flags &= ~ flag_opts[i].mask;
2971 if (flags && add_nl_p)
2973 opts[num++][0] = flags_other;
2974 sprintf (flags_other, "(other flags: %#x)", flags);
2977 /* Add additional flag options. */
2978 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2980 if ((flags2 & flag2_opts[i].mask) != 0)
2982 opts[num++][0] = flag2_opts[i].option;
2983 flags2 &= ~ flag2_opts[i].mask;
2987 if (flags2 && add_nl_p)
2989 opts[num++][0] = flags2_other;
2990 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2993 /* Add -fpmath= option. */
2994 if (fpmath)
2996 opts[num][0] = "-mfpmath=";
2997 switch ((int) fpmath)
2999 case FPMATH_387:
3000 opts[num++][1] = "387";
3001 break;
3003 case FPMATH_SSE:
3004 opts[num++][1] = "sse";
3005 break;
3007 case FPMATH_387 | FPMATH_SSE:
3008 opts[num++][1] = "sse+387";
3009 break;
3011 default:
3012 gcc_unreachable ();
3016 /* Any options? */
3017 if (num == 0)
3018 return NULL;
3020 gcc_assert (num < ARRAY_SIZE (opts));
3022 /* Size the string. */
3023 len = 0;
3024 sep_len = (add_nl_p) ? 3 : 1;
3025 for (i = 0; i < num; i++)
3027 len += sep_len;
3028 for (j = 0; j < 2; j++)
3029 if (opts[i][j])
3030 len += strlen (opts[i][j]);
3033 /* Build the string. */
3034 ret = ptr = (char *) xmalloc (len);
3035 line_len = 0;
3037 for (i = 0; i < num; i++)
3039 size_t len2[2];
3041 for (j = 0; j < 2; j++)
3042 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3044 if (i != 0)
3046 *ptr++ = ' ';
3047 line_len++;
3049 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3051 *ptr++ = '\\';
3052 *ptr++ = '\n';
3053 line_len = 0;
3057 for (j = 0; j < 2; j++)
3058 if (opts[i][j])
3060 memcpy (ptr, opts[i][j], len2[j]);
3061 ptr += len2[j];
3062 line_len += len2[j];
3066 *ptr = '\0';
3067 gcc_assert (ret + len >= ptr);
3069 return ret;
3072 /* Return true, if profiling code should be emitted before
3073 prologue. Otherwise it returns false.
3074 Note: For x86 with "hotfix" it is sorried. */
3075 static bool
3076 ix86_profile_before_prologue (void)
3078 return flag_fentry != 0;
3081 /* Function that is callable from the debugger to print the current
3082 options. */
3083 void ATTRIBUTE_UNUSED
3084 ix86_debug_options (void)
3086 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3087 target_flags, ix86_target_flags,
3088 ix86_arch_string,ix86_tune_string,
3089 ix86_fpmath, true);
3091 if (opts)
3093 fprintf (stderr, "%s\n\n", opts);
3094 free (opts);
3096 else
3097 fputs ("<no options>\n\n", stderr);
3099 return;
3102 /* Return true if T is one of the bytes we should avoid with
3103 -mmitigate-rop. */
3105 static bool
3106 ix86_rop_should_change_byte_p (int t)
3108 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3111 static const char *stringop_alg_names[] = {
3112 #define DEF_ENUM
3113 #define DEF_ALG(alg, name) #name,
3114 #include "stringop.def"
3115 #undef DEF_ENUM
3116 #undef DEF_ALG
3119 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3120 The string is of the following form (or comma separated list of it):
3122 strategy_alg:max_size:[align|noalign]
3124 where the full size range for the strategy is either [0, max_size] or
3125 [min_size, max_size], in which min_size is the max_size + 1 of the
3126 preceding range. The last size range must have max_size == -1.
3128 Examples:
3131 -mmemcpy-strategy=libcall:-1:noalign
3133 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3137 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3139 This is to tell the compiler to use the following strategy for memset
3140 1) when the expected size is between [1, 16], use rep_8byte strategy;
3141 2) when the size is between [17, 2048], use vector_loop;
3142 3) when the size is > 2048, use libcall. */
3144 struct stringop_size_range
3146 int max;
3147 stringop_alg alg;
3148 bool noalign;
3151 static void
3152 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3154 const struct stringop_algs *default_algs;
3155 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3156 char *curr_range_str, *next_range_str;
3157 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3158 int i = 0, n = 0;
3160 if (is_memset)
3161 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3162 else
3163 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3165 curr_range_str = strategy_str;
3169 int maxs;
3170 char alg_name[128];
3171 char align[16];
3172 next_range_str = strchr (curr_range_str, ',');
3173 if (next_range_str)
3174 *next_range_str++ = '\0';
3176 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3177 align) != 3)
3179 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3180 return;
3183 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3185 error ("size ranges of option %qs should be increasing", opt);
3186 return;
3189 for (i = 0; i < last_alg; i++)
3190 if (!strcmp (alg_name, stringop_alg_names[i]))
3191 break;
3193 if (i == last_alg)
3195 error ("wrong strategy name %qs specified for option %qs",
3196 alg_name, opt);
3198 auto_vec <const char *> candidates;
3199 for (i = 0; i < last_alg; i++)
3200 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3201 candidates.safe_push (stringop_alg_names[i]);
3203 char *s;
3204 const char *hint
3205 = candidates_list_and_hint (alg_name, s, candidates);
3206 if (hint)
3207 inform (input_location,
3208 "valid arguments to %qs are: %s; did you mean %qs?",
3209 opt, s, hint);
3210 else
3211 inform (input_location, "valid arguments to %qs are: %s",
3212 opt, s);
3213 XDELETEVEC (s);
3214 return;
3217 if ((stringop_alg) i == rep_prefix_8_byte
3218 && !TARGET_64BIT)
3220 /* rep; movq isn't available in 32-bit code. */
3221 error ("strategy name %qs specified for option %qs "
3222 "not supported for 32-bit code", alg_name, opt);
3223 return;
3226 input_ranges[n].max = maxs;
3227 input_ranges[n].alg = (stringop_alg) i;
3228 if (!strcmp (align, "align"))
3229 input_ranges[n].noalign = false;
3230 else if (!strcmp (align, "noalign"))
3231 input_ranges[n].noalign = true;
3232 else
3234 error ("unknown alignment %qs specified for option %qs", align, opt);
3235 return;
3237 n++;
3238 curr_range_str = next_range_str;
3240 while (curr_range_str);
3242 if (input_ranges[n - 1].max != -1)
3244 error ("the max value for the last size range should be -1"
3245 " for option %qs", opt);
3246 return;
3249 if (n > MAX_STRINGOP_ALGS)
3251 error ("too many size ranges specified in option %qs", opt);
3252 return;
3255 /* Now override the default algs array. */
3256 for (i = 0; i < n; i++)
3258 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3259 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3260 = input_ranges[i].alg;
3261 *const_cast<int *>(&default_algs->size[i].noalign)
3262 = input_ranges[i].noalign;
3267 /* parse -mtune-ctrl= option. When DUMP is true,
3268 print the features that are explicitly set. */
3270 static void
3271 parse_mtune_ctrl_str (bool dump)
3273 if (!ix86_tune_ctrl_string)
3274 return;
3276 char *next_feature_string = NULL;
3277 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3278 char *orig = curr_feature_string;
3279 int i;
3282 bool clear = false;
3284 next_feature_string = strchr (curr_feature_string, ',');
3285 if (next_feature_string)
3286 *next_feature_string++ = '\0';
3287 if (*curr_feature_string == '^')
3289 curr_feature_string++;
3290 clear = true;
3292 for (i = 0; i < X86_TUNE_LAST; i++)
3294 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3296 ix86_tune_features[i] = !clear;
3297 if (dump)
3298 fprintf (stderr, "Explicitly %s feature %s\n",
3299 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3300 break;
3303 if (i == X86_TUNE_LAST)
3304 error ("unknown parameter to option -mtune-ctrl: %s",
3305 clear ? curr_feature_string - 1 : curr_feature_string);
3306 curr_feature_string = next_feature_string;
3308 while (curr_feature_string);
3309 free (orig);
3312 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3313 processor type. */
3315 static void
3316 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3318 unsigned int ix86_tune_mask = 1u << ix86_tune;
3319 int i;
3321 for (i = 0; i < X86_TUNE_LAST; ++i)
3323 if (ix86_tune_no_default)
3324 ix86_tune_features[i] = 0;
3325 else
3326 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3329 if (dump)
3331 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3332 for (i = 0; i < X86_TUNE_LAST; i++)
3333 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3334 ix86_tune_features[i] ? "on" : "off");
3337 parse_mtune_ctrl_str (dump);
3341 /* Default align_* from the processor table. */
3343 static void
3344 ix86_default_align (struct gcc_options *opts)
3346 if (opts->x_align_loops == 0)
3348 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3349 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3351 if (opts->x_align_jumps == 0)
3353 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3354 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3356 if (opts->x_align_functions == 0)
3358 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3362 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3364 static void
3365 ix86_override_options_after_change (void)
3367 ix86_default_align (&global_options);
3370 /* Override various settings based on options. If MAIN_ARGS_P, the
3371 options are from the command line, otherwise they are from
3372 attributes. Return true if there's an error related to march
3373 option. */
3375 static bool
3376 ix86_option_override_internal (bool main_args_p,
3377 struct gcc_options *opts,
3378 struct gcc_options *opts_set)
3380 int i;
3381 unsigned int ix86_arch_mask;
3382 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3384 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3385 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3386 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3387 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3388 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3389 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3390 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3391 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3392 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3393 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3394 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3395 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3396 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3397 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3398 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3399 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3400 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3401 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3402 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3403 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3404 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3405 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3406 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3407 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3408 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3409 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3410 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3411 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3412 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3413 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3414 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3415 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3416 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3417 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3418 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3419 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3420 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3421 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3422 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3423 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3424 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3425 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3426 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3427 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3428 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3429 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3430 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3431 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3432 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3433 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3434 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3435 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3436 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3437 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3438 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3439 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3440 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3441 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3442 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3443 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3444 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3445 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3446 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3447 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3449 #define PTA_CORE2 \
3450 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3451 | PTA_CX16 | PTA_FXSR)
3452 #define PTA_NEHALEM \
3453 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3454 #define PTA_WESTMERE \
3455 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3456 #define PTA_SANDYBRIDGE \
3457 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3458 #define PTA_IVYBRIDGE \
3459 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3460 #define PTA_HASWELL \
3461 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3462 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3463 #define PTA_BROADWELL \
3464 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3465 #define PTA_SKYLAKE \
3466 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3467 #define PTA_SKYLAKE_AVX512 \
3468 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3469 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU | PTA_CLWB)
3470 #define PTA_CANNONLAKE \
3471 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA)
3472 #define PTA_KNL \
3473 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3474 #define PTA_BONNELL \
3475 (PTA_CORE2 | PTA_MOVBE)
3476 #define PTA_SILVERMONT \
3477 (PTA_WESTMERE | PTA_MOVBE | PTA_RDRND)
3478 #define PTA_KNM \
3479 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3481 /* if this reaches 64, need to widen struct pta flags below */
3483 static struct pta
3485 const char *const name; /* processor name or nickname. */
3486 const enum processor_type processor;
3487 const enum attr_cpu schedule;
3488 const unsigned HOST_WIDE_INT flags;
3490 const processor_alias_table[] =
3492 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3493 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3494 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3495 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3496 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3497 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3498 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3499 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3500 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3501 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3502 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3503 PTA_MMX | PTA_SSE | PTA_FXSR},
3504 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3505 PTA_MMX | PTA_SSE | PTA_FXSR},
3506 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3507 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3508 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3509 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3510 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3511 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3512 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3513 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3514 PTA_MMX | PTA_SSE | PTA_FXSR},
3515 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3516 PTA_MMX | PTA_SSE | PTA_FXSR},
3517 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3518 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3519 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3520 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3521 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3522 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3523 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3524 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3525 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3526 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3527 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3528 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3529 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3530 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3531 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3532 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3533 PTA_SANDYBRIDGE},
3534 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3535 PTA_SANDYBRIDGE},
3536 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3537 PTA_IVYBRIDGE},
3538 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3539 PTA_IVYBRIDGE},
3540 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3541 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3542 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3543 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3544 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3545 PTA_SKYLAKE_AVX512},
3546 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3547 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3548 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3549 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3550 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3551 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3552 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3553 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3554 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3555 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3556 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3557 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3558 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3559 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3560 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3561 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3562 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3563 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3564 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3565 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3566 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3567 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3568 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3569 {"x86-64", PROCESSOR_K8, CPU_K8,
3570 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3571 {"eden-x2", PROCESSOR_K8, CPU_K8,
3572 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3573 {"nano", PROCESSOR_K8, CPU_K8,
3574 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3575 | PTA_SSSE3 | PTA_FXSR},
3576 {"nano-1000", PROCESSOR_K8, CPU_K8,
3577 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3578 | PTA_SSSE3 | PTA_FXSR},
3579 {"nano-2000", PROCESSOR_K8, CPU_K8,
3580 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3581 | PTA_SSSE3 | PTA_FXSR},
3582 {"nano-3000", PROCESSOR_K8, CPU_K8,
3583 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3584 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3585 {"nano-x2", PROCESSOR_K8, CPU_K8,
3586 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3587 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3588 {"eden-x4", PROCESSOR_K8, CPU_K8,
3589 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3590 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3591 {"nano-x4", PROCESSOR_K8, CPU_K8,
3592 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3593 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3594 {"k8", PROCESSOR_K8, CPU_K8,
3595 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3596 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3597 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3598 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3599 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3600 {"opteron", PROCESSOR_K8, CPU_K8,
3601 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3602 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3603 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3604 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3605 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3606 {"athlon64", PROCESSOR_K8, CPU_K8,
3607 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3608 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3609 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3610 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3611 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3612 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3613 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3614 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3615 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3616 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3617 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3618 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3619 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3620 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3621 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3622 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3623 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3624 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3625 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3626 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3627 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3628 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3629 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3630 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3631 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3632 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3633 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3634 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3635 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3636 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3637 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3638 | PTA_XSAVEOPT | PTA_FSGSBASE},
3639 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3640 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3641 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3642 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3643 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3644 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3645 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3646 | PTA_MOVBE | PTA_MWAITX},
3647 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3648 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3649 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3650 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3651 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3652 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3653 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3654 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3655 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3656 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3657 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3658 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3659 | PTA_FXSR | PTA_XSAVE},
3660 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3661 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3662 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3663 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3664 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3665 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3667 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3668 PTA_64BIT
3669 | PTA_HLE /* flags are only used for -march switch. */ },
3672 /* -mrecip options. */
3673 static struct
3675 const char *string; /* option name */
3676 unsigned int mask; /* mask bits to set */
3678 const recip_options[] =
3680 { "all", RECIP_MASK_ALL },
3681 { "none", RECIP_MASK_NONE },
3682 { "div", RECIP_MASK_DIV },
3683 { "sqrt", RECIP_MASK_SQRT },
3684 { "vec-div", RECIP_MASK_VEC_DIV },
3685 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3688 int const pta_size = ARRAY_SIZE (processor_alias_table);
3690 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3691 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3692 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3693 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3694 #ifdef TARGET_BI_ARCH
3695 else
3697 #if TARGET_BI_ARCH == 1
3698 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3699 is on and OPTION_MASK_ABI_X32 is off. We turn off
3700 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3701 -mx32. */
3702 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3703 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3704 #else
3705 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3706 on and OPTION_MASK_ABI_64 is off. We turn off
3707 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3708 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3709 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3710 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3711 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3712 #endif
3713 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3714 && TARGET_IAMCU_P (opts->x_target_flags))
3715 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3716 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3718 #endif
3720 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3722 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3723 OPTION_MASK_ABI_64 for TARGET_X32. */
3724 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3725 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3727 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3728 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3729 | OPTION_MASK_ABI_X32
3730 | OPTION_MASK_ABI_64);
3731 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3733 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3734 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3735 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3736 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3739 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3740 SUBTARGET_OVERRIDE_OPTIONS;
3741 #endif
3743 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3744 SUBSUBTARGET_OVERRIDE_OPTIONS;
3745 #endif
3747 /* -fPIC is the default for x86_64. */
3748 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3749 opts->x_flag_pic = 2;
3751 /* Need to check -mtune=generic first. */
3752 if (opts->x_ix86_tune_string)
3754 /* As special support for cross compilers we read -mtune=native
3755 as -mtune=generic. With native compilers we won't see the
3756 -mtune=native, as it was changed by the driver. */
3757 if (!strcmp (opts->x_ix86_tune_string, "native"))
3759 opts->x_ix86_tune_string = "generic";
3761 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3762 warning (OPT_Wdeprecated,
3763 main_args_p
3764 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3765 "or %<-mtune=generic%> instead as appropriate")
3766 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3767 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3768 " instead as appropriate"));
3770 else
3772 if (opts->x_ix86_arch_string)
3773 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3774 if (!opts->x_ix86_tune_string)
3776 opts->x_ix86_tune_string
3777 = processor_target_table[TARGET_CPU_DEFAULT].name;
3778 ix86_tune_defaulted = 1;
3781 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3782 or defaulted. We need to use a sensible tune option. */
3783 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3785 opts->x_ix86_tune_string = "generic";
3789 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3790 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3792 /* rep; movq isn't available in 32-bit code. */
3793 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3794 opts->x_ix86_stringop_alg = no_stringop;
3797 if (!opts->x_ix86_arch_string)
3798 opts->x_ix86_arch_string
3799 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3800 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3801 else
3802 ix86_arch_specified = 1;
3804 if (opts_set->x_ix86_pmode)
3806 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3807 && opts->x_ix86_pmode == PMODE_SI)
3808 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3809 && opts->x_ix86_pmode == PMODE_DI))
3810 error ("address mode %qs not supported in the %s bit mode",
3811 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3812 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3814 else
3815 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3816 ? PMODE_DI : PMODE_SI;
3818 if (!opts_set->x_ix86_abi)
3819 opts->x_ix86_abi = DEFAULT_ABI;
3821 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3822 error ("-mabi=ms not supported with X32 ABI");
3823 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3825 /* For targets using ms ABI enable ms-extensions, if not
3826 explicit turned off. For non-ms ABI we turn off this
3827 option. */
3828 if (!opts_set->x_flag_ms_extensions)
3829 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3831 if (opts_set->x_ix86_cmodel)
3833 switch (opts->x_ix86_cmodel)
3835 case CM_SMALL:
3836 case CM_SMALL_PIC:
3837 if (opts->x_flag_pic)
3838 opts->x_ix86_cmodel = CM_SMALL_PIC;
3839 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3840 error ("code model %qs not supported in the %s bit mode",
3841 "small", "32");
3842 break;
3844 case CM_MEDIUM:
3845 case CM_MEDIUM_PIC:
3846 if (opts->x_flag_pic)
3847 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3848 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3849 error ("code model %qs not supported in the %s bit mode",
3850 "medium", "32");
3851 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3852 error ("code model %qs not supported in x32 mode",
3853 "medium");
3854 break;
3856 case CM_LARGE:
3857 case CM_LARGE_PIC:
3858 if (opts->x_flag_pic)
3859 opts->x_ix86_cmodel = CM_LARGE_PIC;
3860 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3861 error ("code model %qs not supported in the %s bit mode",
3862 "large", "32");
3863 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3864 error ("code model %qs not supported in x32 mode",
3865 "large");
3866 break;
3868 case CM_32:
3869 if (opts->x_flag_pic)
3870 error ("code model %s does not support PIC mode", "32");
3871 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3872 error ("code model %qs not supported in the %s bit mode",
3873 "32", "64");
3874 break;
3876 case CM_KERNEL:
3877 if (opts->x_flag_pic)
3879 error ("code model %s does not support PIC mode", "kernel");
3880 opts->x_ix86_cmodel = CM_32;
3882 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3883 error ("code model %qs not supported in the %s bit mode",
3884 "kernel", "32");
3885 break;
3887 default:
3888 gcc_unreachable ();
3891 else
3893 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3894 use of rip-relative addressing. This eliminates fixups that
3895 would otherwise be needed if this object is to be placed in a
3896 DLL, and is essentially just as efficient as direct addressing. */
3897 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3898 && (TARGET_RDOS || TARGET_PECOFF))
3899 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3900 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3901 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3902 else
3903 opts->x_ix86_cmodel = CM_32;
3905 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3907 error ("-masm=intel not supported in this configuration");
3908 opts->x_ix86_asm_dialect = ASM_ATT;
3910 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3911 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3912 sorry ("%i-bit mode not compiled in",
3913 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3915 for (i = 0; i < pta_size; i++)
3916 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3918 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3920 error (main_args_p
3921 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3922 "switch")
3923 : G_("%<generic%> CPU can be used only for "
3924 "%<target(\"tune=\")%> attribute"));
3925 return false;
3927 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3929 error (main_args_p
3930 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3931 "switch")
3932 : G_("%<intel%> CPU can be used only for "
3933 "%<target(\"tune=\")%> attribute"));
3934 return false;
3937 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3938 && !(processor_alias_table[i].flags & PTA_64BIT))
3940 error ("CPU you selected does not support x86-64 "
3941 "instruction set");
3942 return false;
3945 ix86_schedule = processor_alias_table[i].schedule;
3946 ix86_arch = processor_alias_table[i].processor;
3947 /* Default cpu tuning to the architecture. */
3948 ix86_tune = ix86_arch;
3950 if (processor_alias_table[i].flags & PTA_MMX
3951 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3952 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3953 if (processor_alias_table[i].flags & PTA_3DNOW
3954 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3955 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3956 if (processor_alias_table[i].flags & PTA_3DNOW_A
3957 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3958 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3959 if (processor_alias_table[i].flags & PTA_SSE
3960 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3961 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3962 if (processor_alias_table[i].flags & PTA_SSE2
3963 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3964 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3965 if (processor_alias_table[i].flags & PTA_SSE3
3966 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3967 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3968 if (processor_alias_table[i].flags & PTA_SSSE3
3969 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3970 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3971 if (processor_alias_table[i].flags & PTA_SSE4_1
3972 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3973 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3974 if (processor_alias_table[i].flags & PTA_SSE4_2
3975 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3976 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3977 if (processor_alias_table[i].flags & PTA_AVX
3978 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3979 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3980 if (processor_alias_table[i].flags & PTA_AVX2
3981 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3982 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3983 if (processor_alias_table[i].flags & PTA_FMA
3984 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3985 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3986 if (processor_alias_table[i].flags & PTA_SSE4A
3987 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3988 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3989 if (processor_alias_table[i].flags & PTA_FMA4
3990 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3991 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3992 if (processor_alias_table[i].flags & PTA_XOP
3993 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3994 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3995 if (processor_alias_table[i].flags & PTA_LWP
3996 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3997 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3998 if (processor_alias_table[i].flags & PTA_ABM
3999 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
4000 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
4001 if (processor_alias_table[i].flags & PTA_BMI
4002 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
4003 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
4004 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
4005 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
4006 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
4007 if (processor_alias_table[i].flags & PTA_TBM
4008 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
4009 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
4010 if (processor_alias_table[i].flags & PTA_BMI2
4011 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4012 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4013 if (processor_alias_table[i].flags & PTA_CX16
4014 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4015 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4016 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
4017 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4018 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4019 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4020 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4021 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4022 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4023 if (processor_alias_table[i].flags & PTA_MOVBE
4024 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4025 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4026 if (processor_alias_table[i].flags & PTA_AES
4027 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4028 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4029 if (processor_alias_table[i].flags & PTA_SHA
4030 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4031 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4032 if (processor_alias_table[i].flags & PTA_PCLMUL
4033 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4034 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4035 if (processor_alias_table[i].flags & PTA_FSGSBASE
4036 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4037 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4038 if (processor_alias_table[i].flags & PTA_RDRND
4039 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4040 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4041 if (processor_alias_table[i].flags & PTA_F16C
4042 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4043 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4044 if (processor_alias_table[i].flags & PTA_RTM
4045 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4046 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4047 if (processor_alias_table[i].flags & PTA_HLE
4048 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4049 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4050 if (processor_alias_table[i].flags & PTA_PRFCHW
4051 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4052 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4053 if (processor_alias_table[i].flags & PTA_RDSEED
4054 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4055 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4056 if (processor_alias_table[i].flags & PTA_ADX
4057 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4058 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4059 if (processor_alias_table[i].flags & PTA_FXSR
4060 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4061 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4062 if (processor_alias_table[i].flags & PTA_XSAVE
4063 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4064 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4065 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4066 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4067 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4068 if (processor_alias_table[i].flags & PTA_AVX512F
4069 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4070 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4071 if (processor_alias_table[i].flags & PTA_AVX512ER
4072 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4073 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4074 if (processor_alias_table[i].flags & PTA_AVX512PF
4075 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4076 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4077 if (processor_alias_table[i].flags & PTA_AVX512CD
4078 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4079 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4080 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4081 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4082 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4083 if (processor_alias_table[i].flags & PTA_CLWB
4084 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4085 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4086 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4087 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4088 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4089 if (processor_alias_table[i].flags & PTA_CLZERO
4090 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4091 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4092 if (processor_alias_table[i].flags & PTA_XSAVEC
4093 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4094 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4095 if (processor_alias_table[i].flags & PTA_XSAVES
4096 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4097 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4098 if (processor_alias_table[i].flags & PTA_AVX512DQ
4099 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4100 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4101 if (processor_alias_table[i].flags & PTA_AVX512BW
4102 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4103 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4104 if (processor_alias_table[i].flags & PTA_AVX512VL
4105 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4106 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4107 if (processor_alias_table[i].flags & PTA_MPX
4108 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4109 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4110 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4111 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4112 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4113 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4114 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4115 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4117 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4118 && !(opts->x_ix86_isa_flags2_explicit
4119 & OPTION_MASK_ISA_AVX5124VNNIW))
4120 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4121 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4122 && !(opts->x_ix86_isa_flags2_explicit
4123 & OPTION_MASK_ISA_AVX5124FMAPS))
4124 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4125 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4126 && !(opts->x_ix86_isa_flags_explicit
4127 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4128 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4129 if (processor_alias_table[i].flags & PTA_SGX
4130 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4131 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4133 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4134 x86_prefetch_sse = true;
4135 if (processor_alias_table[i].flags & PTA_MWAITX
4136 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4137 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4138 if (processor_alias_table[i].flags & PTA_PKU
4139 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4140 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4142 /* Don't enable x87 instructions if only
4143 general registers are allowed. */
4144 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4145 && !(opts_set->x_target_flags & MASK_80387))
4147 if (processor_alias_table[i].flags & PTA_NO_80387)
4148 opts->x_target_flags &= ~MASK_80387;
4149 else
4150 opts->x_target_flags |= MASK_80387;
4152 break;
4155 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4156 error ("Intel MPX does not support x32");
4158 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4159 error ("Intel MPX does not support x32");
4161 if (i == pta_size)
4163 error (main_args_p
4164 ? G_("bad value (%qs) for %<-march=%> switch")
4165 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4166 opts->x_ix86_arch_string);
4168 auto_vec <const char *> candidates;
4169 for (i = 0; i < pta_size; i++)
4170 if (strcmp (processor_alias_table[i].name, "generic")
4171 && strcmp (processor_alias_table[i].name, "intel")
4172 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4173 || (processor_alias_table[i].flags & PTA_64BIT)))
4174 candidates.safe_push (processor_alias_table[i].name);
4176 char *s;
4177 const char *hint
4178 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4179 if (hint)
4180 inform (input_location,
4181 main_args_p
4182 ? G_("valid arguments to %<-march=%> switch are: "
4183 "%s; did you mean %qs?")
4184 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4185 "%s; did you mean %qs?"), s, hint);
4186 else
4187 inform (input_location,
4188 main_args_p
4189 ? G_("valid arguments to %<-march=%> switch are: %s")
4190 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4191 "are: %s"), s);
4192 XDELETEVEC (s);
4195 ix86_arch_mask = 1u << ix86_arch;
4196 for (i = 0; i < X86_ARCH_LAST; ++i)
4197 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4199 for (i = 0; i < pta_size; i++)
4200 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4202 ix86_schedule = processor_alias_table[i].schedule;
4203 ix86_tune = processor_alias_table[i].processor;
4204 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4206 if (!(processor_alias_table[i].flags & PTA_64BIT))
4208 if (ix86_tune_defaulted)
4210 opts->x_ix86_tune_string = "x86-64";
4211 for (i = 0; i < pta_size; i++)
4212 if (! strcmp (opts->x_ix86_tune_string,
4213 processor_alias_table[i].name))
4214 break;
4215 ix86_schedule = processor_alias_table[i].schedule;
4216 ix86_tune = processor_alias_table[i].processor;
4218 else
4219 error ("CPU you selected does not support x86-64 "
4220 "instruction set");
4223 /* Intel CPUs have always interpreted SSE prefetch instructions as
4224 NOPs; so, we can enable SSE prefetch instructions even when
4225 -mtune (rather than -march) points us to a processor that has them.
4226 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4227 higher processors. */
4228 if (TARGET_CMOV
4229 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4230 x86_prefetch_sse = true;
4231 break;
4234 if (ix86_tune_specified && i == pta_size)
4236 error (main_args_p
4237 ? G_("bad value (%qs) for %<-mtune=%> switch")
4238 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4239 opts->x_ix86_tune_string);
4241 auto_vec <const char *> candidates;
4242 for (i = 0; i < pta_size; i++)
4243 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4244 || (processor_alias_table[i].flags & PTA_64BIT))
4245 candidates.safe_push (processor_alias_table[i].name);
4247 char *s;
4248 const char *hint
4249 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4250 if (hint)
4251 inform (input_location,
4252 main_args_p
4253 ? G_("valid arguments to %<-mtune=%> switch are: "
4254 "%s; did you mean %qs?")
4255 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4256 "%s; did you mean %qs?"), s, hint);
4257 else
4258 inform (input_location,
4259 main_args_p
4260 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4261 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4262 "are: %s"), s);
4263 XDELETEVEC (s);
4266 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4268 #ifndef USE_IX86_FRAME_POINTER
4269 #define USE_IX86_FRAME_POINTER 0
4270 #endif
4272 #ifndef USE_X86_64_FRAME_POINTER
4273 #define USE_X86_64_FRAME_POINTER 0
4274 #endif
4276 /* Set the default values for switches whose default depends on TARGET_64BIT
4277 in case they weren't overwritten by command line options. */
4278 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4280 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4281 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4282 if (opts->x_flag_asynchronous_unwind_tables
4283 && !opts_set->x_flag_unwind_tables
4284 && TARGET_64BIT_MS_ABI)
4285 opts->x_flag_unwind_tables = 1;
4286 if (opts->x_flag_asynchronous_unwind_tables == 2)
4287 opts->x_flag_unwind_tables
4288 = opts->x_flag_asynchronous_unwind_tables = 1;
4289 if (opts->x_flag_pcc_struct_return == 2)
4290 opts->x_flag_pcc_struct_return = 0;
4292 else
4294 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4295 opts->x_flag_omit_frame_pointer
4296 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4297 if (opts->x_flag_asynchronous_unwind_tables == 2)
4298 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4299 if (opts->x_flag_pcc_struct_return == 2)
4301 /* Intel MCU psABI specifies that -freg-struct-return should
4302 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4303 we check -miamcu so that -freg-struct-return is always
4304 turned on if -miamcu is used. */
4305 if (TARGET_IAMCU_P (opts->x_target_flags))
4306 opts->x_flag_pcc_struct_return = 0;
4307 else
4308 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4312 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4313 /* TODO: ix86_cost should be chosen at instruction or function granuality
4314 so for cold code we use size_cost even in !optimize_size compilation. */
4315 if (opts->x_optimize_size)
4316 ix86_cost = &ix86_size_cost;
4317 else
4318 ix86_cost = ix86_tune_cost;
4320 /* Arrange to set up i386_stack_locals for all functions. */
4321 init_machine_status = ix86_init_machine_status;
4323 /* Validate -mregparm= value. */
4324 if (opts_set->x_ix86_regparm)
4326 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4327 warning (0, "-mregparm is ignored in 64-bit mode");
4328 else if (TARGET_IAMCU_P (opts->x_target_flags))
4329 warning (0, "-mregparm is ignored for Intel MCU psABI");
4330 if (opts->x_ix86_regparm > REGPARM_MAX)
4332 error ("-mregparm=%d is not between 0 and %d",
4333 opts->x_ix86_regparm, REGPARM_MAX);
4334 opts->x_ix86_regparm = 0;
4337 if (TARGET_IAMCU_P (opts->x_target_flags)
4338 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4339 opts->x_ix86_regparm = REGPARM_MAX;
4341 /* Default align_* from the processor table. */
4342 ix86_default_align (opts);
4344 /* Provide default for -mbranch-cost= value. */
4345 if (!opts_set->x_ix86_branch_cost)
4346 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4348 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4350 opts->x_target_flags
4351 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4353 /* Enable by default the SSE and MMX builtins. Do allow the user to
4354 explicitly disable any of these. In particular, disabling SSE and
4355 MMX for kernel code is extremely useful. */
4356 if (!ix86_arch_specified)
4357 opts->x_ix86_isa_flags
4358 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4359 | TARGET_SUBTARGET64_ISA_DEFAULT)
4360 & ~opts->x_ix86_isa_flags_explicit);
4362 if (TARGET_RTD_P (opts->x_target_flags))
4363 warning (0,
4364 main_args_p
4365 ? G_("%<-mrtd%> is ignored in 64bit mode")
4366 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4368 else
4370 opts->x_target_flags
4371 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4373 if (!ix86_arch_specified)
4374 opts->x_ix86_isa_flags
4375 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4377 /* i386 ABI does not specify red zone. It still makes sense to use it
4378 when programmer takes care to stack from being destroyed. */
4379 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4380 opts->x_target_flags |= MASK_NO_RED_ZONE;
4383 /* Keep nonleaf frame pointers. */
4384 if (opts->x_flag_omit_frame_pointer)
4385 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4386 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4387 opts->x_flag_omit_frame_pointer = 1;
4389 /* If we're doing fast math, we don't care about comparison order
4390 wrt NaNs. This lets us use a shorter comparison sequence. */
4391 if (opts->x_flag_finite_math_only)
4392 opts->x_target_flags &= ~MASK_IEEE_FP;
4394 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4395 since the insns won't need emulation. */
4396 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4397 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4399 /* Likewise, if the target doesn't have a 387, or we've specified
4400 software floating point, don't use 387 inline intrinsics. */
4401 if (!TARGET_80387_P (opts->x_target_flags))
4402 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4404 /* Turn on MMX builtins for -msse. */
4405 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4406 opts->x_ix86_isa_flags
4407 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4409 /* Enable SSE prefetch. */
4410 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4411 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4412 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4413 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4414 x86_prefetch_sse = true;
4416 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4417 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4418 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4419 opts->x_ix86_isa_flags
4420 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4422 /* Enable lzcnt instruction for -mabm. */
4423 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4424 opts->x_ix86_isa_flags
4425 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4427 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4428 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4429 opts->x_ix86_isa_flags
4430 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4431 & ~opts->x_ix86_isa_flags_explicit);
4433 /* Validate -mpreferred-stack-boundary= value or default it to
4434 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4435 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4436 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4438 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4439 int max = TARGET_SEH ? 4 : 12;
4441 if (opts->x_ix86_preferred_stack_boundary_arg < min
4442 || opts->x_ix86_preferred_stack_boundary_arg > max)
4444 if (min == max)
4445 error ("-mpreferred-stack-boundary is not supported "
4446 "for this target");
4447 else
4448 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4449 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4451 else
4452 ix86_preferred_stack_boundary
4453 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4456 /* Set the default value for -mstackrealign. */
4457 if (!opts_set->x_ix86_force_align_arg_pointer)
4458 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4460 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4462 /* Validate -mincoming-stack-boundary= value or default it to
4463 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4464 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4465 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4467 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4469 if (opts->x_ix86_incoming_stack_boundary_arg < min
4470 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4471 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4472 opts->x_ix86_incoming_stack_boundary_arg, min);
4473 else
4475 ix86_user_incoming_stack_boundary
4476 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4477 ix86_incoming_stack_boundary
4478 = ix86_user_incoming_stack_boundary;
4482 #ifndef NO_PROFILE_COUNTERS
4483 if (flag_nop_mcount)
4484 error ("-mnop-mcount is not compatible with this target");
4485 #endif
4486 if (flag_nop_mcount && flag_pic)
4487 error ("-mnop-mcount is not implemented for -fPIC");
4489 /* Accept -msseregparm only if at least SSE support is enabled. */
4490 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4491 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4492 error (main_args_p
4493 ? G_("%<-msseregparm%> used without SSE enabled")
4494 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4496 if (opts_set->x_ix86_fpmath)
4498 if (opts->x_ix86_fpmath & FPMATH_SSE)
4500 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4502 if (TARGET_80387_P (opts->x_target_flags))
4504 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4505 opts->x_ix86_fpmath = FPMATH_387;
4508 else if ((opts->x_ix86_fpmath & FPMATH_387)
4509 && !TARGET_80387_P (opts->x_target_flags))
4511 warning (0, "387 instruction set disabled, using SSE arithmetics");
4512 opts->x_ix86_fpmath = FPMATH_SSE;
4516 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4517 fpmath=387. The second is however default at many targets since the
4518 extra 80bit precision of temporaries is considered to be part of ABI.
4519 Overwrite the default at least for -ffast-math.
4520 TODO: -mfpmath=both seems to produce same performing code with bit
4521 smaller binaries. It is however not clear if register allocation is
4522 ready for this setting.
4523 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4524 codegen. We may switch to 387 with -ffast-math for size optimized
4525 functions. */
4526 else if (fast_math_flags_set_p (&global_options)
4527 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4528 opts->x_ix86_fpmath = FPMATH_SSE;
4529 else
4530 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4532 /* Use external vectorized library in vectorizing intrinsics. */
4533 if (opts_set->x_ix86_veclibabi_type)
4534 switch (opts->x_ix86_veclibabi_type)
4536 case ix86_veclibabi_type_svml:
4537 ix86_veclib_handler = ix86_veclibabi_svml;
4538 break;
4540 case ix86_veclibabi_type_acml:
4541 ix86_veclib_handler = ix86_veclibabi_acml;
4542 break;
4544 default:
4545 gcc_unreachable ();
4548 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4549 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4550 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4552 /* If stack probes are required, the space used for large function
4553 arguments on the stack must also be probed, so enable
4554 -maccumulate-outgoing-args so this happens in the prologue. */
4555 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4556 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4558 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4559 warning (0,
4560 main_args_p
4561 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4562 "for correctness")
4563 : G_("stack probing requires "
4564 "%<target(\"accumulate-outgoing-args\")%> for "
4565 "correctness"));
4566 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4569 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4570 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4571 if (fixed_regs[BP_REG]
4572 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4574 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4575 warning (0,
4576 main_args_p
4577 ? G_("fixed ebp register requires "
4578 "%<-maccumulate-outgoing-args%>")
4579 : G_("fixed ebp register requires "
4580 "%<target(\"accumulate-outgoing-args\")%>"));
4581 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4584 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4586 char *p;
4587 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4588 p = strchr (internal_label_prefix, 'X');
4589 internal_label_prefix_len = p - internal_label_prefix;
4590 *p = '\0';
4593 /* When scheduling description is not available, disable scheduler pass
4594 so it won't slow down the compilation and make x87 code slower. */
4595 if (!TARGET_SCHEDULE)
4596 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4598 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4599 ix86_tune_cost->simultaneous_prefetches,
4600 opts->x_param_values,
4601 opts_set->x_param_values);
4602 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4603 ix86_tune_cost->prefetch_block,
4604 opts->x_param_values,
4605 opts_set->x_param_values);
4606 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4607 ix86_tune_cost->l1_cache_size,
4608 opts->x_param_values,
4609 opts_set->x_param_values);
4610 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4611 ix86_tune_cost->l2_cache_size,
4612 opts->x_param_values,
4613 opts_set->x_param_values);
4615 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4616 if (opts->x_flag_prefetch_loop_arrays < 0
4617 && HAVE_prefetch
4618 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4619 && !opts->x_optimize_size
4620 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4621 opts->x_flag_prefetch_loop_arrays = 1;
4623 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4624 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4625 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4626 targetm.expand_builtin_va_start = NULL;
4628 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4630 ix86_gen_leave = gen_leave_rex64;
4631 if (Pmode == DImode)
4633 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4634 ix86_gen_tls_local_dynamic_base_64
4635 = gen_tls_local_dynamic_base_64_di;
4637 else
4639 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4640 ix86_gen_tls_local_dynamic_base_64
4641 = gen_tls_local_dynamic_base_64_si;
4644 else
4645 ix86_gen_leave = gen_leave;
4647 if (Pmode == DImode)
4649 ix86_gen_add3 = gen_adddi3;
4650 ix86_gen_sub3 = gen_subdi3;
4651 ix86_gen_sub3_carry = gen_subdi3_carry;
4652 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4653 ix86_gen_andsp = gen_anddi3;
4654 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4655 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4656 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4657 ix86_gen_monitor = gen_sse3_monitor_di;
4658 ix86_gen_monitorx = gen_monitorx_di;
4659 ix86_gen_clzero = gen_clzero_di;
4661 else
4663 ix86_gen_add3 = gen_addsi3;
4664 ix86_gen_sub3 = gen_subsi3;
4665 ix86_gen_sub3_carry = gen_subsi3_carry;
4666 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4667 ix86_gen_andsp = gen_andsi3;
4668 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4669 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4670 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4671 ix86_gen_monitor = gen_sse3_monitor_si;
4672 ix86_gen_monitorx = gen_monitorx_si;
4673 ix86_gen_clzero = gen_clzero_si;
4676 #ifdef USE_IX86_CLD
4677 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4678 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4679 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4680 #endif
4682 /* Set the default value for -mfentry. */
4683 if (!opts_set->x_flag_fentry)
4684 opts->x_flag_fentry = TARGET_SEH;
4685 else
4687 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4688 && opts->x_flag_fentry)
4689 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4690 "with -fpic");
4691 else if (TARGET_SEH && !opts->x_flag_fentry)
4692 sorry ("-mno-fentry isn%'t compatible with SEH");
4695 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4696 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4698 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4699 && TARGET_EMIT_VZEROUPPER)
4700 opts->x_target_flags |= MASK_VZEROUPPER;
4701 if (!(opts_set->x_target_flags & MASK_STV))
4702 opts->x_target_flags |= MASK_STV;
4703 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4704 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4705 stack realignment will be extra cost the pass doesn't take into
4706 account and the pass can't realign the stack. */
4707 if (ix86_preferred_stack_boundary < 128
4708 || ix86_incoming_stack_boundary < 128
4709 || opts->x_ix86_force_align_arg_pointer)
4710 opts->x_target_flags &= ~MASK_STV;
4711 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4712 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4713 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4714 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4715 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4716 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4718 /* Enable 128-bit AVX instruction generation
4719 for the auto-vectorizer. */
4720 if (TARGET_AVX128_OPTIMAL
4721 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4722 opts->x_prefer_vector_width_type = PVW_AVX128;
4724 /* Use 256-bit AVX instruction generation
4725 in the auto-vectorizer. */
4726 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4727 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4728 opts->x_prefer_vector_width_type = PVW_AVX256;
4730 if (opts->x_ix86_recip_name)
4732 char *p = ASTRDUP (opts->x_ix86_recip_name);
4733 char *q;
4734 unsigned int mask, i;
4735 bool invert;
4737 while ((q = strtok (p, ",")) != NULL)
4739 p = NULL;
4740 if (*q == '!')
4742 invert = true;
4743 q++;
4745 else
4746 invert = false;
4748 if (!strcmp (q, "default"))
4749 mask = RECIP_MASK_ALL;
4750 else
4752 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4753 if (!strcmp (q, recip_options[i].string))
4755 mask = recip_options[i].mask;
4756 break;
4759 if (i == ARRAY_SIZE (recip_options))
4761 error ("unknown option for -mrecip=%s", q);
4762 invert = false;
4763 mask = RECIP_MASK_NONE;
4767 opts->x_recip_mask_explicit |= mask;
4768 if (invert)
4769 opts->x_recip_mask &= ~mask;
4770 else
4771 opts->x_recip_mask |= mask;
4775 if (TARGET_RECIP_P (opts->x_target_flags))
4776 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4777 else if (opts_set->x_target_flags & MASK_RECIP)
4778 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4780 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4781 for 64-bit Bionic. Also default long double to 64-bit for Intel
4782 MCU psABI. */
4783 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4784 && !(opts_set->x_target_flags
4785 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4786 opts->x_target_flags |= (TARGET_64BIT
4787 ? MASK_LONG_DOUBLE_128
4788 : MASK_LONG_DOUBLE_64);
4790 /* Only one of them can be active. */
4791 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4792 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4794 /* Handle stack protector */
4795 if (!opts_set->x_ix86_stack_protector_guard)
4796 opts->x_ix86_stack_protector_guard
4797 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4799 #ifdef TARGET_THREAD_SSP_OFFSET
4800 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4801 #endif
4803 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4805 char *endp;
4806 const char *str = ix86_stack_protector_guard_offset_str;
4808 errno = 0;
4809 int64_t offset;
4811 #if defined(INT64_T_IS_LONG)
4812 offset = strtol (str, &endp, 0);
4813 #else
4814 offset = strtoll (str, &endp, 0);
4815 #endif
4817 if (!*str || *endp || errno)
4818 error ("%qs is not a valid number "
4819 "in -mstack-protector-guard-offset=", str);
4821 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4822 HOST_WIDE_INT_C (0x7fffffff)))
4823 error ("%qs is not a valid offset "
4824 "in -mstack-protector-guard-offset=", str);
4826 ix86_stack_protector_guard_offset = offset;
4829 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4831 /* The kernel uses a different segment register for performance
4832 reasons; a system call would not have to trash the userspace
4833 segment register, which would be expensive. */
4834 if (ix86_cmodel == CM_KERNEL)
4835 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4837 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4839 const char *str = ix86_stack_protector_guard_reg_str;
4840 addr_space_t seg = ADDR_SPACE_GENERIC;
4842 /* Discard optional register prefix. */
4843 if (str[0] == '%')
4844 str++;
4846 if (strlen (str) == 2 && str[1] == 's')
4848 if (str[0] == 'f')
4849 seg = ADDR_SPACE_SEG_FS;
4850 else if (str[0] == 'g')
4851 seg = ADDR_SPACE_SEG_GS;
4854 if (seg == ADDR_SPACE_GENERIC)
4855 error ("%qs is not a valid base register "
4856 "in -mstack-protector-guard-reg=",
4857 ix86_stack_protector_guard_reg_str);
4859 ix86_stack_protector_guard_reg = seg;
4862 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4863 if (opts->x_ix86_tune_memcpy_strategy)
4865 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4866 ix86_parse_stringop_strategy_string (str, false);
4867 free (str);
4870 if (opts->x_ix86_tune_memset_strategy)
4872 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4873 ix86_parse_stringop_strategy_string (str, true);
4874 free (str);
4877 /* Save the initial options in case the user does function specific
4878 options. */
4879 if (main_args_p)
4880 target_option_default_node = target_option_current_node
4881 = build_target_option_node (opts);
4883 /* Do not support control flow instrumentation if CET is not enabled. */
4884 if (opts->x_flag_cf_protection != CF_NONE)
4886 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4887 || TARGET_SHSTK_P (opts->x_ix86_isa_flags)))
4889 if (flag_cf_protection == CF_FULL)
4891 error ("%<-fcf-protection=full%> requires CET support "
4892 "on this target. Use -mcet or one of -mibt, "
4893 "-mshstk options to enable CET");
4895 else if (flag_cf_protection == CF_BRANCH)
4897 error ("%<-fcf-protection=branch%> requires CET support "
4898 "on this target. Use -mcet or one of -mibt, "
4899 "-mshstk options to enable CET");
4901 else if (flag_cf_protection == CF_RETURN)
4903 error ("%<-fcf-protection=return%> requires CET support "
4904 "on this target. Use -mcet or one of -mibt, "
4905 "-mshstk options to enable CET");
4907 flag_cf_protection = CF_NONE;
4908 return false;
4910 opts->x_flag_cf_protection =
4911 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4914 if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4915 maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4916 opts->x_param_values,
4917 opts_set->x_param_values);
4919 return true;
4922 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4924 static void
4925 ix86_option_override (void)
4927 ix86_option_override_internal (true, &global_options, &global_options_set);
4930 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4931 static char *
4932 ix86_offload_options (void)
4934 if (TARGET_LP64)
4935 return xstrdup ("-foffload-abi=lp64");
4936 return xstrdup ("-foffload-abi=ilp32");
4939 /* Update register usage after having seen the compiler flags. */
4941 static void
4942 ix86_conditional_register_usage (void)
4944 int i, c_mask;
4946 /* If there are no caller-saved registers, preserve all registers.
4947 except fixed_regs and registers used for function return value
4948 since aggregate_value_p checks call_used_regs[regno] on return
4949 value. */
4950 if (cfun && cfun->machine->no_caller_saved_registers)
4951 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4952 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4953 call_used_regs[i] = 0;
4955 /* For 32-bit targets, squash the REX registers. */
4956 if (! TARGET_64BIT)
4958 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4959 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4960 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4961 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4962 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4963 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4966 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4967 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4969 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4971 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4973 /* Set/reset conditionally defined registers from
4974 CALL_USED_REGISTERS initializer. */
4975 if (call_used_regs[i] > 1)
4976 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4978 /* Calculate registers of CLOBBERED_REGS register set
4979 as call used registers from GENERAL_REGS register set. */
4980 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4981 && call_used_regs[i])
4982 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4985 /* If MMX is disabled, squash the registers. */
4986 if (! TARGET_MMX)
4987 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4988 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4989 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4991 /* If SSE is disabled, squash the registers. */
4992 if (! TARGET_SSE)
4993 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4994 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4995 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4997 /* If the FPU is disabled, squash the registers. */
4998 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4999 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5000 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
5001 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5003 /* If AVX512F is disabled, squash the registers. */
5004 if (! TARGET_AVX512F)
5006 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5007 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5009 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
5010 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5013 /* If MPX is disabled, squash the registers. */
5014 if (! TARGET_MPX)
5015 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
5016 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5019 /* Canonicalize a comparison from one we don't have to one we do have. */
5021 static void
5022 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5023 bool op0_preserve_value)
5025 /* The order of operands in x87 ficom compare is forced by combine in
5026 simplify_comparison () function. Float operator is treated as RTX_OBJ
5027 with a precedence over other operators and is always put in the first
5028 place. Swap condition and operands to match ficom instruction. */
5029 if (!op0_preserve_value
5030 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5032 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5034 /* We are called only for compares that are split to SAHF instruction.
5035 Ensure that we have setcc/jcc insn for the swapped condition. */
5036 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5038 std::swap (*op0, *op1);
5039 *code = (int) scode;
5044 /* Save the current options */
5046 static void
5047 ix86_function_specific_save (struct cl_target_option *ptr,
5048 struct gcc_options *opts)
5050 ptr->arch = ix86_arch;
5051 ptr->schedule = ix86_schedule;
5052 ptr->prefetch_sse = x86_prefetch_sse;
5053 ptr->tune = ix86_tune;
5054 ptr->branch_cost = ix86_branch_cost;
5055 ptr->tune_defaulted = ix86_tune_defaulted;
5056 ptr->arch_specified = ix86_arch_specified;
5057 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5058 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5059 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5060 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5061 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5062 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5063 ptr->x_ix86_abi = opts->x_ix86_abi;
5064 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5065 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5066 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5067 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5068 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5069 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5070 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5071 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5072 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5073 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5074 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5075 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5076 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5077 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5078 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5079 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5080 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5081 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5082 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5083 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5085 /* The fields are char but the variables are not; make sure the
5086 values fit in the fields. */
5087 gcc_assert (ptr->arch == ix86_arch);
5088 gcc_assert (ptr->schedule == ix86_schedule);
5089 gcc_assert (ptr->tune == ix86_tune);
5090 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5093 /* Restore the current options */
5095 static void
5096 ix86_function_specific_restore (struct gcc_options *opts,
5097 struct cl_target_option *ptr)
5099 enum processor_type old_tune = ix86_tune;
5100 enum processor_type old_arch = ix86_arch;
5101 unsigned int ix86_arch_mask;
5102 int i;
5104 /* We don't change -fPIC. */
5105 opts->x_flag_pic = flag_pic;
5107 ix86_arch = (enum processor_type) ptr->arch;
5108 ix86_schedule = (enum attr_cpu) ptr->schedule;
5109 ix86_tune = (enum processor_type) ptr->tune;
5110 x86_prefetch_sse = ptr->prefetch_sse;
5111 opts->x_ix86_branch_cost = ptr->branch_cost;
5112 ix86_tune_defaulted = ptr->tune_defaulted;
5113 ix86_arch_specified = ptr->arch_specified;
5114 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5115 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5116 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5117 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5118 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5119 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5120 opts->x_ix86_abi = ptr->x_ix86_abi;
5121 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5122 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5123 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5124 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5125 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5126 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5127 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5128 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5129 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5130 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5131 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5132 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5133 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5134 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5135 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5136 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5137 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5138 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5139 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5140 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5141 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5142 /* TODO: ix86_cost should be chosen at instruction or function granuality
5143 so for cold code we use size_cost even in !optimize_size compilation. */
5144 if (opts->x_optimize_size)
5145 ix86_cost = &ix86_size_cost;
5146 else
5147 ix86_cost = ix86_tune_cost;
5149 /* Recreate the arch feature tests if the arch changed */
5150 if (old_arch != ix86_arch)
5152 ix86_arch_mask = 1u << ix86_arch;
5153 for (i = 0; i < X86_ARCH_LAST; ++i)
5154 ix86_arch_features[i]
5155 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5158 /* Recreate the tune optimization tests */
5159 if (old_tune != ix86_tune)
5160 set_ix86_tune_features (ix86_tune, false);
5163 /* Adjust target options after streaming them in. This is mainly about
5164 reconciling them with global options. */
5166 static void
5167 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5169 /* flag_pic is a global option, but ix86_cmodel is target saved option
5170 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5171 for PIC, or error out. */
5172 if (flag_pic)
5173 switch (ptr->x_ix86_cmodel)
5175 case CM_SMALL:
5176 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5177 break;
5179 case CM_MEDIUM:
5180 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5181 break;
5183 case CM_LARGE:
5184 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5185 break;
5187 case CM_KERNEL:
5188 error ("code model %s does not support PIC mode", "kernel");
5189 break;
5191 default:
5192 break;
5194 else
5195 switch (ptr->x_ix86_cmodel)
5197 case CM_SMALL_PIC:
5198 ptr->x_ix86_cmodel = CM_SMALL;
5199 break;
5201 case CM_MEDIUM_PIC:
5202 ptr->x_ix86_cmodel = CM_MEDIUM;
5203 break;
5205 case CM_LARGE_PIC:
5206 ptr->x_ix86_cmodel = CM_LARGE;
5207 break;
5209 default:
5210 break;
5214 /* Print the current options */
5216 static void
5217 ix86_function_specific_print (FILE *file, int indent,
5218 struct cl_target_option *ptr)
5220 char *target_string
5221 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5222 ptr->x_target_flags, ptr->x_ix86_target_flags,
5223 NULL, NULL, ptr->x_ix86_fpmath, false);
5225 gcc_assert (ptr->arch < PROCESSOR_max);
5226 fprintf (file, "%*sarch = %d (%s)\n",
5227 indent, "",
5228 ptr->arch, processor_target_table[ptr->arch].name);
5230 gcc_assert (ptr->tune < PROCESSOR_max);
5231 fprintf (file, "%*stune = %d (%s)\n",
5232 indent, "",
5233 ptr->tune, processor_target_table[ptr->tune].name);
5235 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5237 if (target_string)
5239 fprintf (file, "%*s%s\n", indent, "", target_string);
5240 free (target_string);
5245 /* Inner function to process the attribute((target(...))), take an argument and
5246 set the current options from the argument. If we have a list, recursively go
5247 over the list. */
5249 static bool
5250 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5251 struct gcc_options *opts,
5252 struct gcc_options *opts_set,
5253 struct gcc_options *enum_opts_set)
5255 char *next_optstr;
5256 bool ret = true;
5258 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5259 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5260 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5261 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5262 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5264 enum ix86_opt_type
5266 ix86_opt_unknown,
5267 ix86_opt_yes,
5268 ix86_opt_no,
5269 ix86_opt_str,
5270 ix86_opt_enum,
5271 ix86_opt_isa
5274 static const struct
5276 const char *string;
5277 size_t len;
5278 enum ix86_opt_type type;
5279 int opt;
5280 int mask;
5281 } attrs[] = {
5282 /* isa options */
5283 IX86_ATTR_ISA ("sgx", OPT_msgx),
5284 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5285 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5286 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5287 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5288 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5289 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5291 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5292 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5293 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5294 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5295 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5296 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5297 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5298 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5299 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5300 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5301 IX86_ATTR_ISA ("fma", OPT_mfma),
5302 IX86_ATTR_ISA ("xop", OPT_mxop),
5303 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5304 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5305 IX86_ATTR_ISA ("avx", OPT_mavx),
5306 IX86_ATTR_ISA ("sse4", OPT_msse4),
5307 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5308 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5309 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5310 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5311 IX86_ATTR_ISA ("sse3", OPT_msse3),
5312 IX86_ATTR_ISA ("aes", OPT_maes),
5313 IX86_ATTR_ISA ("sha", OPT_msha),
5314 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5315 IX86_ATTR_ISA ("sse2", OPT_msse2),
5316 IX86_ATTR_ISA ("sse", OPT_msse),
5317 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5318 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5319 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5320 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5321 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5322 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5323 IX86_ATTR_ISA ("adx", OPT_madx),
5324 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5325 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5326 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5327 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5328 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5329 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5330 IX86_ATTR_ISA ("abm", OPT_mabm),
5331 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5332 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5333 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5334 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5335 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5336 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5337 IX86_ATTR_ISA ("sahf", OPT_msahf),
5338 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5339 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5340 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5341 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5342 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5343 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5344 IX86_ATTR_ISA ("pku", OPT_mpku),
5345 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5346 IX86_ATTR_ISA ("hle", OPT_mhle),
5347 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5348 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5349 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5350 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5351 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5352 IX86_ATTR_ISA ("ibt", OPT_mibt),
5353 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5354 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5355 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5357 /* enum options */
5358 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5360 /* string options */
5361 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5362 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5364 /* flag options */
5365 IX86_ATTR_YES ("cld",
5366 OPT_mcld,
5367 MASK_CLD),
5369 IX86_ATTR_NO ("fancy-math-387",
5370 OPT_mfancy_math_387,
5371 MASK_NO_FANCY_MATH_387),
5373 IX86_ATTR_YES ("ieee-fp",
5374 OPT_mieee_fp,
5375 MASK_IEEE_FP),
5377 IX86_ATTR_YES ("inline-all-stringops",
5378 OPT_minline_all_stringops,
5379 MASK_INLINE_ALL_STRINGOPS),
5381 IX86_ATTR_YES ("inline-stringops-dynamically",
5382 OPT_minline_stringops_dynamically,
5383 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5385 IX86_ATTR_NO ("align-stringops",
5386 OPT_mno_align_stringops,
5387 MASK_NO_ALIGN_STRINGOPS),
5389 IX86_ATTR_YES ("recip",
5390 OPT_mrecip,
5391 MASK_RECIP),
5395 /* If this is a list, recurse to get the options. */
5396 if (TREE_CODE (args) == TREE_LIST)
5398 bool ret = true;
5400 for (; args; args = TREE_CHAIN (args))
5401 if (TREE_VALUE (args)
5402 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5403 p_strings, opts, opts_set,
5404 enum_opts_set))
5405 ret = false;
5407 return ret;
5410 else if (TREE_CODE (args) != STRING_CST)
5412 error ("attribute %<target%> argument not a string");
5413 return false;
5416 /* Handle multiple arguments separated by commas. */
5417 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5419 while (next_optstr && *next_optstr != '\0')
5421 char *p = next_optstr;
5422 char *orig_p = p;
5423 char *comma = strchr (next_optstr, ',');
5424 const char *opt_string;
5425 size_t len, opt_len;
5426 int opt;
5427 bool opt_set_p;
5428 char ch;
5429 unsigned i;
5430 enum ix86_opt_type type = ix86_opt_unknown;
5431 int mask = 0;
5433 if (comma)
5435 *comma = '\0';
5436 len = comma - next_optstr;
5437 next_optstr = comma + 1;
5439 else
5441 len = strlen (p);
5442 next_optstr = NULL;
5445 /* Recognize no-xxx. */
5446 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5448 opt_set_p = false;
5449 p += 3;
5450 len -= 3;
5452 else
5453 opt_set_p = true;
5455 /* Find the option. */
5456 ch = *p;
5457 opt = N_OPTS;
5458 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5460 type = attrs[i].type;
5461 opt_len = attrs[i].len;
5462 if (ch == attrs[i].string[0]
5463 && ((type != ix86_opt_str && type != ix86_opt_enum)
5464 ? len == opt_len
5465 : len > opt_len)
5466 && memcmp (p, attrs[i].string, opt_len) == 0)
5468 opt = attrs[i].opt;
5469 mask = attrs[i].mask;
5470 opt_string = attrs[i].string;
5471 break;
5475 /* Process the option. */
5476 if (opt == N_OPTS)
5478 error ("attribute(target(\"%s\")) is unknown", orig_p);
5479 ret = false;
5482 else if (type == ix86_opt_isa)
5484 struct cl_decoded_option decoded;
5486 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5487 ix86_handle_option (opts, opts_set,
5488 &decoded, input_location);
5491 else if (type == ix86_opt_yes || type == ix86_opt_no)
5493 if (type == ix86_opt_no)
5494 opt_set_p = !opt_set_p;
5496 if (opt_set_p)
5497 opts->x_target_flags |= mask;
5498 else
5499 opts->x_target_flags &= ~mask;
5502 else if (type == ix86_opt_str)
5504 if (p_strings[opt])
5506 error ("option(\"%s\") was already specified", opt_string);
5507 ret = false;
5509 else
5510 p_strings[opt] = xstrdup (p + opt_len);
5513 else if (type == ix86_opt_enum)
5515 bool arg_ok;
5516 int value;
5518 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5519 if (arg_ok)
5520 set_option (opts, enum_opts_set, opt, value,
5521 p + opt_len, DK_UNSPECIFIED, input_location,
5522 global_dc);
5523 else
5525 error ("attribute(target(\"%s\")) is unknown", orig_p);
5526 ret = false;
5530 else
5531 gcc_unreachable ();
5534 return ret;
5537 /* Release allocated strings. */
5538 static void
5539 release_options_strings (char **option_strings)
5541 /* Free up memory allocated to hold the strings */
5542 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5543 free (option_strings[i]);
5546 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5548 tree
5549 ix86_valid_target_attribute_tree (tree args,
5550 struct gcc_options *opts,
5551 struct gcc_options *opts_set)
5553 const char *orig_arch_string = opts->x_ix86_arch_string;
5554 const char *orig_tune_string = opts->x_ix86_tune_string;
5555 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5556 int orig_tune_defaulted = ix86_tune_defaulted;
5557 int orig_arch_specified = ix86_arch_specified;
5558 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5559 tree t = NULL_TREE;
5560 struct cl_target_option *def
5561 = TREE_TARGET_OPTION (target_option_default_node);
5562 struct gcc_options enum_opts_set;
5564 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5566 /* Process each of the options on the chain. */
5567 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5568 opts_set, &enum_opts_set))
5569 return error_mark_node;
5571 /* If the changed options are different from the default, rerun
5572 ix86_option_override_internal, and then save the options away.
5573 The string options are attribute options, and will be undone
5574 when we copy the save structure. */
5575 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5576 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5577 || opts->x_target_flags != def->x_target_flags
5578 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5579 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5580 || enum_opts_set.x_ix86_fpmath)
5582 /* If we are using the default tune= or arch=, undo the string assigned,
5583 and use the default. */
5584 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5586 opts->x_ix86_arch_string
5587 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5589 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5590 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5591 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5592 | OPTION_MASK_ABI_64
5593 | OPTION_MASK_ABI_X32
5594 | OPTION_MASK_CODE16);
5595 opts->x_ix86_isa_flags2 = 0;
5597 else if (!orig_arch_specified)
5598 opts->x_ix86_arch_string = NULL;
5600 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5601 opts->x_ix86_tune_string
5602 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5603 else if (orig_tune_defaulted)
5604 opts->x_ix86_tune_string = NULL;
5606 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5607 if (enum_opts_set.x_ix86_fpmath)
5608 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5610 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5611 bool r = ix86_option_override_internal (false, opts, opts_set);
5612 if (!r)
5614 release_options_strings (option_strings);
5615 return error_mark_node;
5618 /* Add any builtin functions with the new isa if any. */
5619 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5621 /* Save the current options unless we are validating options for
5622 #pragma. */
5623 t = build_target_option_node (opts);
5625 opts->x_ix86_arch_string = orig_arch_string;
5626 opts->x_ix86_tune_string = orig_tune_string;
5627 opts_set->x_ix86_fpmath = orig_fpmath_set;
5629 release_options_strings (option_strings);
5632 return t;
5635 /* Hook to validate attribute((target("string"))). */
5637 static bool
5638 ix86_valid_target_attribute_p (tree fndecl,
5639 tree ARG_UNUSED (name),
5640 tree args,
5641 int ARG_UNUSED (flags))
5643 struct gcc_options func_options;
5644 tree new_target, new_optimize;
5645 bool ret = true;
5647 /* attribute((target("default"))) does nothing, beyond
5648 affecting multi-versioning. */
5649 if (TREE_VALUE (args)
5650 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5651 && TREE_CHAIN (args) == NULL_TREE
5652 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5653 return true;
5655 tree old_optimize = build_optimization_node (&global_options);
5657 /* Get the optimization options of the current function. */
5658 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5660 if (!func_optimize)
5661 func_optimize = old_optimize;
5663 /* Init func_options. */
5664 memset (&func_options, 0, sizeof (func_options));
5665 init_options_struct (&func_options, NULL);
5666 lang_hooks.init_options_struct (&func_options);
5668 cl_optimization_restore (&func_options,
5669 TREE_OPTIMIZATION (func_optimize));
5671 /* Initialize func_options to the default before its target options can
5672 be set. */
5673 cl_target_option_restore (&func_options,
5674 TREE_TARGET_OPTION (target_option_default_node));
5676 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5677 &global_options_set);
5679 new_optimize = build_optimization_node (&func_options);
5681 if (new_target == error_mark_node)
5682 ret = false;
5684 else if (fndecl && new_target)
5686 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5688 if (old_optimize != new_optimize)
5689 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5692 finalize_options_struct (&func_options);
5694 return ret;
5698 /* Hook to determine if one function can safely inline another. */
5700 static bool
5701 ix86_can_inline_p (tree caller, tree callee)
5703 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5704 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5705 if (!callee_tree)
5706 callee_tree = target_option_default_node;
5707 if (!caller_tree)
5708 caller_tree = target_option_default_node;
5709 if (callee_tree == caller_tree)
5710 return true;
5712 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5713 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5714 bool ret = false;
5716 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5717 function can inline a SSE2 function but a SSE2 function can't inline
5718 a SSE4 function. */
5719 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5720 != callee_opts->x_ix86_isa_flags)
5721 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5722 != callee_opts->x_ix86_isa_flags2))
5723 ret = false;
5725 /* See if we have the same non-isa options. */
5726 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5727 ret = false;
5729 /* See if arch, tune, etc. are the same. */
5730 else if (caller_opts->arch != callee_opts->arch)
5731 ret = false;
5733 else if (caller_opts->tune != callee_opts->tune)
5734 ret = false;
5736 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5737 /* If the calle doesn't use FP expressions differences in
5738 ix86_fpmath can be ignored. We are called from FEs
5739 for multi-versioning call optimization, so beware of
5740 ipa_fn_summaries not available. */
5741 && (! ipa_fn_summaries
5742 || ipa_fn_summaries->get
5743 (cgraph_node::get (callee))->fp_expressions))
5744 ret = false;
5746 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5747 ret = false;
5749 else
5750 ret = true;
5752 return ret;
5756 /* Remember the last target of ix86_set_current_function. */
5757 static GTY(()) tree ix86_previous_fndecl;
5759 /* Set targets globals to the default (or current #pragma GCC target
5760 if active). Invalidate ix86_previous_fndecl cache. */
5762 void
5763 ix86_reset_previous_fndecl (void)
5765 tree new_tree = target_option_current_node;
5766 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5767 if (TREE_TARGET_GLOBALS (new_tree))
5768 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5769 else if (new_tree == target_option_default_node)
5770 restore_target_globals (&default_target_globals);
5771 else
5772 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5773 ix86_previous_fndecl = NULL_TREE;
5776 /* Set the func_type field from the function FNDECL. */
5778 static void
5779 ix86_set_func_type (tree fndecl)
5781 if (cfun->machine->func_type == TYPE_UNKNOWN)
5783 if (lookup_attribute ("interrupt",
5784 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5786 if (ix86_function_naked (fndecl))
5787 error_at (DECL_SOURCE_LOCATION (fndecl),
5788 "interrupt and naked attributes are not compatible");
5790 int nargs = 0;
5791 for (tree arg = DECL_ARGUMENTS (fndecl);
5792 arg;
5793 arg = TREE_CHAIN (arg))
5794 nargs++;
5795 cfun->machine->no_caller_saved_registers = true;
5796 cfun->machine->func_type
5797 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5799 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5801 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5802 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5803 sorry ("Only DWARF debug format is supported for interrupt "
5804 "service routine.");
5806 else
5808 cfun->machine->func_type = TYPE_NORMAL;
5809 if (lookup_attribute ("no_caller_saved_registers",
5810 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5811 cfun->machine->no_caller_saved_registers = true;
5816 /* Set the indirect_branch_type field from the function FNDECL. */
5818 static void
5819 ix86_set_indirect_branch_type (tree fndecl)
5821 if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5823 tree attr = lookup_attribute ("indirect_branch",
5824 DECL_ATTRIBUTES (fndecl));
5825 if (attr != NULL)
5827 tree args = TREE_VALUE (attr);
5828 if (args == NULL)
5829 gcc_unreachable ();
5830 tree cst = TREE_VALUE (args);
5831 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5832 cfun->machine->indirect_branch_type = indirect_branch_keep;
5833 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5834 cfun->machine->indirect_branch_type = indirect_branch_thunk;
5835 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5836 cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5837 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5838 cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5839 else
5840 gcc_unreachable ();
5842 else
5843 cfun->machine->indirect_branch_type = ix86_indirect_branch;
5845 /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5846 nor -mindirect-branch=thunk-extern. */
5847 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5848 && ((cfun->machine->indirect_branch_type
5849 == indirect_branch_thunk_extern)
5850 || (cfun->machine->indirect_branch_type
5851 == indirect_branch_thunk)))
5852 error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5853 "compatible",
5854 ((cfun->machine->indirect_branch_type
5855 == indirect_branch_thunk_extern)
5856 ? "thunk-extern" : "thunk"));
5859 if (cfun->machine->function_return_type == indirect_branch_unset)
5861 tree attr = lookup_attribute ("function_return",
5862 DECL_ATTRIBUTES (fndecl));
5863 if (attr != NULL)
5865 tree args = TREE_VALUE (attr);
5866 if (args == NULL)
5867 gcc_unreachable ();
5868 tree cst = TREE_VALUE (args);
5869 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5870 cfun->machine->function_return_type = indirect_branch_keep;
5871 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5872 cfun->machine->function_return_type = indirect_branch_thunk;
5873 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5874 cfun->machine->function_return_type = indirect_branch_thunk_inline;
5875 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5876 cfun->machine->function_return_type = indirect_branch_thunk_extern;
5877 else
5878 gcc_unreachable ();
5880 else
5881 cfun->machine->function_return_type = ix86_function_return;
5883 /* -mcmodel=large is not compatible with -mfunction-return=thunk
5884 nor -mfunction-return=thunk-extern. */
5885 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5886 && ((cfun->machine->function_return_type
5887 == indirect_branch_thunk_extern)
5888 || (cfun->machine->function_return_type
5889 == indirect_branch_thunk)))
5890 error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5891 "compatible",
5892 ((cfun->machine->function_return_type
5893 == indirect_branch_thunk_extern)
5894 ? "thunk-extern" : "thunk"));
5898 /* Establish appropriate back-end context for processing the function
5899 FNDECL. The argument might be NULL to indicate processing at top
5900 level, outside of any function scope. */
5901 static void
5902 ix86_set_current_function (tree fndecl)
5904 /* Only change the context if the function changes. This hook is called
5905 several times in the course of compiling a function, and we don't want to
5906 slow things down too much or call target_reinit when it isn't safe. */
5907 if (fndecl == ix86_previous_fndecl)
5909 /* There may be 2 function bodies for the same function FNDECL,
5910 one is extern inline and one isn't. Call ix86_set_func_type
5911 to set the func_type field. */
5912 if (fndecl != NULL_TREE)
5914 ix86_set_func_type (fndecl);
5915 ix86_set_indirect_branch_type (fndecl);
5917 return;
5920 tree old_tree;
5921 if (ix86_previous_fndecl == NULL_TREE)
5922 old_tree = target_option_current_node;
5923 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5924 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5925 else
5926 old_tree = target_option_default_node;
5928 if (fndecl == NULL_TREE)
5930 if (old_tree != target_option_current_node)
5931 ix86_reset_previous_fndecl ();
5932 return;
5935 ix86_set_func_type (fndecl);
5936 ix86_set_indirect_branch_type (fndecl);
5938 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5939 if (new_tree == NULL_TREE)
5940 new_tree = target_option_default_node;
5942 if (old_tree != new_tree)
5944 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5945 if (TREE_TARGET_GLOBALS (new_tree))
5946 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5947 else if (new_tree == target_option_default_node)
5948 restore_target_globals (&default_target_globals);
5949 else
5950 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5952 ix86_previous_fndecl = fndecl;
5954 static bool prev_no_caller_saved_registers;
5956 /* 64-bit MS and SYSV ABI have different set of call used registers.
5957 Avoid expensive re-initialization of init_regs each time we switch
5958 function context. */
5959 if (TARGET_64BIT
5960 && (call_used_regs[SI_REG]
5961 == (cfun->machine->call_abi == MS_ABI)))
5962 reinit_regs ();
5963 /* Need to re-initialize init_regs if caller-saved registers are
5964 changed. */
5965 else if (prev_no_caller_saved_registers
5966 != cfun->machine->no_caller_saved_registers)
5967 reinit_regs ();
5969 if (cfun->machine->func_type != TYPE_NORMAL
5970 || cfun->machine->no_caller_saved_registers)
5972 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5973 may change processor state. */
5974 const char *isa;
5975 if (TARGET_MPX)
5976 isa = "MPX";
5977 else if (TARGET_SSE)
5978 isa = "SSE";
5979 else if (TARGET_MMX)
5980 isa = "MMX/3Dnow";
5981 else if (TARGET_80387)
5982 isa = "80387";
5983 else
5984 isa = NULL;
5985 if (isa != NULL)
5987 if (cfun->machine->func_type != TYPE_NORMAL)
5988 sorry ("%s instructions aren't allowed in %s service routine",
5989 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5990 ? "exception" : "interrupt"));
5991 else
5992 sorry ("%s instructions aren't allowed in function with "
5993 "no_caller_saved_registers attribute", isa);
5994 /* Don't issue the same error twice. */
5995 cfun->machine->func_type = TYPE_NORMAL;
5996 cfun->machine->no_caller_saved_registers = false;
6000 prev_no_caller_saved_registers
6001 = cfun->machine->no_caller_saved_registers;
6005 /* Return true if this goes in large data/bss. */
6007 static bool
6008 ix86_in_large_data_p (tree exp)
6010 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
6011 return false;
6013 if (exp == NULL_TREE)
6014 return false;
6016 /* Functions are never large data. */
6017 if (TREE_CODE (exp) == FUNCTION_DECL)
6018 return false;
6020 /* Automatic variables are never large data. */
6021 if (VAR_P (exp) && !is_global_var (exp))
6022 return false;
6024 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
6026 const char *section = DECL_SECTION_NAME (exp);
6027 if (strcmp (section, ".ldata") == 0
6028 || strcmp (section, ".lbss") == 0)
6029 return true;
6030 return false;
6032 else
6034 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6036 /* If this is an incomplete type with size 0, then we can't put it
6037 in data because it might be too big when completed. Also,
6038 int_size_in_bytes returns -1 if size can vary or is larger than
6039 an integer in which case also it is safer to assume that it goes in
6040 large data. */
6041 if (size <= 0 || size > ix86_section_threshold)
6042 return true;
6045 return false;
6048 /* i386-specific section flag to mark large sections. */
6049 #define SECTION_LARGE SECTION_MACH_DEP
6051 /* Switch to the appropriate section for output of DECL.
6052 DECL is either a `VAR_DECL' node or a constant of some sort.
6053 RELOC indicates whether forming the initial value of DECL requires
6054 link-time relocations. */
6056 ATTRIBUTE_UNUSED static section *
6057 x86_64_elf_select_section (tree decl, int reloc,
6058 unsigned HOST_WIDE_INT align)
6060 if (ix86_in_large_data_p (decl))
6062 const char *sname = NULL;
6063 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6064 switch (categorize_decl_for_section (decl, reloc))
6066 case SECCAT_DATA:
6067 sname = ".ldata";
6068 break;
6069 case SECCAT_DATA_REL:
6070 sname = ".ldata.rel";
6071 break;
6072 case SECCAT_DATA_REL_LOCAL:
6073 sname = ".ldata.rel.local";
6074 break;
6075 case SECCAT_DATA_REL_RO:
6076 sname = ".ldata.rel.ro";
6077 break;
6078 case SECCAT_DATA_REL_RO_LOCAL:
6079 sname = ".ldata.rel.ro.local";
6080 break;
6081 case SECCAT_BSS:
6082 sname = ".lbss";
6083 flags |= SECTION_BSS;
6084 break;
6085 case SECCAT_RODATA:
6086 case SECCAT_RODATA_MERGE_STR:
6087 case SECCAT_RODATA_MERGE_STR_INIT:
6088 case SECCAT_RODATA_MERGE_CONST:
6089 sname = ".lrodata";
6090 flags &= ~SECTION_WRITE;
6091 break;
6092 case SECCAT_SRODATA:
6093 case SECCAT_SDATA:
6094 case SECCAT_SBSS:
6095 gcc_unreachable ();
6096 case SECCAT_TEXT:
6097 case SECCAT_TDATA:
6098 case SECCAT_TBSS:
6099 /* We don't split these for medium model. Place them into
6100 default sections and hope for best. */
6101 break;
6103 if (sname)
6105 /* We might get called with string constants, but get_named_section
6106 doesn't like them as they are not DECLs. Also, we need to set
6107 flags in that case. */
6108 if (!DECL_P (decl))
6109 return get_section (sname, flags, NULL);
6110 return get_named_section (decl, sname, reloc);
6113 return default_elf_select_section (decl, reloc, align);
6116 /* Select a set of attributes for section NAME based on the properties
6117 of DECL and whether or not RELOC indicates that DECL's initializer
6118 might contain runtime relocations. */
6120 static unsigned int ATTRIBUTE_UNUSED
6121 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6123 unsigned int flags = default_section_type_flags (decl, name, reloc);
6125 if (ix86_in_large_data_p (decl))
6126 flags |= SECTION_LARGE;
6128 if (decl == NULL_TREE
6129 && (strcmp (name, ".ldata.rel.ro") == 0
6130 || strcmp (name, ".ldata.rel.ro.local") == 0))
6131 flags |= SECTION_RELRO;
6133 if (strcmp (name, ".lbss") == 0
6134 || strncmp (name, ".lbss.", 5) == 0
6135 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6136 flags |= SECTION_BSS;
6138 return flags;
6141 /* Build up a unique section name, expressed as a
6142 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6143 RELOC indicates whether the initial value of EXP requires
6144 link-time relocations. */
6146 static void ATTRIBUTE_UNUSED
6147 x86_64_elf_unique_section (tree decl, int reloc)
6149 if (ix86_in_large_data_p (decl))
6151 const char *prefix = NULL;
6152 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6153 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6155 switch (categorize_decl_for_section (decl, reloc))
6157 case SECCAT_DATA:
6158 case SECCAT_DATA_REL:
6159 case SECCAT_DATA_REL_LOCAL:
6160 case SECCAT_DATA_REL_RO:
6161 case SECCAT_DATA_REL_RO_LOCAL:
6162 prefix = one_only ? ".ld" : ".ldata";
6163 break;
6164 case SECCAT_BSS:
6165 prefix = one_only ? ".lb" : ".lbss";
6166 break;
6167 case SECCAT_RODATA:
6168 case SECCAT_RODATA_MERGE_STR:
6169 case SECCAT_RODATA_MERGE_STR_INIT:
6170 case SECCAT_RODATA_MERGE_CONST:
6171 prefix = one_only ? ".lr" : ".lrodata";
6172 break;
6173 case SECCAT_SRODATA:
6174 case SECCAT_SDATA:
6175 case SECCAT_SBSS:
6176 gcc_unreachable ();
6177 case SECCAT_TEXT:
6178 case SECCAT_TDATA:
6179 case SECCAT_TBSS:
6180 /* We don't split these for medium model. Place them into
6181 default sections and hope for best. */
6182 break;
6184 if (prefix)
6186 const char *name, *linkonce;
6187 char *string;
6189 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6190 name = targetm.strip_name_encoding (name);
6192 /* If we're using one_only, then there needs to be a .gnu.linkonce
6193 prefix to the section name. */
6194 linkonce = one_only ? ".gnu.linkonce" : "";
6196 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6198 set_decl_section_name (decl, string);
6199 return;
6202 default_unique_section (decl, reloc);
6205 #ifdef COMMON_ASM_OP
6207 #ifndef LARGECOMM_SECTION_ASM_OP
6208 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6209 #endif
6211 /* This says how to output assembler code to declare an
6212 uninitialized external linkage data object.
6214 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6215 large objects. */
6216 void
6217 x86_elf_aligned_decl_common (FILE *file, tree decl,
6218 const char *name, unsigned HOST_WIDE_INT size,
6219 int align)
6221 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6222 && size > (unsigned int)ix86_section_threshold)
6224 switch_to_section (get_named_section (decl, ".lbss", 0));
6225 fputs (LARGECOMM_SECTION_ASM_OP, file);
6227 else
6228 fputs (COMMON_ASM_OP, file);
6229 assemble_name (file, name);
6230 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6231 size, align / BITS_PER_UNIT);
6233 #endif
6235 /* Utility function for targets to use in implementing
6236 ASM_OUTPUT_ALIGNED_BSS. */
6238 void
6239 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6240 unsigned HOST_WIDE_INT size, int align)
6242 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6243 && size > (unsigned int)ix86_section_threshold)
6244 switch_to_section (get_named_section (decl, ".lbss", 0));
6245 else
6246 switch_to_section (bss_section);
6247 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6248 #ifdef ASM_DECLARE_OBJECT_NAME
6249 last_assemble_variable_decl = decl;
6250 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6251 #else
6252 /* Standard thing is just output label for the object. */
6253 ASM_OUTPUT_LABEL (file, name);
6254 #endif /* ASM_DECLARE_OBJECT_NAME */
6255 ASM_OUTPUT_SKIP (file, size ? size : 1);
6258 /* Decide whether we must probe the stack before any space allocation
6259 on this target. It's essentially TARGET_STACK_PROBE except when
6260 -fstack-check causes the stack to be already probed differently. */
6262 bool
6263 ix86_target_stack_probe (void)
6265 /* Do not probe the stack twice if static stack checking is enabled. */
6266 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6267 return false;
6269 return TARGET_STACK_PROBE;
6272 /* Decide whether we can make a sibling call to a function. DECL is the
6273 declaration of the function being targeted by the call and EXP is the
6274 CALL_EXPR representing the call. */
6276 static bool
6277 ix86_function_ok_for_sibcall (tree decl, tree exp)
6279 tree type, decl_or_type;
6280 rtx a, b;
6281 bool bind_global = decl && !targetm.binds_local_p (decl);
6283 if (ix86_function_naked (current_function_decl))
6284 return false;
6286 /* Sibling call isn't OK if there are no caller-saved registers
6287 since all registers must be preserved before return. */
6288 if (cfun->machine->no_caller_saved_registers)
6289 return false;
6291 /* If we are generating position-independent code, we cannot sibcall
6292 optimize direct calls to global functions, as the PLT requires
6293 %ebx be live. (Darwin does not have a PLT.) */
6294 if (!TARGET_MACHO
6295 && !TARGET_64BIT
6296 && flag_pic
6297 && flag_plt
6298 && bind_global)
6299 return false;
6301 /* If we need to align the outgoing stack, then sibcalling would
6302 unalign the stack, which may break the called function. */
6303 if (ix86_minimum_incoming_stack_boundary (true)
6304 < PREFERRED_STACK_BOUNDARY)
6305 return false;
6307 if (decl)
6309 decl_or_type = decl;
6310 type = TREE_TYPE (decl);
6312 else
6314 /* We're looking at the CALL_EXPR, we need the type of the function. */
6315 type = CALL_EXPR_FN (exp); /* pointer expression */
6316 type = TREE_TYPE (type); /* pointer type */
6317 type = TREE_TYPE (type); /* function type */
6318 decl_or_type = type;
6321 /* Check that the return value locations are the same. Like
6322 if we are returning floats on the 80387 register stack, we cannot
6323 make a sibcall from a function that doesn't return a float to a
6324 function that does or, conversely, from a function that does return
6325 a float to a function that doesn't; the necessary stack adjustment
6326 would not be executed. This is also the place we notice
6327 differences in the return value ABI. Note that it is ok for one
6328 of the functions to have void return type as long as the return
6329 value of the other is passed in a register. */
6330 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6331 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6332 cfun->decl, false);
6333 if (STACK_REG_P (a) || STACK_REG_P (b))
6335 if (!rtx_equal_p (a, b))
6336 return false;
6338 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6340 else if (!rtx_equal_p (a, b))
6341 return false;
6343 if (TARGET_64BIT)
6345 /* The SYSV ABI has more call-clobbered registers;
6346 disallow sibcalls from MS to SYSV. */
6347 if (cfun->machine->call_abi == MS_ABI
6348 && ix86_function_type_abi (type) == SYSV_ABI)
6349 return false;
6351 else
6353 /* If this call is indirect, we'll need to be able to use a
6354 call-clobbered register for the address of the target function.
6355 Make sure that all such registers are not used for passing
6356 parameters. Note that DLLIMPORT functions and call to global
6357 function via GOT slot are indirect. */
6358 if (!decl
6359 || (bind_global && flag_pic && !flag_plt)
6360 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6362 /* Check if regparm >= 3 since arg_reg_available is set to
6363 false if regparm == 0. If regparm is 1 or 2, there is
6364 always a call-clobbered register available.
6366 ??? The symbol indirect call doesn't need a call-clobbered
6367 register. But we don't know if this is a symbol indirect
6368 call or not here. */
6369 if (ix86_function_regparm (type, NULL) >= 3
6370 && !cfun->machine->arg_reg_available)
6371 return false;
6375 /* Otherwise okay. That also includes certain types of indirect calls. */
6376 return true;
6379 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6380 and "sseregparm" calling convention attributes;
6381 arguments as in struct attribute_spec.handler. */
6383 static tree
6384 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6385 bool *no_add_attrs)
6387 if (TREE_CODE (*node) != FUNCTION_TYPE
6388 && TREE_CODE (*node) != METHOD_TYPE
6389 && TREE_CODE (*node) != FIELD_DECL
6390 && TREE_CODE (*node) != TYPE_DECL)
6392 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6393 name);
6394 *no_add_attrs = true;
6395 return NULL_TREE;
6398 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6399 if (is_attribute_p ("regparm", name))
6401 tree cst;
6403 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6405 error ("fastcall and regparm attributes are not compatible");
6408 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6410 error ("regparam and thiscall attributes are not compatible");
6413 cst = TREE_VALUE (args);
6414 if (TREE_CODE (cst) != INTEGER_CST)
6416 warning (OPT_Wattributes,
6417 "%qE attribute requires an integer constant argument",
6418 name);
6419 *no_add_attrs = true;
6421 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6423 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6424 name, REGPARM_MAX);
6425 *no_add_attrs = true;
6428 return NULL_TREE;
6431 if (TARGET_64BIT)
6433 /* Do not warn when emulating the MS ABI. */
6434 if ((TREE_CODE (*node) != FUNCTION_TYPE
6435 && TREE_CODE (*node) != METHOD_TYPE)
6436 || ix86_function_type_abi (*node) != MS_ABI)
6437 warning (OPT_Wattributes, "%qE attribute ignored",
6438 name);
6439 *no_add_attrs = true;
6440 return NULL_TREE;
6443 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6444 if (is_attribute_p ("fastcall", name))
6446 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6448 error ("fastcall and cdecl attributes are not compatible");
6450 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6452 error ("fastcall and stdcall attributes are not compatible");
6454 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6456 error ("fastcall and regparm attributes are not compatible");
6458 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6460 error ("fastcall and thiscall attributes are not compatible");
6464 /* Can combine stdcall with fastcall (redundant), regparm and
6465 sseregparm. */
6466 else if (is_attribute_p ("stdcall", name))
6468 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6470 error ("stdcall and cdecl attributes are not compatible");
6472 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6474 error ("stdcall and fastcall attributes are not compatible");
6476 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6478 error ("stdcall and thiscall attributes are not compatible");
6482 /* Can combine cdecl with regparm and sseregparm. */
6483 else if (is_attribute_p ("cdecl", name))
6485 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6487 error ("stdcall and cdecl attributes are not compatible");
6489 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6491 error ("fastcall and cdecl attributes are not compatible");
6493 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6495 error ("cdecl and thiscall attributes are not compatible");
6498 else if (is_attribute_p ("thiscall", name))
6500 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6501 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6502 name);
6503 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6505 error ("stdcall and thiscall attributes are not compatible");
6507 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6509 error ("fastcall and thiscall attributes are not compatible");
6511 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6513 error ("cdecl and thiscall attributes are not compatible");
6517 /* Can combine sseregparm with all attributes. */
6519 return NULL_TREE;
6522 /* The transactional memory builtins are implicitly regparm or fastcall
6523 depending on the ABI. Override the generic do-nothing attribute that
6524 these builtins were declared with, and replace it with one of the two
6525 attributes that we expect elsewhere. */
6527 static tree
6528 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6529 int flags, bool *no_add_attrs)
6531 tree alt;
6533 /* In no case do we want to add the placeholder attribute. */
6534 *no_add_attrs = true;
6536 /* The 64-bit ABI is unchanged for transactional memory. */
6537 if (TARGET_64BIT)
6538 return NULL_TREE;
6540 /* ??? Is there a better way to validate 32-bit windows? We have
6541 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6542 if (CHECK_STACK_LIMIT > 0)
6543 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6544 else
6546 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6547 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6549 decl_attributes (node, alt, flags);
6551 return NULL_TREE;
6554 /* This function determines from TYPE the calling-convention. */
6556 unsigned int
6557 ix86_get_callcvt (const_tree type)
6559 unsigned int ret = 0;
6560 bool is_stdarg;
6561 tree attrs;
6563 if (TARGET_64BIT)
6564 return IX86_CALLCVT_CDECL;
6566 attrs = TYPE_ATTRIBUTES (type);
6567 if (attrs != NULL_TREE)
6569 if (lookup_attribute ("cdecl", attrs))
6570 ret |= IX86_CALLCVT_CDECL;
6571 else if (lookup_attribute ("stdcall", attrs))
6572 ret |= IX86_CALLCVT_STDCALL;
6573 else if (lookup_attribute ("fastcall", attrs))
6574 ret |= IX86_CALLCVT_FASTCALL;
6575 else if (lookup_attribute ("thiscall", attrs))
6576 ret |= IX86_CALLCVT_THISCALL;
6578 /* Regparam isn't allowed for thiscall and fastcall. */
6579 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6581 if (lookup_attribute ("regparm", attrs))
6582 ret |= IX86_CALLCVT_REGPARM;
6583 if (lookup_attribute ("sseregparm", attrs))
6584 ret |= IX86_CALLCVT_SSEREGPARM;
6587 if (IX86_BASE_CALLCVT(ret) != 0)
6588 return ret;
6591 is_stdarg = stdarg_p (type);
6592 if (TARGET_RTD && !is_stdarg)
6593 return IX86_CALLCVT_STDCALL | ret;
6595 if (ret != 0
6596 || is_stdarg
6597 || TREE_CODE (type) != METHOD_TYPE
6598 || ix86_function_type_abi (type) != MS_ABI)
6599 return IX86_CALLCVT_CDECL | ret;
6601 return IX86_CALLCVT_THISCALL;
6604 /* Return 0 if the attributes for two types are incompatible, 1 if they
6605 are compatible, and 2 if they are nearly compatible (which causes a
6606 warning to be generated). */
6608 static int
6609 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6611 unsigned int ccvt1, ccvt2;
6613 if (TREE_CODE (type1) != FUNCTION_TYPE
6614 && TREE_CODE (type1) != METHOD_TYPE)
6615 return 1;
6617 ccvt1 = ix86_get_callcvt (type1);
6618 ccvt2 = ix86_get_callcvt (type2);
6619 if (ccvt1 != ccvt2)
6620 return 0;
6621 if (ix86_function_regparm (type1, NULL)
6622 != ix86_function_regparm (type2, NULL))
6623 return 0;
6625 return 1;
6628 /* Return the regparm value for a function with the indicated TYPE and DECL.
6629 DECL may be NULL when calling function indirectly
6630 or considering a libcall. */
6632 static int
6633 ix86_function_regparm (const_tree type, const_tree decl)
6635 tree attr;
6636 int regparm;
6637 unsigned int ccvt;
6639 if (TARGET_64BIT)
6640 return (ix86_function_type_abi (type) == SYSV_ABI
6641 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6642 ccvt = ix86_get_callcvt (type);
6643 regparm = ix86_regparm;
6645 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6647 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6648 if (attr)
6650 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6651 return regparm;
6654 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6655 return 2;
6656 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6657 return 1;
6659 /* Use register calling convention for local functions when possible. */
6660 if (decl
6661 && TREE_CODE (decl) == FUNCTION_DECL)
6663 cgraph_node *target = cgraph_node::get (decl);
6664 if (target)
6665 target = target->function_symbol ();
6667 /* Caller and callee must agree on the calling convention, so
6668 checking here just optimize means that with
6669 __attribute__((optimize (...))) caller could use regparm convention
6670 and callee not, or vice versa. Instead look at whether the callee
6671 is optimized or not. */
6672 if (target && opt_for_fn (target->decl, optimize)
6673 && !(profile_flag && !flag_fentry))
6675 cgraph_local_info *i = &target->local;
6676 if (i && i->local && i->can_change_signature)
6678 int local_regparm, globals = 0, regno;
6680 /* Make sure no regparm register is taken by a
6681 fixed register variable. */
6682 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6683 local_regparm++)
6684 if (fixed_regs[local_regparm])
6685 break;
6687 /* We don't want to use regparm(3) for nested functions as
6688 these use a static chain pointer in the third argument. */
6689 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6690 local_regparm = 2;
6692 /* Save a register for the split stack. */
6693 if (flag_split_stack)
6695 if (local_regparm == 3)
6696 local_regparm = 2;
6697 else if (local_regparm == 2
6698 && DECL_STATIC_CHAIN (target->decl))
6699 local_regparm = 1;
6702 /* Each fixed register usage increases register pressure,
6703 so less registers should be used for argument passing.
6704 This functionality can be overriden by an explicit
6705 regparm value. */
6706 for (regno = AX_REG; regno <= DI_REG; regno++)
6707 if (fixed_regs[regno])
6708 globals++;
6710 local_regparm
6711 = globals < local_regparm ? local_regparm - globals : 0;
6713 if (local_regparm > regparm)
6714 regparm = local_regparm;
6719 return regparm;
6722 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6723 DFmode (2) arguments in SSE registers for a function with the
6724 indicated TYPE and DECL. DECL may be NULL when calling function
6725 indirectly or considering a libcall. Return -1 if any FP parameter
6726 should be rejected by error. This is used in siutation we imply SSE
6727 calling convetion but the function is called from another function with
6728 SSE disabled. Otherwise return 0. */
6730 static int
6731 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6733 gcc_assert (!TARGET_64BIT);
6735 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6736 by the sseregparm attribute. */
6737 if (TARGET_SSEREGPARM
6738 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6740 if (!TARGET_SSE)
6742 if (warn)
6744 if (decl)
6745 error ("calling %qD with attribute sseregparm without "
6746 "SSE/SSE2 enabled", decl);
6747 else
6748 error ("calling %qT with attribute sseregparm without "
6749 "SSE/SSE2 enabled", type);
6751 return 0;
6754 return 2;
6757 if (!decl)
6758 return 0;
6760 cgraph_node *target = cgraph_node::get (decl);
6761 if (target)
6762 target = target->function_symbol ();
6764 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6765 (and DFmode for SSE2) arguments in SSE registers. */
6766 if (target
6767 /* TARGET_SSE_MATH */
6768 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6769 && opt_for_fn (target->decl, optimize)
6770 && !(profile_flag && !flag_fentry))
6772 cgraph_local_info *i = &target->local;
6773 if (i && i->local && i->can_change_signature)
6775 /* Refuse to produce wrong code when local function with SSE enabled
6776 is called from SSE disabled function.
6777 FIXME: We need a way to detect these cases cross-ltrans partition
6778 and avoid using SSE calling conventions on local functions called
6779 from function with SSE disabled. For now at least delay the
6780 warning until we know we are going to produce wrong code.
6781 See PR66047 */
6782 if (!TARGET_SSE && warn)
6783 return -1;
6784 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6785 ->x_ix86_isa_flags) ? 2 : 1;
6789 return 0;
6792 /* Return true if EAX is live at the start of the function. Used by
6793 ix86_expand_prologue to determine if we need special help before
6794 calling allocate_stack_worker. */
6796 static bool
6797 ix86_eax_live_at_start_p (void)
6799 /* Cheat. Don't bother working forward from ix86_function_regparm
6800 to the function type to whether an actual argument is located in
6801 eax. Instead just look at cfg info, which is still close enough
6802 to correct at this point. This gives false positives for broken
6803 functions that might use uninitialized data that happens to be
6804 allocated in eax, but who cares? */
6805 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6808 static bool
6809 ix86_keep_aggregate_return_pointer (tree fntype)
6811 tree attr;
6813 if (!TARGET_64BIT)
6815 attr = lookup_attribute ("callee_pop_aggregate_return",
6816 TYPE_ATTRIBUTES (fntype));
6817 if (attr)
6818 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6820 /* For 32-bit MS-ABI the default is to keep aggregate
6821 return pointer. */
6822 if (ix86_function_type_abi (fntype) == MS_ABI)
6823 return true;
6825 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6828 /* Value is the number of bytes of arguments automatically
6829 popped when returning from a subroutine call.
6830 FUNDECL is the declaration node of the function (as a tree),
6831 FUNTYPE is the data type of the function (as a tree),
6832 or for a library call it is an identifier node for the subroutine name.
6833 SIZE is the number of bytes of arguments passed on the stack.
6835 On the 80386, the RTD insn may be used to pop them if the number
6836 of args is fixed, but if the number is variable then the caller
6837 must pop them all. RTD can't be used for library calls now
6838 because the library is compiled with the Unix compiler.
6839 Use of RTD is a selectable option, since it is incompatible with
6840 standard Unix calling sequences. If the option is not selected,
6841 the caller must always pop the args.
6843 The attribute stdcall is equivalent to RTD on a per module basis. */
6845 static poly_int64
6846 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6848 unsigned int ccvt;
6850 /* None of the 64-bit ABIs pop arguments. */
6851 if (TARGET_64BIT)
6852 return 0;
6854 ccvt = ix86_get_callcvt (funtype);
6856 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6857 | IX86_CALLCVT_THISCALL)) != 0
6858 && ! stdarg_p (funtype))
6859 return size;
6861 /* Lose any fake structure return argument if it is passed on the stack. */
6862 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6863 && !ix86_keep_aggregate_return_pointer (funtype))
6865 int nregs = ix86_function_regparm (funtype, fundecl);
6866 if (nregs == 0)
6867 return GET_MODE_SIZE (Pmode);
6870 return 0;
6873 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6875 static bool
6876 ix86_legitimate_combined_insn (rtx_insn *insn)
6878 int i;
6880 /* Check operand constraints in case hard registers were propagated
6881 into insn pattern. This check prevents combine pass from
6882 generating insn patterns with invalid hard register operands.
6883 These invalid insns can eventually confuse reload to error out
6884 with a spill failure. See also PRs 46829 and 46843. */
6886 gcc_assert (INSN_CODE (insn) >= 0);
6888 extract_insn (insn);
6889 preprocess_constraints (insn);
6891 int n_operands = recog_data.n_operands;
6892 int n_alternatives = recog_data.n_alternatives;
6893 for (i = 0; i < n_operands; i++)
6895 rtx op = recog_data.operand[i];
6896 machine_mode mode = GET_MODE (op);
6897 const operand_alternative *op_alt;
6898 int offset = 0;
6899 bool win;
6900 int j;
6902 /* A unary operator may be accepted by the predicate, but it
6903 is irrelevant for matching constraints. */
6904 if (UNARY_P (op))
6905 op = XEXP (op, 0);
6907 if (SUBREG_P (op))
6909 if (REG_P (SUBREG_REG (op))
6910 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6911 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6912 GET_MODE (SUBREG_REG (op)),
6913 SUBREG_BYTE (op),
6914 GET_MODE (op));
6915 op = SUBREG_REG (op);
6918 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6919 continue;
6921 op_alt = recog_op_alt;
6923 /* Operand has no constraints, anything is OK. */
6924 win = !n_alternatives;
6926 alternative_mask preferred = get_preferred_alternatives (insn);
6927 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6929 if (!TEST_BIT (preferred, j))
6930 continue;
6931 if (op_alt[i].anything_ok
6932 || (op_alt[i].matches != -1
6933 && operands_match_p
6934 (recog_data.operand[i],
6935 recog_data.operand[op_alt[i].matches]))
6936 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6938 win = true;
6939 break;
6943 if (!win)
6944 return false;
6947 return true;
6950 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6952 static unsigned HOST_WIDE_INT
6953 ix86_asan_shadow_offset (void)
6955 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6956 : HOST_WIDE_INT_C (0x7fff8000))
6957 : (HOST_WIDE_INT_1 << 29);
6960 /* Argument support functions. */
6962 /* Return true when register may be used to pass function parameters. */
6963 bool
6964 ix86_function_arg_regno_p (int regno)
6966 int i;
6967 enum calling_abi call_abi;
6968 const int *parm_regs;
6970 if (TARGET_MPX && BND_REGNO_P (regno))
6971 return true;
6973 if (!TARGET_64BIT)
6975 if (TARGET_MACHO)
6976 return (regno < REGPARM_MAX
6977 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6978 else
6979 return (regno < REGPARM_MAX
6980 || (TARGET_MMX && MMX_REGNO_P (regno)
6981 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6982 || (TARGET_SSE && SSE_REGNO_P (regno)
6983 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6986 if (TARGET_SSE && SSE_REGNO_P (regno)
6987 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6988 return true;
6990 /* TODO: The function should depend on current function ABI but
6991 builtins.c would need updating then. Therefore we use the
6992 default ABI. */
6993 call_abi = ix86_cfun_abi ();
6995 /* RAX is used as hidden argument to va_arg functions. */
6996 if (call_abi == SYSV_ABI && regno == AX_REG)
6997 return true;
6999 if (call_abi == MS_ABI)
7000 parm_regs = x86_64_ms_abi_int_parameter_registers;
7001 else
7002 parm_regs = x86_64_int_parameter_registers;
7004 for (i = 0; i < (call_abi == MS_ABI
7005 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
7006 if (regno == parm_regs[i])
7007 return true;
7008 return false;
7011 /* Return if we do not know how to pass TYPE solely in registers. */
7013 static bool
7014 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
7016 if (must_pass_in_stack_var_size_or_pad (mode, type))
7017 return true;
7019 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
7020 The layout_type routine is crafty and tries to trick us into passing
7021 currently unsupported vector types on the stack by using TImode. */
7022 return (!TARGET_64BIT && mode == TImode
7023 && type && TREE_CODE (type) != VECTOR_TYPE);
7026 /* It returns the size, in bytes, of the area reserved for arguments passed
7027 in registers for the function represented by fndecl dependent to the used
7028 abi format. */
7030 ix86_reg_parm_stack_space (const_tree fndecl)
7032 enum calling_abi call_abi = SYSV_ABI;
7033 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7034 call_abi = ix86_function_abi (fndecl);
7035 else
7036 call_abi = ix86_function_type_abi (fndecl);
7037 if (TARGET_64BIT && call_abi == MS_ABI)
7038 return 32;
7039 return 0;
7042 /* We add this as a workaround in order to use libc_has_function
7043 hook in i386.md. */
7044 bool
7045 ix86_libc_has_function (enum function_class fn_class)
7047 return targetm.libc_has_function (fn_class);
7050 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7051 specifying the call abi used. */
7052 enum calling_abi
7053 ix86_function_type_abi (const_tree fntype)
7055 enum calling_abi abi = ix86_abi;
7057 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7058 return abi;
7060 if (abi == SYSV_ABI
7061 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7063 static int warned;
7064 if (TARGET_X32 && !warned)
7066 error ("X32 does not support ms_abi attribute");
7067 warned = 1;
7070 abi = MS_ABI;
7072 else if (abi == MS_ABI
7073 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7074 abi = SYSV_ABI;
7076 return abi;
7079 static enum calling_abi
7080 ix86_function_abi (const_tree fndecl)
7082 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7085 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7086 specifying the call abi used. */
7087 enum calling_abi
7088 ix86_cfun_abi (void)
7090 return cfun ? cfun->machine->call_abi : ix86_abi;
7093 static bool
7094 ix86_function_ms_hook_prologue (const_tree fn)
7096 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7098 if (decl_function_context (fn) != NULL_TREE)
7099 error_at (DECL_SOURCE_LOCATION (fn),
7100 "ms_hook_prologue is not compatible with nested function");
7101 else
7102 return true;
7104 return false;
7107 static bool
7108 ix86_function_naked (const_tree fn)
7110 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7111 return true;
7113 return false;
7116 /* Write the extra assembler code needed to declare a function properly. */
7118 void
7119 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7120 tree decl)
7122 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7124 if (is_ms_hook)
7126 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7127 unsigned int filler_cc = 0xcccccccc;
7129 for (i = 0; i < filler_count; i += 4)
7130 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7133 #ifdef SUBTARGET_ASM_UNWIND_INIT
7134 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7135 #endif
7137 ASM_OUTPUT_LABEL (asm_out_file, fname);
7139 /* Output magic byte marker, if hot-patch attribute is set. */
7140 if (is_ms_hook)
7142 if (TARGET_64BIT)
7144 /* leaq [%rsp + 0], %rsp */
7145 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7146 asm_out_file);
7148 else
7150 /* movl.s %edi, %edi
7151 push %ebp
7152 movl.s %esp, %ebp */
7153 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7158 /* Implementation of call abi switching target hook. Specific to FNDECL
7159 the specific call register sets are set. See also
7160 ix86_conditional_register_usage for more details. */
7161 void
7162 ix86_call_abi_override (const_tree fndecl)
7164 cfun->machine->call_abi = ix86_function_abi (fndecl);
7167 /* Return 1 if pseudo register should be created and used to hold
7168 GOT address for PIC code. */
7169 bool
7170 ix86_use_pseudo_pic_reg (void)
7172 if ((TARGET_64BIT
7173 && (ix86_cmodel == CM_SMALL_PIC
7174 || TARGET_PECOFF))
7175 || !flag_pic)
7176 return false;
7177 return true;
7180 /* Initialize large model PIC register. */
7182 static void
7183 ix86_init_large_pic_reg (unsigned int tmp_regno)
7185 rtx_code_label *label;
7186 rtx tmp_reg;
7188 gcc_assert (Pmode == DImode);
7189 label = gen_label_rtx ();
7190 emit_label (label);
7191 LABEL_PRESERVE_P (label) = 1;
7192 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7193 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7194 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7195 label));
7196 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7197 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7198 pic_offset_table_rtx, tmp_reg));
7199 const char *name = LABEL_NAME (label);
7200 PUT_CODE (label, NOTE);
7201 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7202 NOTE_DELETED_LABEL_NAME (label) = name;
7205 /* Create and initialize PIC register if required. */
7206 static void
7207 ix86_init_pic_reg (void)
7209 edge entry_edge;
7210 rtx_insn *seq;
7212 if (!ix86_use_pseudo_pic_reg ())
7213 return;
7215 start_sequence ();
7217 if (TARGET_64BIT)
7219 if (ix86_cmodel == CM_LARGE_PIC)
7220 ix86_init_large_pic_reg (R11_REG);
7221 else
7222 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7224 else
7226 /* If there is future mcount call in the function it is more profitable
7227 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7228 rtx reg = crtl->profile
7229 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7230 : pic_offset_table_rtx;
7231 rtx_insn *insn = emit_insn (gen_set_got (reg));
7232 RTX_FRAME_RELATED_P (insn) = 1;
7233 if (crtl->profile)
7234 emit_move_insn (pic_offset_table_rtx, reg);
7235 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7238 seq = get_insns ();
7239 end_sequence ();
7241 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7242 insert_insn_on_edge (seq, entry_edge);
7243 commit_one_edge_insertion (entry_edge);
7246 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7247 for a call to a function whose data type is FNTYPE.
7248 For a library call, FNTYPE is 0. */
7250 void
7251 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7252 tree fntype, /* tree ptr for function decl */
7253 rtx libname, /* SYMBOL_REF of library name or 0 */
7254 tree fndecl,
7255 int caller)
7257 struct cgraph_local_info *i = NULL;
7258 struct cgraph_node *target = NULL;
7260 memset (cum, 0, sizeof (*cum));
7262 if (fndecl)
7264 target = cgraph_node::get (fndecl);
7265 if (target)
7267 target = target->function_symbol ();
7268 i = cgraph_node::local_info (target->decl);
7269 cum->call_abi = ix86_function_abi (target->decl);
7271 else
7272 cum->call_abi = ix86_function_abi (fndecl);
7274 else
7275 cum->call_abi = ix86_function_type_abi (fntype);
7277 cum->caller = caller;
7279 /* Set up the number of registers to use for passing arguments. */
7280 cum->nregs = ix86_regparm;
7281 if (TARGET_64BIT)
7283 cum->nregs = (cum->call_abi == SYSV_ABI
7284 ? X86_64_REGPARM_MAX
7285 : X86_64_MS_REGPARM_MAX);
7287 if (TARGET_SSE)
7289 cum->sse_nregs = SSE_REGPARM_MAX;
7290 if (TARGET_64BIT)
7292 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7293 ? X86_64_SSE_REGPARM_MAX
7294 : X86_64_MS_SSE_REGPARM_MAX);
7297 if (TARGET_MMX)
7298 cum->mmx_nregs = MMX_REGPARM_MAX;
7299 cum->warn_avx512f = true;
7300 cum->warn_avx = true;
7301 cum->warn_sse = true;
7302 cum->warn_mmx = true;
7304 /* Because type might mismatch in between caller and callee, we need to
7305 use actual type of function for local calls.
7306 FIXME: cgraph_analyze can be told to actually record if function uses
7307 va_start so for local functions maybe_vaarg can be made aggressive
7308 helping K&R code.
7309 FIXME: once typesytem is fixed, we won't need this code anymore. */
7310 if (i && i->local && i->can_change_signature)
7311 fntype = TREE_TYPE (target->decl);
7312 cum->stdarg = stdarg_p (fntype);
7313 cum->maybe_vaarg = (fntype
7314 ? (!prototype_p (fntype) || stdarg_p (fntype))
7315 : !libname);
7317 cum->bnd_regno = FIRST_BND_REG;
7318 cum->bnds_in_bt = 0;
7319 cum->force_bnd_pass = 0;
7320 cum->decl = fndecl;
7322 cum->warn_empty = !warn_abi || cum->stdarg;
7323 if (!cum->warn_empty && fntype)
7325 function_args_iterator iter;
7326 tree argtype;
7327 bool seen_empty_type = false;
7328 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7330 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7331 break;
7332 if (TYPE_EMPTY_P (argtype))
7333 seen_empty_type = true;
7334 else if (seen_empty_type)
7336 cum->warn_empty = true;
7337 break;
7342 if (!TARGET_64BIT)
7344 /* If there are variable arguments, then we won't pass anything
7345 in registers in 32-bit mode. */
7346 if (stdarg_p (fntype))
7348 cum->nregs = 0;
7349 /* Since in 32-bit, variable arguments are always passed on
7350 stack, there is scratch register available for indirect
7351 sibcall. */
7352 cfun->machine->arg_reg_available = true;
7353 cum->sse_nregs = 0;
7354 cum->mmx_nregs = 0;
7355 cum->warn_avx512f = false;
7356 cum->warn_avx = false;
7357 cum->warn_sse = false;
7358 cum->warn_mmx = false;
7359 return;
7362 /* Use ecx and edx registers if function has fastcall attribute,
7363 else look for regparm information. */
7364 if (fntype)
7366 unsigned int ccvt = ix86_get_callcvt (fntype);
7367 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7369 cum->nregs = 1;
7370 cum->fastcall = 1; /* Same first register as in fastcall. */
7372 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7374 cum->nregs = 2;
7375 cum->fastcall = 1;
7377 else
7378 cum->nregs = ix86_function_regparm (fntype, fndecl);
7381 /* Set up the number of SSE registers used for passing SFmode
7382 and DFmode arguments. Warn for mismatching ABI. */
7383 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7386 cfun->machine->arg_reg_available = (cum->nregs > 0);
7389 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7390 But in the case of vector types, it is some vector mode.
7392 When we have only some of our vector isa extensions enabled, then there
7393 are some modes for which vector_mode_supported_p is false. For these
7394 modes, the generic vector support in gcc will choose some non-vector mode
7395 in order to implement the type. By computing the natural mode, we'll
7396 select the proper ABI location for the operand and not depend on whatever
7397 the middle-end decides to do with these vector types.
7399 The midde-end can't deal with the vector types > 16 bytes. In this
7400 case, we return the original mode and warn ABI change if CUM isn't
7401 NULL.
7403 If INT_RETURN is true, warn ABI change if the vector mode isn't
7404 available for function return value. */
7406 static machine_mode
7407 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7408 bool in_return)
7410 machine_mode mode = TYPE_MODE (type);
7412 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7414 HOST_WIDE_INT size = int_size_in_bytes (type);
7415 if ((size == 8 || size == 16 || size == 32 || size == 64)
7416 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7417 && TYPE_VECTOR_SUBPARTS (type) > 1)
7419 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7421 /* There are no XFmode vector modes. */
7422 if (innermode == XFmode)
7423 return mode;
7425 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7426 mode = MIN_MODE_VECTOR_FLOAT;
7427 else
7428 mode = MIN_MODE_VECTOR_INT;
7430 /* Get the mode which has this inner mode and number of units. */
7431 FOR_EACH_MODE_FROM (mode, mode)
7432 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7433 && GET_MODE_INNER (mode) == innermode)
7435 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7437 static bool warnedavx512f;
7438 static bool warnedavx512f_ret;
7440 if (cum && cum->warn_avx512f && !warnedavx512f)
7442 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7443 "without AVX512F enabled changes the ABI"))
7444 warnedavx512f = true;
7446 else if (in_return && !warnedavx512f_ret)
7448 if (warning (OPT_Wpsabi, "AVX512F vector return "
7449 "without AVX512F enabled changes the ABI"))
7450 warnedavx512f_ret = true;
7453 return TYPE_MODE (type);
7455 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7457 static bool warnedavx;
7458 static bool warnedavx_ret;
7460 if (cum && cum->warn_avx && !warnedavx)
7462 if (warning (OPT_Wpsabi, "AVX vector argument "
7463 "without AVX enabled changes the ABI"))
7464 warnedavx = true;
7466 else if (in_return && !warnedavx_ret)
7468 if (warning (OPT_Wpsabi, "AVX vector return "
7469 "without AVX enabled changes the ABI"))
7470 warnedavx_ret = true;
7473 return TYPE_MODE (type);
7475 else if (((size == 8 && TARGET_64BIT) || size == 16)
7476 && !TARGET_SSE
7477 && !TARGET_IAMCU)
7479 static bool warnedsse;
7480 static bool warnedsse_ret;
7482 if (cum && cum->warn_sse && !warnedsse)
7484 if (warning (OPT_Wpsabi, "SSE vector argument "
7485 "without SSE enabled changes the ABI"))
7486 warnedsse = true;
7488 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7490 if (warning (OPT_Wpsabi, "SSE vector return "
7491 "without SSE enabled changes the ABI"))
7492 warnedsse_ret = true;
7495 else if ((size == 8 && !TARGET_64BIT)
7496 && (!cfun
7497 || cfun->machine->func_type == TYPE_NORMAL)
7498 && !TARGET_MMX
7499 && !TARGET_IAMCU)
7501 static bool warnedmmx;
7502 static bool warnedmmx_ret;
7504 if (cum && cum->warn_mmx && !warnedmmx)
7506 if (warning (OPT_Wpsabi, "MMX vector argument "
7507 "without MMX enabled changes the ABI"))
7508 warnedmmx = true;
7510 else if (in_return && !warnedmmx_ret)
7512 if (warning (OPT_Wpsabi, "MMX vector return "
7513 "without MMX enabled changes the ABI"))
7514 warnedmmx_ret = true;
7517 return mode;
7520 gcc_unreachable ();
7524 return mode;
7527 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7528 this may not agree with the mode that the type system has chosen for the
7529 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7530 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7532 static rtx
7533 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7534 unsigned int regno)
7536 rtx tmp;
7538 if (orig_mode != BLKmode)
7539 tmp = gen_rtx_REG (orig_mode, regno);
7540 else
7542 tmp = gen_rtx_REG (mode, regno);
7543 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7544 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7547 return tmp;
7550 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7551 of this code is to classify each 8bytes of incoming argument by the register
7552 class and assign registers accordingly. */
7554 /* Return the union class of CLASS1 and CLASS2.
7555 See the x86-64 PS ABI for details. */
7557 static enum x86_64_reg_class
7558 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7560 /* Rule #1: If both classes are equal, this is the resulting class. */
7561 if (class1 == class2)
7562 return class1;
7564 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7565 the other class. */
7566 if (class1 == X86_64_NO_CLASS)
7567 return class2;
7568 if (class2 == X86_64_NO_CLASS)
7569 return class1;
7571 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7572 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7573 return X86_64_MEMORY_CLASS;
7575 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7576 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7577 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7578 return X86_64_INTEGERSI_CLASS;
7579 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7580 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7581 return X86_64_INTEGER_CLASS;
7583 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7584 MEMORY is used. */
7585 if (class1 == X86_64_X87_CLASS
7586 || class1 == X86_64_X87UP_CLASS
7587 || class1 == X86_64_COMPLEX_X87_CLASS
7588 || class2 == X86_64_X87_CLASS
7589 || class2 == X86_64_X87UP_CLASS
7590 || class2 == X86_64_COMPLEX_X87_CLASS)
7591 return X86_64_MEMORY_CLASS;
7593 /* Rule #6: Otherwise class SSE is used. */
7594 return X86_64_SSE_CLASS;
7597 /* Classify the argument of type TYPE and mode MODE.
7598 CLASSES will be filled by the register class used to pass each word
7599 of the operand. The number of words is returned. In case the parameter
7600 should be passed in memory, 0 is returned. As a special case for zero
7601 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7603 BIT_OFFSET is used internally for handling records and specifies offset
7604 of the offset in bits modulo 512 to avoid overflow cases.
7606 See the x86-64 PS ABI for details.
7609 static int
7610 classify_argument (machine_mode mode, const_tree type,
7611 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7613 HOST_WIDE_INT bytes =
7614 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7615 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7617 /* Variable sized entities are always passed/returned in memory. */
7618 if (bytes < 0)
7619 return 0;
7621 if (mode != VOIDmode
7622 && targetm.calls.must_pass_in_stack (mode, type))
7623 return 0;
7625 if (type && AGGREGATE_TYPE_P (type))
7627 int i;
7628 tree field;
7629 enum x86_64_reg_class subclasses[MAX_CLASSES];
7631 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7632 if (bytes > 64)
7633 return 0;
7635 for (i = 0; i < words; i++)
7636 classes[i] = X86_64_NO_CLASS;
7638 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7639 signalize memory class, so handle it as special case. */
7640 if (!words)
7642 classes[0] = X86_64_NO_CLASS;
7643 return 1;
7646 /* Classify each field of record and merge classes. */
7647 switch (TREE_CODE (type))
7649 case RECORD_TYPE:
7650 /* And now merge the fields of structure. */
7651 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7653 if (TREE_CODE (field) == FIELD_DECL)
7655 int num;
7657 if (TREE_TYPE (field) == error_mark_node)
7658 continue;
7660 /* Bitfields are always classified as integer. Handle them
7661 early, since later code would consider them to be
7662 misaligned integers. */
7663 if (DECL_BIT_FIELD (field))
7665 for (i = (int_bit_position (field)
7666 + (bit_offset % 64)) / 8 / 8;
7667 i < ((int_bit_position (field) + (bit_offset % 64))
7668 + tree_to_shwi (DECL_SIZE (field))
7669 + 63) / 8 / 8; i++)
7670 classes[i] =
7671 merge_classes (X86_64_INTEGER_CLASS,
7672 classes[i]);
7674 else
7676 int pos;
7678 type = TREE_TYPE (field);
7680 /* Flexible array member is ignored. */
7681 if (TYPE_MODE (type) == BLKmode
7682 && TREE_CODE (type) == ARRAY_TYPE
7683 && TYPE_SIZE (type) == NULL_TREE
7684 && TYPE_DOMAIN (type) != NULL_TREE
7685 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7686 == NULL_TREE))
7688 static bool warned;
7690 if (!warned && warn_psabi)
7692 warned = true;
7693 inform (input_location,
7694 "the ABI of passing struct with"
7695 " a flexible array member has"
7696 " changed in GCC 4.4");
7698 continue;
7700 num = classify_argument (TYPE_MODE (type), type,
7701 subclasses,
7702 (int_bit_position (field)
7703 + bit_offset) % 512);
7704 if (!num)
7705 return 0;
7706 pos = (int_bit_position (field)
7707 + (bit_offset % 64)) / 8 / 8;
7708 for (i = 0; i < num && (i + pos) < words; i++)
7709 classes[i + pos] =
7710 merge_classes (subclasses[i], classes[i + pos]);
7714 break;
7716 case ARRAY_TYPE:
7717 /* Arrays are handled as small records. */
7719 int num;
7720 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7721 TREE_TYPE (type), subclasses, bit_offset);
7722 if (!num)
7723 return 0;
7725 /* The partial classes are now full classes. */
7726 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7727 subclasses[0] = X86_64_SSE_CLASS;
7728 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7729 && !((bit_offset % 64) == 0 && bytes == 4))
7730 subclasses[0] = X86_64_INTEGER_CLASS;
7732 for (i = 0; i < words; i++)
7733 classes[i] = subclasses[i % num];
7735 break;
7737 case UNION_TYPE:
7738 case QUAL_UNION_TYPE:
7739 /* Unions are similar to RECORD_TYPE but offset is always 0.
7741 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7743 if (TREE_CODE (field) == FIELD_DECL)
7745 int num;
7747 if (TREE_TYPE (field) == error_mark_node)
7748 continue;
7750 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7751 TREE_TYPE (field), subclasses,
7752 bit_offset);
7753 if (!num)
7754 return 0;
7755 for (i = 0; i < num && i < words; i++)
7756 classes[i] = merge_classes (subclasses[i], classes[i]);
7759 break;
7761 default:
7762 gcc_unreachable ();
7765 if (words > 2)
7767 /* When size > 16 bytes, if the first one isn't
7768 X86_64_SSE_CLASS or any other ones aren't
7769 X86_64_SSEUP_CLASS, everything should be passed in
7770 memory. */
7771 if (classes[0] != X86_64_SSE_CLASS)
7772 return 0;
7774 for (i = 1; i < words; i++)
7775 if (classes[i] != X86_64_SSEUP_CLASS)
7776 return 0;
7779 /* Final merger cleanup. */
7780 for (i = 0; i < words; i++)
7782 /* If one class is MEMORY, everything should be passed in
7783 memory. */
7784 if (classes[i] == X86_64_MEMORY_CLASS)
7785 return 0;
7787 /* The X86_64_SSEUP_CLASS should be always preceded by
7788 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7789 if (classes[i] == X86_64_SSEUP_CLASS
7790 && classes[i - 1] != X86_64_SSE_CLASS
7791 && classes[i - 1] != X86_64_SSEUP_CLASS)
7793 /* The first one should never be X86_64_SSEUP_CLASS. */
7794 gcc_assert (i != 0);
7795 classes[i] = X86_64_SSE_CLASS;
7798 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7799 everything should be passed in memory. */
7800 if (classes[i] == X86_64_X87UP_CLASS
7801 && (classes[i - 1] != X86_64_X87_CLASS))
7803 static bool warned;
7805 /* The first one should never be X86_64_X87UP_CLASS. */
7806 gcc_assert (i != 0);
7807 if (!warned && warn_psabi)
7809 warned = true;
7810 inform (input_location,
7811 "the ABI of passing union with long double"
7812 " has changed in GCC 4.4");
7814 return 0;
7817 return words;
7820 /* Compute alignment needed. We align all types to natural boundaries with
7821 exception of XFmode that is aligned to 64bits. */
7822 if (mode != VOIDmode && mode != BLKmode)
7824 int mode_alignment = GET_MODE_BITSIZE (mode);
7826 if (mode == XFmode)
7827 mode_alignment = 128;
7828 else if (mode == XCmode)
7829 mode_alignment = 256;
7830 if (COMPLEX_MODE_P (mode))
7831 mode_alignment /= 2;
7832 /* Misaligned fields are always returned in memory. */
7833 if (bit_offset % mode_alignment)
7834 return 0;
7837 /* for V1xx modes, just use the base mode */
7838 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7839 && GET_MODE_UNIT_SIZE (mode) == bytes)
7840 mode = GET_MODE_INNER (mode);
7842 /* Classification of atomic types. */
7843 switch (mode)
7845 case E_SDmode:
7846 case E_DDmode:
7847 classes[0] = X86_64_SSE_CLASS;
7848 return 1;
7849 case E_TDmode:
7850 classes[0] = X86_64_SSE_CLASS;
7851 classes[1] = X86_64_SSEUP_CLASS;
7852 return 2;
7853 case E_DImode:
7854 case E_SImode:
7855 case E_HImode:
7856 case E_QImode:
7857 case E_CSImode:
7858 case E_CHImode:
7859 case E_CQImode:
7861 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7863 /* Analyze last 128 bits only. */
7864 size = (size - 1) & 0x7f;
7866 if (size < 32)
7868 classes[0] = X86_64_INTEGERSI_CLASS;
7869 return 1;
7871 else if (size < 64)
7873 classes[0] = X86_64_INTEGER_CLASS;
7874 return 1;
7876 else if (size < 64+32)
7878 classes[0] = X86_64_INTEGER_CLASS;
7879 classes[1] = X86_64_INTEGERSI_CLASS;
7880 return 2;
7882 else if (size < 64+64)
7884 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7885 return 2;
7887 else
7888 gcc_unreachable ();
7890 case E_CDImode:
7891 case E_TImode:
7892 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7893 return 2;
7894 case E_COImode:
7895 case E_OImode:
7896 /* OImode shouldn't be used directly. */
7897 gcc_unreachable ();
7898 case E_CTImode:
7899 return 0;
7900 case E_SFmode:
7901 if (!(bit_offset % 64))
7902 classes[0] = X86_64_SSESF_CLASS;
7903 else
7904 classes[0] = X86_64_SSE_CLASS;
7905 return 1;
7906 case E_DFmode:
7907 classes[0] = X86_64_SSEDF_CLASS;
7908 return 1;
7909 case E_XFmode:
7910 classes[0] = X86_64_X87_CLASS;
7911 classes[1] = X86_64_X87UP_CLASS;
7912 return 2;
7913 case E_TFmode:
7914 classes[0] = X86_64_SSE_CLASS;
7915 classes[1] = X86_64_SSEUP_CLASS;
7916 return 2;
7917 case E_SCmode:
7918 classes[0] = X86_64_SSE_CLASS;
7919 if (!(bit_offset % 64))
7920 return 1;
7921 else
7923 static bool warned;
7925 if (!warned && warn_psabi)
7927 warned = true;
7928 inform (input_location,
7929 "the ABI of passing structure with complex float"
7930 " member has changed in GCC 4.4");
7932 classes[1] = X86_64_SSESF_CLASS;
7933 return 2;
7935 case E_DCmode:
7936 classes[0] = X86_64_SSEDF_CLASS;
7937 classes[1] = X86_64_SSEDF_CLASS;
7938 return 2;
7939 case E_XCmode:
7940 classes[0] = X86_64_COMPLEX_X87_CLASS;
7941 return 1;
7942 case E_TCmode:
7943 /* This modes is larger than 16 bytes. */
7944 return 0;
7945 case E_V8SFmode:
7946 case E_V8SImode:
7947 case E_V32QImode:
7948 case E_V16HImode:
7949 case E_V4DFmode:
7950 case E_V4DImode:
7951 classes[0] = X86_64_SSE_CLASS;
7952 classes[1] = X86_64_SSEUP_CLASS;
7953 classes[2] = X86_64_SSEUP_CLASS;
7954 classes[3] = X86_64_SSEUP_CLASS;
7955 return 4;
7956 case E_V8DFmode:
7957 case E_V16SFmode:
7958 case E_V8DImode:
7959 case E_V16SImode:
7960 case E_V32HImode:
7961 case E_V64QImode:
7962 classes[0] = X86_64_SSE_CLASS;
7963 classes[1] = X86_64_SSEUP_CLASS;
7964 classes[2] = X86_64_SSEUP_CLASS;
7965 classes[3] = X86_64_SSEUP_CLASS;
7966 classes[4] = X86_64_SSEUP_CLASS;
7967 classes[5] = X86_64_SSEUP_CLASS;
7968 classes[6] = X86_64_SSEUP_CLASS;
7969 classes[7] = X86_64_SSEUP_CLASS;
7970 return 8;
7971 case E_V4SFmode:
7972 case E_V4SImode:
7973 case E_V16QImode:
7974 case E_V8HImode:
7975 case E_V2DFmode:
7976 case E_V2DImode:
7977 classes[0] = X86_64_SSE_CLASS;
7978 classes[1] = X86_64_SSEUP_CLASS;
7979 return 2;
7980 case E_V1TImode:
7981 case E_V1DImode:
7982 case E_V2SFmode:
7983 case E_V2SImode:
7984 case E_V4HImode:
7985 case E_V8QImode:
7986 classes[0] = X86_64_SSE_CLASS;
7987 return 1;
7988 case E_BLKmode:
7989 case E_VOIDmode:
7990 return 0;
7991 default:
7992 gcc_assert (VECTOR_MODE_P (mode));
7994 if (bytes > 16)
7995 return 0;
7997 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7999 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
8000 classes[0] = X86_64_INTEGERSI_CLASS;
8001 else
8002 classes[0] = X86_64_INTEGER_CLASS;
8003 classes[1] = X86_64_INTEGER_CLASS;
8004 return 1 + (bytes > 8);
8008 /* Examine the argument and return set number of register required in each
8009 class. Return true iff parameter should be passed in memory. */
8011 static bool
8012 examine_argument (machine_mode mode, const_tree type, int in_return,
8013 int *int_nregs, int *sse_nregs)
8015 enum x86_64_reg_class regclass[MAX_CLASSES];
8016 int n = classify_argument (mode, type, regclass, 0);
8018 *int_nregs = 0;
8019 *sse_nregs = 0;
8021 if (!n)
8022 return true;
8023 for (n--; n >= 0; n--)
8024 switch (regclass[n])
8026 case X86_64_INTEGER_CLASS:
8027 case X86_64_INTEGERSI_CLASS:
8028 (*int_nregs)++;
8029 break;
8030 case X86_64_SSE_CLASS:
8031 case X86_64_SSESF_CLASS:
8032 case X86_64_SSEDF_CLASS:
8033 (*sse_nregs)++;
8034 break;
8035 case X86_64_NO_CLASS:
8036 case X86_64_SSEUP_CLASS:
8037 break;
8038 case X86_64_X87_CLASS:
8039 case X86_64_X87UP_CLASS:
8040 case X86_64_COMPLEX_X87_CLASS:
8041 if (!in_return)
8042 return true;
8043 break;
8044 case X86_64_MEMORY_CLASS:
8045 gcc_unreachable ();
8048 return false;
8051 /* Construct container for the argument used by GCC interface. See
8052 FUNCTION_ARG for the detailed description. */
8054 static rtx
8055 construct_container (machine_mode mode, machine_mode orig_mode,
8056 const_tree type, int in_return, int nintregs, int nsseregs,
8057 const int *intreg, int sse_regno)
8059 /* The following variables hold the static issued_error state. */
8060 static bool issued_sse_arg_error;
8061 static bool issued_sse_ret_error;
8062 static bool issued_x87_ret_error;
8064 machine_mode tmpmode;
8065 int bytes =
8066 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8067 enum x86_64_reg_class regclass[MAX_CLASSES];
8068 int n;
8069 int i;
8070 int nexps = 0;
8071 int needed_sseregs, needed_intregs;
8072 rtx exp[MAX_CLASSES];
8073 rtx ret;
8075 n = classify_argument (mode, type, regclass, 0);
8076 if (!n)
8077 return NULL;
8078 if (examine_argument (mode, type, in_return, &needed_intregs,
8079 &needed_sseregs))
8080 return NULL;
8081 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8082 return NULL;
8084 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
8085 some less clueful developer tries to use floating-point anyway. */
8086 if (needed_sseregs && !TARGET_SSE)
8088 if (in_return)
8090 if (!issued_sse_ret_error)
8092 error ("SSE register return with SSE disabled");
8093 issued_sse_ret_error = true;
8096 else if (!issued_sse_arg_error)
8098 error ("SSE register argument with SSE disabled");
8099 issued_sse_arg_error = true;
8101 return NULL;
8104 /* Likewise, error if the ABI requires us to return values in the
8105 x87 registers and the user specified -mno-80387. */
8106 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8107 for (i = 0; i < n; i++)
8108 if (regclass[i] == X86_64_X87_CLASS
8109 || regclass[i] == X86_64_X87UP_CLASS
8110 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8112 if (!issued_x87_ret_error)
8114 error ("x87 register return with x87 disabled");
8115 issued_x87_ret_error = true;
8117 return NULL;
8120 /* First construct simple cases. Avoid SCmode, since we want to use
8121 single register to pass this type. */
8122 if (n == 1 && mode != SCmode)
8123 switch (regclass[0])
8125 case X86_64_INTEGER_CLASS:
8126 case X86_64_INTEGERSI_CLASS:
8127 return gen_rtx_REG (mode, intreg[0]);
8128 case X86_64_SSE_CLASS:
8129 case X86_64_SSESF_CLASS:
8130 case X86_64_SSEDF_CLASS:
8131 if (mode != BLKmode)
8132 return gen_reg_or_parallel (mode, orig_mode,
8133 SSE_REGNO (sse_regno));
8134 break;
8135 case X86_64_X87_CLASS:
8136 case X86_64_COMPLEX_X87_CLASS:
8137 return gen_rtx_REG (mode, FIRST_STACK_REG);
8138 case X86_64_NO_CLASS:
8139 /* Zero sized array, struct or class. */
8140 return NULL;
8141 default:
8142 gcc_unreachable ();
8144 if (n == 2
8145 && regclass[0] == X86_64_SSE_CLASS
8146 && regclass[1] == X86_64_SSEUP_CLASS
8147 && mode != BLKmode)
8148 return gen_reg_or_parallel (mode, orig_mode,
8149 SSE_REGNO (sse_regno));
8150 if (n == 4
8151 && regclass[0] == X86_64_SSE_CLASS
8152 && regclass[1] == X86_64_SSEUP_CLASS
8153 && regclass[2] == X86_64_SSEUP_CLASS
8154 && regclass[3] == X86_64_SSEUP_CLASS
8155 && mode != BLKmode)
8156 return gen_reg_or_parallel (mode, orig_mode,
8157 SSE_REGNO (sse_regno));
8158 if (n == 8
8159 && regclass[0] == X86_64_SSE_CLASS
8160 && regclass[1] == X86_64_SSEUP_CLASS
8161 && regclass[2] == X86_64_SSEUP_CLASS
8162 && regclass[3] == X86_64_SSEUP_CLASS
8163 && regclass[4] == X86_64_SSEUP_CLASS
8164 && regclass[5] == X86_64_SSEUP_CLASS
8165 && regclass[6] == X86_64_SSEUP_CLASS
8166 && regclass[7] == X86_64_SSEUP_CLASS
8167 && mode != BLKmode)
8168 return gen_reg_or_parallel (mode, orig_mode,
8169 SSE_REGNO (sse_regno));
8170 if (n == 2
8171 && regclass[0] == X86_64_X87_CLASS
8172 && regclass[1] == X86_64_X87UP_CLASS)
8173 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8175 if (n == 2
8176 && regclass[0] == X86_64_INTEGER_CLASS
8177 && regclass[1] == X86_64_INTEGER_CLASS
8178 && (mode == CDImode || mode == TImode)
8179 && intreg[0] + 1 == intreg[1])
8180 return gen_rtx_REG (mode, intreg[0]);
8182 /* Otherwise figure out the entries of the PARALLEL. */
8183 for (i = 0; i < n; i++)
8185 int pos;
8187 switch (regclass[i])
8189 case X86_64_NO_CLASS:
8190 break;
8191 case X86_64_INTEGER_CLASS:
8192 case X86_64_INTEGERSI_CLASS:
8193 /* Merge TImodes on aligned occasions here too. */
8194 if (i * 8 + 8 > bytes)
8196 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8197 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8198 /* We've requested 24 bytes we
8199 don't have mode for. Use DImode. */
8200 tmpmode = DImode;
8202 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8203 tmpmode = SImode;
8204 else
8205 tmpmode = DImode;
8206 exp [nexps++]
8207 = gen_rtx_EXPR_LIST (VOIDmode,
8208 gen_rtx_REG (tmpmode, *intreg),
8209 GEN_INT (i*8));
8210 intreg++;
8211 break;
8212 case X86_64_SSESF_CLASS:
8213 exp [nexps++]
8214 = gen_rtx_EXPR_LIST (VOIDmode,
8215 gen_rtx_REG (SFmode,
8216 SSE_REGNO (sse_regno)),
8217 GEN_INT (i*8));
8218 sse_regno++;
8219 break;
8220 case X86_64_SSEDF_CLASS:
8221 exp [nexps++]
8222 = gen_rtx_EXPR_LIST (VOIDmode,
8223 gen_rtx_REG (DFmode,
8224 SSE_REGNO (sse_regno)),
8225 GEN_INT (i*8));
8226 sse_regno++;
8227 break;
8228 case X86_64_SSE_CLASS:
8229 pos = i;
8230 switch (n)
8232 case 1:
8233 tmpmode = DImode;
8234 break;
8235 case 2:
8236 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8238 tmpmode = TImode;
8239 i++;
8241 else
8242 tmpmode = DImode;
8243 break;
8244 case 4:
8245 gcc_assert (i == 0
8246 && regclass[1] == X86_64_SSEUP_CLASS
8247 && regclass[2] == X86_64_SSEUP_CLASS
8248 && regclass[3] == X86_64_SSEUP_CLASS);
8249 tmpmode = OImode;
8250 i += 3;
8251 break;
8252 case 8:
8253 gcc_assert (i == 0
8254 && regclass[1] == X86_64_SSEUP_CLASS
8255 && regclass[2] == X86_64_SSEUP_CLASS
8256 && regclass[3] == X86_64_SSEUP_CLASS
8257 && regclass[4] == X86_64_SSEUP_CLASS
8258 && regclass[5] == X86_64_SSEUP_CLASS
8259 && regclass[6] == X86_64_SSEUP_CLASS
8260 && regclass[7] == X86_64_SSEUP_CLASS);
8261 tmpmode = XImode;
8262 i += 7;
8263 break;
8264 default:
8265 gcc_unreachable ();
8267 exp [nexps++]
8268 = gen_rtx_EXPR_LIST (VOIDmode,
8269 gen_rtx_REG (tmpmode,
8270 SSE_REGNO (sse_regno)),
8271 GEN_INT (pos*8));
8272 sse_regno++;
8273 break;
8274 default:
8275 gcc_unreachable ();
8279 /* Empty aligned struct, union or class. */
8280 if (nexps == 0)
8281 return NULL;
8283 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8284 for (i = 0; i < nexps; i++)
8285 XVECEXP (ret, 0, i) = exp [i];
8286 return ret;
8289 /* Update the data in CUM to advance over an argument of mode MODE
8290 and data type TYPE. (TYPE is null for libcalls where that information
8291 may not be available.)
8293 Return a number of integer regsiters advanced over. */
8295 static int
8296 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8297 const_tree type, HOST_WIDE_INT bytes,
8298 HOST_WIDE_INT words)
8300 int res = 0;
8301 bool error_p = false;
8303 if (TARGET_IAMCU)
8305 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8306 bytes in registers. */
8307 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8308 goto pass_in_reg;
8309 return res;
8312 switch (mode)
8314 default:
8315 break;
8317 case E_BLKmode:
8318 if (bytes < 0)
8319 break;
8320 /* FALLTHRU */
8322 case E_DImode:
8323 case E_SImode:
8324 case E_HImode:
8325 case E_QImode:
8326 pass_in_reg:
8327 cum->words += words;
8328 cum->nregs -= words;
8329 cum->regno += words;
8330 if (cum->nregs >= 0)
8331 res = words;
8332 if (cum->nregs <= 0)
8334 cum->nregs = 0;
8335 cfun->machine->arg_reg_available = false;
8336 cum->regno = 0;
8338 break;
8340 case E_OImode:
8341 /* OImode shouldn't be used directly. */
8342 gcc_unreachable ();
8344 case E_DFmode:
8345 if (cum->float_in_sse == -1)
8346 error_p = true;
8347 if (cum->float_in_sse < 2)
8348 break;
8349 /* FALLTHRU */
8350 case E_SFmode:
8351 if (cum->float_in_sse == -1)
8352 error_p = true;
8353 if (cum->float_in_sse < 1)
8354 break;
8355 /* FALLTHRU */
8357 case E_V8SFmode:
8358 case E_V8SImode:
8359 case E_V64QImode:
8360 case E_V32HImode:
8361 case E_V16SImode:
8362 case E_V8DImode:
8363 case E_V16SFmode:
8364 case E_V8DFmode:
8365 case E_V32QImode:
8366 case E_V16HImode:
8367 case E_V4DFmode:
8368 case E_V4DImode:
8369 case E_TImode:
8370 case E_V16QImode:
8371 case E_V8HImode:
8372 case E_V4SImode:
8373 case E_V2DImode:
8374 case E_V4SFmode:
8375 case E_V2DFmode:
8376 if (!type || !AGGREGATE_TYPE_P (type))
8378 cum->sse_words += words;
8379 cum->sse_nregs -= 1;
8380 cum->sse_regno += 1;
8381 if (cum->sse_nregs <= 0)
8383 cum->sse_nregs = 0;
8384 cum->sse_regno = 0;
8387 break;
8389 case E_V8QImode:
8390 case E_V4HImode:
8391 case E_V2SImode:
8392 case E_V2SFmode:
8393 case E_V1TImode:
8394 case E_V1DImode:
8395 if (!type || !AGGREGATE_TYPE_P (type))
8397 cum->mmx_words += words;
8398 cum->mmx_nregs -= 1;
8399 cum->mmx_regno += 1;
8400 if (cum->mmx_nregs <= 0)
8402 cum->mmx_nregs = 0;
8403 cum->mmx_regno = 0;
8406 break;
8408 if (error_p)
8410 cum->float_in_sse = 0;
8411 error ("calling %qD with SSE calling convention without "
8412 "SSE/SSE2 enabled", cum->decl);
8413 sorry ("this is a GCC bug that can be worked around by adding "
8414 "attribute used to function called");
8417 return res;
8420 static int
8421 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8422 const_tree type, HOST_WIDE_INT words, bool named)
8424 int int_nregs, sse_nregs;
8426 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8427 if (!named && (VALID_AVX512F_REG_MODE (mode)
8428 || VALID_AVX256_REG_MODE (mode)))
8429 return 0;
8431 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8432 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8434 cum->nregs -= int_nregs;
8435 cum->sse_nregs -= sse_nregs;
8436 cum->regno += int_nregs;
8437 cum->sse_regno += sse_nregs;
8438 return int_nregs;
8440 else
8442 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8443 cum->words = ROUND_UP (cum->words, align);
8444 cum->words += words;
8445 return 0;
8449 static int
8450 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8451 HOST_WIDE_INT words)
8453 /* Otherwise, this should be passed indirect. */
8454 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8456 cum->words += words;
8457 if (cum->nregs > 0)
8459 cum->nregs -= 1;
8460 cum->regno += 1;
8461 return 1;
8463 return 0;
8466 /* Update the data in CUM to advance over an argument of mode MODE and
8467 data type TYPE. (TYPE is null for libcalls where that information
8468 may not be available.) */
8470 static void
8471 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8472 const_tree type, bool named)
8474 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8475 HOST_WIDE_INT bytes, words;
8476 int nregs;
8478 /* The argument of interrupt handler is a special case and is
8479 handled in ix86_function_arg. */
8480 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8481 return;
8483 if (mode == BLKmode)
8484 bytes = int_size_in_bytes (type);
8485 else
8486 bytes = GET_MODE_SIZE (mode);
8487 words = CEIL (bytes, UNITS_PER_WORD);
8489 if (type)
8490 mode = type_natural_mode (type, NULL, false);
8492 if ((type && POINTER_BOUNDS_TYPE_P (type))
8493 || POINTER_BOUNDS_MODE_P (mode))
8495 /* If we pass bounds in BT then just update remained bounds count. */
8496 if (cum->bnds_in_bt)
8498 cum->bnds_in_bt--;
8499 return;
8502 /* Update remained number of bounds to force. */
8503 if (cum->force_bnd_pass)
8504 cum->force_bnd_pass--;
8506 cum->bnd_regno++;
8508 return;
8511 /* The first arg not going to Bounds Tables resets this counter. */
8512 cum->bnds_in_bt = 0;
8513 /* For unnamed args we always pass bounds to avoid bounds mess when
8514 passed and received types do not match. If bounds do not follow
8515 unnamed arg, still pretend required number of bounds were passed. */
8516 if (cum->force_bnd_pass)
8518 cum->bnd_regno += cum->force_bnd_pass;
8519 cum->force_bnd_pass = 0;
8522 if (TARGET_64BIT)
8524 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8526 if (call_abi == MS_ABI)
8527 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8528 else
8529 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8531 else
8532 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8534 /* For stdarg we expect bounds to be passed for each value passed
8535 in register. */
8536 if (cum->stdarg)
8537 cum->force_bnd_pass = nregs;
8538 /* For pointers passed in memory we expect bounds passed in Bounds
8539 Table. */
8540 if (!nregs)
8542 /* Track if there are outgoing arguments on stack. */
8543 if (cum->caller)
8544 cfun->machine->outgoing_args_on_stack = true;
8546 cum->bnds_in_bt = chkp_type_bounds_count (type);
8550 /* Define where to put the arguments to a function.
8551 Value is zero to push the argument on the stack,
8552 or a hard register in which to store the argument.
8554 MODE is the argument's machine mode.
8555 TYPE is the data type of the argument (as a tree).
8556 This is null for libcalls where that information may
8557 not be available.
8558 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8559 the preceding args and about the function being called.
8560 NAMED is nonzero if this argument is a named parameter
8561 (otherwise it is an extra parameter matching an ellipsis). */
8563 static rtx
8564 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8565 machine_mode orig_mode, const_tree type,
8566 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8568 bool error_p = false;
8570 /* Avoid the AL settings for the Unix64 ABI. */
8571 if (mode == VOIDmode)
8572 return constm1_rtx;
8574 if (TARGET_IAMCU)
8576 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8577 bytes in registers. */
8578 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8579 goto pass_in_reg;
8580 return NULL_RTX;
8583 switch (mode)
8585 default:
8586 break;
8588 case E_BLKmode:
8589 if (bytes < 0)
8590 break;
8591 /* FALLTHRU */
8592 case E_DImode:
8593 case E_SImode:
8594 case E_HImode:
8595 case E_QImode:
8596 pass_in_reg:
8597 if (words <= cum->nregs)
8599 int regno = cum->regno;
8601 /* Fastcall allocates the first two DWORD (SImode) or
8602 smaller arguments to ECX and EDX if it isn't an
8603 aggregate type . */
8604 if (cum->fastcall)
8606 if (mode == BLKmode
8607 || mode == DImode
8608 || (type && AGGREGATE_TYPE_P (type)))
8609 break;
8611 /* ECX not EAX is the first allocated register. */
8612 if (regno == AX_REG)
8613 regno = CX_REG;
8615 return gen_rtx_REG (mode, regno);
8617 break;
8619 case E_DFmode:
8620 if (cum->float_in_sse == -1)
8621 error_p = true;
8622 if (cum->float_in_sse < 2)
8623 break;
8624 /* FALLTHRU */
8625 case E_SFmode:
8626 if (cum->float_in_sse == -1)
8627 error_p = true;
8628 if (cum->float_in_sse < 1)
8629 break;
8630 /* FALLTHRU */
8631 case E_TImode:
8632 /* In 32bit, we pass TImode in xmm registers. */
8633 case E_V16QImode:
8634 case E_V8HImode:
8635 case E_V4SImode:
8636 case E_V2DImode:
8637 case E_V4SFmode:
8638 case E_V2DFmode:
8639 if (!type || !AGGREGATE_TYPE_P (type))
8641 if (cum->sse_nregs)
8642 return gen_reg_or_parallel (mode, orig_mode,
8643 cum->sse_regno + FIRST_SSE_REG);
8645 break;
8647 case E_OImode:
8648 case E_XImode:
8649 /* OImode and XImode shouldn't be used directly. */
8650 gcc_unreachable ();
8652 case E_V64QImode:
8653 case E_V32HImode:
8654 case E_V16SImode:
8655 case E_V8DImode:
8656 case E_V16SFmode:
8657 case E_V8DFmode:
8658 case E_V8SFmode:
8659 case E_V8SImode:
8660 case E_V32QImode:
8661 case E_V16HImode:
8662 case E_V4DFmode:
8663 case E_V4DImode:
8664 if (!type || !AGGREGATE_TYPE_P (type))
8666 if (cum->sse_nregs)
8667 return gen_reg_or_parallel (mode, orig_mode,
8668 cum->sse_regno + FIRST_SSE_REG);
8670 break;
8672 case E_V8QImode:
8673 case E_V4HImode:
8674 case E_V2SImode:
8675 case E_V2SFmode:
8676 case E_V1TImode:
8677 case E_V1DImode:
8678 if (!type || !AGGREGATE_TYPE_P (type))
8680 if (cum->mmx_nregs)
8681 return gen_reg_or_parallel (mode, orig_mode,
8682 cum->mmx_regno + FIRST_MMX_REG);
8684 break;
8686 if (error_p)
8688 cum->float_in_sse = 0;
8689 error ("calling %qD with SSE calling convention without "
8690 "SSE/SSE2 enabled", cum->decl);
8691 sorry ("this is a GCC bug that can be worked around by adding "
8692 "attribute used to function called");
8695 return NULL_RTX;
8698 static rtx
8699 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8700 machine_mode orig_mode, const_tree type, bool named)
8702 /* Handle a hidden AL argument containing number of registers
8703 for varargs x86-64 functions. */
8704 if (mode == VOIDmode)
8705 return GEN_INT (cum->maybe_vaarg
8706 ? (cum->sse_nregs < 0
8707 ? X86_64_SSE_REGPARM_MAX
8708 : cum->sse_regno)
8709 : -1);
8711 switch (mode)
8713 default:
8714 break;
8716 case E_V8SFmode:
8717 case E_V8SImode:
8718 case E_V32QImode:
8719 case E_V16HImode:
8720 case E_V4DFmode:
8721 case E_V4DImode:
8722 case E_V16SFmode:
8723 case E_V16SImode:
8724 case E_V64QImode:
8725 case E_V32HImode:
8726 case E_V8DFmode:
8727 case E_V8DImode:
8728 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8729 if (!named)
8730 return NULL;
8731 break;
8734 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8735 cum->sse_nregs,
8736 &x86_64_int_parameter_registers [cum->regno],
8737 cum->sse_regno);
8740 static rtx
8741 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8742 machine_mode orig_mode, bool named,
8743 HOST_WIDE_INT bytes)
8745 unsigned int regno;
8747 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8748 We use value of -2 to specify that current function call is MSABI. */
8749 if (mode == VOIDmode)
8750 return GEN_INT (-2);
8752 /* If we've run out of registers, it goes on the stack. */
8753 if (cum->nregs == 0)
8754 return NULL_RTX;
8756 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8758 /* Only floating point modes are passed in anything but integer regs. */
8759 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8761 if (named)
8762 regno = cum->regno + FIRST_SSE_REG;
8763 else
8765 rtx t1, t2;
8767 /* Unnamed floating parameters are passed in both the
8768 SSE and integer registers. */
8769 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8770 t2 = gen_rtx_REG (mode, regno);
8771 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8772 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8773 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8776 /* Handle aggregated types passed in register. */
8777 if (orig_mode == BLKmode)
8779 if (bytes > 0 && bytes <= 8)
8780 mode = (bytes > 4 ? DImode : SImode);
8781 if (mode == BLKmode)
8782 mode = DImode;
8785 return gen_reg_or_parallel (mode, orig_mode, regno);
8788 /* Return where to put the arguments to a function.
8789 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8791 MODE is the argument's machine mode. TYPE is the data type of the
8792 argument. It is null for libcalls where that information may not be
8793 available. CUM gives information about the preceding args and about
8794 the function being called. NAMED is nonzero if this argument is a
8795 named parameter (otherwise it is an extra parameter matching an
8796 ellipsis). */
8798 static rtx
8799 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8800 const_tree type, bool named)
8802 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8803 machine_mode mode = omode;
8804 HOST_WIDE_INT bytes, words;
8805 rtx arg;
8807 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8809 gcc_assert (type != NULL_TREE);
8810 if (POINTER_TYPE_P (type))
8812 /* This is the pointer argument. */
8813 gcc_assert (TYPE_MODE (type) == Pmode);
8814 /* It is at -WORD(AP) in the current frame in interrupt and
8815 exception handlers. */
8816 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8818 else
8820 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8821 && TREE_CODE (type) == INTEGER_TYPE
8822 && TYPE_MODE (type) == word_mode);
8823 /* The error code is the word-mode integer argument at
8824 -2 * WORD(AP) in the current frame of the exception
8825 handler. */
8826 arg = gen_rtx_MEM (word_mode,
8827 plus_constant (Pmode,
8828 arg_pointer_rtx,
8829 -2 * UNITS_PER_WORD));
8831 return arg;
8834 /* All pointer bounds arguments are handled separately here. */
8835 if ((type && POINTER_BOUNDS_TYPE_P (type))
8836 || POINTER_BOUNDS_MODE_P (mode))
8838 /* Return NULL if bounds are forced to go in Bounds Table. */
8839 if (cum->bnds_in_bt)
8840 arg = NULL;
8841 /* Return the next available bound reg if any. */
8842 else if (cum->bnd_regno <= LAST_BND_REG)
8843 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8844 /* Return the next special slot number otherwise. */
8845 else
8846 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8848 return arg;
8851 if (mode == BLKmode)
8852 bytes = int_size_in_bytes (type);
8853 else
8854 bytes = GET_MODE_SIZE (mode);
8855 words = CEIL (bytes, UNITS_PER_WORD);
8857 /* To simplify the code below, represent vector types with a vector mode
8858 even if MMX/SSE are not active. */
8859 if (type && TREE_CODE (type) == VECTOR_TYPE)
8860 mode = type_natural_mode (type, cum, false);
8862 if (TARGET_64BIT)
8864 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8866 if (call_abi == MS_ABI)
8867 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8868 else
8869 arg = function_arg_64 (cum, mode, omode, type, named);
8871 else
8872 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8874 /* Track if there are outgoing arguments on stack. */
8875 if (arg == NULL_RTX && cum->caller)
8876 cfun->machine->outgoing_args_on_stack = true;
8878 return arg;
8881 /* A C expression that indicates when an argument must be passed by
8882 reference. If nonzero for an argument, a copy of that argument is
8883 made in memory and a pointer to the argument is passed instead of
8884 the argument itself. The pointer is passed in whatever way is
8885 appropriate for passing a pointer to that type. */
8887 static bool
8888 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8889 const_tree type, bool)
8891 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8893 /* Bounds are never passed by reference. */
8894 if ((type && POINTER_BOUNDS_TYPE_P (type))
8895 || POINTER_BOUNDS_MODE_P (mode))
8896 return false;
8898 if (TARGET_64BIT)
8900 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8902 /* See Windows x64 Software Convention. */
8903 if (call_abi == MS_ABI)
8905 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8907 if (type)
8909 /* Arrays are passed by reference. */
8910 if (TREE_CODE (type) == ARRAY_TYPE)
8911 return true;
8913 if (RECORD_OR_UNION_TYPE_P (type))
8915 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8916 are passed by reference. */
8917 msize = int_size_in_bytes (type);
8921 /* __m128 is passed by reference. */
8922 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8924 else if (type && int_size_in_bytes (type) == -1)
8925 return true;
8928 return false;
8931 /* Return true when TYPE should be 128bit aligned for 32bit argument
8932 passing ABI. XXX: This function is obsolete and is only used for
8933 checking psABI compatibility with previous versions of GCC. */
8935 static bool
8936 ix86_compat_aligned_value_p (const_tree type)
8938 machine_mode mode = TYPE_MODE (type);
8939 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8940 || mode == TDmode
8941 || mode == TFmode
8942 || mode == TCmode)
8943 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8944 return true;
8945 if (TYPE_ALIGN (type) < 128)
8946 return false;
8948 if (AGGREGATE_TYPE_P (type))
8950 /* Walk the aggregates recursively. */
8951 switch (TREE_CODE (type))
8953 case RECORD_TYPE:
8954 case UNION_TYPE:
8955 case QUAL_UNION_TYPE:
8957 tree field;
8959 /* Walk all the structure fields. */
8960 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8962 if (TREE_CODE (field) == FIELD_DECL
8963 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8964 return true;
8966 break;
8969 case ARRAY_TYPE:
8970 /* Just for use if some languages passes arrays by value. */
8971 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8972 return true;
8973 break;
8975 default:
8976 gcc_unreachable ();
8979 return false;
8982 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8983 XXX: This function is obsolete and is only used for checking psABI
8984 compatibility with previous versions of GCC. */
8986 static unsigned int
8987 ix86_compat_function_arg_boundary (machine_mode mode,
8988 const_tree type, unsigned int align)
8990 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8991 natural boundaries. */
8992 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8994 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8995 make an exception for SSE modes since these require 128bit
8996 alignment.
8998 The handling here differs from field_alignment. ICC aligns MMX
8999 arguments to 4 byte boundaries, while structure fields are aligned
9000 to 8 byte boundaries. */
9001 if (!type)
9003 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
9004 align = PARM_BOUNDARY;
9006 else
9008 if (!ix86_compat_aligned_value_p (type))
9009 align = PARM_BOUNDARY;
9012 if (align > BIGGEST_ALIGNMENT)
9013 align = BIGGEST_ALIGNMENT;
9014 return align;
9017 /* Return true when TYPE should be 128bit aligned for 32bit argument
9018 passing ABI. */
9020 static bool
9021 ix86_contains_aligned_value_p (const_tree type)
9023 machine_mode mode = TYPE_MODE (type);
9025 if (mode == XFmode || mode == XCmode)
9026 return false;
9028 if (TYPE_ALIGN (type) < 128)
9029 return false;
9031 if (AGGREGATE_TYPE_P (type))
9033 /* Walk the aggregates recursively. */
9034 switch (TREE_CODE (type))
9036 case RECORD_TYPE:
9037 case UNION_TYPE:
9038 case QUAL_UNION_TYPE:
9040 tree field;
9042 /* Walk all the structure fields. */
9043 for (field = TYPE_FIELDS (type);
9044 field;
9045 field = DECL_CHAIN (field))
9047 if (TREE_CODE (field) == FIELD_DECL
9048 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9049 return true;
9051 break;
9054 case ARRAY_TYPE:
9055 /* Just for use if some languages passes arrays by value. */
9056 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9057 return true;
9058 break;
9060 default:
9061 gcc_unreachable ();
9064 else
9065 return TYPE_ALIGN (type) >= 128;
9067 return false;
9070 /* Gives the alignment boundary, in bits, of an argument with the
9071 specified mode and type. */
9073 static unsigned int
9074 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9076 unsigned int align;
9077 if (type)
9079 /* Since the main variant type is used for call, we convert it to
9080 the main variant type. */
9081 type = TYPE_MAIN_VARIANT (type);
9082 align = TYPE_ALIGN (type);
9083 if (TYPE_EMPTY_P (type))
9084 return PARM_BOUNDARY;
9086 else
9087 align = GET_MODE_ALIGNMENT (mode);
9088 if (align < PARM_BOUNDARY)
9089 align = PARM_BOUNDARY;
9090 else
9092 static bool warned;
9093 unsigned int saved_align = align;
9095 if (!TARGET_64BIT)
9097 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
9098 if (!type)
9100 if (mode == XFmode || mode == XCmode)
9101 align = PARM_BOUNDARY;
9103 else if (!ix86_contains_aligned_value_p (type))
9104 align = PARM_BOUNDARY;
9106 if (align < 128)
9107 align = PARM_BOUNDARY;
9110 if (warn_psabi
9111 && !warned
9112 && align != ix86_compat_function_arg_boundary (mode, type,
9113 saved_align))
9115 warned = true;
9116 inform (input_location,
9117 "The ABI for passing parameters with %d-byte"
9118 " alignment has changed in GCC 4.6",
9119 align / BITS_PER_UNIT);
9123 return align;
9126 /* Return true if N is a possible register number of function value. */
9128 static bool
9129 ix86_function_value_regno_p (const unsigned int regno)
9131 switch (regno)
9133 case AX_REG:
9134 return true;
9135 case DX_REG:
9136 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9137 case DI_REG:
9138 case SI_REG:
9139 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9141 case BND0_REG:
9142 case BND1_REG:
9143 return chkp_function_instrumented_p (current_function_decl);
9145 /* Complex values are returned in %st(0)/%st(1) pair. */
9146 case ST0_REG:
9147 case ST1_REG:
9148 /* TODO: The function should depend on current function ABI but
9149 builtins.c would need updating then. Therefore we use the
9150 default ABI. */
9151 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9152 return false;
9153 return TARGET_FLOAT_RETURNS_IN_80387;
9155 /* Complex values are returned in %xmm0/%xmm1 pair. */
9156 case XMM0_REG:
9157 case XMM1_REG:
9158 return TARGET_SSE;
9160 case MM0_REG:
9161 if (TARGET_MACHO || TARGET_64BIT)
9162 return false;
9163 return TARGET_MMX;
9166 return false;
9169 /* Define how to find the value returned by a function.
9170 VALTYPE is the data type of the value (as a tree).
9171 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9172 otherwise, FUNC is 0. */
9174 static rtx
9175 function_value_32 (machine_mode orig_mode, machine_mode mode,
9176 const_tree fntype, const_tree fn)
9178 unsigned int regno;
9180 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9181 we normally prevent this case when mmx is not available. However
9182 some ABIs may require the result to be returned like DImode. */
9183 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9184 regno = FIRST_MMX_REG;
9186 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9187 we prevent this case when sse is not available. However some ABIs
9188 may require the result to be returned like integer TImode. */
9189 else if (mode == TImode
9190 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9191 regno = FIRST_SSE_REG;
9193 /* 32-byte vector modes in %ymm0. */
9194 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9195 regno = FIRST_SSE_REG;
9197 /* 64-byte vector modes in %zmm0. */
9198 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9199 regno = FIRST_SSE_REG;
9201 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9202 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9203 regno = FIRST_FLOAT_REG;
9204 else
9205 /* Most things go in %eax. */
9206 regno = AX_REG;
9208 /* Override FP return register with %xmm0 for local functions when
9209 SSE math is enabled or for functions with sseregparm attribute. */
9210 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9212 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9213 if (sse_level == -1)
9215 error ("calling %qD with SSE calling convention without "
9216 "SSE/SSE2 enabled", fn);
9217 sorry ("this is a GCC bug that can be worked around by adding "
9218 "attribute used to function called");
9220 else if ((sse_level >= 1 && mode == SFmode)
9221 || (sse_level == 2 && mode == DFmode))
9222 regno = FIRST_SSE_REG;
9225 /* OImode shouldn't be used directly. */
9226 gcc_assert (mode != OImode);
9228 return gen_rtx_REG (orig_mode, regno);
9231 static rtx
9232 function_value_64 (machine_mode orig_mode, machine_mode mode,
9233 const_tree valtype)
9235 rtx ret;
9237 /* Handle libcalls, which don't provide a type node. */
9238 if (valtype == NULL)
9240 unsigned int regno;
9242 switch (mode)
9244 case E_SFmode:
9245 case E_SCmode:
9246 case E_DFmode:
9247 case E_DCmode:
9248 case E_TFmode:
9249 case E_SDmode:
9250 case E_DDmode:
9251 case E_TDmode:
9252 regno = FIRST_SSE_REG;
9253 break;
9254 case E_XFmode:
9255 case E_XCmode:
9256 regno = FIRST_FLOAT_REG;
9257 break;
9258 case E_TCmode:
9259 return NULL;
9260 default:
9261 regno = AX_REG;
9264 return gen_rtx_REG (mode, regno);
9266 else if (POINTER_TYPE_P (valtype))
9268 /* Pointers are always returned in word_mode. */
9269 mode = word_mode;
9272 ret = construct_container (mode, orig_mode, valtype, 1,
9273 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9274 x86_64_int_return_registers, 0);
9276 /* For zero sized structures, construct_container returns NULL, but we
9277 need to keep rest of compiler happy by returning meaningful value. */
9278 if (!ret)
9279 ret = gen_rtx_REG (orig_mode, AX_REG);
9281 return ret;
9284 static rtx
9285 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9286 const_tree valtype)
9288 unsigned int regno = AX_REG;
9290 if (TARGET_SSE)
9292 switch (GET_MODE_SIZE (mode))
9294 case 16:
9295 if (valtype != NULL_TREE
9296 && !VECTOR_INTEGER_TYPE_P (valtype)
9297 && !VECTOR_INTEGER_TYPE_P (valtype)
9298 && !INTEGRAL_TYPE_P (valtype)
9299 && !VECTOR_FLOAT_TYPE_P (valtype))
9300 break;
9301 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9302 && !COMPLEX_MODE_P (mode))
9303 regno = FIRST_SSE_REG;
9304 break;
9305 case 8:
9306 case 4:
9307 if (mode == SFmode || mode == DFmode)
9308 regno = FIRST_SSE_REG;
9309 break;
9310 default:
9311 break;
9314 return gen_rtx_REG (orig_mode, regno);
9317 static rtx
9318 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9319 machine_mode orig_mode, machine_mode mode)
9321 const_tree fn, fntype;
9323 fn = NULL_TREE;
9324 if (fntype_or_decl && DECL_P (fntype_or_decl))
9325 fn = fntype_or_decl;
9326 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9328 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9329 || POINTER_BOUNDS_MODE_P (mode))
9330 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9331 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9332 return function_value_ms_64 (orig_mode, mode, valtype);
9333 else if (TARGET_64BIT)
9334 return function_value_64 (orig_mode, mode, valtype);
9335 else
9336 return function_value_32 (orig_mode, mode, fntype, fn);
9339 static rtx
9340 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9342 machine_mode mode, orig_mode;
9344 orig_mode = TYPE_MODE (valtype);
9345 mode = type_natural_mode (valtype, NULL, true);
9346 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9349 /* Return an RTX representing a place where a function returns
9350 or recieves pointer bounds or NULL if no bounds are returned.
9352 VALTYPE is a data type of a value returned by the function.
9354 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9355 or FUNCTION_TYPE of the function.
9357 If OUTGOING is false, return a place in which the caller will
9358 see the return value. Otherwise, return a place where a
9359 function returns a value. */
9361 static rtx
9362 ix86_function_value_bounds (const_tree valtype,
9363 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9364 bool outgoing ATTRIBUTE_UNUSED)
9366 rtx res = NULL_RTX;
9368 if (BOUNDED_TYPE_P (valtype))
9369 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9370 else if (chkp_type_has_pointer (valtype))
9372 bitmap slots;
9373 rtx bounds[2];
9374 bitmap_iterator bi;
9375 unsigned i, bnd_no = 0;
9377 bitmap_obstack_initialize (NULL);
9378 slots = BITMAP_ALLOC (NULL);
9379 chkp_find_bound_slots (valtype, slots);
9381 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9383 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9384 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9385 gcc_assert (bnd_no < 2);
9386 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9389 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9391 BITMAP_FREE (slots);
9392 bitmap_obstack_release (NULL);
9394 else
9395 res = NULL_RTX;
9397 return res;
9400 /* Pointer function arguments and return values are promoted to
9401 word_mode for normal functions. */
9403 static machine_mode
9404 ix86_promote_function_mode (const_tree type, machine_mode mode,
9405 int *punsignedp, const_tree fntype,
9406 int for_return)
9408 if (cfun->machine->func_type == TYPE_NORMAL
9409 && type != NULL_TREE
9410 && POINTER_TYPE_P (type))
9412 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9413 return word_mode;
9415 return default_promote_function_mode (type, mode, punsignedp, fntype,
9416 for_return);
9419 /* Return true if a structure, union or array with MODE containing FIELD
9420 should be accessed using BLKmode. */
9422 static bool
9423 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9425 /* Union with XFmode must be in BLKmode. */
9426 return (mode == XFmode
9427 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9428 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9432 ix86_libcall_value (machine_mode mode)
9434 return ix86_function_value_1 (NULL, NULL, mode, mode);
9437 /* Return true iff type is returned in memory. */
9439 static bool
9440 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9442 #ifdef SUBTARGET_RETURN_IN_MEMORY
9443 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9444 #else
9445 const machine_mode mode = type_natural_mode (type, NULL, true);
9446 HOST_WIDE_INT size;
9448 if (POINTER_BOUNDS_TYPE_P (type))
9449 return false;
9451 if (TARGET_64BIT)
9453 if (ix86_function_type_abi (fntype) == MS_ABI)
9455 size = int_size_in_bytes (type);
9457 /* __m128 is returned in xmm0. */
9458 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9459 || INTEGRAL_TYPE_P (type)
9460 || VECTOR_FLOAT_TYPE_P (type))
9461 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9462 && !COMPLEX_MODE_P (mode)
9463 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9464 return false;
9466 /* Otherwise, the size must be exactly in [1248]. */
9467 return size != 1 && size != 2 && size != 4 && size != 8;
9469 else
9471 int needed_intregs, needed_sseregs;
9473 return examine_argument (mode, type, 1,
9474 &needed_intregs, &needed_sseregs);
9477 else
9479 size = int_size_in_bytes (type);
9481 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9482 bytes in registers. */
9483 if (TARGET_IAMCU)
9484 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9486 if (mode == BLKmode)
9487 return true;
9489 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9490 return false;
9492 if (VECTOR_MODE_P (mode) || mode == TImode)
9494 /* User-created vectors small enough to fit in EAX. */
9495 if (size < 8)
9496 return false;
9498 /* Unless ABI prescibes otherwise,
9499 MMX/3dNow values are returned in MM0 if available. */
9501 if (size == 8)
9502 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9504 /* SSE values are returned in XMM0 if available. */
9505 if (size == 16)
9506 return !TARGET_SSE;
9508 /* AVX values are returned in YMM0 if available. */
9509 if (size == 32)
9510 return !TARGET_AVX;
9512 /* AVX512F values are returned in ZMM0 if available. */
9513 if (size == 64)
9514 return !TARGET_AVX512F;
9517 if (mode == XFmode)
9518 return false;
9520 if (size > 12)
9521 return true;
9523 /* OImode shouldn't be used directly. */
9524 gcc_assert (mode != OImode);
9526 return false;
9528 #endif
9532 /* Create the va_list data type. */
9534 static tree
9535 ix86_build_builtin_va_list_64 (void)
9537 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9539 record = lang_hooks.types.make_type (RECORD_TYPE);
9540 type_decl = build_decl (BUILTINS_LOCATION,
9541 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9543 f_gpr = build_decl (BUILTINS_LOCATION,
9544 FIELD_DECL, get_identifier ("gp_offset"),
9545 unsigned_type_node);
9546 f_fpr = build_decl (BUILTINS_LOCATION,
9547 FIELD_DECL, get_identifier ("fp_offset"),
9548 unsigned_type_node);
9549 f_ovf = build_decl (BUILTINS_LOCATION,
9550 FIELD_DECL, get_identifier ("overflow_arg_area"),
9551 ptr_type_node);
9552 f_sav = build_decl (BUILTINS_LOCATION,
9553 FIELD_DECL, get_identifier ("reg_save_area"),
9554 ptr_type_node);
9556 va_list_gpr_counter_field = f_gpr;
9557 va_list_fpr_counter_field = f_fpr;
9559 DECL_FIELD_CONTEXT (f_gpr) = record;
9560 DECL_FIELD_CONTEXT (f_fpr) = record;
9561 DECL_FIELD_CONTEXT (f_ovf) = record;
9562 DECL_FIELD_CONTEXT (f_sav) = record;
9564 TYPE_STUB_DECL (record) = type_decl;
9565 TYPE_NAME (record) = type_decl;
9566 TYPE_FIELDS (record) = f_gpr;
9567 DECL_CHAIN (f_gpr) = f_fpr;
9568 DECL_CHAIN (f_fpr) = f_ovf;
9569 DECL_CHAIN (f_ovf) = f_sav;
9571 layout_type (record);
9573 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9574 NULL_TREE, TYPE_ATTRIBUTES (record));
9576 /* The correct type is an array type of one element. */
9577 return build_array_type (record, build_index_type (size_zero_node));
9580 /* Setup the builtin va_list data type and for 64-bit the additional
9581 calling convention specific va_list data types. */
9583 static tree
9584 ix86_build_builtin_va_list (void)
9586 if (TARGET_64BIT)
9588 /* Initialize ABI specific va_list builtin types.
9590 In lto1, we can encounter two va_list types:
9591 - one as a result of the type-merge across TUs, and
9592 - the one constructed here.
9593 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9594 a type identity check in canonical_va_list_type based on
9595 TYPE_MAIN_VARIANT (which we used to have) will not work.
9596 Instead, we tag each va_list_type_node with its unique attribute, and
9597 look for the attribute in the type identity check in
9598 canonical_va_list_type.
9600 Tagging sysv_va_list_type_node directly with the attribute is
9601 problematic since it's a array of one record, which will degrade into a
9602 pointer to record when used as parameter (see build_va_arg comments for
9603 an example), dropping the attribute in the process. So we tag the
9604 record instead. */
9606 /* For SYSV_ABI we use an array of one record. */
9607 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9609 /* For MS_ABI we use plain pointer to argument area. */
9610 tree char_ptr_type = build_pointer_type (char_type_node);
9611 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9612 TYPE_ATTRIBUTES (char_ptr_type));
9613 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9615 return ((ix86_abi == MS_ABI)
9616 ? ms_va_list_type_node
9617 : sysv_va_list_type_node);
9619 else
9621 /* For i386 we use plain pointer to argument area. */
9622 return build_pointer_type (char_type_node);
9626 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9628 static void
9629 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9631 rtx save_area, mem;
9632 alias_set_type set;
9633 int i, max;
9635 /* GPR size of varargs save area. */
9636 if (cfun->va_list_gpr_size)
9637 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9638 else
9639 ix86_varargs_gpr_size = 0;
9641 /* FPR size of varargs save area. We don't need it if we don't pass
9642 anything in SSE registers. */
9643 if (TARGET_SSE && cfun->va_list_fpr_size)
9644 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9645 else
9646 ix86_varargs_fpr_size = 0;
9648 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9649 return;
9651 save_area = frame_pointer_rtx;
9652 set = get_varargs_alias_set ();
9654 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9655 if (max > X86_64_REGPARM_MAX)
9656 max = X86_64_REGPARM_MAX;
9658 for (i = cum->regno; i < max; i++)
9660 mem = gen_rtx_MEM (word_mode,
9661 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9662 MEM_NOTRAP_P (mem) = 1;
9663 set_mem_alias_set (mem, set);
9664 emit_move_insn (mem,
9665 gen_rtx_REG (word_mode,
9666 x86_64_int_parameter_registers[i]));
9669 if (ix86_varargs_fpr_size)
9671 machine_mode smode;
9672 rtx_code_label *label;
9673 rtx test;
9675 /* Now emit code to save SSE registers. The AX parameter contains number
9676 of SSE parameter registers used to call this function, though all we
9677 actually check here is the zero/non-zero status. */
9679 label = gen_label_rtx ();
9680 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9681 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9682 label));
9684 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9685 we used movdqa (i.e. TImode) instead? Perhaps even better would
9686 be if we could determine the real mode of the data, via a hook
9687 into pass_stdarg. Ignore all that for now. */
9688 smode = V4SFmode;
9689 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9690 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9692 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9693 if (max > X86_64_SSE_REGPARM_MAX)
9694 max = X86_64_SSE_REGPARM_MAX;
9696 for (i = cum->sse_regno; i < max; ++i)
9698 mem = plus_constant (Pmode, save_area,
9699 i * 16 + ix86_varargs_gpr_size);
9700 mem = gen_rtx_MEM (smode, mem);
9701 MEM_NOTRAP_P (mem) = 1;
9702 set_mem_alias_set (mem, set);
9703 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9705 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9708 emit_label (label);
9712 static void
9713 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9715 alias_set_type set = get_varargs_alias_set ();
9716 int i;
9718 /* Reset to zero, as there might be a sysv vaarg used
9719 before. */
9720 ix86_varargs_gpr_size = 0;
9721 ix86_varargs_fpr_size = 0;
9723 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9725 rtx reg, mem;
9727 mem = gen_rtx_MEM (Pmode,
9728 plus_constant (Pmode, virtual_incoming_args_rtx,
9729 i * UNITS_PER_WORD));
9730 MEM_NOTRAP_P (mem) = 1;
9731 set_mem_alias_set (mem, set);
9733 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9734 emit_move_insn (mem, reg);
9738 static void
9739 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9740 tree type, int *, int no_rtl)
9742 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9743 CUMULATIVE_ARGS next_cum;
9744 tree fntype;
9746 /* This argument doesn't appear to be used anymore. Which is good,
9747 because the old code here didn't suppress rtl generation. */
9748 gcc_assert (!no_rtl);
9750 if (!TARGET_64BIT)
9751 return;
9753 fntype = TREE_TYPE (current_function_decl);
9755 /* For varargs, we do not want to skip the dummy va_dcl argument.
9756 For stdargs, we do want to skip the last named argument. */
9757 next_cum = *cum;
9758 if (stdarg_p (fntype))
9759 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9760 true);
9762 if (cum->call_abi == MS_ABI)
9763 setup_incoming_varargs_ms_64 (&next_cum);
9764 else
9765 setup_incoming_varargs_64 (&next_cum);
9768 static void
9769 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9770 machine_mode mode,
9771 tree type,
9772 int *pretend_size ATTRIBUTE_UNUSED,
9773 int no_rtl)
9775 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9776 CUMULATIVE_ARGS next_cum;
9777 tree fntype;
9778 rtx save_area;
9779 int bnd_reg, i, max;
9781 gcc_assert (!no_rtl);
9783 /* Do nothing if we use plain pointer to argument area. */
9784 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9785 return;
9787 fntype = TREE_TYPE (current_function_decl);
9789 /* For varargs, we do not want to skip the dummy va_dcl argument.
9790 For stdargs, we do want to skip the last named argument. */
9791 next_cum = *cum;
9792 if (stdarg_p (fntype))
9793 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9794 true);
9795 save_area = frame_pointer_rtx;
9797 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9798 if (max > X86_64_REGPARM_MAX)
9799 max = X86_64_REGPARM_MAX;
9801 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9802 if (chkp_function_instrumented_p (current_function_decl))
9803 for (i = cum->regno; i < max; i++)
9805 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9806 rtx ptr = gen_rtx_REG (Pmode,
9807 x86_64_int_parameter_registers[i]);
9808 rtx bounds;
9810 if (bnd_reg <= LAST_BND_REG)
9811 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9812 else
9814 rtx ldx_addr =
9815 plus_constant (Pmode, arg_pointer_rtx,
9816 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9817 bounds = gen_reg_rtx (BNDmode);
9818 emit_insn (BNDmode == BND64mode
9819 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9820 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9823 emit_insn (BNDmode == BND64mode
9824 ? gen_bnd64_stx (addr, ptr, bounds)
9825 : gen_bnd32_stx (addr, ptr, bounds));
9827 bnd_reg++;
9832 /* Checks if TYPE is of kind va_list char *. */
9834 static bool
9835 is_va_list_char_pointer (tree type)
9837 tree canonic;
9839 /* For 32-bit it is always true. */
9840 if (!TARGET_64BIT)
9841 return true;
9842 canonic = ix86_canonical_va_list_type (type);
9843 return (canonic == ms_va_list_type_node
9844 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9847 /* Implement va_start. */
9849 static void
9850 ix86_va_start (tree valist, rtx nextarg)
9852 HOST_WIDE_INT words, n_gpr, n_fpr;
9853 tree f_gpr, f_fpr, f_ovf, f_sav;
9854 tree gpr, fpr, ovf, sav, t;
9855 tree type;
9856 rtx ovf_rtx;
9858 if (flag_split_stack
9859 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9861 unsigned int scratch_regno;
9863 /* When we are splitting the stack, we can't refer to the stack
9864 arguments using internal_arg_pointer, because they may be on
9865 the old stack. The split stack prologue will arrange to
9866 leave a pointer to the old stack arguments in a scratch
9867 register, which we here copy to a pseudo-register. The split
9868 stack prologue can't set the pseudo-register directly because
9869 it (the prologue) runs before any registers have been saved. */
9871 scratch_regno = split_stack_prologue_scratch_regno ();
9872 if (scratch_regno != INVALID_REGNUM)
9874 rtx reg;
9875 rtx_insn *seq;
9877 reg = gen_reg_rtx (Pmode);
9878 cfun->machine->split_stack_varargs_pointer = reg;
9880 start_sequence ();
9881 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9882 seq = get_insns ();
9883 end_sequence ();
9885 push_topmost_sequence ();
9886 emit_insn_after (seq, entry_of_function ());
9887 pop_topmost_sequence ();
9891 /* Only 64bit target needs something special. */
9892 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9894 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9895 std_expand_builtin_va_start (valist, nextarg);
9896 else
9898 rtx va_r, next;
9900 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9901 next = expand_binop (ptr_mode, add_optab,
9902 cfun->machine->split_stack_varargs_pointer,
9903 crtl->args.arg_offset_rtx,
9904 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9905 convert_move (va_r, next, 0);
9907 /* Store zero bounds for va_list. */
9908 if (chkp_function_instrumented_p (current_function_decl))
9909 chkp_expand_bounds_reset_for_mem (valist,
9910 make_tree (TREE_TYPE (valist),
9911 next));
9914 return;
9917 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9918 f_fpr = DECL_CHAIN (f_gpr);
9919 f_ovf = DECL_CHAIN (f_fpr);
9920 f_sav = DECL_CHAIN (f_ovf);
9922 valist = build_simple_mem_ref (valist);
9923 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9924 /* The following should be folded into the MEM_REF offset. */
9925 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9926 f_gpr, NULL_TREE);
9927 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9928 f_fpr, NULL_TREE);
9929 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9930 f_ovf, NULL_TREE);
9931 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9932 f_sav, NULL_TREE);
9934 /* Count number of gp and fp argument registers used. */
9935 words = crtl->args.info.words;
9936 n_gpr = crtl->args.info.regno;
9937 n_fpr = crtl->args.info.sse_regno;
9939 if (cfun->va_list_gpr_size)
9941 type = TREE_TYPE (gpr);
9942 t = build2 (MODIFY_EXPR, type,
9943 gpr, build_int_cst (type, n_gpr * 8));
9944 TREE_SIDE_EFFECTS (t) = 1;
9945 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9948 if (TARGET_SSE && cfun->va_list_fpr_size)
9950 type = TREE_TYPE (fpr);
9951 t = build2 (MODIFY_EXPR, type, fpr,
9952 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9953 TREE_SIDE_EFFECTS (t) = 1;
9954 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9957 /* Find the overflow area. */
9958 type = TREE_TYPE (ovf);
9959 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9960 ovf_rtx = crtl->args.internal_arg_pointer;
9961 else
9962 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9963 t = make_tree (type, ovf_rtx);
9964 if (words != 0)
9965 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9967 /* Store zero bounds for overflow area pointer. */
9968 if (chkp_function_instrumented_p (current_function_decl))
9969 chkp_expand_bounds_reset_for_mem (ovf, t);
9971 t = build2 (MODIFY_EXPR, type, ovf, t);
9972 TREE_SIDE_EFFECTS (t) = 1;
9973 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9975 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9977 /* Find the register save area.
9978 Prologue of the function save it right above stack frame. */
9979 type = TREE_TYPE (sav);
9980 t = make_tree (type, frame_pointer_rtx);
9981 if (!ix86_varargs_gpr_size)
9982 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9984 /* Store zero bounds for save area pointer. */
9985 if (chkp_function_instrumented_p (current_function_decl))
9986 chkp_expand_bounds_reset_for_mem (sav, t);
9988 t = build2 (MODIFY_EXPR, type, sav, t);
9989 TREE_SIDE_EFFECTS (t) = 1;
9990 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9994 /* Implement va_arg. */
9996 static tree
9997 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9998 gimple_seq *post_p)
10000 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
10001 tree f_gpr, f_fpr, f_ovf, f_sav;
10002 tree gpr, fpr, ovf, sav, t;
10003 int size, rsize;
10004 tree lab_false, lab_over = NULL_TREE;
10005 tree addr, t2;
10006 rtx container;
10007 int indirect_p = 0;
10008 tree ptrtype;
10009 machine_mode nat_mode;
10010 unsigned int arg_boundary;
10012 /* Only 64bit target needs something special. */
10013 if (is_va_list_char_pointer (TREE_TYPE (valist)))
10014 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
10016 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
10017 f_fpr = DECL_CHAIN (f_gpr);
10018 f_ovf = DECL_CHAIN (f_fpr);
10019 f_sav = DECL_CHAIN (f_ovf);
10021 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
10022 valist, f_gpr, NULL_TREE);
10024 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
10025 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
10026 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
10028 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10029 if (indirect_p)
10030 type = build_pointer_type (type);
10031 size = arg_int_size_in_bytes (type);
10032 rsize = CEIL (size, UNITS_PER_WORD);
10034 nat_mode = type_natural_mode (type, NULL, false);
10035 switch (nat_mode)
10037 case E_V8SFmode:
10038 case E_V8SImode:
10039 case E_V32QImode:
10040 case E_V16HImode:
10041 case E_V4DFmode:
10042 case E_V4DImode:
10043 case E_V16SFmode:
10044 case E_V16SImode:
10045 case E_V64QImode:
10046 case E_V32HImode:
10047 case E_V8DFmode:
10048 case E_V8DImode:
10049 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
10050 if (!TARGET_64BIT_MS_ABI)
10052 container = NULL;
10053 break;
10055 /* FALLTHRU */
10057 default:
10058 container = construct_container (nat_mode, TYPE_MODE (type),
10059 type, 0, X86_64_REGPARM_MAX,
10060 X86_64_SSE_REGPARM_MAX, intreg,
10062 break;
10065 /* Pull the value out of the saved registers. */
10067 addr = create_tmp_var (ptr_type_node, "addr");
10069 if (container)
10071 int needed_intregs, needed_sseregs;
10072 bool need_temp;
10073 tree int_addr, sse_addr;
10075 lab_false = create_artificial_label (UNKNOWN_LOCATION);
10076 lab_over = create_artificial_label (UNKNOWN_LOCATION);
10078 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
10080 need_temp = (!REG_P (container)
10081 && ((needed_intregs && TYPE_ALIGN (type) > 64)
10082 || TYPE_ALIGN (type) > 128));
10084 /* In case we are passing structure, verify that it is consecutive block
10085 on the register save area. If not we need to do moves. */
10086 if (!need_temp && !REG_P (container))
10088 /* Verify that all registers are strictly consecutive */
10089 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10091 int i;
10093 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10095 rtx slot = XVECEXP (container, 0, i);
10096 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10097 || INTVAL (XEXP (slot, 1)) != i * 16)
10098 need_temp = true;
10101 else
10103 int i;
10105 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10107 rtx slot = XVECEXP (container, 0, i);
10108 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10109 || INTVAL (XEXP (slot, 1)) != i * 8)
10110 need_temp = true;
10114 if (!need_temp)
10116 int_addr = addr;
10117 sse_addr = addr;
10119 else
10121 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10122 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10125 /* First ensure that we fit completely in registers. */
10126 if (needed_intregs)
10128 t = build_int_cst (TREE_TYPE (gpr),
10129 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10130 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10131 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10132 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10133 gimplify_and_add (t, pre_p);
10135 if (needed_sseregs)
10137 t = build_int_cst (TREE_TYPE (fpr),
10138 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10139 + X86_64_REGPARM_MAX * 8);
10140 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10141 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10142 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10143 gimplify_and_add (t, pre_p);
10146 /* Compute index to start of area used for integer regs. */
10147 if (needed_intregs)
10149 /* int_addr = gpr + sav; */
10150 t = fold_build_pointer_plus (sav, gpr);
10151 gimplify_assign (int_addr, t, pre_p);
10153 if (needed_sseregs)
10155 /* sse_addr = fpr + sav; */
10156 t = fold_build_pointer_plus (sav, fpr);
10157 gimplify_assign (sse_addr, t, pre_p);
10159 if (need_temp)
10161 int i, prev_size = 0;
10162 tree temp = create_tmp_var (type, "va_arg_tmp");
10164 /* addr = &temp; */
10165 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10166 gimplify_assign (addr, t, pre_p);
10168 for (i = 0; i < XVECLEN (container, 0); i++)
10170 rtx slot = XVECEXP (container, 0, i);
10171 rtx reg = XEXP (slot, 0);
10172 machine_mode mode = GET_MODE (reg);
10173 tree piece_type;
10174 tree addr_type;
10175 tree daddr_type;
10176 tree src_addr, src;
10177 int src_offset;
10178 tree dest_addr, dest;
10179 int cur_size = GET_MODE_SIZE (mode);
10181 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10182 prev_size = INTVAL (XEXP (slot, 1));
10183 if (prev_size + cur_size > size)
10185 cur_size = size - prev_size;
10186 unsigned int nbits = cur_size * BITS_PER_UNIT;
10187 if (!int_mode_for_size (nbits, 1).exists (&mode))
10188 mode = QImode;
10190 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10191 if (mode == GET_MODE (reg))
10192 addr_type = build_pointer_type (piece_type);
10193 else
10194 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10195 true);
10196 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10197 true);
10199 if (SSE_REGNO_P (REGNO (reg)))
10201 src_addr = sse_addr;
10202 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10204 else
10206 src_addr = int_addr;
10207 src_offset = REGNO (reg) * 8;
10209 src_addr = fold_convert (addr_type, src_addr);
10210 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10212 dest_addr = fold_convert (daddr_type, addr);
10213 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10214 if (cur_size == GET_MODE_SIZE (mode))
10216 src = build_va_arg_indirect_ref (src_addr);
10217 dest = build_va_arg_indirect_ref (dest_addr);
10219 gimplify_assign (dest, src, pre_p);
10221 else
10223 tree copy
10224 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10225 3, dest_addr, src_addr,
10226 size_int (cur_size));
10227 gimplify_and_add (copy, pre_p);
10229 prev_size += cur_size;
10233 if (needed_intregs)
10235 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10236 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10237 gimplify_assign (gpr, t, pre_p);
10240 if (needed_sseregs)
10242 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10243 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10244 gimplify_assign (unshare_expr (fpr), t, pre_p);
10247 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10249 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10252 /* ... otherwise out of the overflow area. */
10254 /* When we align parameter on stack for caller, if the parameter
10255 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10256 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10257 here with caller. */
10258 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10259 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10260 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10262 /* Care for on-stack alignment if needed. */
10263 if (arg_boundary <= 64 || size == 0)
10264 t = ovf;
10265 else
10267 HOST_WIDE_INT align = arg_boundary / 8;
10268 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10269 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10270 build_int_cst (TREE_TYPE (t), -align));
10273 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10274 gimplify_assign (addr, t, pre_p);
10276 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10277 gimplify_assign (unshare_expr (ovf), t, pre_p);
10279 if (container)
10280 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10282 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10283 addr = fold_convert (ptrtype, addr);
10285 if (indirect_p)
10286 addr = build_va_arg_indirect_ref (addr);
10287 return build_va_arg_indirect_ref (addr);
10290 /* Return true if OPNUM's MEM should be matched
10291 in movabs* patterns. */
10293 bool
10294 ix86_check_movabs (rtx insn, int opnum)
10296 rtx set, mem;
10298 set = PATTERN (insn);
10299 if (GET_CODE (set) == PARALLEL)
10300 set = XVECEXP (set, 0, 0);
10301 gcc_assert (GET_CODE (set) == SET);
10302 mem = XEXP (set, opnum);
10303 while (SUBREG_P (mem))
10304 mem = SUBREG_REG (mem);
10305 gcc_assert (MEM_P (mem));
10306 return volatile_ok || !MEM_VOLATILE_P (mem);
10309 /* Return false if INSN contains a MEM with a non-default address space. */
10310 bool
10311 ix86_check_no_addr_space (rtx insn)
10313 subrtx_var_iterator::array_type array;
10314 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10316 rtx x = *iter;
10317 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10318 return false;
10320 return true;
10323 /* Initialize the table of extra 80387 mathematical constants. */
10325 static void
10326 init_ext_80387_constants (void)
10328 static const char * cst[5] =
10330 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10331 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10332 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10333 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10334 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10336 int i;
10338 for (i = 0; i < 5; i++)
10340 real_from_string (&ext_80387_constants_table[i], cst[i]);
10341 /* Ensure each constant is rounded to XFmode precision. */
10342 real_convert (&ext_80387_constants_table[i],
10343 XFmode, &ext_80387_constants_table[i]);
10346 ext_80387_constants_init = 1;
10349 /* Return non-zero if the constant is something that
10350 can be loaded with a special instruction. */
10353 standard_80387_constant_p (rtx x)
10355 machine_mode mode = GET_MODE (x);
10357 const REAL_VALUE_TYPE *r;
10359 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10360 return -1;
10362 if (x == CONST0_RTX (mode))
10363 return 1;
10364 if (x == CONST1_RTX (mode))
10365 return 2;
10367 r = CONST_DOUBLE_REAL_VALUE (x);
10369 /* For XFmode constants, try to find a special 80387 instruction when
10370 optimizing for size or on those CPUs that benefit from them. */
10371 if (mode == XFmode
10372 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10374 int i;
10376 if (! ext_80387_constants_init)
10377 init_ext_80387_constants ();
10379 for (i = 0; i < 5; i++)
10380 if (real_identical (r, &ext_80387_constants_table[i]))
10381 return i + 3;
10384 /* Load of the constant -0.0 or -1.0 will be split as
10385 fldz;fchs or fld1;fchs sequence. */
10386 if (real_isnegzero (r))
10387 return 8;
10388 if (real_identical (r, &dconstm1))
10389 return 9;
10391 return 0;
10394 /* Return the opcode of the special instruction to be used to load
10395 the constant X. */
10397 const char *
10398 standard_80387_constant_opcode (rtx x)
10400 switch (standard_80387_constant_p (x))
10402 case 1:
10403 return "fldz";
10404 case 2:
10405 return "fld1";
10406 case 3:
10407 return "fldlg2";
10408 case 4:
10409 return "fldln2";
10410 case 5:
10411 return "fldl2e";
10412 case 6:
10413 return "fldl2t";
10414 case 7:
10415 return "fldpi";
10416 case 8:
10417 case 9:
10418 return "#";
10419 default:
10420 gcc_unreachable ();
10424 /* Return the CONST_DOUBLE representing the 80387 constant that is
10425 loaded by the specified special instruction. The argument IDX
10426 matches the return value from standard_80387_constant_p. */
10429 standard_80387_constant_rtx (int idx)
10431 int i;
10433 if (! ext_80387_constants_init)
10434 init_ext_80387_constants ();
10436 switch (idx)
10438 case 3:
10439 case 4:
10440 case 5:
10441 case 6:
10442 case 7:
10443 i = idx - 3;
10444 break;
10446 default:
10447 gcc_unreachable ();
10450 return const_double_from_real_value (ext_80387_constants_table[i],
10451 XFmode);
10454 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10455 in supported SSE/AVX vector mode. */
10458 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10460 machine_mode mode;
10462 if (!TARGET_SSE)
10463 return 0;
10465 mode = GET_MODE (x);
10467 if (x == const0_rtx || const0_operand (x, mode))
10468 return 1;
10470 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10472 /* VOIDmode integer constant, get mode from the predicate. */
10473 if (mode == VOIDmode)
10474 mode = pred_mode;
10476 switch (GET_MODE_SIZE (mode))
10478 case 64:
10479 if (TARGET_AVX512F)
10480 return 2;
10481 break;
10482 case 32:
10483 if (TARGET_AVX2)
10484 return 2;
10485 break;
10486 case 16:
10487 if (TARGET_SSE2)
10488 return 2;
10489 break;
10490 case 0:
10491 /* VOIDmode */
10492 gcc_unreachable ();
10493 default:
10494 break;
10498 return 0;
10501 /* Return the opcode of the special instruction to be used to load
10502 the constant operands[1] into operands[0]. */
10504 const char *
10505 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10507 machine_mode mode;
10508 rtx x = operands[1];
10510 gcc_assert (TARGET_SSE);
10512 mode = GET_MODE (x);
10514 if (x == const0_rtx || const0_operand (x, mode))
10516 switch (get_attr_mode (insn))
10518 case MODE_TI:
10519 if (!EXT_REX_SSE_REG_P (operands[0]))
10520 return "%vpxor\t%0, %d0";
10521 /* FALLTHRU */
10522 case MODE_XI:
10523 case MODE_OI:
10524 if (EXT_REX_SSE_REG_P (operands[0]))
10525 return (TARGET_AVX512VL
10526 ? "vpxord\t%x0, %x0, %x0"
10527 : "vpxord\t%g0, %g0, %g0");
10528 return "vpxor\t%x0, %x0, %x0";
10530 case MODE_V2DF:
10531 if (!EXT_REX_SSE_REG_P (operands[0]))
10532 return "%vxorpd\t%0, %d0";
10533 /* FALLTHRU */
10534 case MODE_V8DF:
10535 case MODE_V4DF:
10536 if (!EXT_REX_SSE_REG_P (operands[0]))
10537 return "vxorpd\t%x0, %x0, %x0";
10538 else if (TARGET_AVX512DQ)
10539 return (TARGET_AVX512VL
10540 ? "vxorpd\t%x0, %x0, %x0"
10541 : "vxorpd\t%g0, %g0, %g0");
10542 else
10543 return (TARGET_AVX512VL
10544 ? "vpxorq\t%x0, %x0, %x0"
10545 : "vpxorq\t%g0, %g0, %g0");
10547 case MODE_V4SF:
10548 if (!EXT_REX_SSE_REG_P (operands[0]))
10549 return "%vxorps\t%0, %d0";
10550 /* FALLTHRU */
10551 case MODE_V16SF:
10552 case MODE_V8SF:
10553 if (!EXT_REX_SSE_REG_P (operands[0]))
10554 return "vxorps\t%x0, %x0, %x0";
10555 else if (TARGET_AVX512DQ)
10556 return (TARGET_AVX512VL
10557 ? "vxorps\t%x0, %x0, %x0"
10558 : "vxorps\t%g0, %g0, %g0");
10559 else
10560 return (TARGET_AVX512VL
10561 ? "vpxord\t%x0, %x0, %x0"
10562 : "vpxord\t%g0, %g0, %g0");
10564 default:
10565 gcc_unreachable ();
10568 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10570 enum attr_mode insn_mode = get_attr_mode (insn);
10572 switch (insn_mode)
10574 case MODE_XI:
10575 case MODE_V8DF:
10576 case MODE_V16SF:
10577 gcc_assert (TARGET_AVX512F);
10578 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10580 case MODE_OI:
10581 case MODE_V4DF:
10582 case MODE_V8SF:
10583 gcc_assert (TARGET_AVX2);
10584 /* FALLTHRU */
10585 case MODE_TI:
10586 case MODE_V2DF:
10587 case MODE_V4SF:
10588 gcc_assert (TARGET_SSE2);
10589 if (!EXT_REX_SSE_REG_P (operands[0]))
10590 return (TARGET_AVX
10591 ? "vpcmpeqd\t%0, %0, %0"
10592 : "pcmpeqd\t%0, %0");
10593 else if (TARGET_AVX512VL)
10594 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10595 else
10596 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10598 default:
10599 gcc_unreachable ();
10603 gcc_unreachable ();
10606 /* Returns true if INSN can be transformed from a memory load
10607 to a supported FP constant load. */
10609 bool
10610 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10612 rtx src = find_constant_src (insn);
10614 gcc_assert (REG_P (dst));
10616 if (src == NULL
10617 || (SSE_REGNO_P (REGNO (dst))
10618 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10619 || (STACK_REGNO_P (REGNO (dst))
10620 && standard_80387_constant_p (src) < 1))
10621 return false;
10623 return true;
10626 /* Returns true if OP contains a symbol reference */
10628 bool
10629 symbolic_reference_mentioned_p (rtx op)
10631 const char *fmt;
10632 int i;
10634 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10635 return true;
10637 fmt = GET_RTX_FORMAT (GET_CODE (op));
10638 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10640 if (fmt[i] == 'E')
10642 int j;
10644 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10645 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10646 return true;
10649 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10650 return true;
10653 return false;
10656 /* Return true if it is appropriate to emit `ret' instructions in the
10657 body of a function. Do this only if the epilogue is simple, needing a
10658 couple of insns. Prior to reloading, we can't tell how many registers
10659 must be saved, so return false then. Return false if there is no frame
10660 marker to de-allocate. */
10662 bool
10663 ix86_can_use_return_insn_p (void)
10665 if (ix86_function_naked (current_function_decl))
10666 return false;
10668 /* Don't use `ret' instruction in interrupt handler. */
10669 if (! reload_completed
10670 || frame_pointer_needed
10671 || cfun->machine->func_type != TYPE_NORMAL)
10672 return 0;
10674 /* Don't allow more than 32k pop, since that's all we can do
10675 with one instruction. */
10676 if (crtl->args.pops_args && crtl->args.size >= 32768)
10677 return 0;
10679 struct ix86_frame &frame = cfun->machine->frame;
10680 return (frame.stack_pointer_offset == UNITS_PER_WORD
10681 && (frame.nregs + frame.nsseregs) == 0);
10684 /* Value should be nonzero if functions must have frame pointers.
10685 Zero means the frame pointer need not be set up (and parms may
10686 be accessed via the stack pointer) in functions that seem suitable. */
10688 static bool
10689 ix86_frame_pointer_required (void)
10691 /* If we accessed previous frames, then the generated code expects
10692 to be able to access the saved ebp value in our frame. */
10693 if (cfun->machine->accesses_prev_frame)
10694 return true;
10696 /* Several x86 os'es need a frame pointer for other reasons,
10697 usually pertaining to setjmp. */
10698 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10699 return true;
10701 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10702 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10703 return true;
10705 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10706 allocation is 4GB. */
10707 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10708 return true;
10710 /* SSE saves require frame-pointer when stack is misaligned. */
10711 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10712 return true;
10714 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10715 turns off the frame pointer by default. Turn it back on now if
10716 we've not got a leaf function. */
10717 if (TARGET_OMIT_LEAF_FRAME_POINTER
10718 && (!crtl->is_leaf
10719 || ix86_current_function_calls_tls_descriptor))
10720 return true;
10722 if (crtl->profile && !flag_fentry)
10723 return true;
10725 return false;
10728 /* Record that the current function accesses previous call frames. */
10730 void
10731 ix86_setup_frame_addresses (void)
10733 cfun->machine->accesses_prev_frame = 1;
10736 #ifndef USE_HIDDEN_LINKONCE
10737 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10738 # define USE_HIDDEN_LINKONCE 1
10739 # else
10740 # define USE_HIDDEN_LINKONCE 0
10741 # endif
10742 #endif
10744 /* Label count for call and return thunks. It is used to make unique
10745 labels in call and return thunks. */
10746 static int indirectlabelno;
10748 /* True if call and return thunk functions are needed. */
10749 static bool indirect_thunk_needed = false;
10750 /* True if call and return thunk functions with the BND prefix are
10751 needed. */
10752 static bool indirect_thunk_bnd_needed = false;
10754 /* Bit masks of integer registers, which contain branch target, used
10755 by call and return thunks functions. */
10756 static int indirect_thunks_used;
10757 /* Bit masks of integer registers, which contain branch target, used
10758 by call and return thunks functions with the BND prefix. */
10759 static int indirect_thunks_bnd_used;
10761 #ifndef INDIRECT_LABEL
10762 # define INDIRECT_LABEL "LIND"
10763 #endif
10765 /* Fills in the label name that should be used for the indirect thunk. */
10767 static void
10768 indirect_thunk_name (char name[32], unsigned int regno,
10769 bool need_bnd_p, bool ret_p)
10771 if (regno != INVALID_REGNUM && ret_p)
10772 gcc_unreachable ();
10774 if (USE_HIDDEN_LINKONCE)
10776 const char *bnd = need_bnd_p ? "_bnd" : "";
10777 if (regno != INVALID_REGNUM)
10779 const char *reg_prefix;
10780 if (LEGACY_INT_REGNO_P (regno))
10781 reg_prefix = TARGET_64BIT ? "r" : "e";
10782 else
10783 reg_prefix = "";
10784 sprintf (name, "__x86_indirect_thunk%s_%s%s",
10785 bnd, reg_prefix, reg_names[regno]);
10787 else
10789 const char *ret = ret_p ? "return" : "indirect";
10790 sprintf (name, "__x86_%s_thunk%s", ret, bnd);
10793 else
10795 if (regno != INVALID_REGNUM)
10797 if (need_bnd_p)
10798 ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10799 else
10800 ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10802 else
10804 if (ret_p)
10806 if (need_bnd_p)
10807 ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10808 else
10809 ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10811 else
10813 if (need_bnd_p)
10814 ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10815 else
10816 ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10822 /* Output a call and return thunk for indirect branch. If BND_P is
10823 true, the BND prefix is needed. If REGNO != -1, the function
10824 address is in REGNO and the call and return thunk looks like:
10826 call L2
10828 pause
10829 lfence
10830 jmp L1
10832 mov %REG, (%sp)
10835 Otherwise, the function address is on the top of stack and the
10836 call and return thunk looks like:
10838 call L2
10840 pause
10841 lfence
10842 jmp L1
10844 lea WORD_SIZE(%sp), %sp
10848 static void
10849 output_indirect_thunk (bool need_bnd_p, unsigned int regno)
10851 char indirectlabel1[32];
10852 char indirectlabel2[32];
10854 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10855 indirectlabelno++);
10856 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10857 indirectlabelno++);
10859 /* Call */
10860 if (need_bnd_p)
10861 fputs ("\tbnd call\t", asm_out_file);
10862 else
10863 fputs ("\tcall\t", asm_out_file);
10864 assemble_name_raw (asm_out_file, indirectlabel2);
10865 fputc ('\n', asm_out_file);
10867 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10869 /* AMD and Intel CPUs prefer each a different instruction as loop filler.
10870 Usage of both pause + lfence is compromise solution. */
10871 fprintf (asm_out_file, "\tpause\n\tlfence\n");
10873 /* Jump. */
10874 fputs ("\tjmp\t", asm_out_file);
10875 assemble_name_raw (asm_out_file, indirectlabel1);
10876 fputc ('\n', asm_out_file);
10878 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
10880 if (regno != INVALID_REGNUM)
10882 /* MOV. */
10883 rtx xops[2];
10884 xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
10885 xops[1] = gen_rtx_REG (word_mode, regno);
10886 output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
10888 else
10890 /* LEA. */
10891 rtx xops[2];
10892 xops[0] = stack_pointer_rtx;
10893 xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10894 output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
10897 if (need_bnd_p)
10898 fputs ("\tbnd ret\n", asm_out_file);
10899 else
10900 fputs ("\tret\n", asm_out_file);
10903 /* Output a funtion with a call and return thunk for indirect branch.
10904 If BND_P is true, the BND prefix is needed. If REGNO != UNVALID_REGNUM,
10905 the function address is in REGNO. Otherwise, the function address is
10906 on the top of stack. */
10908 static void
10909 output_indirect_thunk_function (bool need_bnd_p, unsigned int regno)
10911 char name[32];
10912 tree decl;
10914 /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd. */
10915 indirect_thunk_name (name, regno, need_bnd_p, false);
10916 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10917 get_identifier (name),
10918 build_function_type_list (void_type_node, NULL_TREE));
10919 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10920 NULL_TREE, void_type_node);
10921 TREE_PUBLIC (decl) = 1;
10922 TREE_STATIC (decl) = 1;
10923 DECL_IGNORED_P (decl) = 1;
10925 #if TARGET_MACHO
10926 if (TARGET_MACHO)
10928 switch_to_section (darwin_sections[picbase_thunk_section]);
10929 fputs ("\t.weak_definition\t", asm_out_file);
10930 assemble_name (asm_out_file, name);
10931 fputs ("\n\t.private_extern\t", asm_out_file);
10932 assemble_name (asm_out_file, name);
10933 putc ('\n', asm_out_file);
10934 ASM_OUTPUT_LABEL (asm_out_file, name);
10935 DECL_WEAK (decl) = 1;
10937 else
10938 #endif
10939 if (USE_HIDDEN_LINKONCE)
10941 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10943 targetm.asm_out.unique_section (decl, 0);
10944 switch_to_section (get_named_section (decl, NULL, 0));
10946 targetm.asm_out.globalize_label (asm_out_file, name);
10947 fputs ("\t.hidden\t", asm_out_file);
10948 assemble_name (asm_out_file, name);
10949 putc ('\n', asm_out_file);
10950 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10952 else
10954 switch_to_section (text_section);
10955 ASM_OUTPUT_LABEL (asm_out_file, name);
10958 if (regno == INVALID_REGNUM)
10960 /* Create alias for __x86.return_thunk/__x86.return_thunk_bnd. */
10961 char alias[32];
10963 indirect_thunk_name (alias, regno, need_bnd_p, true);
10964 #if TARGET_MACHO
10965 if (TARGET_MACHO)
10967 fputs ("\t.weak_definition\t", asm_out_file);
10968 assemble_name (asm_out_file, alias);
10969 fputs ("\n\t.private_extern\t", asm_out_file);
10970 assemble_name (asm_out_file, alias);
10971 putc ('\n', asm_out_file);
10972 ASM_OUTPUT_LABEL (asm_out_file, alias);
10974 #else
10975 ASM_OUTPUT_DEF (asm_out_file, alias, name);
10976 if (USE_HIDDEN_LINKONCE)
10978 fputs ("\t.globl\t", asm_out_file);
10979 assemble_name (asm_out_file, alias);
10980 putc ('\n', asm_out_file);
10981 fputs ("\t.hidden\t", asm_out_file);
10982 assemble_name (asm_out_file, alias);
10983 putc ('\n', asm_out_file);
10985 #endif
10988 DECL_INITIAL (decl) = make_node (BLOCK);
10989 current_function_decl = decl;
10990 allocate_struct_function (decl, false);
10991 init_function_start (decl);
10992 /* We're about to hide the function body from callees of final_* by
10993 emitting it directly; tell them we're a thunk, if they care. */
10994 cfun->is_thunk = true;
10995 first_function_block_is_cold = false;
10996 /* Make sure unwind info is emitted for the thunk if needed. */
10997 final_start_function (emit_barrier (), asm_out_file, 1);
10999 output_indirect_thunk (need_bnd_p, regno);
11001 final_end_function ();
11002 init_insn_lengths ();
11003 free_after_compilation (cfun);
11004 set_cfun (NULL);
11005 current_function_decl = NULL;
11008 static int pic_labels_used;
11010 /* Fills in the label name that should be used for a pc thunk for
11011 the given register. */
11013 static void
11014 get_pc_thunk_name (char name[32], unsigned int regno)
11016 gcc_assert (!TARGET_64BIT);
11018 if (USE_HIDDEN_LINKONCE)
11019 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
11020 else
11021 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
11025 /* This function generates code for -fpic that loads %ebx with
11026 the return address of the caller and then returns. */
11028 static void
11029 ix86_code_end (void)
11031 rtx xops[2];
11032 unsigned int regno;
11034 if (indirect_thunk_needed)
11035 output_indirect_thunk_function (false, INVALID_REGNUM);
11036 if (indirect_thunk_bnd_needed)
11037 output_indirect_thunk_function (true, INVALID_REGNUM);
11039 for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
11041 unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
11042 if ((indirect_thunks_used & (1 << i)))
11043 output_indirect_thunk_function (false, regno);
11045 if ((indirect_thunks_bnd_used & (1 << i)))
11046 output_indirect_thunk_function (true, regno);
11049 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11051 char name[32];
11052 tree decl;
11054 if ((indirect_thunks_used & (1 << regno)))
11055 output_indirect_thunk_function (false, regno);
11057 if ((indirect_thunks_bnd_used & (1 << regno)))
11058 output_indirect_thunk_function (true, regno);
11060 if (!(pic_labels_used & (1 << regno)))
11061 continue;
11063 get_pc_thunk_name (name, regno);
11065 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11066 get_identifier (name),
11067 build_function_type_list (void_type_node, NULL_TREE));
11068 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11069 NULL_TREE, void_type_node);
11070 TREE_PUBLIC (decl) = 1;
11071 TREE_STATIC (decl) = 1;
11072 DECL_IGNORED_P (decl) = 1;
11074 #if TARGET_MACHO
11075 if (TARGET_MACHO)
11077 switch_to_section (darwin_sections[picbase_thunk_section]);
11078 fputs ("\t.weak_definition\t", asm_out_file);
11079 assemble_name (asm_out_file, name);
11080 fputs ("\n\t.private_extern\t", asm_out_file);
11081 assemble_name (asm_out_file, name);
11082 putc ('\n', asm_out_file);
11083 ASM_OUTPUT_LABEL (asm_out_file, name);
11084 DECL_WEAK (decl) = 1;
11086 else
11087 #endif
11088 if (USE_HIDDEN_LINKONCE)
11090 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11092 targetm.asm_out.unique_section (decl, 0);
11093 switch_to_section (get_named_section (decl, NULL, 0));
11095 targetm.asm_out.globalize_label (asm_out_file, name);
11096 fputs ("\t.hidden\t", asm_out_file);
11097 assemble_name (asm_out_file, name);
11098 putc ('\n', asm_out_file);
11099 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11101 else
11103 switch_to_section (text_section);
11104 ASM_OUTPUT_LABEL (asm_out_file, name);
11107 DECL_INITIAL (decl) = make_node (BLOCK);
11108 current_function_decl = decl;
11109 allocate_struct_function (decl, false);
11110 init_function_start (decl);
11111 /* We're about to hide the function body from callees of final_* by
11112 emitting it directly; tell them we're a thunk, if they care. */
11113 cfun->is_thunk = true;
11114 first_function_block_is_cold = false;
11115 /* Make sure unwind info is emitted for the thunk if needed. */
11116 final_start_function (emit_barrier (), asm_out_file, 1);
11118 /* Pad stack IP move with 4 instructions (two NOPs count
11119 as one instruction). */
11120 if (TARGET_PAD_SHORT_FUNCTION)
11122 int i = 8;
11124 while (i--)
11125 fputs ("\tnop\n", asm_out_file);
11128 xops[0] = gen_rtx_REG (Pmode, regno);
11129 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11130 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11131 output_asm_insn ("%!ret", NULL);
11132 final_end_function ();
11133 init_insn_lengths ();
11134 free_after_compilation (cfun);
11135 set_cfun (NULL);
11136 current_function_decl = NULL;
11139 if (flag_split_stack)
11140 file_end_indicate_split_stack ();
11143 /* Emit code for the SET_GOT patterns. */
11145 const char *
11146 output_set_got (rtx dest, rtx label)
11148 rtx xops[3];
11150 xops[0] = dest;
11152 if (TARGET_VXWORKS_RTP && flag_pic)
11154 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11155 xops[2] = gen_rtx_MEM (Pmode,
11156 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11157 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11159 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11160 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11161 an unadorned address. */
11162 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11163 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11164 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11165 return "";
11168 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11170 if (flag_pic)
11172 char name[32];
11173 get_pc_thunk_name (name, REGNO (dest));
11174 pic_labels_used |= 1 << REGNO (dest);
11176 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11177 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11178 output_asm_insn ("%!call\t%X2", xops);
11180 #if TARGET_MACHO
11181 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11182 This is what will be referenced by the Mach-O PIC subsystem. */
11183 if (machopic_should_output_picbase_label () || !label)
11184 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11186 /* When we are restoring the pic base at the site of a nonlocal label,
11187 and we decided to emit the pic base above, we will still output a
11188 local label used for calculating the correction offset (even though
11189 the offset will be 0 in that case). */
11190 if (label)
11191 targetm.asm_out.internal_label (asm_out_file, "L",
11192 CODE_LABEL_NUMBER (label));
11193 #endif
11195 else
11197 if (TARGET_MACHO)
11198 /* We don't need a pic base, we're not producing pic. */
11199 gcc_unreachable ();
11201 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11202 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11203 targetm.asm_out.internal_label (asm_out_file, "L",
11204 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11207 if (!TARGET_MACHO)
11208 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11210 return "";
11213 /* Generate an "push" pattern for input ARG. */
11215 static rtx
11216 gen_push (rtx arg)
11218 struct machine_function *m = cfun->machine;
11220 if (m->fs.cfa_reg == stack_pointer_rtx)
11221 m->fs.cfa_offset += UNITS_PER_WORD;
11222 m->fs.sp_offset += UNITS_PER_WORD;
11224 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11225 arg = gen_rtx_REG (word_mode, REGNO (arg));
11227 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11228 gen_rtx_PRE_DEC (Pmode,
11229 stack_pointer_rtx)),
11230 arg);
11233 /* Generate an "pop" pattern for input ARG. */
11235 static rtx
11236 gen_pop (rtx arg)
11238 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11239 arg = gen_rtx_REG (word_mode, REGNO (arg));
11241 return gen_rtx_SET (arg,
11242 gen_rtx_MEM (word_mode,
11243 gen_rtx_POST_INC (Pmode,
11244 stack_pointer_rtx)));
11247 /* Return >= 0 if there is an unused call-clobbered register available
11248 for the entire function. */
11250 static unsigned int
11251 ix86_select_alt_pic_regnum (void)
11253 if (ix86_use_pseudo_pic_reg ())
11254 return INVALID_REGNUM;
11256 if (crtl->is_leaf
11257 && !crtl->profile
11258 && !ix86_current_function_calls_tls_descriptor)
11260 int i, drap;
11261 /* Can't use the same register for both PIC and DRAP. */
11262 if (crtl->drap_reg)
11263 drap = REGNO (crtl->drap_reg);
11264 else
11265 drap = -1;
11266 for (i = 2; i >= 0; --i)
11267 if (i != drap && !df_regs_ever_live_p (i))
11268 return i;
11271 return INVALID_REGNUM;
11274 /* Return true if REGNO is used by the epilogue. */
11276 bool
11277 ix86_epilogue_uses (int regno)
11279 /* If there are no caller-saved registers, we preserve all registers,
11280 except for MMX and x87 registers which aren't supported when saving
11281 and restoring registers. Don't explicitly save SP register since
11282 it is always preserved. */
11283 return (epilogue_completed
11284 && cfun->machine->no_caller_saved_registers
11285 && !fixed_regs[regno]
11286 && !STACK_REGNO_P (regno)
11287 && !MMX_REGNO_P (regno));
11290 /* Return nonzero if register REGNO can be used as a scratch register
11291 in peephole2. */
11293 static bool
11294 ix86_hard_regno_scratch_ok (unsigned int regno)
11296 /* If there are no caller-saved registers, we can't use any register
11297 as a scratch register after epilogue and use REGNO as scratch
11298 register only if it has been used before to avoid saving and
11299 restoring it. */
11300 return (!cfun->machine->no_caller_saved_registers
11301 || (!epilogue_completed
11302 && df_regs_ever_live_p (regno)));
11305 /* Return true if register class CL should be an additional allocno
11306 class. */
11308 static bool
11309 ix86_additional_allocno_class_p (reg_class_t cl)
11311 return cl == MOD4_SSE_REGS;
11314 /* Return TRUE if we need to save REGNO. */
11316 static bool
11317 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11319 /* If there are no caller-saved registers, we preserve all registers,
11320 except for MMX and x87 registers which aren't supported when saving
11321 and restoring registers. Don't explicitly save SP register since
11322 it is always preserved. */
11323 if (cfun->machine->no_caller_saved_registers)
11325 /* Don't preserve registers used for function return value. */
11326 rtx reg = crtl->return_rtx;
11327 if (reg)
11329 unsigned int i = REGNO (reg);
11330 unsigned int nregs = REG_NREGS (reg);
11331 while (nregs-- > 0)
11332 if ((i + nregs) == regno)
11333 return false;
11335 reg = crtl->return_bnd;
11336 if (reg)
11338 i = REGNO (reg);
11339 nregs = REG_NREGS (reg);
11340 while (nregs-- > 0)
11341 if ((i + nregs) == regno)
11342 return false;
11346 return (df_regs_ever_live_p (regno)
11347 && !fixed_regs[regno]
11348 && !STACK_REGNO_P (regno)
11349 && !MMX_REGNO_P (regno)
11350 && (regno != HARD_FRAME_POINTER_REGNUM
11351 || !frame_pointer_needed));
11354 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11355 && pic_offset_table_rtx)
11357 if (ix86_use_pseudo_pic_reg ())
11359 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11360 _mcount in prologue. */
11361 if (!TARGET_64BIT && flag_pic && crtl->profile)
11362 return true;
11364 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11365 || crtl->profile
11366 || crtl->calls_eh_return
11367 || crtl->uses_const_pool
11368 || cfun->has_nonlocal_label)
11369 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11372 if (crtl->calls_eh_return && maybe_eh_return)
11374 unsigned i;
11375 for (i = 0; ; i++)
11377 unsigned test = EH_RETURN_DATA_REGNO (i);
11378 if (test == INVALID_REGNUM)
11379 break;
11380 if (test == regno)
11381 return true;
11385 if (ignore_outlined && cfun->machine->call_ms2sysv)
11387 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11388 + xlogue_layout::MIN_REGS;
11389 if (xlogue_layout::is_stub_managed_reg (regno, count))
11390 return false;
11393 if (crtl->drap_reg
11394 && regno == REGNO (crtl->drap_reg)
11395 && !cfun->machine->no_drap_save_restore)
11396 return true;
11398 return (df_regs_ever_live_p (regno)
11399 && !call_used_regs[regno]
11400 && !fixed_regs[regno]
11401 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11404 /* Return number of saved general prupose registers. */
11406 static int
11407 ix86_nsaved_regs (void)
11409 int nregs = 0;
11410 int regno;
11412 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11413 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11414 nregs ++;
11415 return nregs;
11418 /* Return number of saved SSE registers. */
11420 static int
11421 ix86_nsaved_sseregs (void)
11423 int nregs = 0;
11424 int regno;
11426 if (!TARGET_64BIT_MS_ABI)
11427 return 0;
11428 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11429 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11430 nregs ++;
11431 return nregs;
11434 /* Given FROM and TO register numbers, say whether this elimination is
11435 allowed. If stack alignment is needed, we can only replace argument
11436 pointer with hard frame pointer, or replace frame pointer with stack
11437 pointer. Otherwise, frame pointer elimination is automatically
11438 handled and all other eliminations are valid. */
11440 static bool
11441 ix86_can_eliminate (const int from, const int to)
11443 if (stack_realign_fp)
11444 return ((from == ARG_POINTER_REGNUM
11445 && to == HARD_FRAME_POINTER_REGNUM)
11446 || (from == FRAME_POINTER_REGNUM
11447 && to == STACK_POINTER_REGNUM));
11448 else
11449 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11452 /* Return the offset between two registers, one to be eliminated, and the other
11453 its replacement, at the start of a routine. */
11455 HOST_WIDE_INT
11456 ix86_initial_elimination_offset (int from, int to)
11458 struct ix86_frame &frame = cfun->machine->frame;
11460 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11461 return frame.hard_frame_pointer_offset;
11462 else if (from == FRAME_POINTER_REGNUM
11463 && to == HARD_FRAME_POINTER_REGNUM)
11464 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11465 else
11467 gcc_assert (to == STACK_POINTER_REGNUM);
11469 if (from == ARG_POINTER_REGNUM)
11470 return frame.stack_pointer_offset;
11472 gcc_assert (from == FRAME_POINTER_REGNUM);
11473 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11477 /* In a dynamically-aligned function, we can't know the offset from
11478 stack pointer to frame pointer, so we must ensure that setjmp
11479 eliminates fp against the hard fp (%ebp) rather than trying to
11480 index from %esp up to the top of the frame across a gap that is
11481 of unknown (at compile-time) size. */
11482 static rtx
11483 ix86_builtin_setjmp_frame_value (void)
11485 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11488 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11489 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11491 static bool warned_once = false;
11492 if (!warned_once)
11494 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11495 feature);
11496 warned_once = true;
11500 /* Return the probing interval for -fstack-clash-protection. */
11502 static HOST_WIDE_INT
11503 get_probe_interval (void)
11505 if (flag_stack_clash_protection)
11506 return (HOST_WIDE_INT_1U
11507 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11508 else
11509 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11512 /* When using -fsplit-stack, the allocation routines set a field in
11513 the TCB to the bottom of the stack plus this much space, measured
11514 in bytes. */
11516 #define SPLIT_STACK_AVAILABLE 256
11518 /* Fill structure ix86_frame about frame of currently computed function. */
11520 static void
11521 ix86_compute_frame_layout (void)
11523 struct ix86_frame *frame = &cfun->machine->frame;
11524 struct machine_function *m = cfun->machine;
11525 unsigned HOST_WIDE_INT stack_alignment_needed;
11526 HOST_WIDE_INT offset;
11527 unsigned HOST_WIDE_INT preferred_alignment;
11528 HOST_WIDE_INT size = get_frame_size ();
11529 HOST_WIDE_INT to_allocate;
11531 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11532 * ms_abi functions that call a sysv function. We now need to prune away
11533 * cases where it should be disabled. */
11534 if (TARGET_64BIT && m->call_ms2sysv)
11536 gcc_assert (TARGET_64BIT_MS_ABI);
11537 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11538 gcc_assert (!TARGET_SEH);
11539 gcc_assert (TARGET_SSE);
11540 gcc_assert (!ix86_using_red_zone ());
11542 if (crtl->calls_eh_return)
11544 gcc_assert (!reload_completed);
11545 m->call_ms2sysv = false;
11546 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11549 else if (ix86_static_chain_on_stack)
11551 gcc_assert (!reload_completed);
11552 m->call_ms2sysv = false;
11553 warn_once_call_ms2sysv_xlogues ("static call chains");
11556 /* Finally, compute which registers the stub will manage. */
11557 else
11559 unsigned count = xlogue_layout::count_stub_managed_regs ();
11560 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11561 m->call_ms2sysv_pad_in = 0;
11565 frame->nregs = ix86_nsaved_regs ();
11566 frame->nsseregs = ix86_nsaved_sseregs ();
11568 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11569 except for function prologues, leaf functions and when the defult
11570 incoming stack boundary is overriden at command line or via
11571 force_align_arg_pointer attribute. */
11572 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11573 && (!crtl->is_leaf || cfun->calls_alloca != 0
11574 || ix86_current_function_calls_tls_descriptor
11575 || ix86_incoming_stack_boundary < 128))
11577 crtl->preferred_stack_boundary = 128;
11578 crtl->stack_alignment_needed = 128;
11581 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11582 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11584 gcc_assert (!size || stack_alignment_needed);
11585 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11586 gcc_assert (preferred_alignment <= stack_alignment_needed);
11588 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11589 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11590 if (TARGET_64BIT && m->call_ms2sysv)
11592 gcc_assert (stack_alignment_needed >= 16);
11593 gcc_assert (!frame->nsseregs);
11596 /* For SEH we have to limit the amount of code movement into the prologue.
11597 At present we do this via a BLOCKAGE, at which point there's very little
11598 scheduling that can be done, which means that there's very little point
11599 in doing anything except PUSHs. */
11600 if (TARGET_SEH)
11601 m->use_fast_prologue_epilogue = false;
11602 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11604 int count = frame->nregs;
11605 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11607 /* The fast prologue uses move instead of push to save registers. This
11608 is significantly longer, but also executes faster as modern hardware
11609 can execute the moves in parallel, but can't do that for push/pop.
11611 Be careful about choosing what prologue to emit: When function takes
11612 many instructions to execute we may use slow version as well as in
11613 case function is known to be outside hot spot (this is known with
11614 feedback only). Weight the size of function by number of registers
11615 to save as it is cheap to use one or two push instructions but very
11616 slow to use many of them. */
11617 if (count)
11618 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11619 if (node->frequency < NODE_FREQUENCY_NORMAL
11620 || (flag_branch_probabilities
11621 && node->frequency < NODE_FREQUENCY_HOT))
11622 m->use_fast_prologue_epilogue = false;
11623 else
11624 m->use_fast_prologue_epilogue
11625 = !expensive_function_p (count);
11628 frame->save_regs_using_mov
11629 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11630 /* If static stack checking is enabled and done with probes,
11631 the registers need to be saved before allocating the frame. */
11632 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11634 /* Skip return address and error code in exception handler. */
11635 offset = INCOMING_FRAME_SP_OFFSET;
11637 /* Skip pushed static chain. */
11638 if (ix86_static_chain_on_stack)
11639 offset += UNITS_PER_WORD;
11641 /* Skip saved base pointer. */
11642 if (frame_pointer_needed)
11643 offset += UNITS_PER_WORD;
11644 frame->hfp_save_offset = offset;
11646 /* The traditional frame pointer location is at the top of the frame. */
11647 frame->hard_frame_pointer_offset = offset;
11649 /* Register save area */
11650 offset += frame->nregs * UNITS_PER_WORD;
11651 frame->reg_save_offset = offset;
11653 /* On SEH target, registers are pushed just before the frame pointer
11654 location. */
11655 if (TARGET_SEH)
11656 frame->hard_frame_pointer_offset = offset;
11658 /* Calculate the size of the va-arg area (not including padding, if any). */
11659 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11661 /* Also adjust stack_realign_offset for the largest alignment of
11662 stack slot actually used. */
11663 if (stack_realign_fp
11664 || (cfun->machine->max_used_stack_alignment != 0
11665 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11667 /* We may need a 16-byte aligned stack for the remainder of the
11668 register save area, but the stack frame for the local function
11669 may require a greater alignment if using AVX/2/512. In order
11670 to avoid wasting space, we first calculate the space needed for
11671 the rest of the register saves, add that to the stack pointer,
11672 and then realign the stack to the boundary of the start of the
11673 frame for the local function. */
11674 HOST_WIDE_INT space_needed = 0;
11675 HOST_WIDE_INT sse_reg_space_needed = 0;
11677 if (TARGET_64BIT)
11679 if (m->call_ms2sysv)
11681 m->call_ms2sysv_pad_in = 0;
11682 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11685 else if (frame->nsseregs)
11686 /* The only ABI that has saved SSE registers (Win64) also has a
11687 16-byte aligned default stack. However, many programs violate
11688 the ABI, and Wine64 forces stack realignment to compensate. */
11689 space_needed = frame->nsseregs * 16;
11691 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11693 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11694 rounding to be pedantic. */
11695 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11697 else
11698 space_needed = frame->va_arg_size;
11700 /* Record the allocation size required prior to the realignment AND. */
11701 frame->stack_realign_allocate = space_needed;
11703 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11704 before this point are not directly comparable with values below
11705 this point. Use sp_valid_at to determine if the stack pointer is
11706 valid for a given offset, fp_valid_at for the frame pointer, or
11707 choose_baseaddr to have a base register chosen for you.
11709 Note that the result of (frame->stack_realign_offset
11710 & (stack_alignment_needed - 1)) may not equal zero. */
11711 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11712 frame->stack_realign_offset = offset - space_needed;
11713 frame->sse_reg_save_offset = frame->stack_realign_offset
11714 + sse_reg_space_needed;
11716 else
11718 frame->stack_realign_offset = offset;
11720 if (TARGET_64BIT && m->call_ms2sysv)
11722 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11723 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11726 /* Align and set SSE register save area. */
11727 else if (frame->nsseregs)
11729 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11730 required and the DRAP re-alignment boundary is at least 16 bytes,
11731 then we want the SSE register save area properly aligned. */
11732 if (ix86_incoming_stack_boundary >= 128
11733 || (stack_realign_drap && stack_alignment_needed >= 16))
11734 offset = ROUND_UP (offset, 16);
11735 offset += frame->nsseregs * 16;
11737 frame->sse_reg_save_offset = offset;
11738 offset += frame->va_arg_size;
11741 /* Align start of frame for local function. When a function call
11742 is removed, it may become a leaf function. But if argument may
11743 be passed on stack, we need to align the stack when there is no
11744 tail call. */
11745 if (m->call_ms2sysv
11746 || frame->va_arg_size != 0
11747 || size != 0
11748 || !crtl->is_leaf
11749 || (!crtl->tail_call_emit
11750 && cfun->machine->outgoing_args_on_stack)
11751 || cfun->calls_alloca
11752 || ix86_current_function_calls_tls_descriptor)
11753 offset = ROUND_UP (offset, stack_alignment_needed);
11755 /* Frame pointer points here. */
11756 frame->frame_pointer_offset = offset;
11758 offset += size;
11760 /* Add outgoing arguments area. Can be skipped if we eliminated
11761 all the function calls as dead code.
11762 Skipping is however impossible when function calls alloca. Alloca
11763 expander assumes that last crtl->outgoing_args_size
11764 of stack frame are unused. */
11765 if (ACCUMULATE_OUTGOING_ARGS
11766 && (!crtl->is_leaf || cfun->calls_alloca
11767 || ix86_current_function_calls_tls_descriptor))
11769 offset += crtl->outgoing_args_size;
11770 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11772 else
11773 frame->outgoing_arguments_size = 0;
11775 /* Align stack boundary. Only needed if we're calling another function
11776 or using alloca. */
11777 if (!crtl->is_leaf || cfun->calls_alloca
11778 || ix86_current_function_calls_tls_descriptor)
11779 offset = ROUND_UP (offset, preferred_alignment);
11781 /* We've reached end of stack frame. */
11782 frame->stack_pointer_offset = offset;
11784 /* Size prologue needs to allocate. */
11785 to_allocate = offset - frame->sse_reg_save_offset;
11787 if ((!to_allocate && frame->nregs <= 1)
11788 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11789 /* If stack clash probing needs a loop, then it needs a
11790 scratch register. But the returned register is only guaranteed
11791 to be safe to use after register saves are complete. So if
11792 stack clash protections are enabled and the allocated frame is
11793 larger than the probe interval, then use pushes to save
11794 callee saved registers. */
11795 || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11796 frame->save_regs_using_mov = false;
11798 if (ix86_using_red_zone ()
11799 && crtl->sp_is_unchanging
11800 && crtl->is_leaf
11801 && !ix86_pc_thunk_call_expanded
11802 && !ix86_current_function_calls_tls_descriptor)
11804 frame->red_zone_size = to_allocate;
11805 if (frame->save_regs_using_mov)
11806 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11807 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11808 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11810 else
11811 frame->red_zone_size = 0;
11812 frame->stack_pointer_offset -= frame->red_zone_size;
11814 /* The SEH frame pointer location is near the bottom of the frame.
11815 This is enforced by the fact that the difference between the
11816 stack pointer and the frame pointer is limited to 240 bytes in
11817 the unwind data structure. */
11818 if (TARGET_SEH)
11820 HOST_WIDE_INT diff;
11822 /* If we can leave the frame pointer where it is, do so. Also, returns
11823 the establisher frame for __builtin_frame_address (0). */
11824 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11825 if (diff <= SEH_MAX_FRAME_SIZE
11826 && (diff > 240 || (diff & 15) != 0)
11827 && !crtl->accesses_prior_frames)
11829 /* Ideally we'd determine what portion of the local stack frame
11830 (within the constraint of the lowest 240) is most heavily used.
11831 But without that complication, simply bias the frame pointer
11832 by 128 bytes so as to maximize the amount of the local stack
11833 frame that is addressable with 8-bit offsets. */
11834 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11839 /* This is semi-inlined memory_address_length, but simplified
11840 since we know that we're always dealing with reg+offset, and
11841 to avoid having to create and discard all that rtl. */
11843 static inline int
11844 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11846 int len = 4;
11848 if (offset == 0)
11850 /* EBP and R13 cannot be encoded without an offset. */
11851 len = (regno == BP_REG || regno == R13_REG);
11853 else if (IN_RANGE (offset, -128, 127))
11854 len = 1;
11856 /* ESP and R12 must be encoded with a SIB byte. */
11857 if (regno == SP_REG || regno == R12_REG)
11858 len++;
11860 return len;
11863 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11864 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11866 static bool
11867 sp_valid_at (HOST_WIDE_INT cfa_offset)
11869 const struct machine_frame_state &fs = cfun->machine->fs;
11870 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11872 /* Validate that the cfa_offset isn't in a "no-man's land". */
11873 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11874 return false;
11876 return fs.sp_valid;
11879 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11880 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11882 static inline bool
11883 fp_valid_at (HOST_WIDE_INT cfa_offset)
11885 const struct machine_frame_state &fs = cfun->machine->fs;
11886 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11888 /* Validate that the cfa_offset isn't in a "no-man's land". */
11889 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11890 return false;
11892 return fs.fp_valid;
11895 /* Choose a base register based upon alignment requested, speed and/or
11896 size. */
11898 static void
11899 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11900 HOST_WIDE_INT &base_offset,
11901 unsigned int align_reqested, unsigned int *align)
11903 const struct machine_function *m = cfun->machine;
11904 unsigned int hfp_align;
11905 unsigned int drap_align;
11906 unsigned int sp_align;
11907 bool hfp_ok = fp_valid_at (cfa_offset);
11908 bool drap_ok = m->fs.drap_valid;
11909 bool sp_ok = sp_valid_at (cfa_offset);
11911 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11913 /* Filter out any registers that don't meet the requested alignment
11914 criteria. */
11915 if (align_reqested)
11917 if (m->fs.realigned)
11918 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11919 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11920 notes (which we would need to use a realigned stack pointer),
11921 so disable on SEH targets. */
11922 else if (m->fs.sp_realigned)
11923 sp_align = crtl->stack_alignment_needed;
11925 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11926 drap_ok = drap_ok && drap_align >= align_reqested;
11927 sp_ok = sp_ok && sp_align >= align_reqested;
11930 if (m->use_fast_prologue_epilogue)
11932 /* Choose the base register most likely to allow the most scheduling
11933 opportunities. Generally FP is valid throughout the function,
11934 while DRAP must be reloaded within the epilogue. But choose either
11935 over the SP due to increased encoding size. */
11937 if (hfp_ok)
11939 base_reg = hard_frame_pointer_rtx;
11940 base_offset = m->fs.fp_offset - cfa_offset;
11942 else if (drap_ok)
11944 base_reg = crtl->drap_reg;
11945 base_offset = 0 - cfa_offset;
11947 else if (sp_ok)
11949 base_reg = stack_pointer_rtx;
11950 base_offset = m->fs.sp_offset - cfa_offset;
11953 else
11955 HOST_WIDE_INT toffset;
11956 int len = 16, tlen;
11958 /* Choose the base register with the smallest address encoding.
11959 With a tie, choose FP > DRAP > SP. */
11960 if (sp_ok)
11962 base_reg = stack_pointer_rtx;
11963 base_offset = m->fs.sp_offset - cfa_offset;
11964 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11966 if (drap_ok)
11968 toffset = 0 - cfa_offset;
11969 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11970 if (tlen <= len)
11972 base_reg = crtl->drap_reg;
11973 base_offset = toffset;
11974 len = tlen;
11977 if (hfp_ok)
11979 toffset = m->fs.fp_offset - cfa_offset;
11980 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11981 if (tlen <= len)
11983 base_reg = hard_frame_pointer_rtx;
11984 base_offset = toffset;
11985 len = tlen;
11990 /* Set the align return value. */
11991 if (align)
11993 if (base_reg == stack_pointer_rtx)
11994 *align = sp_align;
11995 else if (base_reg == crtl->drap_reg)
11996 *align = drap_align;
11997 else if (base_reg == hard_frame_pointer_rtx)
11998 *align = hfp_align;
12002 /* Return an RTX that points to CFA_OFFSET within the stack frame and
12003 the alignment of address. If ALIGN is non-null, it should point to
12004 an alignment value (in bits) that is preferred or zero and will
12005 recieve the alignment of the base register that was selected,
12006 irrespective of rather or not CFA_OFFSET is a multiple of that
12007 alignment value. If it is possible for the base register offset to be
12008 non-immediate then SCRATCH_REGNO should specify a scratch register to
12009 use.
12011 The valid base registers are taken from CFUN->MACHINE->FS. */
12013 static rtx
12014 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
12015 unsigned int scratch_regno = INVALID_REGNUM)
12017 rtx base_reg = NULL;
12018 HOST_WIDE_INT base_offset = 0;
12020 /* If a specific alignment is requested, try to get a base register
12021 with that alignment first. */
12022 if (align && *align)
12023 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
12025 if (!base_reg)
12026 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
12028 gcc_assert (base_reg != NULL);
12030 rtx base_offset_rtx = GEN_INT (base_offset);
12032 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
12034 gcc_assert (scratch_regno != INVALID_REGNUM);
12036 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
12037 emit_move_insn (scratch_reg, base_offset_rtx);
12039 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
12042 return plus_constant (Pmode, base_reg, base_offset);
12045 /* Emit code to save registers in the prologue. */
12047 static void
12048 ix86_emit_save_regs (void)
12050 unsigned int regno;
12051 rtx_insn *insn;
12053 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12054 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12056 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12057 RTX_FRAME_RELATED_P (insn) = 1;
12061 /* Emit a single register save at CFA - CFA_OFFSET. */
12063 static void
12064 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12065 HOST_WIDE_INT cfa_offset)
12067 struct machine_function *m = cfun->machine;
12068 rtx reg = gen_rtx_REG (mode, regno);
12069 rtx mem, addr, base, insn;
12070 unsigned int align = GET_MODE_ALIGNMENT (mode);
12072 addr = choose_baseaddr (cfa_offset, &align);
12073 mem = gen_frame_mem (mode, addr);
12075 /* The location aligment depends upon the base register. */
12076 align = MIN (GET_MODE_ALIGNMENT (mode), align);
12077 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12078 set_mem_align (mem, align);
12080 insn = emit_insn (gen_rtx_SET (mem, reg));
12081 RTX_FRAME_RELATED_P (insn) = 1;
12083 base = addr;
12084 if (GET_CODE (base) == PLUS)
12085 base = XEXP (base, 0);
12086 gcc_checking_assert (REG_P (base));
12088 /* When saving registers into a re-aligned local stack frame, avoid
12089 any tricky guessing by dwarf2out. */
12090 if (m->fs.realigned)
12092 gcc_checking_assert (stack_realign_drap);
12094 if (regno == REGNO (crtl->drap_reg))
12096 /* A bit of a hack. We force the DRAP register to be saved in
12097 the re-aligned stack frame, which provides us with a copy
12098 of the CFA that will last past the prologue. Install it. */
12099 gcc_checking_assert (cfun->machine->fs.fp_valid);
12100 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12101 cfun->machine->fs.fp_offset - cfa_offset);
12102 mem = gen_rtx_MEM (mode, addr);
12103 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12105 else
12107 /* The frame pointer is a stable reference within the
12108 aligned frame. Use it. */
12109 gcc_checking_assert (cfun->machine->fs.fp_valid);
12110 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12111 cfun->machine->fs.fp_offset - cfa_offset);
12112 mem = gen_rtx_MEM (mode, addr);
12113 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12117 else if (base == stack_pointer_rtx && m->fs.sp_realigned
12118 && cfa_offset >= m->fs.sp_realigned_offset)
12120 gcc_checking_assert (stack_realign_fp);
12121 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12124 /* The memory may not be relative to the current CFA register,
12125 which means that we may need to generate a new pattern for
12126 use by the unwind info. */
12127 else if (base != m->fs.cfa_reg)
12129 addr = plus_constant (Pmode, m->fs.cfa_reg,
12130 m->fs.cfa_offset - cfa_offset);
12131 mem = gen_rtx_MEM (mode, addr);
12132 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12136 /* Emit code to save registers using MOV insns.
12137 First register is stored at CFA - CFA_OFFSET. */
12138 static void
12139 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12141 unsigned int regno;
12143 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12144 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12146 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12147 cfa_offset -= UNITS_PER_WORD;
12151 /* Emit code to save SSE registers using MOV insns.
12152 First register is stored at CFA - CFA_OFFSET. */
12153 static void
12154 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12156 unsigned int regno;
12158 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12159 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12161 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12162 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12166 static GTY(()) rtx queued_cfa_restores;
12168 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12169 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12170 Don't add the note if the previously saved value will be left untouched
12171 within stack red-zone till return, as unwinders can find the same value
12172 in the register and on the stack. */
12174 static void
12175 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12177 if (!crtl->shrink_wrapped
12178 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12179 return;
12181 if (insn)
12183 add_reg_note (insn, REG_CFA_RESTORE, reg);
12184 RTX_FRAME_RELATED_P (insn) = 1;
12186 else
12187 queued_cfa_restores
12188 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12191 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12193 static void
12194 ix86_add_queued_cfa_restore_notes (rtx insn)
12196 rtx last;
12197 if (!queued_cfa_restores)
12198 return;
12199 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12201 XEXP (last, 1) = REG_NOTES (insn);
12202 REG_NOTES (insn) = queued_cfa_restores;
12203 queued_cfa_restores = NULL_RTX;
12204 RTX_FRAME_RELATED_P (insn) = 1;
12207 /* Expand prologue or epilogue stack adjustment.
12208 The pattern exist to put a dependency on all ebp-based memory accesses.
12209 STYLE should be negative if instructions should be marked as frame related,
12210 zero if %r11 register is live and cannot be freely used and positive
12211 otherwise. */
12213 static rtx
12214 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12215 int style, bool set_cfa)
12217 struct machine_function *m = cfun->machine;
12218 rtx insn;
12219 bool add_frame_related_expr = false;
12221 if (Pmode == SImode)
12222 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12223 else if (x86_64_immediate_operand (offset, DImode))
12224 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12225 else
12227 rtx tmp;
12228 /* r11 is used by indirect sibcall return as well, set before the
12229 epilogue and used after the epilogue. */
12230 if (style)
12231 tmp = gen_rtx_REG (DImode, R11_REG);
12232 else
12234 gcc_assert (src != hard_frame_pointer_rtx
12235 && dest != hard_frame_pointer_rtx);
12236 tmp = hard_frame_pointer_rtx;
12238 insn = emit_insn (gen_rtx_SET (tmp, offset));
12239 if (style < 0)
12240 add_frame_related_expr = true;
12242 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12245 insn = emit_insn (insn);
12246 if (style >= 0)
12247 ix86_add_queued_cfa_restore_notes (insn);
12249 if (set_cfa)
12251 rtx r;
12253 gcc_assert (m->fs.cfa_reg == src);
12254 m->fs.cfa_offset += INTVAL (offset);
12255 m->fs.cfa_reg = dest;
12257 r = gen_rtx_PLUS (Pmode, src, offset);
12258 r = gen_rtx_SET (dest, r);
12259 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12260 RTX_FRAME_RELATED_P (insn) = 1;
12262 else if (style < 0)
12264 RTX_FRAME_RELATED_P (insn) = 1;
12265 if (add_frame_related_expr)
12267 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12268 r = gen_rtx_SET (dest, r);
12269 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12273 if (dest == stack_pointer_rtx)
12275 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12276 bool valid = m->fs.sp_valid;
12277 bool realigned = m->fs.sp_realigned;
12279 if (src == hard_frame_pointer_rtx)
12281 valid = m->fs.fp_valid;
12282 realigned = false;
12283 ooffset = m->fs.fp_offset;
12285 else if (src == crtl->drap_reg)
12287 valid = m->fs.drap_valid;
12288 realigned = false;
12289 ooffset = 0;
12291 else
12293 /* Else there are two possibilities: SP itself, which we set
12294 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12295 taken care of this by hand along the eh_return path. */
12296 gcc_checking_assert (src == stack_pointer_rtx
12297 || offset == const0_rtx);
12300 m->fs.sp_offset = ooffset - INTVAL (offset);
12301 m->fs.sp_valid = valid;
12302 m->fs.sp_realigned = realigned;
12304 return insn;
12307 /* Find an available register to be used as dynamic realign argument
12308 pointer regsiter. Such a register will be written in prologue and
12309 used in begin of body, so it must not be
12310 1. parameter passing register.
12311 2. GOT pointer.
12312 We reuse static-chain register if it is available. Otherwise, we
12313 use DI for i386 and R13 for x86-64. We chose R13 since it has
12314 shorter encoding.
12316 Return: the regno of chosen register. */
12318 static unsigned int
12319 find_drap_reg (void)
12321 tree decl = cfun->decl;
12323 /* Always use callee-saved register if there are no caller-saved
12324 registers. */
12325 if (TARGET_64BIT)
12327 /* Use R13 for nested function or function need static chain.
12328 Since function with tail call may use any caller-saved
12329 registers in epilogue, DRAP must not use caller-saved
12330 register in such case. */
12331 if (DECL_STATIC_CHAIN (decl)
12332 || cfun->machine->no_caller_saved_registers
12333 || crtl->tail_call_emit)
12334 return R13_REG;
12336 return R10_REG;
12338 else
12340 /* Use DI for nested function or function need static chain.
12341 Since function with tail call may use any caller-saved
12342 registers in epilogue, DRAP must not use caller-saved
12343 register in such case. */
12344 if (DECL_STATIC_CHAIN (decl)
12345 || cfun->machine->no_caller_saved_registers
12346 || crtl->tail_call_emit)
12347 return DI_REG;
12349 /* Reuse static chain register if it isn't used for parameter
12350 passing. */
12351 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12353 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12354 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12355 return CX_REG;
12357 return DI_REG;
12361 /* Handle a "force_align_arg_pointer" attribute. */
12363 static tree
12364 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12365 tree, int, bool *no_add_attrs)
12367 if (TREE_CODE (*node) != FUNCTION_TYPE
12368 && TREE_CODE (*node) != METHOD_TYPE
12369 && TREE_CODE (*node) != FIELD_DECL
12370 && TREE_CODE (*node) != TYPE_DECL)
12372 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12373 name);
12374 *no_add_attrs = true;
12377 return NULL_TREE;
12380 /* Return minimum incoming stack alignment. */
12382 static unsigned int
12383 ix86_minimum_incoming_stack_boundary (bool sibcall)
12385 unsigned int incoming_stack_boundary;
12387 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
12388 if (cfun->machine->func_type != TYPE_NORMAL)
12389 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12390 /* Prefer the one specified at command line. */
12391 else if (ix86_user_incoming_stack_boundary)
12392 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12393 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12394 if -mstackrealign is used, it isn't used for sibcall check and
12395 estimated stack alignment is 128bit. */
12396 else if (!sibcall
12397 && ix86_force_align_arg_pointer
12398 && crtl->stack_alignment_estimated == 128)
12399 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12400 else
12401 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12403 /* Incoming stack alignment can be changed on individual functions
12404 via force_align_arg_pointer attribute. We use the smallest
12405 incoming stack boundary. */
12406 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12407 && lookup_attribute (ix86_force_align_arg_pointer_string,
12408 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12409 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12411 /* The incoming stack frame has to be aligned at least at
12412 parm_stack_boundary. */
12413 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12414 incoming_stack_boundary = crtl->parm_stack_boundary;
12416 /* Stack at entrance of main is aligned by runtime. We use the
12417 smallest incoming stack boundary. */
12418 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12419 && DECL_NAME (current_function_decl)
12420 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12421 && DECL_FILE_SCOPE_P (current_function_decl))
12422 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12424 return incoming_stack_boundary;
12427 /* Update incoming stack boundary and estimated stack alignment. */
12429 static void
12430 ix86_update_stack_boundary (void)
12432 ix86_incoming_stack_boundary
12433 = ix86_minimum_incoming_stack_boundary (false);
12435 /* x86_64 vararg needs 16byte stack alignment for register save
12436 area. */
12437 if (TARGET_64BIT
12438 && cfun->stdarg
12439 && crtl->stack_alignment_estimated < 128)
12440 crtl->stack_alignment_estimated = 128;
12442 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12443 if (ix86_tls_descriptor_calls_expanded_in_cfun
12444 && crtl->preferred_stack_boundary < 128)
12445 crtl->preferred_stack_boundary = 128;
12448 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12449 needed or an rtx for DRAP otherwise. */
12451 static rtx
12452 ix86_get_drap_rtx (void)
12454 /* We must use DRAP if there are outgoing arguments on stack and
12455 ACCUMULATE_OUTGOING_ARGS is false. */
12456 if (ix86_force_drap
12457 || (cfun->machine->outgoing_args_on_stack
12458 && !ACCUMULATE_OUTGOING_ARGS))
12459 crtl->need_drap = true;
12461 if (stack_realign_drap)
12463 /* Assign DRAP to vDRAP and returns vDRAP */
12464 unsigned int regno = find_drap_reg ();
12465 rtx drap_vreg;
12466 rtx arg_ptr;
12467 rtx_insn *seq, *insn;
12469 arg_ptr = gen_rtx_REG (Pmode, regno);
12470 crtl->drap_reg = arg_ptr;
12472 start_sequence ();
12473 drap_vreg = copy_to_reg (arg_ptr);
12474 seq = get_insns ();
12475 end_sequence ();
12477 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12478 if (!optimize)
12480 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12481 RTX_FRAME_RELATED_P (insn) = 1;
12483 return drap_vreg;
12485 else
12486 return NULL;
12489 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12491 static rtx
12492 ix86_internal_arg_pointer (void)
12494 return virtual_incoming_args_rtx;
12497 struct scratch_reg {
12498 rtx reg;
12499 bool saved;
12502 /* Return a short-lived scratch register for use on function entry.
12503 In 32-bit mode, it is valid only after the registers are saved
12504 in the prologue. This register must be released by means of
12505 release_scratch_register_on_entry once it is dead. */
12507 static void
12508 get_scratch_register_on_entry (struct scratch_reg *sr)
12510 int regno;
12512 sr->saved = false;
12514 if (TARGET_64BIT)
12516 /* We always use R11 in 64-bit mode. */
12517 regno = R11_REG;
12519 else
12521 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12522 bool fastcall_p
12523 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12524 bool thiscall_p
12525 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12526 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12527 int regparm = ix86_function_regparm (fntype, decl);
12528 int drap_regno
12529 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12531 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12532 for the static chain register. */
12533 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12534 && drap_regno != AX_REG)
12535 regno = AX_REG;
12536 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12537 for the static chain register. */
12538 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12539 regno = AX_REG;
12540 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12541 regno = DX_REG;
12542 /* ecx is the static chain register. */
12543 else if (regparm < 3 && !fastcall_p && !thiscall_p
12544 && !static_chain_p
12545 && drap_regno != CX_REG)
12546 regno = CX_REG;
12547 else if (ix86_save_reg (BX_REG, true, false))
12548 regno = BX_REG;
12549 /* esi is the static chain register. */
12550 else if (!(regparm == 3 && static_chain_p)
12551 && ix86_save_reg (SI_REG, true, false))
12552 regno = SI_REG;
12553 else if (ix86_save_reg (DI_REG, true, false))
12554 regno = DI_REG;
12555 else
12557 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12558 sr->saved = true;
12562 sr->reg = gen_rtx_REG (Pmode, regno);
12563 if (sr->saved)
12565 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12566 RTX_FRAME_RELATED_P (insn) = 1;
12570 /* Release a scratch register obtained from the preceding function. */
12572 static void
12573 release_scratch_register_on_entry (struct scratch_reg *sr)
12575 if (sr->saved)
12577 struct machine_function *m = cfun->machine;
12578 rtx x, insn = emit_insn (gen_pop (sr->reg));
12580 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12581 RTX_FRAME_RELATED_P (insn) = 1;
12582 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12583 x = gen_rtx_SET (stack_pointer_rtx, x);
12584 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12585 m->fs.sp_offset -= UNITS_PER_WORD;
12589 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12591 This differs from the next routine in that it tries hard to prevent
12592 attacks that jump the stack guard. Thus it is never allowed to allocate
12593 more than PROBE_INTERVAL bytes of stack space without a suitable
12594 probe.
12596 INT_REGISTERS_SAVED is true if integer registers have already been
12597 pushed on the stack. */
12599 static void
12600 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size,
12601 const bool int_registers_saved)
12603 struct machine_function *m = cfun->machine;
12605 /* If this function does not statically allocate stack space, then
12606 no probes are needed. */
12607 if (!size)
12609 /* However, the allocation of space via pushes for register
12610 saves could be viewed as allocating space, but without the
12611 need to probe. */
12612 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12613 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12614 else
12615 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12616 return;
12619 /* If we are a noreturn function, then we have to consider the
12620 possibility that we're called via a jump rather than a call.
12622 Thus we don't have the implicit probe generated by saving the
12623 return address into the stack at the call. Thus, the stack
12624 pointer could be anywhere in the guard page. The safe thing
12625 to do is emit a probe now.
12627 The probe can be avoided if we have already emitted any callee
12628 register saves into the stack or have a frame pointer (which will
12629 have been saved as well). Those saves will function as implicit
12630 probes.
12632 ?!? This should be revamped to work like aarch64 and s390 where
12633 we track the offset from the most recent probe. Normally that
12634 offset would be zero. For a noreturn function we would reset
12635 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12636 we just probe when we cross PROBE_INTERVAL. */
12637 if (TREE_THIS_VOLATILE (cfun->decl)
12638 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12640 /* We can safely use any register here since we're just going to push
12641 its value and immediately pop it back. But we do try and avoid
12642 argument passing registers so as not to introduce dependencies in
12643 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12644 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12645 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12646 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12647 m->fs.sp_offset -= UNITS_PER_WORD;
12648 if (m->fs.cfa_reg == stack_pointer_rtx)
12650 m->fs.cfa_offset -= UNITS_PER_WORD;
12651 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12652 x = gen_rtx_SET (stack_pointer_rtx, x);
12653 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12654 RTX_FRAME_RELATED_P (insn_push) = 1;
12655 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12656 x = gen_rtx_SET (stack_pointer_rtx, x);
12657 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12658 RTX_FRAME_RELATED_P (insn_pop) = 1;
12660 emit_insn (gen_blockage ());
12663 /* If we allocate less than the size of the guard statically,
12664 then no probing is necessary, but we do need to allocate
12665 the stack. */
12666 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12668 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12669 GEN_INT (-size), -1,
12670 m->fs.cfa_reg == stack_pointer_rtx);
12671 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12672 return;
12675 /* We're allocating a large enough stack frame that we need to
12676 emit probes. Either emit them inline or in a loop depending
12677 on the size. */
12678 HOST_WIDE_INT probe_interval = get_probe_interval ();
12679 if (size <= 4 * probe_interval)
12681 HOST_WIDE_INT i;
12682 for (i = probe_interval; i <= size; i += probe_interval)
12684 /* Allocate PROBE_INTERVAL bytes. */
12685 rtx insn
12686 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12687 GEN_INT (-probe_interval), -1,
12688 m->fs.cfa_reg == stack_pointer_rtx);
12689 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12691 /* And probe at *sp. */
12692 emit_stack_probe (stack_pointer_rtx);
12693 emit_insn (gen_blockage ());
12696 /* We need to allocate space for the residual, but we do not need
12697 to probe the residual. */
12698 HOST_WIDE_INT residual = (i - probe_interval - size);
12699 if (residual)
12700 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12701 GEN_INT (residual), -1,
12702 m->fs.cfa_reg == stack_pointer_rtx);
12703 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12705 else
12707 /* We expect the GP registers to be saved when probes are used
12708 as the probing sequences might need a scratch register and
12709 the routine to allocate one assumes the integer registers
12710 have already been saved. */
12711 gcc_assert (int_registers_saved);
12713 struct scratch_reg sr;
12714 get_scratch_register_on_entry (&sr);
12716 /* Step 1: round SIZE down to a multiple of the interval. */
12717 HOST_WIDE_INT rounded_size = size & -probe_interval;
12719 /* Step 2: compute final value of the loop counter. Use lea if
12720 possible. */
12721 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12722 rtx insn;
12723 if (address_no_seg_operand (addr, Pmode))
12724 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12725 else
12727 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12728 insn = emit_insn (gen_rtx_SET (sr.reg,
12729 gen_rtx_PLUS (Pmode, sr.reg,
12730 stack_pointer_rtx)));
12732 if (m->fs.cfa_reg == stack_pointer_rtx)
12734 add_reg_note (insn, REG_CFA_DEF_CFA,
12735 plus_constant (Pmode, sr.reg,
12736 m->fs.cfa_offset + rounded_size));
12737 RTX_FRAME_RELATED_P (insn) = 1;
12740 /* Step 3: the loop. */
12741 rtx size_rtx = GEN_INT (rounded_size);
12742 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12743 size_rtx));
12744 if (m->fs.cfa_reg == stack_pointer_rtx)
12746 m->fs.cfa_offset += rounded_size;
12747 add_reg_note (insn, REG_CFA_DEF_CFA,
12748 plus_constant (Pmode, stack_pointer_rtx,
12749 m->fs.cfa_offset));
12750 RTX_FRAME_RELATED_P (insn) = 1;
12752 m->fs.sp_offset += rounded_size;
12753 emit_insn (gen_blockage ());
12755 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12756 is equal to ROUNDED_SIZE. */
12758 if (size != rounded_size)
12759 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12760 GEN_INT (rounded_size - size), -1,
12761 m->fs.cfa_reg == stack_pointer_rtx);
12762 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12764 release_scratch_register_on_entry (&sr);
12767 /* Make sure nothing is scheduled before we are done. */
12768 emit_insn (gen_blockage ());
12771 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12773 INT_REGISTERS_SAVED is true if integer registers have already been
12774 pushed on the stack. */
12776 static void
12777 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size,
12778 const bool int_registers_saved)
12780 /* We skip the probe for the first interval + a small dope of 4 words and
12781 probe that many bytes past the specified size to maintain a protection
12782 area at the botton of the stack. */
12783 const int dope = 4 * UNITS_PER_WORD;
12784 rtx size_rtx = GEN_INT (size), last;
12786 /* See if we have a constant small number of probes to generate. If so,
12787 that's the easy case. The run-time loop is made up of 9 insns in the
12788 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12789 for n # of intervals. */
12790 if (size <= 4 * get_probe_interval ())
12792 HOST_WIDE_INT i, adjust;
12793 bool first_probe = true;
12795 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12796 values of N from 1 until it exceeds SIZE. If only one probe is
12797 needed, this will not generate any code. Then adjust and probe
12798 to PROBE_INTERVAL + SIZE. */
12799 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12801 if (first_probe)
12803 adjust = 2 * get_probe_interval () + dope;
12804 first_probe = false;
12806 else
12807 adjust = get_probe_interval ();
12809 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12810 plus_constant (Pmode, stack_pointer_rtx,
12811 -adjust)));
12812 emit_stack_probe (stack_pointer_rtx);
12815 if (first_probe)
12816 adjust = size + get_probe_interval () + dope;
12817 else
12818 adjust = size + get_probe_interval () - i;
12820 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12821 plus_constant (Pmode, stack_pointer_rtx,
12822 -adjust)));
12823 emit_stack_probe (stack_pointer_rtx);
12825 /* Adjust back to account for the additional first interval. */
12826 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12827 plus_constant (Pmode, stack_pointer_rtx,
12828 (get_probe_interval ()
12829 + dope))));
12832 /* Otherwise, do the same as above, but in a loop. Note that we must be
12833 extra careful with variables wrapping around because we might be at
12834 the very top (or the very bottom) of the address space and we have
12835 to be able to handle this case properly; in particular, we use an
12836 equality test for the loop condition. */
12837 else
12839 /* We expect the GP registers to be saved when probes are used
12840 as the probing sequences might need a scratch register and
12841 the routine to allocate one assumes the integer registers
12842 have already been saved. */
12843 gcc_assert (int_registers_saved);
12845 HOST_WIDE_INT rounded_size;
12846 struct scratch_reg sr;
12848 get_scratch_register_on_entry (&sr);
12851 /* Step 1: round SIZE to the previous multiple of the interval. */
12853 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12856 /* Step 2: compute initial and final value of the loop counter. */
12858 /* SP = SP_0 + PROBE_INTERVAL. */
12859 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12860 plus_constant (Pmode, stack_pointer_rtx,
12861 - (get_probe_interval () + dope))));
12863 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12864 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12865 emit_insn (gen_rtx_SET (sr.reg,
12866 plus_constant (Pmode, stack_pointer_rtx,
12867 -rounded_size)));
12868 else
12870 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12871 emit_insn (gen_rtx_SET (sr.reg,
12872 gen_rtx_PLUS (Pmode, sr.reg,
12873 stack_pointer_rtx)));
12877 /* Step 3: the loop
12881 SP = SP + PROBE_INTERVAL
12882 probe at SP
12884 while (SP != LAST_ADDR)
12886 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12887 values of N from 1 until it is equal to ROUNDED_SIZE. */
12889 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12892 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12893 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12895 if (size != rounded_size)
12897 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12898 plus_constant (Pmode, stack_pointer_rtx,
12899 rounded_size - size)));
12900 emit_stack_probe (stack_pointer_rtx);
12903 /* Adjust back to account for the additional first interval. */
12904 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12905 plus_constant (Pmode, stack_pointer_rtx,
12906 (get_probe_interval ()
12907 + dope))));
12909 release_scratch_register_on_entry (&sr);
12912 /* Even if the stack pointer isn't the CFA register, we need to correctly
12913 describe the adjustments made to it, in particular differentiate the
12914 frame-related ones from the frame-unrelated ones. */
12915 if (size > 0)
12917 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12918 XVECEXP (expr, 0, 0)
12919 = gen_rtx_SET (stack_pointer_rtx,
12920 plus_constant (Pmode, stack_pointer_rtx, -size));
12921 XVECEXP (expr, 0, 1)
12922 = gen_rtx_SET (stack_pointer_rtx,
12923 plus_constant (Pmode, stack_pointer_rtx,
12924 get_probe_interval () + dope + size));
12925 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12926 RTX_FRAME_RELATED_P (last) = 1;
12928 cfun->machine->fs.sp_offset += size;
12931 /* Make sure nothing is scheduled before we are done. */
12932 emit_insn (gen_blockage ());
12935 /* Adjust the stack pointer up to REG while probing it. */
12937 const char *
12938 output_adjust_stack_and_probe (rtx reg)
12940 static int labelno = 0;
12941 char loop_lab[32];
12942 rtx xops[2];
12944 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12946 /* Loop. */
12947 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12949 /* SP = SP + PROBE_INTERVAL. */
12950 xops[0] = stack_pointer_rtx;
12951 xops[1] = GEN_INT (get_probe_interval ());
12952 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12954 /* Probe at SP. */
12955 xops[1] = const0_rtx;
12956 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12958 /* Test if SP == LAST_ADDR. */
12959 xops[0] = stack_pointer_rtx;
12960 xops[1] = reg;
12961 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12963 /* Branch. */
12964 fputs ("\tjne\t", asm_out_file);
12965 assemble_name_raw (asm_out_file, loop_lab);
12966 fputc ('\n', asm_out_file);
12968 return "";
12971 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12972 inclusive. These are offsets from the current stack pointer.
12974 INT_REGISTERS_SAVED is true if integer registers have already been
12975 pushed on the stack. */
12977 static void
12978 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
12979 const bool int_registers_saved)
12981 /* See if we have a constant small number of probes to generate. If so,
12982 that's the easy case. The run-time loop is made up of 6 insns in the
12983 generic case while the compile-time loop is made up of n insns for n #
12984 of intervals. */
12985 if (size <= 6 * get_probe_interval ())
12987 HOST_WIDE_INT i;
12989 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12990 it exceeds SIZE. If only one probe is needed, this will not
12991 generate any code. Then probe at FIRST + SIZE. */
12992 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12993 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12994 -(first + i)));
12996 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12997 -(first + size)));
13000 /* Otherwise, do the same as above, but in a loop. Note that we must be
13001 extra careful with variables wrapping around because we might be at
13002 the very top (or the very bottom) of the address space and we have
13003 to be able to handle this case properly; in particular, we use an
13004 equality test for the loop condition. */
13005 else
13007 /* We expect the GP registers to be saved when probes are used
13008 as the probing sequences might need a scratch register and
13009 the routine to allocate one assumes the integer registers
13010 have already been saved. */
13011 gcc_assert (int_registers_saved);
13013 HOST_WIDE_INT rounded_size, last;
13014 struct scratch_reg sr;
13016 get_scratch_register_on_entry (&sr);
13019 /* Step 1: round SIZE to the previous multiple of the interval. */
13021 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13024 /* Step 2: compute initial and final value of the loop counter. */
13026 /* TEST_OFFSET = FIRST. */
13027 emit_move_insn (sr.reg, GEN_INT (-first));
13029 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13030 last = first + rounded_size;
13033 /* Step 3: the loop
13037 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13038 probe at TEST_ADDR
13040 while (TEST_ADDR != LAST_ADDR)
13042 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13043 until it is equal to ROUNDED_SIZE. */
13045 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13048 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13049 that SIZE is equal to ROUNDED_SIZE. */
13051 if (size != rounded_size)
13052 emit_stack_probe (plus_constant (Pmode,
13053 gen_rtx_PLUS (Pmode,
13054 stack_pointer_rtx,
13055 sr.reg),
13056 rounded_size - size));
13058 release_scratch_register_on_entry (&sr);
13061 /* Make sure nothing is scheduled before we are done. */
13062 emit_insn (gen_blockage ());
13065 /* Probe a range of stack addresses from REG to END, inclusive. These are
13066 offsets from the current stack pointer. */
13068 const char *
13069 output_probe_stack_range (rtx reg, rtx end)
13071 static int labelno = 0;
13072 char loop_lab[32];
13073 rtx xops[3];
13075 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13077 /* Loop. */
13078 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13080 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13081 xops[0] = reg;
13082 xops[1] = GEN_INT (get_probe_interval ());
13083 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13085 /* Probe at TEST_ADDR. */
13086 xops[0] = stack_pointer_rtx;
13087 xops[1] = reg;
13088 xops[2] = const0_rtx;
13089 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13091 /* Test if TEST_ADDR == LAST_ADDR. */
13092 xops[0] = reg;
13093 xops[1] = end;
13094 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13096 /* Branch. */
13097 fputs ("\tjne\t", asm_out_file);
13098 assemble_name_raw (asm_out_file, loop_lab);
13099 fputc ('\n', asm_out_file);
13101 return "";
13104 /* Return true if stack frame is required. Update STACK_ALIGNMENT
13105 to the largest alignment, in bits, of stack slot used if stack
13106 frame is required and CHECK_STACK_SLOT is true. */
13108 static bool
13109 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13110 bool check_stack_slot)
13112 HARD_REG_SET set_up_by_prologue, prologue_used;
13113 basic_block bb;
13115 CLEAR_HARD_REG_SET (prologue_used);
13116 CLEAR_HARD_REG_SET (set_up_by_prologue);
13117 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13118 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13119 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13120 HARD_FRAME_POINTER_REGNUM);
13122 /* The preferred stack alignment is the minimum stack alignment. */
13123 if (stack_alignment > crtl->preferred_stack_boundary)
13124 stack_alignment = crtl->preferred_stack_boundary;
13126 bool require_stack_frame = false;
13128 FOR_EACH_BB_FN (bb, cfun)
13130 rtx_insn *insn;
13131 FOR_BB_INSNS (bb, insn)
13132 if (NONDEBUG_INSN_P (insn)
13133 && requires_stack_frame_p (insn, prologue_used,
13134 set_up_by_prologue))
13136 require_stack_frame = true;
13138 if (check_stack_slot)
13140 /* Find the maximum stack alignment. */
13141 subrtx_iterator::array_type array;
13142 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13143 if (MEM_P (*iter)
13144 && (reg_mentioned_p (stack_pointer_rtx,
13145 *iter)
13146 || reg_mentioned_p (frame_pointer_rtx,
13147 *iter)))
13149 unsigned int alignment = MEM_ALIGN (*iter);
13150 if (alignment > stack_alignment)
13151 stack_alignment = alignment;
13157 return require_stack_frame;
13160 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13161 will guide prologue/epilogue to be generated in correct form. */
13163 static void
13164 ix86_finalize_stack_frame_flags (void)
13166 /* Check if stack realign is really needed after reload, and
13167 stores result in cfun */
13168 unsigned int incoming_stack_boundary
13169 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13170 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13171 unsigned int stack_alignment
13172 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13173 ? crtl->max_used_stack_slot_alignment
13174 : crtl->stack_alignment_needed);
13175 unsigned int stack_realign
13176 = (incoming_stack_boundary < stack_alignment);
13177 bool recompute_frame_layout_p = false;
13179 if (crtl->stack_realign_finalized)
13181 /* After stack_realign_needed is finalized, we can't no longer
13182 change it. */
13183 gcc_assert (crtl->stack_realign_needed == stack_realign);
13184 return;
13187 /* If the only reason for frame_pointer_needed is that we conservatively
13188 assumed stack realignment might be needed or -fno-omit-frame-pointer
13189 is used, but in the end nothing that needed the stack alignment had
13190 been spilled nor stack access, clear frame_pointer_needed and say we
13191 don't need stack realignment. */
13192 if ((stack_realign || !flag_omit_frame_pointer)
13193 && frame_pointer_needed
13194 && crtl->is_leaf
13195 && crtl->sp_is_unchanging
13196 && !ix86_current_function_calls_tls_descriptor
13197 && !crtl->accesses_prior_frames
13198 && !cfun->calls_alloca
13199 && !crtl->calls_eh_return
13200 /* See ira_setup_eliminable_regset for the rationale. */
13201 && !(STACK_CHECK_MOVING_SP
13202 && flag_stack_check
13203 && flag_exceptions
13204 && cfun->can_throw_non_call_exceptions)
13205 && !ix86_frame_pointer_required ()
13206 && get_frame_size () == 0
13207 && ix86_nsaved_sseregs () == 0
13208 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13210 if (ix86_find_max_used_stack_alignment (stack_alignment,
13211 stack_realign))
13213 /* Stack frame is required. If stack alignment needed is less
13214 than incoming stack boundary, don't realign stack. */
13215 stack_realign = incoming_stack_boundary < stack_alignment;
13216 if (!stack_realign)
13218 crtl->max_used_stack_slot_alignment
13219 = incoming_stack_boundary;
13220 crtl->stack_alignment_needed
13221 = incoming_stack_boundary;
13222 /* Also update preferred_stack_boundary for leaf
13223 functions. */
13224 crtl->preferred_stack_boundary
13225 = incoming_stack_boundary;
13228 else
13230 /* If drap has been set, but it actually isn't live at the
13231 start of the function, there is no reason to set it up. */
13232 if (crtl->drap_reg)
13234 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13235 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13236 REGNO (crtl->drap_reg)))
13238 crtl->drap_reg = NULL_RTX;
13239 crtl->need_drap = false;
13242 else
13243 cfun->machine->no_drap_save_restore = true;
13245 frame_pointer_needed = false;
13246 stack_realign = false;
13247 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13248 crtl->stack_alignment_needed = incoming_stack_boundary;
13249 crtl->stack_alignment_estimated = incoming_stack_boundary;
13250 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13251 crtl->preferred_stack_boundary = incoming_stack_boundary;
13252 df_finish_pass (true);
13253 df_scan_alloc (NULL);
13254 df_scan_blocks ();
13255 df_compute_regs_ever_live (true);
13256 df_analyze ();
13258 if (flag_var_tracking)
13260 /* Since frame pointer is no longer available, replace it with
13261 stack pointer - UNITS_PER_WORD in debug insns. */
13262 df_ref ref, next;
13263 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13264 ref; ref = next)
13266 next = DF_REF_NEXT_REG (ref);
13267 if (!DF_REF_INSN_INFO (ref))
13268 continue;
13270 /* Make sure the next ref is for a different instruction,
13271 so that we're not affected by the rescan. */
13272 rtx_insn *insn = DF_REF_INSN (ref);
13273 while (next && DF_REF_INSN (next) == insn)
13274 next = DF_REF_NEXT_REG (next);
13276 if (DEBUG_INSN_P (insn))
13278 bool changed = false;
13279 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13281 rtx *loc = DF_REF_LOC (ref);
13282 if (*loc == hard_frame_pointer_rtx)
13284 *loc = plus_constant (Pmode,
13285 stack_pointer_rtx,
13286 -UNITS_PER_WORD);
13287 changed = true;
13290 if (changed)
13291 df_insn_rescan (insn);
13296 recompute_frame_layout_p = true;
13299 else if (crtl->max_used_stack_slot_alignment
13300 > crtl->preferred_stack_boundary)
13302 /* We don't need to realign stack. But we still need to keep
13303 stack frame properly aligned to satisfy the largest alignment
13304 of stack slots. */
13305 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13306 cfun->machine->max_used_stack_alignment
13307 = stack_alignment / BITS_PER_UNIT;
13310 if (crtl->stack_realign_needed != stack_realign)
13311 recompute_frame_layout_p = true;
13312 crtl->stack_realign_needed = stack_realign;
13313 crtl->stack_realign_finalized = true;
13314 if (recompute_frame_layout_p)
13315 ix86_compute_frame_layout ();
13318 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13320 static void
13321 ix86_elim_entry_set_got (rtx reg)
13323 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13324 rtx_insn *c_insn = BB_HEAD (bb);
13325 if (!NONDEBUG_INSN_P (c_insn))
13326 c_insn = next_nonnote_nondebug_insn (c_insn);
13327 if (c_insn && NONJUMP_INSN_P (c_insn))
13329 rtx pat = PATTERN (c_insn);
13330 if (GET_CODE (pat) == PARALLEL)
13332 rtx vec = XVECEXP (pat, 0, 0);
13333 if (GET_CODE (vec) == SET
13334 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13335 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13336 delete_insn (c_insn);
13341 static rtx
13342 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13344 rtx addr, mem;
13346 if (offset)
13347 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13348 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13349 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13352 static inline rtx
13353 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13355 return gen_frame_set (reg, frame_reg, offset, false);
13358 static inline rtx
13359 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13361 return gen_frame_set (reg, frame_reg, offset, true);
13364 static void
13365 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13367 struct machine_function *m = cfun->machine;
13368 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13369 + m->call_ms2sysv_extra_regs;
13370 rtvec v = rtvec_alloc (ncregs + 1);
13371 unsigned int align, i, vi = 0;
13372 rtx_insn *insn;
13373 rtx sym, addr;
13374 rtx rax = gen_rtx_REG (word_mode, AX_REG);
13375 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13377 /* AL should only be live with sysv_abi. */
13378 gcc_assert (!ix86_eax_live_at_start_p ());
13379 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13381 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
13382 we've actually realigned the stack or not. */
13383 align = GET_MODE_ALIGNMENT (V4SFmode);
13384 addr = choose_baseaddr (frame.stack_realign_offset
13385 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13386 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13388 emit_insn (gen_rtx_SET (rax, addr));
13390 /* Get the stub symbol. */
13391 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13392 : XLOGUE_STUB_SAVE);
13393 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13395 for (i = 0; i < ncregs; ++i)
13397 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13398 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13399 r.regno);
13400 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13403 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13405 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13406 RTX_FRAME_RELATED_P (insn) = true;
13409 /* Expand the prologue into a bunch of separate insns. */
13411 void
13412 ix86_expand_prologue (void)
13414 struct machine_function *m = cfun->machine;
13415 rtx insn, t;
13416 HOST_WIDE_INT allocate;
13417 bool int_registers_saved;
13418 bool sse_registers_saved;
13419 bool save_stub_call_needed;
13420 rtx static_chain = NULL_RTX;
13422 if (ix86_function_naked (current_function_decl))
13423 return;
13425 ix86_finalize_stack_frame_flags ();
13427 /* DRAP should not coexist with stack_realign_fp */
13428 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13430 memset (&m->fs, 0, sizeof (m->fs));
13432 /* Initialize CFA state for before the prologue. */
13433 m->fs.cfa_reg = stack_pointer_rtx;
13434 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13436 /* Track SP offset to the CFA. We continue tracking this after we've
13437 swapped the CFA register away from SP. In the case of re-alignment
13438 this is fudged; we're interested to offsets within the local frame. */
13439 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13440 m->fs.sp_valid = true;
13441 m->fs.sp_realigned = false;
13443 const struct ix86_frame &frame = cfun->machine->frame;
13445 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13447 /* We should have already generated an error for any use of
13448 ms_hook on a nested function. */
13449 gcc_checking_assert (!ix86_static_chain_on_stack);
13451 /* Check if profiling is active and we shall use profiling before
13452 prologue variant. If so sorry. */
13453 if (crtl->profile && flag_fentry != 0)
13454 sorry ("ms_hook_prologue attribute isn%'t compatible "
13455 "with -mfentry for 32-bit");
13457 /* In ix86_asm_output_function_label we emitted:
13458 8b ff movl.s %edi,%edi
13459 55 push %ebp
13460 8b ec movl.s %esp,%ebp
13462 This matches the hookable function prologue in Win32 API
13463 functions in Microsoft Windows XP Service Pack 2 and newer.
13464 Wine uses this to enable Windows apps to hook the Win32 API
13465 functions provided by Wine.
13467 What that means is that we've already set up the frame pointer. */
13469 if (frame_pointer_needed
13470 && !(crtl->drap_reg && crtl->stack_realign_needed))
13472 rtx push, mov;
13474 /* We've decided to use the frame pointer already set up.
13475 Describe this to the unwinder by pretending that both
13476 push and mov insns happen right here.
13478 Putting the unwind info here at the end of the ms_hook
13479 is done so that we can make absolutely certain we get
13480 the required byte sequence at the start of the function,
13481 rather than relying on an assembler that can produce
13482 the exact encoding required.
13484 However it does mean (in the unpatched case) that we have
13485 a 1 insn window where the asynchronous unwind info is
13486 incorrect. However, if we placed the unwind info at
13487 its correct location we would have incorrect unwind info
13488 in the patched case. Which is probably all moot since
13489 I don't expect Wine generates dwarf2 unwind info for the
13490 system libraries that use this feature. */
13492 insn = emit_insn (gen_blockage ());
13494 push = gen_push (hard_frame_pointer_rtx);
13495 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13496 stack_pointer_rtx);
13497 RTX_FRAME_RELATED_P (push) = 1;
13498 RTX_FRAME_RELATED_P (mov) = 1;
13500 RTX_FRAME_RELATED_P (insn) = 1;
13501 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13502 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13504 /* Note that gen_push incremented m->fs.cfa_offset, even
13505 though we didn't emit the push insn here. */
13506 m->fs.cfa_reg = hard_frame_pointer_rtx;
13507 m->fs.fp_offset = m->fs.cfa_offset;
13508 m->fs.fp_valid = true;
13510 else
13512 /* The frame pointer is not needed so pop %ebp again.
13513 This leaves us with a pristine state. */
13514 emit_insn (gen_pop (hard_frame_pointer_rtx));
13518 /* The first insn of a function that accepts its static chain on the
13519 stack is to push the register that would be filled in by a direct
13520 call. This insn will be skipped by the trampoline. */
13521 else if (ix86_static_chain_on_stack)
13523 static_chain = ix86_static_chain (cfun->decl, false);
13524 insn = emit_insn (gen_push (static_chain));
13525 emit_insn (gen_blockage ());
13527 /* We don't want to interpret this push insn as a register save,
13528 only as a stack adjustment. The real copy of the register as
13529 a save will be done later, if needed. */
13530 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13531 t = gen_rtx_SET (stack_pointer_rtx, t);
13532 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13533 RTX_FRAME_RELATED_P (insn) = 1;
13536 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13537 of DRAP is needed and stack realignment is really needed after reload */
13538 if (stack_realign_drap)
13540 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13542 /* Can't use DRAP in interrupt function. */
13543 if (cfun->machine->func_type != TYPE_NORMAL)
13544 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13545 "in interrupt service routine. This may be worked "
13546 "around by avoiding functions with aggregate return.");
13548 /* Only need to push parameter pointer reg if it is caller saved. */
13549 if (!call_used_regs[REGNO (crtl->drap_reg)])
13551 /* Push arg pointer reg */
13552 insn = emit_insn (gen_push (crtl->drap_reg));
13553 RTX_FRAME_RELATED_P (insn) = 1;
13556 /* Grab the argument pointer. */
13557 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13558 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13559 RTX_FRAME_RELATED_P (insn) = 1;
13560 m->fs.cfa_reg = crtl->drap_reg;
13561 m->fs.cfa_offset = 0;
13563 /* Align the stack. */
13564 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13565 stack_pointer_rtx,
13566 GEN_INT (-align_bytes)));
13567 RTX_FRAME_RELATED_P (insn) = 1;
13569 /* Replicate the return address on the stack so that return
13570 address can be reached via (argp - 1) slot. This is needed
13571 to implement macro RETURN_ADDR_RTX and intrinsic function
13572 expand_builtin_return_addr etc. */
13573 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13574 t = gen_frame_mem (word_mode, t);
13575 insn = emit_insn (gen_push (t));
13576 RTX_FRAME_RELATED_P (insn) = 1;
13578 /* For the purposes of frame and register save area addressing,
13579 we've started over with a new frame. */
13580 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13581 m->fs.realigned = true;
13583 if (static_chain)
13585 /* Replicate static chain on the stack so that static chain
13586 can be reached via (argp - 2) slot. This is needed for
13587 nested function with stack realignment. */
13588 insn = emit_insn (gen_push (static_chain));
13589 RTX_FRAME_RELATED_P (insn) = 1;
13593 int_registers_saved = (frame.nregs == 0);
13594 sse_registers_saved = (frame.nsseregs == 0);
13595 save_stub_call_needed = (m->call_ms2sysv);
13596 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13598 if (frame_pointer_needed && !m->fs.fp_valid)
13600 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13601 slower on all targets. Also sdb didn't like it. */
13602 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13603 RTX_FRAME_RELATED_P (insn) = 1;
13605 /* Push registers now, before setting the frame pointer
13606 on SEH target. */
13607 if (!int_registers_saved
13608 && TARGET_SEH
13609 && !frame.save_regs_using_mov)
13611 ix86_emit_save_regs ();
13612 int_registers_saved = true;
13613 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13616 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13618 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13619 RTX_FRAME_RELATED_P (insn) = 1;
13621 if (m->fs.cfa_reg == stack_pointer_rtx)
13622 m->fs.cfa_reg = hard_frame_pointer_rtx;
13623 m->fs.fp_offset = m->fs.sp_offset;
13624 m->fs.fp_valid = true;
13628 if (!int_registers_saved)
13630 /* If saving registers via PUSH, do so now. */
13631 if (!frame.save_regs_using_mov)
13633 ix86_emit_save_regs ();
13634 int_registers_saved = true;
13635 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13638 /* When using red zone we may start register saving before allocating
13639 the stack frame saving one cycle of the prologue. However, avoid
13640 doing this if we have to probe the stack; at least on x86_64 the
13641 stack probe can turn into a call that clobbers a red zone location. */
13642 else if (ix86_using_red_zone ()
13643 && (! TARGET_STACK_PROBE
13644 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13646 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13647 int_registers_saved = true;
13651 if (stack_realign_fp)
13653 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13654 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13656 /* Record last valid frame pointer offset. */
13657 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13659 /* The computation of the size of the re-aligned stack frame means
13660 that we must allocate the size of the register save area before
13661 performing the actual alignment. Otherwise we cannot guarantee
13662 that there's enough storage above the realignment point. */
13663 allocate = frame.reg_save_offset - m->fs.sp_offset
13664 + frame.stack_realign_allocate;
13665 if (allocate)
13666 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13667 GEN_INT (-allocate), -1, false);
13669 /* Align the stack. */
13670 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13671 stack_pointer_rtx,
13672 GEN_INT (-align_bytes)));
13673 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13674 m->fs.sp_realigned_offset = m->fs.sp_offset
13675 - frame.stack_realign_allocate;
13676 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13677 Beyond this point, stack access should be done via choose_baseaddr or
13678 by using sp_valid_at and fp_valid_at to determine the correct base
13679 register. Henceforth, any CFA offset should be thought of as logical
13680 and not physical. */
13681 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13682 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13683 m->fs.sp_realigned = true;
13685 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13686 is needed to describe where a register is saved using a realigned
13687 stack pointer, so we need to invalidate the stack pointer for that
13688 target. */
13689 if (TARGET_SEH)
13690 m->fs.sp_valid = false;
13692 /* If SP offset is non-immediate after allocation of the stack frame,
13693 then emit SSE saves or stub call prior to allocating the rest of the
13694 stack frame. This is less efficient for the out-of-line stub because
13695 we can't combine allocations across the call barrier, but it's better
13696 than using a scratch register. */
13697 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13698 - m->fs.sp_realigned_offset),
13699 Pmode))
13701 if (!sse_registers_saved)
13703 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13704 sse_registers_saved = true;
13706 else if (save_stub_call_needed)
13708 ix86_emit_outlined_ms2sysv_save (frame);
13709 save_stub_call_needed = false;
13714 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13716 if (flag_stack_usage_info)
13718 /* We start to count from ARG_POINTER. */
13719 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13721 /* If it was realigned, take into account the fake frame. */
13722 if (stack_realign_drap)
13724 if (ix86_static_chain_on_stack)
13725 stack_size += UNITS_PER_WORD;
13727 if (!call_used_regs[REGNO (crtl->drap_reg)])
13728 stack_size += UNITS_PER_WORD;
13730 /* This over-estimates by 1 minimal-stack-alignment-unit but
13731 mitigates that by counting in the new return address slot. */
13732 current_function_dynamic_stack_size
13733 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13736 current_function_static_stack_size = stack_size;
13739 /* On SEH target with very large frame size, allocate an area to save
13740 SSE registers (as the very large allocation won't be described). */
13741 if (TARGET_SEH
13742 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13743 && !sse_registers_saved)
13745 HOST_WIDE_INT sse_size =
13746 frame.sse_reg_save_offset - frame.reg_save_offset;
13748 gcc_assert (int_registers_saved);
13750 /* No need to do stack checking as the area will be immediately
13751 written. */
13752 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13753 GEN_INT (-sse_size), -1,
13754 m->fs.cfa_reg == stack_pointer_rtx);
13755 allocate -= sse_size;
13756 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13757 sse_registers_saved = true;
13760 /* The stack has already been decremented by the instruction calling us
13761 so probe if the size is non-negative to preserve the protection area. */
13762 if (allocate >= 0
13763 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13764 || flag_stack_clash_protection))
13766 if (flag_stack_clash_protection)
13768 ix86_adjust_stack_and_probe_stack_clash (allocate,
13769 int_registers_saved);
13770 allocate = 0;
13772 else if (STACK_CHECK_MOVING_SP)
13774 if (!(crtl->is_leaf && !cfun->calls_alloca
13775 && allocate <= get_probe_interval ()))
13777 ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13778 allocate = 0;
13781 else
13783 HOST_WIDE_INT size = allocate;
13785 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13786 size = 0x80000000 - get_stack_check_protect () - 1;
13788 if (TARGET_STACK_PROBE)
13790 if (crtl->is_leaf && !cfun->calls_alloca)
13792 if (size > get_probe_interval ())
13793 ix86_emit_probe_stack_range (0, size, int_registers_saved);
13795 else
13796 ix86_emit_probe_stack_range (0,
13797 size + get_stack_check_protect (),
13798 int_registers_saved);
13800 else
13802 if (crtl->is_leaf && !cfun->calls_alloca)
13804 if (size > get_probe_interval ()
13805 && size > get_stack_check_protect ())
13806 ix86_emit_probe_stack_range (get_stack_check_protect (),
13807 (size
13808 - get_stack_check_protect ()),
13809 int_registers_saved);
13811 else
13812 ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13813 int_registers_saved);
13818 if (allocate == 0)
13820 else if (!ix86_target_stack_probe ()
13821 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13823 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13824 GEN_INT (-allocate), -1,
13825 m->fs.cfa_reg == stack_pointer_rtx);
13827 else
13829 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13830 rtx r10 = NULL;
13831 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13832 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13833 bool eax_live = ix86_eax_live_at_start_p ();
13834 bool r10_live = false;
13836 if (TARGET_64BIT)
13837 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13839 if (eax_live)
13841 insn = emit_insn (gen_push (eax));
13842 allocate -= UNITS_PER_WORD;
13843 /* Note that SEH directives need to continue tracking the stack
13844 pointer even after the frame pointer has been set up. */
13845 if (sp_is_cfa_reg || TARGET_SEH)
13847 if (sp_is_cfa_reg)
13848 m->fs.cfa_offset += UNITS_PER_WORD;
13849 RTX_FRAME_RELATED_P (insn) = 1;
13850 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13851 gen_rtx_SET (stack_pointer_rtx,
13852 plus_constant (Pmode, stack_pointer_rtx,
13853 -UNITS_PER_WORD)));
13857 if (r10_live)
13859 r10 = gen_rtx_REG (Pmode, R10_REG);
13860 insn = emit_insn (gen_push (r10));
13861 allocate -= UNITS_PER_WORD;
13862 if (sp_is_cfa_reg || TARGET_SEH)
13864 if (sp_is_cfa_reg)
13865 m->fs.cfa_offset += UNITS_PER_WORD;
13866 RTX_FRAME_RELATED_P (insn) = 1;
13867 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13868 gen_rtx_SET (stack_pointer_rtx,
13869 plus_constant (Pmode, stack_pointer_rtx,
13870 -UNITS_PER_WORD)));
13874 emit_move_insn (eax, GEN_INT (allocate));
13875 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13877 /* Use the fact that AX still contains ALLOCATE. */
13878 adjust_stack_insn = (Pmode == DImode
13879 ? gen_pro_epilogue_adjust_stack_di_sub
13880 : gen_pro_epilogue_adjust_stack_si_sub);
13882 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13883 stack_pointer_rtx, eax));
13885 if (sp_is_cfa_reg || TARGET_SEH)
13887 if (sp_is_cfa_reg)
13888 m->fs.cfa_offset += allocate;
13889 RTX_FRAME_RELATED_P (insn) = 1;
13890 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13891 gen_rtx_SET (stack_pointer_rtx,
13892 plus_constant (Pmode, stack_pointer_rtx,
13893 -allocate)));
13895 m->fs.sp_offset += allocate;
13897 /* Use stack_pointer_rtx for relative addressing so that code
13898 works for realigned stack, too. */
13899 if (r10_live && eax_live)
13901 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13902 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13903 gen_frame_mem (word_mode, t));
13904 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13905 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13906 gen_frame_mem (word_mode, t));
13908 else if (eax_live || r10_live)
13910 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13911 emit_move_insn (gen_rtx_REG (word_mode,
13912 (eax_live ? AX_REG : R10_REG)),
13913 gen_frame_mem (word_mode, t));
13916 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13918 /* If we havn't already set up the frame pointer, do so now. */
13919 if (frame_pointer_needed && !m->fs.fp_valid)
13921 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13922 GEN_INT (frame.stack_pointer_offset
13923 - frame.hard_frame_pointer_offset));
13924 insn = emit_insn (insn);
13925 RTX_FRAME_RELATED_P (insn) = 1;
13926 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13928 if (m->fs.cfa_reg == stack_pointer_rtx)
13929 m->fs.cfa_reg = hard_frame_pointer_rtx;
13930 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13931 m->fs.fp_valid = true;
13934 if (!int_registers_saved)
13935 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13936 if (!sse_registers_saved)
13937 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13938 else if (save_stub_call_needed)
13939 ix86_emit_outlined_ms2sysv_save (frame);
13941 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13942 in PROLOGUE. */
13943 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13945 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13946 insn = emit_insn (gen_set_got (pic));
13947 RTX_FRAME_RELATED_P (insn) = 1;
13948 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13949 emit_insn (gen_prologue_use (pic));
13950 /* Deleting already emmitted SET_GOT if exist and allocated to
13951 REAL_PIC_OFFSET_TABLE_REGNUM. */
13952 ix86_elim_entry_set_got (pic);
13955 if (crtl->drap_reg && !crtl->stack_realign_needed)
13957 /* vDRAP is setup but after reload it turns out stack realign
13958 isn't necessary, here we will emit prologue to setup DRAP
13959 without stack realign adjustment */
13960 t = choose_baseaddr (0, NULL);
13961 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13964 /* Prevent instructions from being scheduled into register save push
13965 sequence when access to the redzone area is done through frame pointer.
13966 The offset between the frame pointer and the stack pointer is calculated
13967 relative to the value of the stack pointer at the end of the function
13968 prologue, and moving instructions that access redzone area via frame
13969 pointer inside push sequence violates this assumption. */
13970 if (frame_pointer_needed && frame.red_zone_size)
13971 emit_insn (gen_memory_blockage ());
13973 /* SEH requires that the prologue end within 256 bytes of the start of
13974 the function. Prevent instruction schedules that would extend that.
13975 Further, prevent alloca modifications to the stack pointer from being
13976 combined with prologue modifications. */
13977 if (TARGET_SEH)
13978 emit_insn (gen_prologue_use (stack_pointer_rtx));
13981 /* Emit code to restore REG using a POP insn. */
13983 static void
13984 ix86_emit_restore_reg_using_pop (rtx reg)
13986 struct machine_function *m = cfun->machine;
13987 rtx_insn *insn = emit_insn (gen_pop (reg));
13989 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13990 m->fs.sp_offset -= UNITS_PER_WORD;
13992 if (m->fs.cfa_reg == crtl->drap_reg
13993 && REGNO (reg) == REGNO (crtl->drap_reg))
13995 /* Previously we'd represented the CFA as an expression
13996 like *(%ebp - 8). We've just popped that value from
13997 the stack, which means we need to reset the CFA to
13998 the drap register. This will remain until we restore
13999 the stack pointer. */
14000 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14001 RTX_FRAME_RELATED_P (insn) = 1;
14003 /* This means that the DRAP register is valid for addressing too. */
14004 m->fs.drap_valid = true;
14005 return;
14008 if (m->fs.cfa_reg == stack_pointer_rtx)
14010 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14011 x = gen_rtx_SET (stack_pointer_rtx, x);
14012 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14013 RTX_FRAME_RELATED_P (insn) = 1;
14015 m->fs.cfa_offset -= UNITS_PER_WORD;
14018 /* When the frame pointer is the CFA, and we pop it, we are
14019 swapping back to the stack pointer as the CFA. This happens
14020 for stack frames that don't allocate other data, so we assume
14021 the stack pointer is now pointing at the return address, i.e.
14022 the function entry state, which makes the offset be 1 word. */
14023 if (reg == hard_frame_pointer_rtx)
14025 m->fs.fp_valid = false;
14026 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14028 m->fs.cfa_reg = stack_pointer_rtx;
14029 m->fs.cfa_offset -= UNITS_PER_WORD;
14031 add_reg_note (insn, REG_CFA_DEF_CFA,
14032 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14033 GEN_INT (m->fs.cfa_offset)));
14034 RTX_FRAME_RELATED_P (insn) = 1;
14039 /* Emit code to restore saved registers using POP insns. */
14041 static void
14042 ix86_emit_restore_regs_using_pop (void)
14044 unsigned int regno;
14046 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14047 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14048 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14051 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14052 omits the emit and only attaches the notes. */
14054 static void
14055 ix86_emit_leave (rtx_insn *insn)
14057 struct machine_function *m = cfun->machine;
14058 if (!insn)
14059 insn = emit_insn (ix86_gen_leave ());
14061 ix86_add_queued_cfa_restore_notes (insn);
14063 gcc_assert (m->fs.fp_valid);
14064 m->fs.sp_valid = true;
14065 m->fs.sp_realigned = false;
14066 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14067 m->fs.fp_valid = false;
14069 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14071 m->fs.cfa_reg = stack_pointer_rtx;
14072 m->fs.cfa_offset = m->fs.sp_offset;
14074 add_reg_note (insn, REG_CFA_DEF_CFA,
14075 plus_constant (Pmode, stack_pointer_rtx,
14076 m->fs.sp_offset));
14077 RTX_FRAME_RELATED_P (insn) = 1;
14079 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14080 m->fs.fp_offset);
14083 /* Emit code to restore saved registers using MOV insns.
14084 First register is restored from CFA - CFA_OFFSET. */
14085 static void
14086 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14087 bool maybe_eh_return)
14089 struct machine_function *m = cfun->machine;
14090 unsigned int regno;
14092 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14093 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14095 rtx reg = gen_rtx_REG (word_mode, regno);
14096 rtx mem;
14097 rtx_insn *insn;
14099 mem = choose_baseaddr (cfa_offset, NULL);
14100 mem = gen_frame_mem (word_mode, mem);
14101 insn = emit_move_insn (reg, mem);
14103 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14105 /* Previously we'd represented the CFA as an expression
14106 like *(%ebp - 8). We've just popped that value from
14107 the stack, which means we need to reset the CFA to
14108 the drap register. This will remain until we restore
14109 the stack pointer. */
14110 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14111 RTX_FRAME_RELATED_P (insn) = 1;
14113 /* This means that the DRAP register is valid for addressing. */
14114 m->fs.drap_valid = true;
14116 else
14117 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14119 cfa_offset -= UNITS_PER_WORD;
14123 /* Emit code to restore saved registers using MOV insns.
14124 First register is restored from CFA - CFA_OFFSET. */
14125 static void
14126 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14127 bool maybe_eh_return)
14129 unsigned int regno;
14131 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14132 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14134 rtx reg = gen_rtx_REG (V4SFmode, regno);
14135 rtx mem;
14136 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14138 mem = choose_baseaddr (cfa_offset, &align);
14139 mem = gen_rtx_MEM (V4SFmode, mem);
14141 /* The location aligment depends upon the base register. */
14142 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14143 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14144 set_mem_align (mem, align);
14145 emit_insn (gen_rtx_SET (reg, mem));
14147 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14149 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14153 static void
14154 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14155 bool use_call, int style)
14157 struct machine_function *m = cfun->machine;
14158 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14159 + m->call_ms2sysv_extra_regs;
14160 rtvec v;
14161 unsigned int elems_needed, align, i, vi = 0;
14162 rtx_insn *insn;
14163 rtx sym, tmp;
14164 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14165 rtx r10 = NULL_RTX;
14166 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14167 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14168 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14169 rtx rsi_frame_load = NULL_RTX;
14170 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14171 enum xlogue_stub stub;
14173 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14175 /* If using a realigned stack, we should never start with padding. */
14176 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14178 /* Setup RSI as the stub's base pointer. */
14179 align = GET_MODE_ALIGNMENT (V4SFmode);
14180 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14181 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14183 emit_insn (gen_rtx_SET (rsi, tmp));
14185 /* Get a symbol for the stub. */
14186 if (frame_pointer_needed)
14187 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14188 : XLOGUE_STUB_RESTORE_HFP_TAIL;
14189 else
14190 stub = use_call ? XLOGUE_STUB_RESTORE
14191 : XLOGUE_STUB_RESTORE_TAIL;
14192 sym = xlogue.get_stub_rtx (stub);
14194 elems_needed = ncregs;
14195 if (use_call)
14196 elems_needed += 1;
14197 else
14198 elems_needed += frame_pointer_needed ? 5 : 3;
14199 v = rtvec_alloc (elems_needed);
14201 /* We call the epilogue stub when we need to pop incoming args or we are
14202 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
14203 epilogue stub and it is the tail-call. */
14204 if (use_call)
14205 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14206 else
14208 RTVEC_ELT (v, vi++) = ret_rtx;
14209 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14210 if (frame_pointer_needed)
14212 rtx rbp = gen_rtx_REG (DImode, BP_REG);
14213 gcc_assert (m->fs.fp_valid);
14214 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14216 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14217 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14218 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14219 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14220 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14222 else
14224 /* If no hard frame pointer, we set R10 to the SP restore value. */
14225 gcc_assert (!m->fs.fp_valid);
14226 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14227 gcc_assert (m->fs.sp_valid);
14229 r10 = gen_rtx_REG (DImode, R10_REG);
14230 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14231 emit_insn (gen_rtx_SET (r10, tmp));
14233 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14237 /* Generate frame load insns and restore notes. */
14238 for (i = 0; i < ncregs; ++i)
14240 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14241 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14242 rtx reg, frame_load;
14244 reg = gen_rtx_REG (mode, r.regno);
14245 frame_load = gen_frame_load (reg, rsi, r.offset);
14247 /* Save RSI frame load insn & note to add last. */
14248 if (r.regno == SI_REG)
14250 gcc_assert (!rsi_frame_load);
14251 rsi_frame_load = frame_load;
14252 rsi_restore_offset = r.offset;
14254 else
14256 RTVEC_ELT (v, vi++) = frame_load;
14257 ix86_add_cfa_restore_note (NULL, reg, r.offset);
14261 /* Add RSI frame load & restore note at the end. */
14262 gcc_assert (rsi_frame_load);
14263 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14264 RTVEC_ELT (v, vi++) = rsi_frame_load;
14265 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14266 rsi_restore_offset);
14268 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
14269 if (!use_call && !frame_pointer_needed)
14271 gcc_assert (m->fs.sp_valid);
14272 gcc_assert (!m->fs.sp_realigned);
14274 /* At this point, R10 should point to frame.stack_realign_offset. */
14275 if (m->fs.cfa_reg == stack_pointer_rtx)
14276 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14277 m->fs.sp_offset = frame.stack_realign_offset;
14280 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14281 tmp = gen_rtx_PARALLEL (VOIDmode, v);
14282 if (use_call)
14283 insn = emit_insn (tmp);
14284 else
14286 insn = emit_jump_insn (tmp);
14287 JUMP_LABEL (insn) = ret_rtx;
14289 if (frame_pointer_needed)
14290 ix86_emit_leave (insn);
14291 else
14293 /* Need CFA adjust note. */
14294 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14295 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14299 RTX_FRAME_RELATED_P (insn) = true;
14300 ix86_add_queued_cfa_restore_notes (insn);
14302 /* If we're not doing a tail-call, we need to adjust the stack. */
14303 if (use_call && m->fs.sp_valid)
14305 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14306 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14307 GEN_INT (dealloc), style,
14308 m->fs.cfa_reg == stack_pointer_rtx);
14312 /* Restore function stack, frame, and registers. */
14314 void
14315 ix86_expand_epilogue (int style)
14317 struct machine_function *m = cfun->machine;
14318 struct machine_frame_state frame_state_save = m->fs;
14319 bool restore_regs_via_mov;
14320 bool using_drap;
14321 bool restore_stub_is_tail = false;
14323 if (ix86_function_naked (current_function_decl))
14325 /* The program should not reach this point. */
14326 emit_insn (gen_ud2 ());
14327 return;
14330 ix86_finalize_stack_frame_flags ();
14331 const struct ix86_frame &frame = cfun->machine->frame;
14333 m->fs.sp_realigned = stack_realign_fp;
14334 m->fs.sp_valid = stack_realign_fp
14335 || !frame_pointer_needed
14336 || crtl->sp_is_unchanging;
14337 gcc_assert (!m->fs.sp_valid
14338 || m->fs.sp_offset == frame.stack_pointer_offset);
14340 /* The FP must be valid if the frame pointer is present. */
14341 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14342 gcc_assert (!m->fs.fp_valid
14343 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14345 /* We must have *some* valid pointer to the stack frame. */
14346 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14348 /* The DRAP is never valid at this point. */
14349 gcc_assert (!m->fs.drap_valid);
14351 /* See the comment about red zone and frame
14352 pointer usage in ix86_expand_prologue. */
14353 if (frame_pointer_needed && frame.red_zone_size)
14354 emit_insn (gen_memory_blockage ());
14356 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14357 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14359 /* Determine the CFA offset of the end of the red-zone. */
14360 m->fs.red_zone_offset = 0;
14361 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14363 /* The red-zone begins below return address and error code in
14364 exception handler. */
14365 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14367 /* When the register save area is in the aligned portion of
14368 the stack, determine the maximum runtime displacement that
14369 matches up with the aligned frame. */
14370 if (stack_realign_drap)
14371 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14372 + UNITS_PER_WORD);
14375 HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14377 /* Special care must be taken for the normal return case of a function
14378 using eh_return: the eax and edx registers are marked as saved, but
14379 not restored along this path. Adjust the save location to match. */
14380 if (crtl->calls_eh_return && style != 2)
14381 reg_save_offset -= 2 * UNITS_PER_WORD;
14383 /* EH_RETURN requires the use of moves to function properly. */
14384 if (crtl->calls_eh_return)
14385 restore_regs_via_mov = true;
14386 /* SEH requires the use of pops to identify the epilogue. */
14387 else if (TARGET_SEH)
14388 restore_regs_via_mov = false;
14389 /* If we're only restoring one register and sp cannot be used then
14390 using a move instruction to restore the register since it's
14391 less work than reloading sp and popping the register. */
14392 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14393 restore_regs_via_mov = true;
14394 else if (TARGET_EPILOGUE_USING_MOVE
14395 && cfun->machine->use_fast_prologue_epilogue
14396 && (frame.nregs > 1
14397 || m->fs.sp_offset != reg_save_offset))
14398 restore_regs_via_mov = true;
14399 else if (frame_pointer_needed
14400 && !frame.nregs
14401 && m->fs.sp_offset != reg_save_offset)
14402 restore_regs_via_mov = true;
14403 else if (frame_pointer_needed
14404 && TARGET_USE_LEAVE
14405 && cfun->machine->use_fast_prologue_epilogue
14406 && frame.nregs == 1)
14407 restore_regs_via_mov = true;
14408 else
14409 restore_regs_via_mov = false;
14411 if (restore_regs_via_mov || frame.nsseregs)
14413 /* Ensure that the entire register save area is addressable via
14414 the stack pointer, if we will restore SSE regs via sp. */
14415 if (TARGET_64BIT
14416 && m->fs.sp_offset > 0x7fffffff
14417 && sp_valid_at (frame.stack_realign_offset + 1)
14418 && (frame.nsseregs + frame.nregs) != 0)
14420 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14421 GEN_INT (m->fs.sp_offset
14422 - frame.sse_reg_save_offset),
14423 style,
14424 m->fs.cfa_reg == stack_pointer_rtx);
14428 /* If there are any SSE registers to restore, then we have to do it
14429 via moves, since there's obviously no pop for SSE regs. */
14430 if (frame.nsseregs)
14431 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14432 style == 2);
14434 if (m->call_ms2sysv)
14436 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14438 /* We cannot use a tail-call for the stub if:
14439 1. We have to pop incoming args,
14440 2. We have additional int regs to restore, or
14441 3. A sibling call will be the tail-call, or
14442 4. We are emitting an eh_return_internal epilogue.
14444 TODO: Item 4 has not yet tested!
14446 If any of the above are true, we will call the stub rather than
14447 jump to it. */
14448 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14449 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14452 /* If using out-of-line stub that is a tail-call, then...*/
14453 if (m->call_ms2sysv && restore_stub_is_tail)
14455 /* TODO: parinoid tests. (remove eventually) */
14456 gcc_assert (m->fs.sp_valid);
14457 gcc_assert (!m->fs.sp_realigned);
14458 gcc_assert (!m->fs.fp_valid);
14459 gcc_assert (!m->fs.realigned);
14460 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14461 gcc_assert (!crtl->drap_reg);
14462 gcc_assert (!frame.nregs);
14464 else if (restore_regs_via_mov)
14466 rtx t;
14468 if (frame.nregs)
14469 ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14471 /* eh_return epilogues need %ecx added to the stack pointer. */
14472 if (style == 2)
14474 rtx sa = EH_RETURN_STACKADJ_RTX;
14475 rtx_insn *insn;
14477 /* %ecx can't be used for both DRAP register and eh_return. */
14478 if (crtl->drap_reg)
14479 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14481 /* regparm nested functions don't work with eh_return. */
14482 gcc_assert (!ix86_static_chain_on_stack);
14484 if (frame_pointer_needed)
14486 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14487 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14488 emit_insn (gen_rtx_SET (sa, t));
14490 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14491 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14493 /* Note that we use SA as a temporary CFA, as the return
14494 address is at the proper place relative to it. We
14495 pretend this happens at the FP restore insn because
14496 prior to this insn the FP would be stored at the wrong
14497 offset relative to SA, and after this insn we have no
14498 other reasonable register to use for the CFA. We don't
14499 bother resetting the CFA to the SP for the duration of
14500 the return insn, unless the control flow instrumentation
14501 is done. In this case the SP is used later and we have
14502 to reset CFA to SP. */
14503 add_reg_note (insn, REG_CFA_DEF_CFA,
14504 plus_constant (Pmode, sa, UNITS_PER_WORD));
14505 ix86_add_queued_cfa_restore_notes (insn);
14506 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14507 RTX_FRAME_RELATED_P (insn) = 1;
14509 m->fs.cfa_reg = sa;
14510 m->fs.cfa_offset = UNITS_PER_WORD;
14511 m->fs.fp_valid = false;
14513 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14514 const0_rtx, style,
14515 flag_cf_protection);
14517 else
14519 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14520 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14521 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14522 ix86_add_queued_cfa_restore_notes (insn);
14524 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14525 if (m->fs.cfa_offset != UNITS_PER_WORD)
14527 m->fs.cfa_offset = UNITS_PER_WORD;
14528 add_reg_note (insn, REG_CFA_DEF_CFA,
14529 plus_constant (Pmode, stack_pointer_rtx,
14530 UNITS_PER_WORD));
14531 RTX_FRAME_RELATED_P (insn) = 1;
14534 m->fs.sp_offset = UNITS_PER_WORD;
14535 m->fs.sp_valid = true;
14536 m->fs.sp_realigned = false;
14539 else
14541 /* SEH requires that the function end with (1) a stack adjustment
14542 if necessary, (2) a sequence of pops, and (3) a return or
14543 jump instruction. Prevent insns from the function body from
14544 being scheduled into this sequence. */
14545 if (TARGET_SEH)
14547 /* Prevent a catch region from being adjacent to the standard
14548 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14549 several other flags that would be interesting to test are
14550 not yet set up. */
14551 if (flag_non_call_exceptions)
14552 emit_insn (gen_nops (const1_rtx));
14553 else
14554 emit_insn (gen_blockage ());
14557 /* First step is to deallocate the stack frame so that we can
14558 pop the registers. If the stack pointer was realigned, it needs
14559 to be restored now. Also do it on SEH target for very large
14560 frame as the emitted instructions aren't allowed by the ABI
14561 in epilogues. */
14562 if (!m->fs.sp_valid || m->fs.sp_realigned
14563 || (TARGET_SEH
14564 && (m->fs.sp_offset - reg_save_offset
14565 >= SEH_MAX_FRAME_SIZE)))
14567 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14568 GEN_INT (m->fs.fp_offset
14569 - reg_save_offset),
14570 style, false);
14572 else if (m->fs.sp_offset != reg_save_offset)
14574 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14575 GEN_INT (m->fs.sp_offset
14576 - reg_save_offset),
14577 style,
14578 m->fs.cfa_reg == stack_pointer_rtx);
14581 ix86_emit_restore_regs_using_pop ();
14584 /* If we used a stack pointer and haven't already got rid of it,
14585 then do so now. */
14586 if (m->fs.fp_valid)
14588 /* If the stack pointer is valid and pointing at the frame
14589 pointer store address, then we only need a pop. */
14590 if (sp_valid_at (frame.hfp_save_offset)
14591 && m->fs.sp_offset == frame.hfp_save_offset)
14592 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14593 /* Leave results in shorter dependency chains on CPUs that are
14594 able to grok it fast. */
14595 else if (TARGET_USE_LEAVE
14596 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14597 || !cfun->machine->use_fast_prologue_epilogue)
14598 ix86_emit_leave (NULL);
14599 else
14601 pro_epilogue_adjust_stack (stack_pointer_rtx,
14602 hard_frame_pointer_rtx,
14603 const0_rtx, style, !using_drap);
14604 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14608 if (using_drap)
14610 int param_ptr_offset = UNITS_PER_WORD;
14611 rtx_insn *insn;
14613 gcc_assert (stack_realign_drap);
14615 if (ix86_static_chain_on_stack)
14616 param_ptr_offset += UNITS_PER_WORD;
14617 if (!call_used_regs[REGNO (crtl->drap_reg)])
14618 param_ptr_offset += UNITS_PER_WORD;
14620 insn = emit_insn (gen_rtx_SET
14621 (stack_pointer_rtx,
14622 gen_rtx_PLUS (Pmode,
14623 crtl->drap_reg,
14624 GEN_INT (-param_ptr_offset))));
14625 m->fs.cfa_reg = stack_pointer_rtx;
14626 m->fs.cfa_offset = param_ptr_offset;
14627 m->fs.sp_offset = param_ptr_offset;
14628 m->fs.realigned = false;
14630 add_reg_note (insn, REG_CFA_DEF_CFA,
14631 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14632 GEN_INT (param_ptr_offset)));
14633 RTX_FRAME_RELATED_P (insn) = 1;
14635 if (!call_used_regs[REGNO (crtl->drap_reg)])
14636 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14639 /* At this point the stack pointer must be valid, and we must have
14640 restored all of the registers. We may not have deallocated the
14641 entire stack frame. We've delayed this until now because it may
14642 be possible to merge the local stack deallocation with the
14643 deallocation forced by ix86_static_chain_on_stack. */
14644 gcc_assert (m->fs.sp_valid);
14645 gcc_assert (!m->fs.sp_realigned);
14646 gcc_assert (!m->fs.fp_valid);
14647 gcc_assert (!m->fs.realigned);
14648 if (m->fs.sp_offset != UNITS_PER_WORD)
14650 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14651 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14652 style, true);
14654 else
14655 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14657 /* Sibcall epilogues don't want a return instruction. */
14658 if (style == 0)
14660 m->fs = frame_state_save;
14661 return;
14664 if (cfun->machine->func_type != TYPE_NORMAL)
14665 emit_jump_insn (gen_interrupt_return ());
14666 else if (crtl->args.pops_args && crtl->args.size)
14668 rtx popc = GEN_INT (crtl->args.pops_args);
14670 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14671 address, do explicit add, and jump indirectly to the caller. */
14673 if (crtl->args.pops_args >= 65536)
14675 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14676 rtx_insn *insn;
14678 /* There is no "pascal" calling convention in any 64bit ABI. */
14679 gcc_assert (!TARGET_64BIT);
14681 insn = emit_insn (gen_pop (ecx));
14682 m->fs.cfa_offset -= UNITS_PER_WORD;
14683 m->fs.sp_offset -= UNITS_PER_WORD;
14685 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14686 x = gen_rtx_SET (stack_pointer_rtx, x);
14687 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14688 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14689 RTX_FRAME_RELATED_P (insn) = 1;
14691 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14692 popc, -1, true);
14693 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14695 else
14696 emit_jump_insn (gen_simple_return_pop_internal (popc));
14698 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14700 /* In case of return from EH a simple return cannot be used
14701 as a return address will be compared with a shadow stack
14702 return address. Use indirect jump instead. */
14703 if (style == 2 && flag_cf_protection)
14705 /* Register used in indirect jump must be in word_mode. But
14706 Pmode may not be the same as word_mode for x32. */
14707 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14708 rtx_insn *insn;
14710 insn = emit_insn (gen_pop (ecx));
14711 m->fs.cfa_offset -= UNITS_PER_WORD;
14712 m->fs.sp_offset -= UNITS_PER_WORD;
14714 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14715 x = gen_rtx_SET (stack_pointer_rtx, x);
14716 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14717 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14718 RTX_FRAME_RELATED_P (insn) = 1;
14720 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14722 else
14723 emit_jump_insn (gen_simple_return_internal ());
14726 /* Restore the state back to the state from the prologue,
14727 so that it's correct for the next epilogue. */
14728 m->fs = frame_state_save;
14731 /* Reset from the function's potential modifications. */
14733 static void
14734 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14736 if (pic_offset_table_rtx
14737 && !ix86_use_pseudo_pic_reg ())
14738 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14740 if (TARGET_MACHO)
14742 rtx_insn *insn = get_last_insn ();
14743 rtx_insn *deleted_debug_label = NULL;
14745 /* Mach-O doesn't support labels at the end of objects, so if
14746 it looks like we might want one, take special action.
14747 First, collect any sequence of deleted debug labels. */
14748 while (insn
14749 && NOTE_P (insn)
14750 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14752 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14753 notes only, instead set their CODE_LABEL_NUMBER to -1,
14754 otherwise there would be code generation differences
14755 in between -g and -g0. */
14756 if (NOTE_P (insn) && NOTE_KIND (insn)
14757 == NOTE_INSN_DELETED_DEBUG_LABEL)
14758 deleted_debug_label = insn;
14759 insn = PREV_INSN (insn);
14762 /* If we have:
14763 label:
14764 barrier
14765 then this needs to be detected, so skip past the barrier. */
14767 if (insn && BARRIER_P (insn))
14768 insn = PREV_INSN (insn);
14770 /* Up to now we've only seen notes or barriers. */
14771 if (insn)
14773 if (LABEL_P (insn)
14774 || (NOTE_P (insn)
14775 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14776 /* Trailing label. */
14777 fputs ("\tnop\n", file);
14778 else if (cfun && ! cfun->is_thunk)
14780 /* See if we have a completely empty function body, skipping
14781 the special case of the picbase thunk emitted as asm. */
14782 while (insn && ! INSN_P (insn))
14783 insn = PREV_INSN (insn);
14784 /* If we don't find any insns, we've got an empty function body;
14785 I.e. completely empty - without a return or branch. This is
14786 taken as the case where a function body has been removed
14787 because it contains an inline __builtin_unreachable(). GCC
14788 declares that reaching __builtin_unreachable() means UB so
14789 we're not obliged to do anything special; however, we want
14790 non-zero-sized function bodies. To meet this, and help the
14791 user out, let's trap the case. */
14792 if (insn == NULL)
14793 fputs ("\tud2\n", file);
14796 else if (deleted_debug_label)
14797 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14798 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14799 CODE_LABEL_NUMBER (insn) = -1;
14803 /* Return a scratch register to use in the split stack prologue. The
14804 split stack prologue is used for -fsplit-stack. It is the first
14805 instructions in the function, even before the regular prologue.
14806 The scratch register can be any caller-saved register which is not
14807 used for parameters or for the static chain. */
14809 static unsigned int
14810 split_stack_prologue_scratch_regno (void)
14812 if (TARGET_64BIT)
14813 return R11_REG;
14814 else
14816 bool is_fastcall, is_thiscall;
14817 int regparm;
14819 is_fastcall = (lookup_attribute ("fastcall",
14820 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14821 != NULL);
14822 is_thiscall = (lookup_attribute ("thiscall",
14823 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14824 != NULL);
14825 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14827 if (is_fastcall)
14829 if (DECL_STATIC_CHAIN (cfun->decl))
14831 sorry ("-fsplit-stack does not support fastcall with "
14832 "nested function");
14833 return INVALID_REGNUM;
14835 return AX_REG;
14837 else if (is_thiscall)
14839 if (!DECL_STATIC_CHAIN (cfun->decl))
14840 return DX_REG;
14841 return AX_REG;
14843 else if (regparm < 3)
14845 if (!DECL_STATIC_CHAIN (cfun->decl))
14846 return CX_REG;
14847 else
14849 if (regparm >= 2)
14851 sorry ("-fsplit-stack does not support 2 register "
14852 "parameters for a nested function");
14853 return INVALID_REGNUM;
14855 return DX_REG;
14858 else
14860 /* FIXME: We could make this work by pushing a register
14861 around the addition and comparison. */
14862 sorry ("-fsplit-stack does not support 3 register parameters");
14863 return INVALID_REGNUM;
14868 /* A SYMBOL_REF for the function which allocates new stackspace for
14869 -fsplit-stack. */
14871 static GTY(()) rtx split_stack_fn;
14873 /* A SYMBOL_REF for the more stack function when using the large
14874 model. */
14876 static GTY(()) rtx split_stack_fn_large;
14878 /* Return location of the stack guard value in the TLS block. */
14881 ix86_split_stack_guard (void)
14883 int offset;
14884 addr_space_t as = DEFAULT_TLS_SEG_REG;
14885 rtx r;
14887 gcc_assert (flag_split_stack);
14889 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14890 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14891 #else
14892 gcc_unreachable ();
14893 #endif
14895 r = GEN_INT (offset);
14896 r = gen_const_mem (Pmode, r);
14897 set_mem_addr_space (r, as);
14899 return r;
14902 /* Handle -fsplit-stack. These are the first instructions in the
14903 function, even before the regular prologue. */
14905 void
14906 ix86_expand_split_stack_prologue (void)
14908 HOST_WIDE_INT allocate;
14909 unsigned HOST_WIDE_INT args_size;
14910 rtx_code_label *label;
14911 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14912 rtx scratch_reg = NULL_RTX;
14913 rtx_code_label *varargs_label = NULL;
14914 rtx fn;
14916 gcc_assert (flag_split_stack && reload_completed);
14918 ix86_finalize_stack_frame_flags ();
14919 struct ix86_frame &frame = cfun->machine->frame;
14920 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14922 /* This is the label we will branch to if we have enough stack
14923 space. We expect the basic block reordering pass to reverse this
14924 branch if optimizing, so that we branch in the unlikely case. */
14925 label = gen_label_rtx ();
14927 /* We need to compare the stack pointer minus the frame size with
14928 the stack boundary in the TCB. The stack boundary always gives
14929 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14930 can compare directly. Otherwise we need to do an addition. */
14932 limit = ix86_split_stack_guard ();
14934 if (allocate < SPLIT_STACK_AVAILABLE)
14935 current = stack_pointer_rtx;
14936 else
14938 unsigned int scratch_regno;
14939 rtx offset;
14941 /* We need a scratch register to hold the stack pointer minus
14942 the required frame size. Since this is the very start of the
14943 function, the scratch register can be any caller-saved
14944 register which is not used for parameters. */
14945 offset = GEN_INT (- allocate);
14946 scratch_regno = split_stack_prologue_scratch_regno ();
14947 if (scratch_regno == INVALID_REGNUM)
14948 return;
14949 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14950 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14952 /* We don't use ix86_gen_add3 in this case because it will
14953 want to split to lea, but when not optimizing the insn
14954 will not be split after this point. */
14955 emit_insn (gen_rtx_SET (scratch_reg,
14956 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14957 offset)));
14959 else
14961 emit_move_insn (scratch_reg, offset);
14962 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14963 stack_pointer_rtx));
14965 current = scratch_reg;
14968 ix86_expand_branch (GEU, current, limit, label);
14969 rtx_insn *jump_insn = get_last_insn ();
14970 JUMP_LABEL (jump_insn) = label;
14972 /* Mark the jump as very likely to be taken. */
14973 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14975 if (split_stack_fn == NULL_RTX)
14977 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14978 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14980 fn = split_stack_fn;
14982 /* Get more stack space. We pass in the desired stack space and the
14983 size of the arguments to copy to the new stack. In 32-bit mode
14984 we push the parameters; __morestack will return on a new stack
14985 anyhow. In 64-bit mode we pass the parameters in r10 and
14986 r11. */
14987 allocate_rtx = GEN_INT (allocate);
14988 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
14989 call_fusage = NULL_RTX;
14990 rtx pop = NULL_RTX;
14991 if (TARGET_64BIT)
14993 rtx reg10, reg11;
14995 reg10 = gen_rtx_REG (Pmode, R10_REG);
14996 reg11 = gen_rtx_REG (Pmode, R11_REG);
14998 /* If this function uses a static chain, it will be in %r10.
14999 Preserve it across the call to __morestack. */
15000 if (DECL_STATIC_CHAIN (cfun->decl))
15002 rtx rax;
15004 rax = gen_rtx_REG (word_mode, AX_REG);
15005 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
15006 use_reg (&call_fusage, rax);
15009 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
15010 && !TARGET_PECOFF)
15012 HOST_WIDE_INT argval;
15014 gcc_assert (Pmode == DImode);
15015 /* When using the large model we need to load the address
15016 into a register, and we've run out of registers. So we
15017 switch to a different calling convention, and we call a
15018 different function: __morestack_large. We pass the
15019 argument size in the upper 32 bits of r10 and pass the
15020 frame size in the lower 32 bits. */
15021 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15022 gcc_assert ((args_size & 0xffffffff) == args_size);
15024 if (split_stack_fn_large == NULL_RTX)
15026 split_stack_fn_large =
15027 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15028 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15030 if (ix86_cmodel == CM_LARGE_PIC)
15032 rtx_code_label *label;
15033 rtx x;
15035 label = gen_label_rtx ();
15036 emit_label (label);
15037 LABEL_PRESERVE_P (label) = 1;
15038 emit_insn (gen_set_rip_rex64 (reg10, label));
15039 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15040 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15041 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15042 UNSPEC_GOT);
15043 x = gen_rtx_CONST (Pmode, x);
15044 emit_move_insn (reg11, x);
15045 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15046 x = gen_const_mem (Pmode, x);
15047 emit_move_insn (reg11, x);
15049 else
15050 emit_move_insn (reg11, split_stack_fn_large);
15052 fn = reg11;
15054 argval = ((args_size << 16) << 16) + allocate;
15055 emit_move_insn (reg10, GEN_INT (argval));
15057 else
15059 emit_move_insn (reg10, allocate_rtx);
15060 emit_move_insn (reg11, GEN_INT (args_size));
15061 use_reg (&call_fusage, reg11);
15064 use_reg (&call_fusage, reg10);
15066 else
15068 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15069 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15070 insn = emit_insn (gen_push (allocate_rtx));
15071 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15072 pop = GEN_INT (2 * UNITS_PER_WORD);
15074 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15075 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15076 pop, false);
15077 add_function_usage_to (call_insn, call_fusage);
15078 if (!TARGET_64BIT)
15079 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15080 /* Indicate that this function can't jump to non-local gotos. */
15081 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
15083 /* In order to make call/return prediction work right, we now need
15084 to execute a return instruction. See
15085 libgcc/config/i386/morestack.S for the details on how this works.
15087 For flow purposes gcc must not see this as a return
15088 instruction--we need control flow to continue at the subsequent
15089 label. Therefore, we use an unspec. */
15090 gcc_assert (crtl->args.pops_args < 65536);
15091 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15093 /* If we are in 64-bit mode and this function uses a static chain,
15094 we saved %r10 in %rax before calling _morestack. */
15095 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15096 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15097 gen_rtx_REG (word_mode, AX_REG));
15099 /* If this function calls va_start, we need to store a pointer to
15100 the arguments on the old stack, because they may not have been
15101 all copied to the new stack. At this point the old stack can be
15102 found at the frame pointer value used by __morestack, because
15103 __morestack has set that up before calling back to us. Here we
15104 store that pointer in a scratch register, and in
15105 ix86_expand_prologue we store the scratch register in a stack
15106 slot. */
15107 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15109 unsigned int scratch_regno;
15110 rtx frame_reg;
15111 int words;
15113 scratch_regno = split_stack_prologue_scratch_regno ();
15114 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15115 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15117 /* 64-bit:
15118 fp -> old fp value
15119 return address within this function
15120 return address of caller of this function
15121 stack arguments
15122 So we add three words to get to the stack arguments.
15124 32-bit:
15125 fp -> old fp value
15126 return address within this function
15127 first argument to __morestack
15128 second argument to __morestack
15129 return address of caller of this function
15130 stack arguments
15131 So we add five words to get to the stack arguments.
15133 words = TARGET_64BIT ? 3 : 5;
15134 emit_insn (gen_rtx_SET (scratch_reg,
15135 gen_rtx_PLUS (Pmode, frame_reg,
15136 GEN_INT (words * UNITS_PER_WORD))));
15138 varargs_label = gen_label_rtx ();
15139 emit_jump_insn (gen_jump (varargs_label));
15140 JUMP_LABEL (get_last_insn ()) = varargs_label;
15142 emit_barrier ();
15145 emit_label (label);
15146 LABEL_NUSES (label) = 1;
15148 /* If this function calls va_start, we now have to set the scratch
15149 register for the case where we do not call __morestack. In this
15150 case we need to set it based on the stack pointer. */
15151 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15153 emit_insn (gen_rtx_SET (scratch_reg,
15154 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15155 GEN_INT (UNITS_PER_WORD))));
15157 emit_label (varargs_label);
15158 LABEL_NUSES (varargs_label) = 1;
15162 /* We may have to tell the dataflow pass that the split stack prologue
15163 is initializing a scratch register. */
15165 static void
15166 ix86_live_on_entry (bitmap regs)
15168 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15170 gcc_assert (flag_split_stack);
15171 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15175 /* Extract the parts of an RTL expression that is a valid memory address
15176 for an instruction. Return 0 if the structure of the address is
15177 grossly off. Return -1 if the address contains ASHIFT, so it is not
15178 strictly valid, but still used for computing length of lea instruction. */
15181 ix86_decompose_address (rtx addr, struct ix86_address *out)
15183 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15184 rtx base_reg, index_reg;
15185 HOST_WIDE_INT scale = 1;
15186 rtx scale_rtx = NULL_RTX;
15187 rtx tmp;
15188 int retval = 1;
15189 addr_space_t seg = ADDR_SPACE_GENERIC;
15191 /* Allow zero-extended SImode addresses,
15192 they will be emitted with addr32 prefix. */
15193 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15195 if (GET_CODE (addr) == ZERO_EXTEND
15196 && GET_MODE (XEXP (addr, 0)) == SImode)
15198 addr = XEXP (addr, 0);
15199 if (CONST_INT_P (addr))
15200 return 0;
15202 else if (GET_CODE (addr) == AND
15203 && const_32bit_mask (XEXP (addr, 1), DImode))
15205 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15206 if (addr == NULL_RTX)
15207 return 0;
15209 if (CONST_INT_P (addr))
15210 return 0;
15214 /* Allow SImode subregs of DImode addresses,
15215 they will be emitted with addr32 prefix. */
15216 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15218 if (SUBREG_P (addr)
15219 && GET_MODE (SUBREG_REG (addr)) == DImode)
15221 addr = SUBREG_REG (addr);
15222 if (CONST_INT_P (addr))
15223 return 0;
15227 if (REG_P (addr))
15228 base = addr;
15229 else if (SUBREG_P (addr))
15231 if (REG_P (SUBREG_REG (addr)))
15232 base = addr;
15233 else
15234 return 0;
15236 else if (GET_CODE (addr) == PLUS)
15238 rtx addends[4], op;
15239 int n = 0, i;
15241 op = addr;
15244 if (n >= 4)
15245 return 0;
15246 addends[n++] = XEXP (op, 1);
15247 op = XEXP (op, 0);
15249 while (GET_CODE (op) == PLUS);
15250 if (n >= 4)
15251 return 0;
15252 addends[n] = op;
15254 for (i = n; i >= 0; --i)
15256 op = addends[i];
15257 switch (GET_CODE (op))
15259 case MULT:
15260 if (index)
15261 return 0;
15262 index = XEXP (op, 0);
15263 scale_rtx = XEXP (op, 1);
15264 break;
15266 case ASHIFT:
15267 if (index)
15268 return 0;
15269 index = XEXP (op, 0);
15270 tmp = XEXP (op, 1);
15271 if (!CONST_INT_P (tmp))
15272 return 0;
15273 scale = INTVAL (tmp);
15274 if ((unsigned HOST_WIDE_INT) scale > 3)
15275 return 0;
15276 scale = 1 << scale;
15277 break;
15279 case ZERO_EXTEND:
15280 op = XEXP (op, 0);
15281 if (GET_CODE (op) != UNSPEC)
15282 return 0;
15283 /* FALLTHRU */
15285 case UNSPEC:
15286 if (XINT (op, 1) == UNSPEC_TP
15287 && TARGET_TLS_DIRECT_SEG_REFS
15288 && seg == ADDR_SPACE_GENERIC)
15289 seg = DEFAULT_TLS_SEG_REG;
15290 else
15291 return 0;
15292 break;
15294 case SUBREG:
15295 if (!REG_P (SUBREG_REG (op)))
15296 return 0;
15297 /* FALLTHRU */
15299 case REG:
15300 if (!base)
15301 base = op;
15302 else if (!index)
15303 index = op;
15304 else
15305 return 0;
15306 break;
15308 case CONST:
15309 case CONST_INT:
15310 case SYMBOL_REF:
15311 case LABEL_REF:
15312 if (disp)
15313 return 0;
15314 disp = op;
15315 break;
15317 default:
15318 return 0;
15322 else if (GET_CODE (addr) == MULT)
15324 index = XEXP (addr, 0); /* index*scale */
15325 scale_rtx = XEXP (addr, 1);
15327 else if (GET_CODE (addr) == ASHIFT)
15329 /* We're called for lea too, which implements ashift on occasion. */
15330 index = XEXP (addr, 0);
15331 tmp = XEXP (addr, 1);
15332 if (!CONST_INT_P (tmp))
15333 return 0;
15334 scale = INTVAL (tmp);
15335 if ((unsigned HOST_WIDE_INT) scale > 3)
15336 return 0;
15337 scale = 1 << scale;
15338 retval = -1;
15340 else
15341 disp = addr; /* displacement */
15343 if (index)
15345 if (REG_P (index))
15347 else if (SUBREG_P (index)
15348 && REG_P (SUBREG_REG (index)))
15350 else
15351 return 0;
15354 /* Extract the integral value of scale. */
15355 if (scale_rtx)
15357 if (!CONST_INT_P (scale_rtx))
15358 return 0;
15359 scale = INTVAL (scale_rtx);
15362 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15363 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15365 /* Avoid useless 0 displacement. */
15366 if (disp == const0_rtx && (base || index))
15367 disp = NULL_RTX;
15369 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15370 if (base_reg && index_reg && scale == 1
15371 && (REGNO (index_reg) == ARG_POINTER_REGNUM
15372 || REGNO (index_reg) == FRAME_POINTER_REGNUM
15373 || REGNO (index_reg) == SP_REG))
15375 std::swap (base, index);
15376 std::swap (base_reg, index_reg);
15379 /* Special case: %ebp cannot be encoded as a base without a displacement.
15380 Similarly %r13. */
15381 if (!disp && base_reg
15382 && (REGNO (base_reg) == ARG_POINTER_REGNUM
15383 || REGNO (base_reg) == FRAME_POINTER_REGNUM
15384 || REGNO (base_reg) == BP_REG
15385 || REGNO (base_reg) == R13_REG))
15386 disp = const0_rtx;
15388 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15389 Avoid this by transforming to [%esi+0].
15390 Reload calls address legitimization without cfun defined, so we need
15391 to test cfun for being non-NULL. */
15392 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15393 && base_reg && !index_reg && !disp
15394 && REGNO (base_reg) == SI_REG)
15395 disp = const0_rtx;
15397 /* Special case: encode reg+reg instead of reg*2. */
15398 if (!base && index && scale == 2)
15399 base = index, base_reg = index_reg, scale = 1;
15401 /* Special case: scaling cannot be encoded without base or displacement. */
15402 if (!base && !disp && index && scale != 1)
15403 disp = const0_rtx;
15405 out->base = base;
15406 out->index = index;
15407 out->disp = disp;
15408 out->scale = scale;
15409 out->seg = seg;
15411 return retval;
15414 /* Return cost of the memory address x.
15415 For i386, it is better to use a complex address than let gcc copy
15416 the address into a reg and make a new pseudo. But not if the address
15417 requires to two regs - that would mean more pseudos with longer
15418 lifetimes. */
15419 static int
15420 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15422 struct ix86_address parts;
15423 int cost = 1;
15424 int ok = ix86_decompose_address (x, &parts);
15426 gcc_assert (ok);
15428 if (parts.base && SUBREG_P (parts.base))
15429 parts.base = SUBREG_REG (parts.base);
15430 if (parts.index && SUBREG_P (parts.index))
15431 parts.index = SUBREG_REG (parts.index);
15433 /* Attempt to minimize number of registers in the address by increasing
15434 address cost for each used register. We don't increase address cost
15435 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15436 is not invariant itself it most likely means that base or index is not
15437 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15438 which is not profitable for x86. */
15439 if (parts.base
15440 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15441 && (current_pass->type == GIMPLE_PASS
15442 || !pic_offset_table_rtx
15443 || !REG_P (parts.base)
15444 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15445 cost++;
15447 if (parts.index
15448 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15449 && (current_pass->type == GIMPLE_PASS
15450 || !pic_offset_table_rtx
15451 || !REG_P (parts.index)
15452 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15453 cost++;
15455 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15456 since it's predecode logic can't detect the length of instructions
15457 and it degenerates to vector decoded. Increase cost of such
15458 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15459 to split such addresses or even refuse such addresses at all.
15461 Following addressing modes are affected:
15462 [base+scale*index]
15463 [scale*index+disp]
15464 [base+index]
15466 The first and last case may be avoidable by explicitly coding the zero in
15467 memory address, but I don't have AMD-K6 machine handy to check this
15468 theory. */
15470 if (TARGET_K6
15471 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15472 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15473 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15474 cost += 10;
15476 return cost;
15479 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15480 this is used for to form addresses to local data when -fPIC is in
15481 use. */
15483 static bool
15484 darwin_local_data_pic (rtx disp)
15486 return (GET_CODE (disp) == UNSPEC
15487 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15490 /* True if operand X should be loaded from GOT. */
15492 bool
15493 ix86_force_load_from_GOT_p (rtx x)
15495 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15496 && !TARGET_PECOFF && !TARGET_MACHO
15497 && !flag_plt && !flag_pic
15498 && ix86_cmodel != CM_LARGE
15499 && GET_CODE (x) == SYMBOL_REF
15500 && SYMBOL_REF_FUNCTION_P (x)
15501 && !SYMBOL_REF_LOCAL_P (x));
15504 /* Determine if a given RTX is a valid constant. We already know this
15505 satisfies CONSTANT_P. */
15507 static bool
15508 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15510 /* Pointer bounds constants are not valid. */
15511 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15512 return false;
15514 switch (GET_CODE (x))
15516 case CONST:
15517 x = XEXP (x, 0);
15519 if (GET_CODE (x) == PLUS)
15521 if (!CONST_INT_P (XEXP (x, 1)))
15522 return false;
15523 x = XEXP (x, 0);
15526 if (TARGET_MACHO && darwin_local_data_pic (x))
15527 return true;
15529 /* Only some unspecs are valid as "constants". */
15530 if (GET_CODE (x) == UNSPEC)
15531 switch (XINT (x, 1))
15533 case UNSPEC_GOT:
15534 case UNSPEC_GOTOFF:
15535 case UNSPEC_PLTOFF:
15536 return TARGET_64BIT;
15537 case UNSPEC_TPOFF:
15538 case UNSPEC_NTPOFF:
15539 x = XVECEXP (x, 0, 0);
15540 return (GET_CODE (x) == SYMBOL_REF
15541 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15542 case UNSPEC_DTPOFF:
15543 x = XVECEXP (x, 0, 0);
15544 return (GET_CODE (x) == SYMBOL_REF
15545 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15546 default:
15547 return false;
15550 /* We must have drilled down to a symbol. */
15551 if (GET_CODE (x) == LABEL_REF)
15552 return true;
15553 if (GET_CODE (x) != SYMBOL_REF)
15554 return false;
15555 /* FALLTHRU */
15557 case SYMBOL_REF:
15558 /* TLS symbols are never valid. */
15559 if (SYMBOL_REF_TLS_MODEL (x))
15560 return false;
15562 /* DLLIMPORT symbols are never valid. */
15563 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15564 && SYMBOL_REF_DLLIMPORT_P (x))
15565 return false;
15567 #if TARGET_MACHO
15568 /* mdynamic-no-pic */
15569 if (MACHO_DYNAMIC_NO_PIC_P)
15570 return machopic_symbol_defined_p (x);
15571 #endif
15573 /* External function address should be loaded
15574 via the GOT slot to avoid PLT. */
15575 if (ix86_force_load_from_GOT_p (x))
15576 return false;
15578 break;
15580 CASE_CONST_SCALAR_INT:
15581 switch (mode)
15583 case E_TImode:
15584 if (TARGET_64BIT)
15585 return true;
15586 /* FALLTHRU */
15587 case E_OImode:
15588 case E_XImode:
15589 if (!standard_sse_constant_p (x, mode))
15590 return false;
15591 default:
15592 break;
15594 break;
15596 case CONST_VECTOR:
15597 if (!standard_sse_constant_p (x, mode))
15598 return false;
15600 default:
15601 break;
15604 /* Otherwise we handle everything else in the move patterns. */
15605 return true;
15608 /* Determine if it's legal to put X into the constant pool. This
15609 is not possible for the address of thread-local symbols, which
15610 is checked above. */
15612 static bool
15613 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15615 /* We can put any immediate constant in memory. */
15616 switch (GET_CODE (x))
15618 CASE_CONST_ANY:
15619 return false;
15621 default:
15622 break;
15625 return !ix86_legitimate_constant_p (mode, x);
15628 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15629 otherwise zero. */
15631 static bool
15632 is_imported_p (rtx x)
15634 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15635 || GET_CODE (x) != SYMBOL_REF)
15636 return false;
15638 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15642 /* Nonzero if the constant value X is a legitimate general operand
15643 when generating PIC code. It is given that flag_pic is on and
15644 that X satisfies CONSTANT_P. */
15646 bool
15647 legitimate_pic_operand_p (rtx x)
15649 rtx inner;
15651 switch (GET_CODE (x))
15653 case CONST:
15654 inner = XEXP (x, 0);
15655 if (GET_CODE (inner) == PLUS
15656 && CONST_INT_P (XEXP (inner, 1)))
15657 inner = XEXP (inner, 0);
15659 /* Only some unspecs are valid as "constants". */
15660 if (GET_CODE (inner) == UNSPEC)
15661 switch (XINT (inner, 1))
15663 case UNSPEC_GOT:
15664 case UNSPEC_GOTOFF:
15665 case UNSPEC_PLTOFF:
15666 return TARGET_64BIT;
15667 case UNSPEC_TPOFF:
15668 x = XVECEXP (inner, 0, 0);
15669 return (GET_CODE (x) == SYMBOL_REF
15670 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15671 case UNSPEC_MACHOPIC_OFFSET:
15672 return legitimate_pic_address_disp_p (x);
15673 default:
15674 return false;
15676 /* FALLTHRU */
15678 case SYMBOL_REF:
15679 case LABEL_REF:
15680 return legitimate_pic_address_disp_p (x);
15682 default:
15683 return true;
15687 /* Determine if a given CONST RTX is a valid memory displacement
15688 in PIC mode. */
15690 bool
15691 legitimate_pic_address_disp_p (rtx disp)
15693 bool saw_plus;
15695 /* In 64bit mode we can allow direct addresses of symbols and labels
15696 when they are not dynamic symbols. */
15697 if (TARGET_64BIT)
15699 rtx op0 = disp, op1;
15701 switch (GET_CODE (disp))
15703 case LABEL_REF:
15704 return true;
15706 case CONST:
15707 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15708 break;
15709 op0 = XEXP (XEXP (disp, 0), 0);
15710 op1 = XEXP (XEXP (disp, 0), 1);
15711 if (!CONST_INT_P (op1))
15712 break;
15713 if (GET_CODE (op0) == UNSPEC
15714 && (XINT (op0, 1) == UNSPEC_DTPOFF
15715 || XINT (op0, 1) == UNSPEC_NTPOFF)
15716 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15717 return true;
15718 if (INTVAL (op1) >= 16*1024*1024
15719 || INTVAL (op1) < -16*1024*1024)
15720 break;
15721 if (GET_CODE (op0) == LABEL_REF)
15722 return true;
15723 if (GET_CODE (op0) == CONST
15724 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15725 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15726 return true;
15727 if (GET_CODE (op0) == UNSPEC
15728 && XINT (op0, 1) == UNSPEC_PCREL)
15729 return true;
15730 if (GET_CODE (op0) != SYMBOL_REF)
15731 break;
15732 /* FALLTHRU */
15734 case SYMBOL_REF:
15735 /* TLS references should always be enclosed in UNSPEC.
15736 The dllimported symbol needs always to be resolved. */
15737 if (SYMBOL_REF_TLS_MODEL (op0)
15738 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15739 return false;
15741 if (TARGET_PECOFF)
15743 if (is_imported_p (op0))
15744 return true;
15746 if (SYMBOL_REF_FAR_ADDR_P (op0)
15747 || !SYMBOL_REF_LOCAL_P (op0))
15748 break;
15750 /* Function-symbols need to be resolved only for
15751 large-model.
15752 For the small-model we don't need to resolve anything
15753 here. */
15754 if ((ix86_cmodel != CM_LARGE_PIC
15755 && SYMBOL_REF_FUNCTION_P (op0))
15756 || ix86_cmodel == CM_SMALL_PIC)
15757 return true;
15758 /* Non-external symbols don't need to be resolved for
15759 large, and medium-model. */
15760 if ((ix86_cmodel == CM_LARGE_PIC
15761 || ix86_cmodel == CM_MEDIUM_PIC)
15762 && !SYMBOL_REF_EXTERNAL_P (op0))
15763 return true;
15765 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15766 && (SYMBOL_REF_LOCAL_P (op0)
15767 || (HAVE_LD_PIE_COPYRELOC
15768 && flag_pie
15769 && !SYMBOL_REF_WEAK (op0)
15770 && !SYMBOL_REF_FUNCTION_P (op0)))
15771 && ix86_cmodel != CM_LARGE_PIC)
15772 return true;
15773 break;
15775 default:
15776 break;
15779 if (GET_CODE (disp) != CONST)
15780 return false;
15781 disp = XEXP (disp, 0);
15783 if (TARGET_64BIT)
15785 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15786 of GOT tables. We should not need these anyway. */
15787 if (GET_CODE (disp) != UNSPEC
15788 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15789 && XINT (disp, 1) != UNSPEC_GOTOFF
15790 && XINT (disp, 1) != UNSPEC_PCREL
15791 && XINT (disp, 1) != UNSPEC_PLTOFF))
15792 return false;
15794 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15795 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15796 return false;
15797 return true;
15800 saw_plus = false;
15801 if (GET_CODE (disp) == PLUS)
15803 if (!CONST_INT_P (XEXP (disp, 1)))
15804 return false;
15805 disp = XEXP (disp, 0);
15806 saw_plus = true;
15809 if (TARGET_MACHO && darwin_local_data_pic (disp))
15810 return true;
15812 if (GET_CODE (disp) != UNSPEC)
15813 return false;
15815 switch (XINT (disp, 1))
15817 case UNSPEC_GOT:
15818 if (saw_plus)
15819 return false;
15820 /* We need to check for both symbols and labels because VxWorks loads
15821 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15822 details. */
15823 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15824 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15825 case UNSPEC_GOTOFF:
15826 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15827 While ABI specify also 32bit relocation but we don't produce it in
15828 small PIC model at all. */
15829 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15830 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15831 && !TARGET_64BIT)
15832 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15833 return false;
15834 case UNSPEC_GOTTPOFF:
15835 case UNSPEC_GOTNTPOFF:
15836 case UNSPEC_INDNTPOFF:
15837 if (saw_plus)
15838 return false;
15839 disp = XVECEXP (disp, 0, 0);
15840 return (GET_CODE (disp) == SYMBOL_REF
15841 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15842 case UNSPEC_NTPOFF:
15843 disp = XVECEXP (disp, 0, 0);
15844 return (GET_CODE (disp) == SYMBOL_REF
15845 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15846 case UNSPEC_DTPOFF:
15847 disp = XVECEXP (disp, 0, 0);
15848 return (GET_CODE (disp) == SYMBOL_REF
15849 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15852 return false;
15855 /* Determine if op is suitable RTX for an address register.
15856 Return naked register if a register or a register subreg is
15857 found, otherwise return NULL_RTX. */
15859 static rtx
15860 ix86_validate_address_register (rtx op)
15862 machine_mode mode = GET_MODE (op);
15864 /* Only SImode or DImode registers can form the address. */
15865 if (mode != SImode && mode != DImode)
15866 return NULL_RTX;
15868 if (REG_P (op))
15869 return op;
15870 else if (SUBREG_P (op))
15872 rtx reg = SUBREG_REG (op);
15874 if (!REG_P (reg))
15875 return NULL_RTX;
15877 mode = GET_MODE (reg);
15879 /* Don't allow SUBREGs that span more than a word. It can
15880 lead to spill failures when the register is one word out
15881 of a two word structure. */
15882 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15883 return NULL_RTX;
15885 /* Allow only SUBREGs of non-eliminable hard registers. */
15886 if (register_no_elim_operand (reg, mode))
15887 return reg;
15890 /* Op is not a register. */
15891 return NULL_RTX;
15894 /* Recognizes RTL expressions that are valid memory addresses for an
15895 instruction. The MODE argument is the machine mode for the MEM
15896 expression that wants to use this address.
15898 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15899 convert common non-canonical forms to canonical form so that they will
15900 be recognized. */
15902 static bool
15903 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15905 struct ix86_address parts;
15906 rtx base, index, disp;
15907 HOST_WIDE_INT scale;
15908 addr_space_t seg;
15910 if (ix86_decompose_address (addr, &parts) <= 0)
15911 /* Decomposition failed. */
15912 return false;
15914 base = parts.base;
15915 index = parts.index;
15916 disp = parts.disp;
15917 scale = parts.scale;
15918 seg = parts.seg;
15920 /* Validate base register. */
15921 if (base)
15923 rtx reg = ix86_validate_address_register (base);
15925 if (reg == NULL_RTX)
15926 return false;
15928 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15929 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15930 /* Base is not valid. */
15931 return false;
15934 /* Validate index register. */
15935 if (index)
15937 rtx reg = ix86_validate_address_register (index);
15939 if (reg == NULL_RTX)
15940 return false;
15942 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15943 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15944 /* Index is not valid. */
15945 return false;
15948 /* Index and base should have the same mode. */
15949 if (base && index
15950 && GET_MODE (base) != GET_MODE (index))
15951 return false;
15953 /* Address override works only on the (%reg) part of %fs:(%reg). */
15954 if (seg != ADDR_SPACE_GENERIC
15955 && ((base && GET_MODE (base) != word_mode)
15956 || (index && GET_MODE (index) != word_mode)))
15957 return false;
15959 /* Validate scale factor. */
15960 if (scale != 1)
15962 if (!index)
15963 /* Scale without index. */
15964 return false;
15966 if (scale != 2 && scale != 4 && scale != 8)
15967 /* Scale is not a valid multiplier. */
15968 return false;
15971 /* Validate displacement. */
15972 if (disp)
15974 if (GET_CODE (disp) == CONST
15975 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15976 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15977 switch (XINT (XEXP (disp, 0), 1))
15979 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15980 when used. While ABI specify also 32bit relocations, we
15981 don't produce them at all and use IP relative instead.
15982 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15983 should be loaded via GOT. */
15984 case UNSPEC_GOT:
15985 if (!TARGET_64BIT
15986 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15987 goto is_legitimate_pic;
15988 /* FALLTHRU */
15989 case UNSPEC_GOTOFF:
15990 gcc_assert (flag_pic);
15991 if (!TARGET_64BIT)
15992 goto is_legitimate_pic;
15994 /* 64bit address unspec. */
15995 return false;
15997 case UNSPEC_GOTPCREL:
15998 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15999 goto is_legitimate_pic;
16000 /* FALLTHRU */
16001 case UNSPEC_PCREL:
16002 gcc_assert (flag_pic);
16003 goto is_legitimate_pic;
16005 case UNSPEC_GOTTPOFF:
16006 case UNSPEC_GOTNTPOFF:
16007 case UNSPEC_INDNTPOFF:
16008 case UNSPEC_NTPOFF:
16009 case UNSPEC_DTPOFF:
16010 break;
16012 default:
16013 /* Invalid address unspec. */
16014 return false;
16017 else if (SYMBOLIC_CONST (disp)
16018 && (flag_pic
16019 || (TARGET_MACHO
16020 #if TARGET_MACHO
16021 && MACHOPIC_INDIRECT
16022 && !machopic_operand_p (disp)
16023 #endif
16027 is_legitimate_pic:
16028 if (TARGET_64BIT && (index || base))
16030 /* foo@dtpoff(%rX) is ok. */
16031 if (GET_CODE (disp) != CONST
16032 || GET_CODE (XEXP (disp, 0)) != PLUS
16033 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16034 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16035 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16036 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16037 /* Non-constant pic memory reference. */
16038 return false;
16040 else if ((!TARGET_MACHO || flag_pic)
16041 && ! legitimate_pic_address_disp_p (disp))
16042 /* Displacement is an invalid pic construct. */
16043 return false;
16044 #if TARGET_MACHO
16045 else if (MACHO_DYNAMIC_NO_PIC_P
16046 && !ix86_legitimate_constant_p (Pmode, disp))
16047 /* displacment must be referenced via non_lazy_pointer */
16048 return false;
16049 #endif
16051 /* This code used to verify that a symbolic pic displacement
16052 includes the pic_offset_table_rtx register.
16054 While this is good idea, unfortunately these constructs may
16055 be created by "adds using lea" optimization for incorrect
16056 code like:
16058 int a;
16059 int foo(int i)
16061 return *(&a+i);
16064 This code is nonsensical, but results in addressing
16065 GOT table with pic_offset_table_rtx base. We can't
16066 just refuse it easily, since it gets matched by
16067 "addsi3" pattern, that later gets split to lea in the
16068 case output register differs from input. While this
16069 can be handled by separate addsi pattern for this case
16070 that never results in lea, this seems to be easier and
16071 correct fix for crash to disable this test. */
16073 else if (GET_CODE (disp) != LABEL_REF
16074 && !CONST_INT_P (disp)
16075 && (GET_CODE (disp) != CONST
16076 || !ix86_legitimate_constant_p (Pmode, disp))
16077 && (GET_CODE (disp) != SYMBOL_REF
16078 || !ix86_legitimate_constant_p (Pmode, disp)))
16079 /* Displacement is not constant. */
16080 return false;
16081 else if (TARGET_64BIT
16082 && !x86_64_immediate_operand (disp, VOIDmode))
16083 /* Displacement is out of range. */
16084 return false;
16085 /* In x32 mode, constant addresses are sign extended to 64bit, so
16086 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16087 else if (TARGET_X32 && !(index || base)
16088 && CONST_INT_P (disp)
16089 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16090 return false;
16093 /* Everything looks valid. */
16094 return true;
16097 /* Determine if a given RTX is a valid constant address. */
16099 bool
16100 constant_address_p (rtx x)
16102 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16105 /* Return a unique alias set for the GOT. */
16107 static alias_set_type
16108 ix86_GOT_alias_set (void)
16110 static alias_set_type set = -1;
16111 if (set == -1)
16112 set = new_alias_set ();
16113 return set;
16116 /* Return a legitimate reference for ORIG (an address) using the
16117 register REG. If REG is 0, a new pseudo is generated.
16119 There are two types of references that must be handled:
16121 1. Global data references must load the address from the GOT, via
16122 the PIC reg. An insn is emitted to do this load, and the reg is
16123 returned.
16125 2. Static data references, constant pool addresses, and code labels
16126 compute the address as an offset from the GOT, whose base is in
16127 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16128 differentiate them from global data objects. The returned
16129 address is the PIC reg + an unspec constant.
16131 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16132 reg also appears in the address. */
16134 static rtx
16135 legitimize_pic_address (rtx orig, rtx reg)
16137 rtx addr = orig;
16138 rtx new_rtx = orig;
16140 #if TARGET_MACHO
16141 if (TARGET_MACHO && !TARGET_64BIT)
16143 if (reg == 0)
16144 reg = gen_reg_rtx (Pmode);
16145 /* Use the generic Mach-O PIC machinery. */
16146 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16148 #endif
16150 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16152 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16153 if (tmp)
16154 return tmp;
16157 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16158 new_rtx = addr;
16159 else if ((!TARGET_64BIT
16160 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16161 && !TARGET_PECOFF
16162 && gotoff_operand (addr, Pmode))
16164 /* This symbol may be referenced via a displacement
16165 from the PIC base address (@GOTOFF). */
16166 if (GET_CODE (addr) == CONST)
16167 addr = XEXP (addr, 0);
16169 if (GET_CODE (addr) == PLUS)
16171 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16172 UNSPEC_GOTOFF);
16173 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16175 else
16176 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16178 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16180 if (TARGET_64BIT)
16181 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16183 if (reg != 0)
16185 gcc_assert (REG_P (reg));
16186 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16187 new_rtx, reg, 1, OPTAB_DIRECT);
16189 else
16190 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16192 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16193 /* We can't use @GOTOFF for text labels
16194 on VxWorks, see gotoff_operand. */
16195 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16197 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16198 if (tmp)
16199 return tmp;
16201 /* For x64 PE-COFF there is no GOT table,
16202 so we use address directly. */
16203 if (TARGET_64BIT && TARGET_PECOFF)
16205 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16206 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16208 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16210 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16211 UNSPEC_GOTPCREL);
16212 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16213 new_rtx = gen_const_mem (Pmode, new_rtx);
16214 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16216 else
16218 /* This symbol must be referenced via a load
16219 from the Global Offset Table (@GOT). */
16220 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16221 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16222 if (TARGET_64BIT)
16223 new_rtx = force_reg (Pmode, new_rtx);
16224 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16225 new_rtx = gen_const_mem (Pmode, new_rtx);
16226 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16229 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16231 else
16233 if (CONST_INT_P (addr)
16234 && !x86_64_immediate_operand (addr, VOIDmode))
16235 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16236 else if (GET_CODE (addr) == CONST)
16238 addr = XEXP (addr, 0);
16240 /* We must match stuff we generate before. Assume the only
16241 unspecs that can get here are ours. Not that we could do
16242 anything with them anyway.... */
16243 if (GET_CODE (addr) == UNSPEC
16244 || (GET_CODE (addr) == PLUS
16245 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16246 return orig;
16247 gcc_assert (GET_CODE (addr) == PLUS);
16250 if (GET_CODE (addr) == PLUS)
16252 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16254 /* Check first to see if this is a constant
16255 offset from a @GOTOFF symbol reference. */
16256 if (!TARGET_PECOFF
16257 && gotoff_operand (op0, Pmode)
16258 && CONST_INT_P (op1))
16260 if (!TARGET_64BIT)
16262 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16263 UNSPEC_GOTOFF);
16264 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16265 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16267 if (reg != 0)
16269 gcc_assert (REG_P (reg));
16270 new_rtx = expand_simple_binop (Pmode, PLUS,
16271 pic_offset_table_rtx,
16272 new_rtx, reg, 1,
16273 OPTAB_DIRECT);
16275 else
16276 new_rtx
16277 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16279 else
16281 if (INTVAL (op1) < -16*1024*1024
16282 || INTVAL (op1) >= 16*1024*1024)
16284 if (!x86_64_immediate_operand (op1, Pmode))
16285 op1 = force_reg (Pmode, op1);
16287 new_rtx
16288 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16292 else
16294 rtx base = legitimize_pic_address (op0, reg);
16295 machine_mode mode = GET_MODE (base);
16296 new_rtx
16297 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16299 if (CONST_INT_P (new_rtx))
16301 if (INTVAL (new_rtx) < -16*1024*1024
16302 || INTVAL (new_rtx) >= 16*1024*1024)
16304 if (!x86_64_immediate_operand (new_rtx, mode))
16305 new_rtx = force_reg (mode, new_rtx);
16307 new_rtx
16308 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16310 else
16311 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16313 else
16315 /* For %rip addressing, we have to use
16316 just disp32, not base nor index. */
16317 if (TARGET_64BIT
16318 && (GET_CODE (base) == SYMBOL_REF
16319 || GET_CODE (base) == LABEL_REF))
16320 base = force_reg (mode, base);
16321 if (GET_CODE (new_rtx) == PLUS
16322 && CONSTANT_P (XEXP (new_rtx, 1)))
16324 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16325 new_rtx = XEXP (new_rtx, 1);
16327 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16332 return new_rtx;
16335 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16337 static rtx
16338 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16340 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16342 if (GET_MODE (tp) != tp_mode)
16344 gcc_assert (GET_MODE (tp) == SImode);
16345 gcc_assert (tp_mode == DImode);
16347 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16350 if (to_reg)
16351 tp = copy_to_mode_reg (tp_mode, tp);
16353 return tp;
16356 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16358 static GTY(()) rtx ix86_tls_symbol;
16360 static rtx
16361 ix86_tls_get_addr (void)
16363 if (!ix86_tls_symbol)
16365 const char *sym
16366 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16367 ? "___tls_get_addr" : "__tls_get_addr");
16369 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16372 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16374 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16375 UNSPEC_PLTOFF);
16376 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16377 gen_rtx_CONST (Pmode, unspec));
16380 return ix86_tls_symbol;
16383 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16385 static GTY(()) rtx ix86_tls_module_base_symbol;
16388 ix86_tls_module_base (void)
16390 if (!ix86_tls_module_base_symbol)
16392 ix86_tls_module_base_symbol
16393 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16395 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16396 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16399 return ix86_tls_module_base_symbol;
16402 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16403 false if we expect this to be used for a memory address and true if
16404 we expect to load the address into a register. */
16406 static rtx
16407 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16409 rtx dest, base, off;
16410 rtx pic = NULL_RTX, tp = NULL_RTX;
16411 machine_mode tp_mode = Pmode;
16412 int type;
16414 /* Fall back to global dynamic model if tool chain cannot support local
16415 dynamic. */
16416 if (TARGET_SUN_TLS && !TARGET_64BIT
16417 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16418 && model == TLS_MODEL_LOCAL_DYNAMIC)
16419 model = TLS_MODEL_GLOBAL_DYNAMIC;
16421 switch (model)
16423 case TLS_MODEL_GLOBAL_DYNAMIC:
16424 dest = gen_reg_rtx (Pmode);
16426 if (!TARGET_64BIT)
16428 if (flag_pic && !TARGET_PECOFF)
16429 pic = pic_offset_table_rtx;
16430 else
16432 pic = gen_reg_rtx (Pmode);
16433 emit_insn (gen_set_got (pic));
16437 if (TARGET_GNU2_TLS)
16439 if (TARGET_64BIT)
16440 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16441 else
16442 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16444 tp = get_thread_pointer (Pmode, true);
16445 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16447 if (GET_MODE (x) != Pmode)
16448 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16450 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16452 else
16454 rtx caddr = ix86_tls_get_addr ();
16456 if (TARGET_64BIT)
16458 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16459 rtx_insn *insns;
16461 start_sequence ();
16462 emit_call_insn
16463 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16464 insns = get_insns ();
16465 end_sequence ();
16467 if (GET_MODE (x) != Pmode)
16468 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16470 RTL_CONST_CALL_P (insns) = 1;
16471 emit_libcall_block (insns, dest, rax, x);
16473 else
16474 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16476 break;
16478 case TLS_MODEL_LOCAL_DYNAMIC:
16479 base = gen_reg_rtx (Pmode);
16481 if (!TARGET_64BIT)
16483 if (flag_pic)
16484 pic = pic_offset_table_rtx;
16485 else
16487 pic = gen_reg_rtx (Pmode);
16488 emit_insn (gen_set_got (pic));
16492 if (TARGET_GNU2_TLS)
16494 rtx tmp = ix86_tls_module_base ();
16496 if (TARGET_64BIT)
16497 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16498 else
16499 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16501 tp = get_thread_pointer (Pmode, true);
16502 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16503 gen_rtx_MINUS (Pmode, tmp, tp));
16505 else
16507 rtx caddr = ix86_tls_get_addr ();
16509 if (TARGET_64BIT)
16511 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16512 rtx_insn *insns;
16513 rtx eqv;
16515 start_sequence ();
16516 emit_call_insn
16517 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16518 insns = get_insns ();
16519 end_sequence ();
16521 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16522 share the LD_BASE result with other LD model accesses. */
16523 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16524 UNSPEC_TLS_LD_BASE);
16526 RTL_CONST_CALL_P (insns) = 1;
16527 emit_libcall_block (insns, base, rax, eqv);
16529 else
16530 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16533 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16534 off = gen_rtx_CONST (Pmode, off);
16536 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16538 if (TARGET_GNU2_TLS)
16540 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16542 if (GET_MODE (x) != Pmode)
16543 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16545 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16547 break;
16549 case TLS_MODEL_INITIAL_EXEC:
16550 if (TARGET_64BIT)
16552 if (TARGET_SUN_TLS && !TARGET_X32)
16554 /* The Sun linker took the AMD64 TLS spec literally
16555 and can only handle %rax as destination of the
16556 initial executable code sequence. */
16558 dest = gen_reg_rtx (DImode);
16559 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16560 return dest;
16563 /* Generate DImode references to avoid %fs:(%reg32)
16564 problems and linker IE->LE relaxation bug. */
16565 tp_mode = DImode;
16566 pic = NULL;
16567 type = UNSPEC_GOTNTPOFF;
16569 else if (flag_pic)
16571 pic = pic_offset_table_rtx;
16572 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16574 else if (!TARGET_ANY_GNU_TLS)
16576 pic = gen_reg_rtx (Pmode);
16577 emit_insn (gen_set_got (pic));
16578 type = UNSPEC_GOTTPOFF;
16580 else
16582 pic = NULL;
16583 type = UNSPEC_INDNTPOFF;
16586 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16587 off = gen_rtx_CONST (tp_mode, off);
16588 if (pic)
16589 off = gen_rtx_PLUS (tp_mode, pic, off);
16590 off = gen_const_mem (tp_mode, off);
16591 set_mem_alias_set (off, ix86_GOT_alias_set ());
16593 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16595 base = get_thread_pointer (tp_mode,
16596 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16597 off = force_reg (tp_mode, off);
16598 dest = gen_rtx_PLUS (tp_mode, base, off);
16599 if (tp_mode != Pmode)
16600 dest = convert_to_mode (Pmode, dest, 1);
16602 else
16604 base = get_thread_pointer (Pmode, true);
16605 dest = gen_reg_rtx (Pmode);
16606 emit_insn (ix86_gen_sub3 (dest, base, off));
16608 break;
16610 case TLS_MODEL_LOCAL_EXEC:
16611 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16612 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16613 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16614 off = gen_rtx_CONST (Pmode, off);
16616 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16618 base = get_thread_pointer (Pmode,
16619 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16620 return gen_rtx_PLUS (Pmode, base, off);
16622 else
16624 base = get_thread_pointer (Pmode, true);
16625 dest = gen_reg_rtx (Pmode);
16626 emit_insn (ix86_gen_sub3 (dest, base, off));
16628 break;
16630 default:
16631 gcc_unreachable ();
16634 return dest;
16637 /* Return true if OP refers to a TLS address. */
16638 bool
16639 ix86_tls_address_pattern_p (rtx op)
16641 subrtx_var_iterator::array_type array;
16642 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16644 rtx op = *iter;
16645 if (MEM_P (op))
16647 rtx *x = &XEXP (op, 0);
16648 while (GET_CODE (*x) == PLUS)
16650 int i;
16651 for (i = 0; i < 2; i++)
16653 rtx u = XEXP (*x, i);
16654 if (GET_CODE (u) == ZERO_EXTEND)
16655 u = XEXP (u, 0);
16656 if (GET_CODE (u) == UNSPEC
16657 && XINT (u, 1) == UNSPEC_TP)
16658 return true;
16660 x = &XEXP (*x, 0);
16663 iter.skip_subrtxes ();
16667 return false;
16670 /* Rewrite *LOC so that it refers to a default TLS address space. */
16671 void
16672 ix86_rewrite_tls_address_1 (rtx *loc)
16674 subrtx_ptr_iterator::array_type array;
16675 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16677 rtx *loc = *iter;
16678 if (MEM_P (*loc))
16680 rtx addr = XEXP (*loc, 0);
16681 rtx *x = &addr;
16682 while (GET_CODE (*x) == PLUS)
16684 int i;
16685 for (i = 0; i < 2; i++)
16687 rtx u = XEXP (*x, i);
16688 if (GET_CODE (u) == ZERO_EXTEND)
16689 u = XEXP (u, 0);
16690 if (GET_CODE (u) == UNSPEC
16691 && XINT (u, 1) == UNSPEC_TP)
16693 addr_space_t as = DEFAULT_TLS_SEG_REG;
16695 *x = XEXP (*x, 1 - i);
16697 *loc = replace_equiv_address_nv (*loc, addr, true);
16698 set_mem_addr_space (*loc, as);
16699 return;
16702 x = &XEXP (*x, 0);
16705 iter.skip_subrtxes ();
16710 /* Rewrite instruction pattern involvning TLS address
16711 so that it refers to a default TLS address space. */
16713 ix86_rewrite_tls_address (rtx pattern)
16715 pattern = copy_insn (pattern);
16716 ix86_rewrite_tls_address_1 (&pattern);
16717 return pattern;
16720 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16721 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16722 unique refptr-DECL symbol corresponding to symbol DECL. */
16724 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16726 static inline hashval_t hash (tree_map *m) { return m->hash; }
16727 static inline bool
16728 equal (tree_map *a, tree_map *b)
16730 return a->base.from == b->base.from;
16733 static int
16734 keep_cache_entry (tree_map *&m)
16736 return ggc_marked_p (m->base.from);
16740 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16742 static tree
16743 get_dllimport_decl (tree decl, bool beimport)
16745 struct tree_map *h, in;
16746 const char *name;
16747 const char *prefix;
16748 size_t namelen, prefixlen;
16749 char *imp_name;
16750 tree to;
16751 rtx rtl;
16753 if (!dllimport_map)
16754 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16756 in.hash = htab_hash_pointer (decl);
16757 in.base.from = decl;
16758 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16759 h = *loc;
16760 if (h)
16761 return h->to;
16763 *loc = h = ggc_alloc<tree_map> ();
16764 h->hash = in.hash;
16765 h->base.from = decl;
16766 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16767 VAR_DECL, NULL, ptr_type_node);
16768 DECL_ARTIFICIAL (to) = 1;
16769 DECL_IGNORED_P (to) = 1;
16770 DECL_EXTERNAL (to) = 1;
16771 TREE_READONLY (to) = 1;
16773 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16774 name = targetm.strip_name_encoding (name);
16775 if (beimport)
16776 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16777 ? "*__imp_" : "*__imp__";
16778 else
16779 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16780 namelen = strlen (name);
16781 prefixlen = strlen (prefix);
16782 imp_name = (char *) alloca (namelen + prefixlen + 1);
16783 memcpy (imp_name, prefix, prefixlen);
16784 memcpy (imp_name + prefixlen, name, namelen + 1);
16786 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16787 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16788 SET_SYMBOL_REF_DECL (rtl, to);
16789 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16790 if (!beimport)
16792 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16793 #ifdef SUB_TARGET_RECORD_STUB
16794 SUB_TARGET_RECORD_STUB (name);
16795 #endif
16798 rtl = gen_const_mem (Pmode, rtl);
16799 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16801 SET_DECL_RTL (to, rtl);
16802 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16804 return to;
16807 /* Expand SYMBOL into its corresponding far-address symbol.
16808 WANT_REG is true if we require the result be a register. */
16810 static rtx
16811 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16813 tree imp_decl;
16814 rtx x;
16816 gcc_assert (SYMBOL_REF_DECL (symbol));
16817 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16819 x = DECL_RTL (imp_decl);
16820 if (want_reg)
16821 x = force_reg (Pmode, x);
16822 return x;
16825 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16826 true if we require the result be a register. */
16828 static rtx
16829 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16831 tree imp_decl;
16832 rtx x;
16834 gcc_assert (SYMBOL_REF_DECL (symbol));
16835 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16837 x = DECL_RTL (imp_decl);
16838 if (want_reg)
16839 x = force_reg (Pmode, x);
16840 return x;
16843 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16844 is true if we require the result be a register. */
16846 static rtx
16847 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16849 if (!TARGET_PECOFF)
16850 return NULL_RTX;
16852 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16854 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16855 return legitimize_dllimport_symbol (addr, inreg);
16856 if (GET_CODE (addr) == CONST
16857 && GET_CODE (XEXP (addr, 0)) == PLUS
16858 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16859 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16861 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16862 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16866 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16867 return NULL_RTX;
16868 if (GET_CODE (addr) == SYMBOL_REF
16869 && !is_imported_p (addr)
16870 && SYMBOL_REF_EXTERNAL_P (addr)
16871 && SYMBOL_REF_DECL (addr))
16872 return legitimize_pe_coff_extern_decl (addr, inreg);
16874 if (GET_CODE (addr) == CONST
16875 && GET_CODE (XEXP (addr, 0)) == PLUS
16876 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16877 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16878 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16879 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16881 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16882 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16884 return NULL_RTX;
16887 /* Try machine-dependent ways of modifying an illegitimate address
16888 to be legitimate. If we find one, return the new, valid address.
16889 This macro is used in only one place: `memory_address' in explow.c.
16891 OLDX is the address as it was before break_out_memory_refs was called.
16892 In some cases it is useful to look at this to decide what needs to be done.
16894 It is always safe for this macro to do nothing. It exists to recognize
16895 opportunities to optimize the output.
16897 For the 80386, we handle X+REG by loading X into a register R and
16898 using R+REG. R will go in a general reg and indexing will be used.
16899 However, if REG is a broken-out memory address or multiplication,
16900 nothing needs to be done because REG can certainly go in a general reg.
16902 When -fpic is used, special handling is needed for symbolic references.
16903 See comments by legitimize_pic_address in i386.c for details. */
16905 static rtx
16906 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16908 bool changed = false;
16909 unsigned log;
16911 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16912 if (log)
16913 return legitimize_tls_address (x, (enum tls_model) log, false);
16914 if (GET_CODE (x) == CONST
16915 && GET_CODE (XEXP (x, 0)) == PLUS
16916 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16917 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16919 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16920 (enum tls_model) log, false);
16921 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16924 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16926 rtx tmp = legitimize_pe_coff_symbol (x, true);
16927 if (tmp)
16928 return tmp;
16931 if (flag_pic && SYMBOLIC_CONST (x))
16932 return legitimize_pic_address (x, 0);
16934 #if TARGET_MACHO
16935 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16936 return machopic_indirect_data_reference (x, 0);
16937 #endif
16939 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16940 if (GET_CODE (x) == ASHIFT
16941 && CONST_INT_P (XEXP (x, 1))
16942 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16944 changed = true;
16945 log = INTVAL (XEXP (x, 1));
16946 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16947 GEN_INT (1 << log));
16950 if (GET_CODE (x) == PLUS)
16952 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16954 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16955 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16956 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16958 changed = true;
16959 log = INTVAL (XEXP (XEXP (x, 0), 1));
16960 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16961 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16962 GEN_INT (1 << log));
16965 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16966 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16967 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16969 changed = true;
16970 log = INTVAL (XEXP (XEXP (x, 1), 1));
16971 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16972 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16973 GEN_INT (1 << log));
16976 /* Put multiply first if it isn't already. */
16977 if (GET_CODE (XEXP (x, 1)) == MULT)
16979 std::swap (XEXP (x, 0), XEXP (x, 1));
16980 changed = true;
16983 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16984 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16985 created by virtual register instantiation, register elimination, and
16986 similar optimizations. */
16987 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16989 changed = true;
16990 x = gen_rtx_PLUS (Pmode,
16991 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16992 XEXP (XEXP (x, 1), 0)),
16993 XEXP (XEXP (x, 1), 1));
16996 /* Canonicalize
16997 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16998 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16999 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
17000 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
17001 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
17002 && CONSTANT_P (XEXP (x, 1)))
17004 rtx constant;
17005 rtx other = NULL_RTX;
17007 if (CONST_INT_P (XEXP (x, 1)))
17009 constant = XEXP (x, 1);
17010 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17012 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17014 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17015 other = XEXP (x, 1);
17017 else
17018 constant = 0;
17020 if (constant)
17022 changed = true;
17023 x = gen_rtx_PLUS (Pmode,
17024 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17025 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17026 plus_constant (Pmode, other,
17027 INTVAL (constant)));
17031 if (changed && ix86_legitimate_address_p (mode, x, false))
17032 return x;
17034 if (GET_CODE (XEXP (x, 0)) == MULT)
17036 changed = true;
17037 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17040 if (GET_CODE (XEXP (x, 1)) == MULT)
17042 changed = true;
17043 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17046 if (changed
17047 && REG_P (XEXP (x, 1))
17048 && REG_P (XEXP (x, 0)))
17049 return x;
17051 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17053 changed = true;
17054 x = legitimize_pic_address (x, 0);
17057 if (changed && ix86_legitimate_address_p (mode, x, false))
17058 return x;
17060 if (REG_P (XEXP (x, 0)))
17062 rtx temp = gen_reg_rtx (Pmode);
17063 rtx val = force_operand (XEXP (x, 1), temp);
17064 if (val != temp)
17066 val = convert_to_mode (Pmode, val, 1);
17067 emit_move_insn (temp, val);
17070 XEXP (x, 1) = temp;
17071 return x;
17074 else if (REG_P (XEXP (x, 1)))
17076 rtx temp = gen_reg_rtx (Pmode);
17077 rtx val = force_operand (XEXP (x, 0), temp);
17078 if (val != temp)
17080 val = convert_to_mode (Pmode, val, 1);
17081 emit_move_insn (temp, val);
17084 XEXP (x, 0) = temp;
17085 return x;
17089 return x;
17092 /* Print an integer constant expression in assembler syntax. Addition
17093 and subtraction are the only arithmetic that may appear in these
17094 expressions. FILE is the stdio stream to write to, X is the rtx, and
17095 CODE is the operand print code from the output string. */
17097 static void
17098 output_pic_addr_const (FILE *file, rtx x, int code)
17100 char buf[256];
17102 switch (GET_CODE (x))
17104 case PC:
17105 gcc_assert (flag_pic);
17106 putc ('.', file);
17107 break;
17109 case SYMBOL_REF:
17110 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17111 output_addr_const (file, x);
17112 else
17114 const char *name = XSTR (x, 0);
17116 /* Mark the decl as referenced so that cgraph will
17117 output the function. */
17118 if (SYMBOL_REF_DECL (x))
17119 mark_decl_referenced (SYMBOL_REF_DECL (x));
17121 #if TARGET_MACHO
17122 if (MACHOPIC_INDIRECT
17123 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17124 name = machopic_indirection_name (x, /*stub_p=*/true);
17125 #endif
17126 assemble_name (file, name);
17128 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17129 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17130 fputs ("@PLT", file);
17131 break;
17133 case LABEL_REF:
17134 x = XEXP (x, 0);
17135 /* FALLTHRU */
17136 case CODE_LABEL:
17137 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17138 assemble_name (asm_out_file, buf);
17139 break;
17141 case CONST_INT:
17142 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17143 break;
17145 case CONST:
17146 /* This used to output parentheses around the expression,
17147 but that does not work on the 386 (either ATT or BSD assembler). */
17148 output_pic_addr_const (file, XEXP (x, 0), code);
17149 break;
17151 case CONST_DOUBLE:
17152 /* We can't handle floating point constants;
17153 TARGET_PRINT_OPERAND must handle them. */
17154 output_operand_lossage ("floating constant misused");
17155 break;
17157 case PLUS:
17158 /* Some assemblers need integer constants to appear first. */
17159 if (CONST_INT_P (XEXP (x, 0)))
17161 output_pic_addr_const (file, XEXP (x, 0), code);
17162 putc ('+', file);
17163 output_pic_addr_const (file, XEXP (x, 1), code);
17165 else
17167 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17168 output_pic_addr_const (file, XEXP (x, 1), code);
17169 putc ('+', file);
17170 output_pic_addr_const (file, XEXP (x, 0), code);
17172 break;
17174 case MINUS:
17175 if (!TARGET_MACHO)
17176 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17177 output_pic_addr_const (file, XEXP (x, 0), code);
17178 putc ('-', file);
17179 output_pic_addr_const (file, XEXP (x, 1), code);
17180 if (!TARGET_MACHO)
17181 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17182 break;
17184 case UNSPEC:
17185 gcc_assert (XVECLEN (x, 0) == 1);
17186 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17187 switch (XINT (x, 1))
17189 case UNSPEC_GOT:
17190 fputs ("@GOT", file);
17191 break;
17192 case UNSPEC_GOTOFF:
17193 fputs ("@GOTOFF", file);
17194 break;
17195 case UNSPEC_PLTOFF:
17196 fputs ("@PLTOFF", file);
17197 break;
17198 case UNSPEC_PCREL:
17199 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17200 "(%rip)" : "[rip]", file);
17201 break;
17202 case UNSPEC_GOTPCREL:
17203 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17204 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17205 break;
17206 case UNSPEC_GOTTPOFF:
17207 /* FIXME: This might be @TPOFF in Sun ld too. */
17208 fputs ("@gottpoff", file);
17209 break;
17210 case UNSPEC_TPOFF:
17211 fputs ("@tpoff", file);
17212 break;
17213 case UNSPEC_NTPOFF:
17214 if (TARGET_64BIT)
17215 fputs ("@tpoff", file);
17216 else
17217 fputs ("@ntpoff", file);
17218 break;
17219 case UNSPEC_DTPOFF:
17220 fputs ("@dtpoff", file);
17221 break;
17222 case UNSPEC_GOTNTPOFF:
17223 if (TARGET_64BIT)
17224 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17225 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17226 else
17227 fputs ("@gotntpoff", file);
17228 break;
17229 case UNSPEC_INDNTPOFF:
17230 fputs ("@indntpoff", file);
17231 break;
17232 #if TARGET_MACHO
17233 case UNSPEC_MACHOPIC_OFFSET:
17234 putc ('-', file);
17235 machopic_output_function_base_name (file);
17236 break;
17237 #endif
17238 default:
17239 output_operand_lossage ("invalid UNSPEC as operand");
17240 break;
17242 break;
17244 default:
17245 output_operand_lossage ("invalid expression as operand");
17249 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17250 We need to emit DTP-relative relocations. */
17252 static void ATTRIBUTE_UNUSED
17253 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17255 fputs (ASM_LONG, file);
17256 output_addr_const (file, x);
17257 fputs ("@dtpoff", file);
17258 switch (size)
17260 case 4:
17261 break;
17262 case 8:
17263 fputs (", 0", file);
17264 break;
17265 default:
17266 gcc_unreachable ();
17270 /* Return true if X is a representation of the PIC register. This copes
17271 with calls from ix86_find_base_term, where the register might have
17272 been replaced by a cselib value. */
17274 static bool
17275 ix86_pic_register_p (rtx x)
17277 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17278 return (pic_offset_table_rtx
17279 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17280 else if (!REG_P (x))
17281 return false;
17282 else if (pic_offset_table_rtx)
17284 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17285 return true;
17286 if (HARD_REGISTER_P (x)
17287 && !HARD_REGISTER_P (pic_offset_table_rtx)
17288 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17289 return true;
17290 return false;
17292 else
17293 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17296 /* Helper function for ix86_delegitimize_address.
17297 Attempt to delegitimize TLS local-exec accesses. */
17299 static rtx
17300 ix86_delegitimize_tls_address (rtx orig_x)
17302 rtx x = orig_x, unspec;
17303 struct ix86_address addr;
17305 if (!TARGET_TLS_DIRECT_SEG_REFS)
17306 return orig_x;
17307 if (MEM_P (x))
17308 x = XEXP (x, 0);
17309 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17310 return orig_x;
17311 if (ix86_decompose_address (x, &addr) == 0
17312 || addr.seg != DEFAULT_TLS_SEG_REG
17313 || addr.disp == NULL_RTX
17314 || GET_CODE (addr.disp) != CONST)
17315 return orig_x;
17316 unspec = XEXP (addr.disp, 0);
17317 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17318 unspec = XEXP (unspec, 0);
17319 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17320 return orig_x;
17321 x = XVECEXP (unspec, 0, 0);
17322 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17323 if (unspec != XEXP (addr.disp, 0))
17324 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17325 if (addr.index)
17327 rtx idx = addr.index;
17328 if (addr.scale != 1)
17329 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17330 x = gen_rtx_PLUS (Pmode, idx, x);
17332 if (addr.base)
17333 x = gen_rtx_PLUS (Pmode, addr.base, x);
17334 if (MEM_P (orig_x))
17335 x = replace_equiv_address_nv (orig_x, x);
17336 return x;
17339 /* In the name of slightly smaller debug output, and to cater to
17340 general assembler lossage, recognize PIC+GOTOFF and turn it back
17341 into a direct symbol reference.
17343 On Darwin, this is necessary to avoid a crash, because Darwin
17344 has a different PIC label for each routine but the DWARF debugging
17345 information is not associated with any particular routine, so it's
17346 necessary to remove references to the PIC label from RTL stored by
17347 the DWARF output code.
17349 This helper is used in the normal ix86_delegitimize_address
17350 entrypoint (e.g. used in the target delegitimization hook) and
17351 in ix86_find_base_term. As compile time memory optimization, we
17352 avoid allocating rtxes that will not change anything on the outcome
17353 of the callers (find_base_value and find_base_term). */
17355 static inline rtx
17356 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17358 rtx orig_x = delegitimize_mem_from_attrs (x);
17359 /* addend is NULL or some rtx if x is something+GOTOFF where
17360 something doesn't include the PIC register. */
17361 rtx addend = NULL_RTX;
17362 /* reg_addend is NULL or a multiple of some register. */
17363 rtx reg_addend = NULL_RTX;
17364 /* const_addend is NULL or a const_int. */
17365 rtx const_addend = NULL_RTX;
17366 /* This is the result, or NULL. */
17367 rtx result = NULL_RTX;
17369 x = orig_x;
17371 if (MEM_P (x))
17372 x = XEXP (x, 0);
17374 if (TARGET_64BIT)
17376 if (GET_CODE (x) == CONST
17377 && GET_CODE (XEXP (x, 0)) == PLUS
17378 && GET_MODE (XEXP (x, 0)) == Pmode
17379 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17380 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17381 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17383 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17384 base. A CONST can't be arg_pointer_rtx based. */
17385 if (base_term_p && MEM_P (orig_x))
17386 return orig_x;
17387 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17388 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17389 if (MEM_P (orig_x))
17390 x = replace_equiv_address_nv (orig_x, x);
17391 return x;
17394 if (GET_CODE (x) == CONST
17395 && GET_CODE (XEXP (x, 0)) == UNSPEC
17396 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17397 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17398 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17400 x = XVECEXP (XEXP (x, 0), 0, 0);
17401 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17403 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17404 if (x == NULL_RTX)
17405 return orig_x;
17407 return x;
17410 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17411 return ix86_delegitimize_tls_address (orig_x);
17413 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17414 and -mcmodel=medium -fpic. */
17417 if (GET_CODE (x) != PLUS
17418 || GET_CODE (XEXP (x, 1)) != CONST)
17419 return ix86_delegitimize_tls_address (orig_x);
17421 if (ix86_pic_register_p (XEXP (x, 0)))
17422 /* %ebx + GOT/GOTOFF */
17424 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17426 /* %ebx + %reg * scale + GOT/GOTOFF */
17427 reg_addend = XEXP (x, 0);
17428 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17429 reg_addend = XEXP (reg_addend, 1);
17430 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17431 reg_addend = XEXP (reg_addend, 0);
17432 else
17434 reg_addend = NULL_RTX;
17435 addend = XEXP (x, 0);
17438 else
17439 addend = XEXP (x, 0);
17441 x = XEXP (XEXP (x, 1), 0);
17442 if (GET_CODE (x) == PLUS
17443 && CONST_INT_P (XEXP (x, 1)))
17445 const_addend = XEXP (x, 1);
17446 x = XEXP (x, 0);
17449 if (GET_CODE (x) == UNSPEC
17450 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17451 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17452 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17453 && !MEM_P (orig_x) && !addend)))
17454 result = XVECEXP (x, 0, 0);
17456 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17457 && !MEM_P (orig_x))
17458 result = XVECEXP (x, 0, 0);
17460 if (! result)
17461 return ix86_delegitimize_tls_address (orig_x);
17463 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17464 recurse on the first operand. */
17465 if (const_addend && !base_term_p)
17466 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17467 if (reg_addend)
17468 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17469 if (addend)
17471 /* If the rest of original X doesn't involve the PIC register, add
17472 addend and subtract pic_offset_table_rtx. This can happen e.g.
17473 for code like:
17474 leal (%ebx, %ecx, 4), %ecx
17476 movl foo@GOTOFF(%ecx), %edx
17477 in which case we return (%ecx - %ebx) + foo
17478 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17479 and reload has completed. Don't do the latter for debug,
17480 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17481 if (pic_offset_table_rtx
17482 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17483 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17484 pic_offset_table_rtx),
17485 result);
17486 else if (base_term_p
17487 && pic_offset_table_rtx
17488 && !TARGET_MACHO
17489 && !TARGET_VXWORKS_RTP)
17491 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17492 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17493 result = gen_rtx_PLUS (Pmode, tmp, result);
17495 else
17496 return orig_x;
17498 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17500 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17501 if (result == NULL_RTX)
17502 return orig_x;
17504 return result;
17507 /* The normal instantiation of the above template. */
17509 static rtx
17510 ix86_delegitimize_address (rtx x)
17512 return ix86_delegitimize_address_1 (x, false);
17515 /* If X is a machine specific address (i.e. a symbol or label being
17516 referenced as a displacement from the GOT implemented using an
17517 UNSPEC), then return the base term. Otherwise return X. */
17520 ix86_find_base_term (rtx x)
17522 rtx term;
17524 if (TARGET_64BIT)
17526 if (GET_CODE (x) != CONST)
17527 return x;
17528 term = XEXP (x, 0);
17529 if (GET_CODE (term) == PLUS
17530 && CONST_INT_P (XEXP (term, 1)))
17531 term = XEXP (term, 0);
17532 if (GET_CODE (term) != UNSPEC
17533 || (XINT (term, 1) != UNSPEC_GOTPCREL
17534 && XINT (term, 1) != UNSPEC_PCREL))
17535 return x;
17537 return XVECEXP (term, 0, 0);
17540 return ix86_delegitimize_address_1 (x, true);
17543 /* Return true if X shouldn't be emitted into the debug info.
17544 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17545 symbol easily into the .debug_info section, so we need not to
17546 delegitimize, but instead assemble as @gotoff.
17547 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17548 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17550 static bool
17551 ix86_const_not_ok_for_debug_p (rtx x)
17553 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17554 return true;
17556 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17557 return true;
17559 return false;
17562 static void
17563 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17564 bool fp, FILE *file)
17566 const char *suffix;
17568 if (mode == CCFPmode)
17570 code = ix86_fp_compare_code_to_integer (code);
17571 mode = CCmode;
17573 if (reverse)
17574 code = reverse_condition (code);
17576 switch (code)
17578 case EQ:
17579 gcc_assert (mode != CCGZmode);
17580 switch (mode)
17582 case E_CCAmode:
17583 suffix = "a";
17584 break;
17585 case E_CCCmode:
17586 suffix = "c";
17587 break;
17588 case E_CCOmode:
17589 suffix = "o";
17590 break;
17591 case E_CCPmode:
17592 suffix = "p";
17593 break;
17594 case E_CCSmode:
17595 suffix = "s";
17596 break;
17597 default:
17598 suffix = "e";
17599 break;
17601 break;
17602 case NE:
17603 gcc_assert (mode != CCGZmode);
17604 switch (mode)
17606 case E_CCAmode:
17607 suffix = "na";
17608 break;
17609 case E_CCCmode:
17610 suffix = "nc";
17611 break;
17612 case E_CCOmode:
17613 suffix = "no";
17614 break;
17615 case E_CCPmode:
17616 suffix = "np";
17617 break;
17618 case E_CCSmode:
17619 suffix = "ns";
17620 break;
17621 default:
17622 suffix = "ne";
17623 break;
17625 break;
17626 case GT:
17627 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17628 suffix = "g";
17629 break;
17630 case GTU:
17631 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17632 Those same assemblers have the same but opposite lossage on cmov. */
17633 if (mode == CCmode)
17634 suffix = fp ? "nbe" : "a";
17635 else
17636 gcc_unreachable ();
17637 break;
17638 case LT:
17639 switch (mode)
17641 case E_CCNOmode:
17642 case E_CCGOCmode:
17643 suffix = "s";
17644 break;
17646 case E_CCmode:
17647 case E_CCGCmode:
17648 case E_CCGZmode:
17649 suffix = "l";
17650 break;
17652 default:
17653 gcc_unreachable ();
17655 break;
17656 case LTU:
17657 if (mode == CCmode || mode == CCGZmode)
17658 suffix = "b";
17659 else if (mode == CCCmode)
17660 suffix = fp ? "b" : "c";
17661 else
17662 gcc_unreachable ();
17663 break;
17664 case GE:
17665 switch (mode)
17667 case E_CCNOmode:
17668 case E_CCGOCmode:
17669 suffix = "ns";
17670 break;
17672 case E_CCmode:
17673 case E_CCGCmode:
17674 case E_CCGZmode:
17675 suffix = "ge";
17676 break;
17678 default:
17679 gcc_unreachable ();
17681 break;
17682 case GEU:
17683 if (mode == CCmode || mode == CCGZmode)
17684 suffix = "nb";
17685 else if (mode == CCCmode)
17686 suffix = fp ? "nb" : "nc";
17687 else
17688 gcc_unreachable ();
17689 break;
17690 case LE:
17691 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17692 suffix = "le";
17693 break;
17694 case LEU:
17695 if (mode == CCmode)
17696 suffix = "be";
17697 else
17698 gcc_unreachable ();
17699 break;
17700 case UNORDERED:
17701 suffix = fp ? "u" : "p";
17702 break;
17703 case ORDERED:
17704 suffix = fp ? "nu" : "np";
17705 break;
17706 default:
17707 gcc_unreachable ();
17709 fputs (suffix, file);
17712 /* Print the name of register X to FILE based on its machine mode and number.
17713 If CODE is 'w', pretend the mode is HImode.
17714 If CODE is 'b', pretend the mode is QImode.
17715 If CODE is 'k', pretend the mode is SImode.
17716 If CODE is 'q', pretend the mode is DImode.
17717 If CODE is 'x', pretend the mode is V4SFmode.
17718 If CODE is 't', pretend the mode is V8SFmode.
17719 If CODE is 'g', pretend the mode is V16SFmode.
17720 If CODE is 'h', pretend the reg is the 'high' byte register.
17721 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17722 If CODE is 'd', duplicate the operand for AVX instruction.
17723 If CODE is 'V', print naked full integer register name without %.
17726 void
17727 print_reg (rtx x, int code, FILE *file)
17729 const char *reg;
17730 int msize;
17731 unsigned int regno;
17732 bool duplicated;
17734 if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17735 putc ('%', file);
17737 if (x == pc_rtx)
17739 gcc_assert (TARGET_64BIT);
17740 fputs ("rip", file);
17741 return;
17744 if (code == 'y' && STACK_TOP_P (x))
17746 fputs ("st(0)", file);
17747 return;
17750 if (code == 'w')
17751 msize = 2;
17752 else if (code == 'b')
17753 msize = 1;
17754 else if (code == 'k')
17755 msize = 4;
17756 else if (code == 'q')
17757 msize = 8;
17758 else if (code == 'h')
17759 msize = 0;
17760 else if (code == 'x')
17761 msize = 16;
17762 else if (code == 't')
17763 msize = 32;
17764 else if (code == 'g')
17765 msize = 64;
17766 else
17767 msize = GET_MODE_SIZE (GET_MODE (x));
17769 regno = REGNO (x);
17771 if (regno == ARG_POINTER_REGNUM
17772 || regno == FRAME_POINTER_REGNUM
17773 || regno == FPSR_REG
17774 || regno == FPCR_REG)
17776 output_operand_lossage
17777 ("invalid use of register '%s'", reg_names[regno]);
17778 return;
17780 else if (regno == FLAGS_REG)
17782 output_operand_lossage ("invalid use of asm flag output");
17783 return;
17786 if (code == 'V')
17788 if (GENERAL_REGNO_P (regno))
17789 msize = GET_MODE_SIZE (word_mode);
17790 else
17791 error ("'V' modifier on non-integer register");
17794 duplicated = code == 'd' && TARGET_AVX;
17796 switch (msize)
17798 case 16:
17799 case 12:
17800 case 8:
17801 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17802 warning (0, "unsupported size for integer register");
17803 /* FALLTHRU */
17804 case 4:
17805 if (LEGACY_INT_REGNO_P (regno))
17806 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17807 /* FALLTHRU */
17808 case 2:
17809 normal:
17810 reg = hi_reg_name[regno];
17811 break;
17812 case 1:
17813 if (regno >= ARRAY_SIZE (qi_reg_name))
17814 goto normal;
17815 if (!ANY_QI_REGNO_P (regno))
17816 error ("unsupported size for integer register");
17817 reg = qi_reg_name[regno];
17818 break;
17819 case 0:
17820 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17821 goto normal;
17822 reg = qi_high_reg_name[regno];
17823 break;
17824 case 32:
17825 case 64:
17826 if (SSE_REGNO_P (regno))
17828 gcc_assert (!duplicated);
17829 putc (msize == 32 ? 'y' : 'z', file);
17830 reg = hi_reg_name[regno] + 1;
17831 break;
17833 goto normal;
17834 default:
17835 gcc_unreachable ();
17838 fputs (reg, file);
17840 /* Irritatingly, AMD extended registers use
17841 different naming convention: "r%d[bwd]" */
17842 if (REX_INT_REGNO_P (regno))
17844 gcc_assert (TARGET_64BIT);
17845 switch (msize)
17847 case 0:
17848 error ("extended registers have no high halves");
17849 break;
17850 case 1:
17851 putc ('b', file);
17852 break;
17853 case 2:
17854 putc ('w', file);
17855 break;
17856 case 4:
17857 putc ('d', file);
17858 break;
17859 case 8:
17860 /* no suffix */
17861 break;
17862 default:
17863 error ("unsupported operand size for extended register");
17864 break;
17866 return;
17869 if (duplicated)
17871 if (ASSEMBLER_DIALECT == ASM_ATT)
17872 fprintf (file, ", %%%s", reg);
17873 else
17874 fprintf (file, ", %s", reg);
17878 /* Meaning of CODE:
17879 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17880 C -- print opcode suffix for set/cmov insn.
17881 c -- like C, but print reversed condition
17882 F,f -- likewise, but for floating-point.
17883 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17884 otherwise nothing
17885 R -- print embeded rounding and sae.
17886 r -- print only sae.
17887 z -- print the opcode suffix for the size of the current operand.
17888 Z -- likewise, with special suffixes for x87 instructions.
17889 * -- print a star (in certain assembler syntax)
17890 A -- print an absolute memory reference.
17891 E -- print address with DImode register names if TARGET_64BIT.
17892 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17893 s -- print a shift double count, followed by the assemblers argument
17894 delimiter.
17895 b -- print the QImode name of the register for the indicated operand.
17896 %b0 would print %al if operands[0] is reg 0.
17897 w -- likewise, print the HImode name of the register.
17898 k -- likewise, print the SImode name of the register.
17899 q -- likewise, print the DImode name of the register.
17900 x -- likewise, print the V4SFmode name of the register.
17901 t -- likewise, print the V8SFmode name of the register.
17902 g -- likewise, print the V16SFmode name of the register.
17903 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17904 y -- print "st(0)" instead of "st" as a register.
17905 d -- print duplicated register operand for AVX instruction.
17906 D -- print condition for SSE cmp instruction.
17907 P -- if PIC, print an @PLT suffix.
17908 p -- print raw symbol name.
17909 X -- don't print any sort of PIC '@' suffix for a symbol.
17910 & -- print some in-use local-dynamic symbol name.
17911 H -- print a memory address offset by 8; used for sse high-parts
17912 Y -- print condition for XOP pcom* instruction.
17913 V -- print naked full integer register name without %.
17914 + -- print a branch hint as 'cs' or 'ds' prefix
17915 ; -- print a semicolon (after prefixes due to bug in older gas).
17916 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17917 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17918 ! -- print MPX prefix for jxx/call/ret instructions if required.
17921 void
17922 ix86_print_operand (FILE *file, rtx x, int code)
17924 if (code)
17926 switch (code)
17928 case 'A':
17929 switch (ASSEMBLER_DIALECT)
17931 case ASM_ATT:
17932 putc ('*', file);
17933 break;
17935 case ASM_INTEL:
17936 /* Intel syntax. For absolute addresses, registers should not
17937 be surrounded by braces. */
17938 if (!REG_P (x))
17940 putc ('[', file);
17941 ix86_print_operand (file, x, 0);
17942 putc (']', file);
17943 return;
17945 break;
17947 default:
17948 gcc_unreachable ();
17951 ix86_print_operand (file, x, 0);
17952 return;
17954 case 'E':
17955 /* Wrap address in an UNSPEC to declare special handling. */
17956 if (TARGET_64BIT)
17957 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17959 output_address (VOIDmode, x);
17960 return;
17962 case 'L':
17963 if (ASSEMBLER_DIALECT == ASM_ATT)
17964 putc ('l', file);
17965 return;
17967 case 'W':
17968 if (ASSEMBLER_DIALECT == ASM_ATT)
17969 putc ('w', file);
17970 return;
17972 case 'B':
17973 if (ASSEMBLER_DIALECT == ASM_ATT)
17974 putc ('b', file);
17975 return;
17977 case 'Q':
17978 if (ASSEMBLER_DIALECT == ASM_ATT)
17979 putc ('l', file);
17980 return;
17982 case 'S':
17983 if (ASSEMBLER_DIALECT == ASM_ATT)
17984 putc ('s', file);
17985 return;
17987 case 'T':
17988 if (ASSEMBLER_DIALECT == ASM_ATT)
17989 putc ('t', file);
17990 return;
17992 case 'O':
17993 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17994 if (ASSEMBLER_DIALECT != ASM_ATT)
17995 return;
17997 switch (GET_MODE_SIZE (GET_MODE (x)))
17999 case 2:
18000 putc ('w', file);
18001 break;
18003 case 4:
18004 putc ('l', file);
18005 break;
18007 case 8:
18008 putc ('q', file);
18009 break;
18011 default:
18012 output_operand_lossage ("invalid operand size for operand "
18013 "code 'O'");
18014 return;
18017 putc ('.', file);
18018 #endif
18019 return;
18021 case 'z':
18022 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18024 /* Opcodes don't get size suffixes if using Intel opcodes. */
18025 if (ASSEMBLER_DIALECT == ASM_INTEL)
18026 return;
18028 switch (GET_MODE_SIZE (GET_MODE (x)))
18030 case 1:
18031 putc ('b', file);
18032 return;
18034 case 2:
18035 putc ('w', file);
18036 return;
18038 case 4:
18039 putc ('l', file);
18040 return;
18042 case 8:
18043 putc ('q', file);
18044 return;
18046 default:
18047 output_operand_lossage ("invalid operand size for operand "
18048 "code 'z'");
18049 return;
18053 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18054 warning (0, "non-integer operand used with operand code 'z'");
18055 /* FALLTHRU */
18057 case 'Z':
18058 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18059 if (ASSEMBLER_DIALECT == ASM_INTEL)
18060 return;
18062 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18064 switch (GET_MODE_SIZE (GET_MODE (x)))
18066 case 2:
18067 #ifdef HAVE_AS_IX86_FILDS
18068 putc ('s', file);
18069 #endif
18070 return;
18072 case 4:
18073 putc ('l', file);
18074 return;
18076 case 8:
18077 #ifdef HAVE_AS_IX86_FILDQ
18078 putc ('q', file);
18079 #else
18080 fputs ("ll", file);
18081 #endif
18082 return;
18084 default:
18085 break;
18088 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18090 /* 387 opcodes don't get size suffixes
18091 if the operands are registers. */
18092 if (STACK_REG_P (x))
18093 return;
18095 switch (GET_MODE_SIZE (GET_MODE (x)))
18097 case 4:
18098 putc ('s', file);
18099 return;
18101 case 8:
18102 putc ('l', file);
18103 return;
18105 case 12:
18106 case 16:
18107 putc ('t', file);
18108 return;
18110 default:
18111 break;
18114 else
18116 output_operand_lossage ("invalid operand type used with "
18117 "operand code 'Z'");
18118 return;
18121 output_operand_lossage ("invalid operand size for operand code 'Z'");
18122 return;
18124 case 'd':
18125 case 'b':
18126 case 'w':
18127 case 'k':
18128 case 'q':
18129 case 'h':
18130 case 't':
18131 case 'g':
18132 case 'y':
18133 case 'x':
18134 case 'X':
18135 case 'P':
18136 case 'p':
18137 case 'V':
18138 break;
18140 case 's':
18141 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18143 ix86_print_operand (file, x, 0);
18144 fputs (", ", file);
18146 return;
18148 case 'Y':
18149 switch (GET_CODE (x))
18151 case NE:
18152 fputs ("neq", file);
18153 break;
18154 case EQ:
18155 fputs ("eq", file);
18156 break;
18157 case GE:
18158 case GEU:
18159 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18160 break;
18161 case GT:
18162 case GTU:
18163 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18164 break;
18165 case LE:
18166 case LEU:
18167 fputs ("le", file);
18168 break;
18169 case LT:
18170 case LTU:
18171 fputs ("lt", file);
18172 break;
18173 case UNORDERED:
18174 fputs ("unord", file);
18175 break;
18176 case ORDERED:
18177 fputs ("ord", file);
18178 break;
18179 case UNEQ:
18180 fputs ("ueq", file);
18181 break;
18182 case UNGE:
18183 fputs ("nlt", file);
18184 break;
18185 case UNGT:
18186 fputs ("nle", file);
18187 break;
18188 case UNLE:
18189 fputs ("ule", file);
18190 break;
18191 case UNLT:
18192 fputs ("ult", file);
18193 break;
18194 case LTGT:
18195 fputs ("une", file);
18196 break;
18197 default:
18198 output_operand_lossage ("operand is not a condition code, "
18199 "invalid operand code 'Y'");
18200 return;
18202 return;
18204 case 'D':
18205 /* Little bit of braindamage here. The SSE compare instructions
18206 does use completely different names for the comparisons that the
18207 fp conditional moves. */
18208 switch (GET_CODE (x))
18210 case UNEQ:
18211 if (TARGET_AVX)
18213 fputs ("eq_us", file);
18214 break;
18216 /* FALLTHRU */
18217 case EQ:
18218 fputs ("eq", file);
18219 break;
18220 case UNLT:
18221 if (TARGET_AVX)
18223 fputs ("nge", file);
18224 break;
18226 /* FALLTHRU */
18227 case LT:
18228 fputs ("lt", file);
18229 break;
18230 case UNLE:
18231 if (TARGET_AVX)
18233 fputs ("ngt", file);
18234 break;
18236 /* FALLTHRU */
18237 case LE:
18238 fputs ("le", file);
18239 break;
18240 case UNORDERED:
18241 fputs ("unord", file);
18242 break;
18243 case LTGT:
18244 if (TARGET_AVX)
18246 fputs ("neq_oq", file);
18247 break;
18249 /* FALLTHRU */
18250 case NE:
18251 fputs ("neq", file);
18252 break;
18253 case GE:
18254 if (TARGET_AVX)
18256 fputs ("ge", file);
18257 break;
18259 /* FALLTHRU */
18260 case UNGE:
18261 fputs ("nlt", file);
18262 break;
18263 case GT:
18264 if (TARGET_AVX)
18266 fputs ("gt", file);
18267 break;
18269 /* FALLTHRU */
18270 case UNGT:
18271 fputs ("nle", file);
18272 break;
18273 case ORDERED:
18274 fputs ("ord", file);
18275 break;
18276 default:
18277 output_operand_lossage ("operand is not a condition code, "
18278 "invalid operand code 'D'");
18279 return;
18281 return;
18283 case 'F':
18284 case 'f':
18285 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18286 if (ASSEMBLER_DIALECT == ASM_ATT)
18287 putc ('.', file);
18288 gcc_fallthrough ();
18289 #endif
18291 case 'C':
18292 case 'c':
18293 if (!COMPARISON_P (x))
18295 output_operand_lossage ("operand is not a condition code, "
18296 "invalid operand code '%c'", code);
18297 return;
18299 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18300 code == 'c' || code == 'f',
18301 code == 'F' || code == 'f',
18302 file);
18303 return;
18305 case 'H':
18306 if (!offsettable_memref_p (x))
18308 output_operand_lossage ("operand is not an offsettable memory "
18309 "reference, invalid operand code 'H'");
18310 return;
18312 /* It doesn't actually matter what mode we use here, as we're
18313 only going to use this for printing. */
18314 x = adjust_address_nv (x, DImode, 8);
18315 /* Output 'qword ptr' for intel assembler dialect. */
18316 if (ASSEMBLER_DIALECT == ASM_INTEL)
18317 code = 'q';
18318 break;
18320 case 'K':
18321 if (!CONST_INT_P (x))
18323 output_operand_lossage ("operand is not an integer, invalid "
18324 "operand code 'K'");
18325 return;
18328 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18329 #ifdef HAVE_AS_IX86_HLE
18330 fputs ("xacquire ", file);
18331 #else
18332 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18333 #endif
18334 else if (INTVAL (x) & IX86_HLE_RELEASE)
18335 #ifdef HAVE_AS_IX86_HLE
18336 fputs ("xrelease ", file);
18337 #else
18338 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18339 #endif
18340 /* We do not want to print value of the operand. */
18341 return;
18343 case 'N':
18344 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18345 fputs ("{z}", file);
18346 return;
18348 case 'r':
18349 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18351 output_operand_lossage ("operand is not a specific integer, "
18352 "invalid operand code 'r'");
18353 return;
18356 if (ASSEMBLER_DIALECT == ASM_INTEL)
18357 fputs (", ", file);
18359 fputs ("{sae}", file);
18361 if (ASSEMBLER_DIALECT == ASM_ATT)
18362 fputs (", ", file);
18364 return;
18366 case 'R':
18367 if (!CONST_INT_P (x))
18369 output_operand_lossage ("operand is not an integer, invalid "
18370 "operand code 'R'");
18371 return;
18374 if (ASSEMBLER_DIALECT == ASM_INTEL)
18375 fputs (", ", file);
18377 switch (INTVAL (x))
18379 case ROUND_NEAREST_INT | ROUND_SAE:
18380 fputs ("{rn-sae}", file);
18381 break;
18382 case ROUND_NEG_INF | ROUND_SAE:
18383 fputs ("{rd-sae}", file);
18384 break;
18385 case ROUND_POS_INF | ROUND_SAE:
18386 fputs ("{ru-sae}", file);
18387 break;
18388 case ROUND_ZERO | ROUND_SAE:
18389 fputs ("{rz-sae}", file);
18390 break;
18391 default:
18392 output_operand_lossage ("operand is not a specific integer, "
18393 "invalid operand code 'R'");
18396 if (ASSEMBLER_DIALECT == ASM_ATT)
18397 fputs (", ", file);
18399 return;
18401 case '*':
18402 if (ASSEMBLER_DIALECT == ASM_ATT)
18403 putc ('*', file);
18404 return;
18406 case '&':
18408 const char *name = get_some_local_dynamic_name ();
18409 if (name == NULL)
18410 output_operand_lossage ("'%%&' used without any "
18411 "local dynamic TLS references");
18412 else
18413 assemble_name (file, name);
18414 return;
18417 case '+':
18419 rtx x;
18421 if (!optimize
18422 || optimize_function_for_size_p (cfun)
18423 || !TARGET_BRANCH_PREDICTION_HINTS)
18424 return;
18426 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18427 if (x)
18429 int pred_val = profile_probability::from_reg_br_prob_note
18430 (XINT (x, 0)).to_reg_br_prob_base ();
18432 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18433 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18435 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18436 bool cputaken
18437 = final_forward_branch_p (current_output_insn) == 0;
18439 /* Emit hints only in the case default branch prediction
18440 heuristics would fail. */
18441 if (taken != cputaken)
18443 /* We use 3e (DS) prefix for taken branches and
18444 2e (CS) prefix for not taken branches. */
18445 if (taken)
18446 fputs ("ds ; ", file);
18447 else
18448 fputs ("cs ; ", file);
18452 return;
18455 case ';':
18456 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18457 putc (';', file);
18458 #endif
18459 return;
18461 case '~':
18462 putc (TARGET_AVX2 ? 'i' : 'f', file);
18463 return;
18465 case '^':
18466 if (TARGET_64BIT && Pmode != word_mode)
18467 fputs ("addr32 ", file);
18468 return;
18470 case '!':
18471 if (ix86_bnd_prefixed_insn_p (current_output_insn))
18472 fputs ("bnd ", file);
18473 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18474 fputs ("notrack ", file);
18475 return;
18477 default:
18478 output_operand_lossage ("invalid operand code '%c'", code);
18482 if (REG_P (x))
18483 print_reg (x, code, file);
18485 else if (MEM_P (x))
18487 rtx addr = XEXP (x, 0);
18489 /* No `byte ptr' prefix for call instructions ... */
18490 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18492 machine_mode mode = GET_MODE (x);
18493 const char *size;
18495 /* Check for explicit size override codes. */
18496 if (code == 'b')
18497 size = "BYTE";
18498 else if (code == 'w')
18499 size = "WORD";
18500 else if (code == 'k')
18501 size = "DWORD";
18502 else if (code == 'q')
18503 size = "QWORD";
18504 else if (code == 'x')
18505 size = "XMMWORD";
18506 else if (code == 't')
18507 size = "YMMWORD";
18508 else if (code == 'g')
18509 size = "ZMMWORD";
18510 else if (mode == BLKmode)
18511 /* ... or BLKmode operands, when not overridden. */
18512 size = NULL;
18513 else
18514 switch (GET_MODE_SIZE (mode))
18516 case 1: size = "BYTE"; break;
18517 case 2: size = "WORD"; break;
18518 case 4: size = "DWORD"; break;
18519 case 8: size = "QWORD"; break;
18520 case 12: size = "TBYTE"; break;
18521 case 16:
18522 if (mode == XFmode)
18523 size = "TBYTE";
18524 else
18525 size = "XMMWORD";
18526 break;
18527 case 32: size = "YMMWORD"; break;
18528 case 64: size = "ZMMWORD"; break;
18529 default:
18530 gcc_unreachable ();
18532 if (size)
18534 fputs (size, file);
18535 fputs (" PTR ", file);
18539 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18540 output_operand_lossage ("invalid constraints for operand");
18541 else
18542 ix86_print_operand_address_as
18543 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18546 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18548 long l;
18550 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18552 if (ASSEMBLER_DIALECT == ASM_ATT)
18553 putc ('$', file);
18554 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18555 if (code == 'q')
18556 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18557 (unsigned long long) (int) l);
18558 else
18559 fprintf (file, "0x%08x", (unsigned int) l);
18562 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18564 long l[2];
18566 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18568 if (ASSEMBLER_DIALECT == ASM_ATT)
18569 putc ('$', file);
18570 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18573 /* These float cases don't actually occur as immediate operands. */
18574 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18576 char dstr[30];
18578 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18579 fputs (dstr, file);
18582 else
18584 /* We have patterns that allow zero sets of memory, for instance.
18585 In 64-bit mode, we should probably support all 8-byte vectors,
18586 since we can in fact encode that into an immediate. */
18587 if (GET_CODE (x) == CONST_VECTOR)
18589 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18590 x = const0_rtx;
18593 if (code != 'P' && code != 'p')
18595 if (CONST_INT_P (x))
18597 if (ASSEMBLER_DIALECT == ASM_ATT)
18598 putc ('$', file);
18600 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18601 || GET_CODE (x) == LABEL_REF)
18603 if (ASSEMBLER_DIALECT == ASM_ATT)
18604 putc ('$', file);
18605 else
18606 fputs ("OFFSET FLAT:", file);
18609 if (CONST_INT_P (x))
18610 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18611 else if (flag_pic || MACHOPIC_INDIRECT)
18612 output_pic_addr_const (file, x, code);
18613 else
18614 output_addr_const (file, x);
18618 static bool
18619 ix86_print_operand_punct_valid_p (unsigned char code)
18621 return (code == '*' || code == '+' || code == '&' || code == ';'
18622 || code == '~' || code == '^' || code == '!');
18625 /* Print a memory operand whose address is ADDR. */
18627 static void
18628 ix86_print_operand_address_as (FILE *file, rtx addr,
18629 addr_space_t as, bool no_rip)
18631 struct ix86_address parts;
18632 rtx base, index, disp;
18633 int scale;
18634 int ok;
18635 bool vsib = false;
18636 int code = 0;
18638 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18640 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18641 gcc_assert (parts.index == NULL_RTX);
18642 parts.index = XVECEXP (addr, 0, 1);
18643 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18644 addr = XVECEXP (addr, 0, 0);
18645 vsib = true;
18647 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18649 gcc_assert (TARGET_64BIT);
18650 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18651 code = 'q';
18653 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18655 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18656 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18657 if (parts.base != NULL_RTX)
18659 parts.index = parts.base;
18660 parts.scale = 1;
18662 parts.base = XVECEXP (addr, 0, 0);
18663 addr = XVECEXP (addr, 0, 0);
18665 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18667 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18668 gcc_assert (parts.index == NULL_RTX);
18669 parts.index = XVECEXP (addr, 0, 1);
18670 addr = XVECEXP (addr, 0, 0);
18672 else
18673 ok = ix86_decompose_address (addr, &parts);
18675 gcc_assert (ok);
18677 base = parts.base;
18678 index = parts.index;
18679 disp = parts.disp;
18680 scale = parts.scale;
18682 if (ADDR_SPACE_GENERIC_P (as))
18683 as = parts.seg;
18684 else
18685 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18687 if (!ADDR_SPACE_GENERIC_P (as))
18689 const char *string;
18691 if (as == ADDR_SPACE_SEG_FS)
18692 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18693 else if (as == ADDR_SPACE_SEG_GS)
18694 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18695 else
18696 gcc_unreachable ();
18697 fputs (string, file);
18700 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18701 if (TARGET_64BIT && !base && !index && !no_rip)
18703 rtx symbol = disp;
18705 if (GET_CODE (disp) == CONST
18706 && GET_CODE (XEXP (disp, 0)) == PLUS
18707 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18708 symbol = XEXP (XEXP (disp, 0), 0);
18710 if (GET_CODE (symbol) == LABEL_REF
18711 || (GET_CODE (symbol) == SYMBOL_REF
18712 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18713 base = pc_rtx;
18716 if (!base && !index)
18718 /* Displacement only requires special attention. */
18719 if (CONST_INT_P (disp))
18721 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18722 fputs ("ds:", file);
18723 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18725 /* Load the external function address via the GOT slot to avoid PLT. */
18726 else if (GET_CODE (disp) == CONST
18727 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18728 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18729 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18730 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18731 output_pic_addr_const (file, disp, 0);
18732 else if (flag_pic)
18733 output_pic_addr_const (file, disp, 0);
18734 else
18735 output_addr_const (file, disp);
18737 else
18739 /* Print SImode register names to force addr32 prefix. */
18740 if (SImode_address_operand (addr, VOIDmode))
18742 if (flag_checking)
18744 gcc_assert (TARGET_64BIT);
18745 switch (GET_CODE (addr))
18747 case SUBREG:
18748 gcc_assert (GET_MODE (addr) == SImode);
18749 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18750 break;
18751 case ZERO_EXTEND:
18752 case AND:
18753 gcc_assert (GET_MODE (addr) == DImode);
18754 break;
18755 default:
18756 gcc_unreachable ();
18759 gcc_assert (!code);
18760 code = 'k';
18762 else if (code == 0
18763 && TARGET_X32
18764 && disp
18765 && CONST_INT_P (disp)
18766 && INTVAL (disp) < -16*1024*1024)
18768 /* X32 runs in 64-bit mode, where displacement, DISP, in
18769 address DISP(%r64), is encoded as 32-bit immediate sign-
18770 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18771 address is %r64 + 0xffffffffbffffd00. When %r64 <
18772 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18773 which is invalid for x32. The correct address is %r64
18774 - 0x40000300 == 0xf7ffdd64. To properly encode
18775 -0x40000300(%r64) for x32, we zero-extend negative
18776 displacement by forcing addr32 prefix which truncates
18777 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18778 zero-extend all negative displacements, including -1(%rsp).
18779 However, for small negative displacements, sign-extension
18780 won't cause overflow. We only zero-extend negative
18781 displacements if they < -16*1024*1024, which is also used
18782 to check legitimate address displacements for PIC. */
18783 code = 'k';
18786 /* Since the upper 32 bits of RSP are always zero for x32,
18787 we can encode %esp as %rsp to avoid 0x67 prefix if
18788 there is no index register. */
18789 if (TARGET_X32 && Pmode == SImode
18790 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18791 code = 'q';
18793 if (ASSEMBLER_DIALECT == ASM_ATT)
18795 if (disp)
18797 if (flag_pic)
18798 output_pic_addr_const (file, disp, 0);
18799 else if (GET_CODE (disp) == LABEL_REF)
18800 output_asm_label (disp);
18801 else
18802 output_addr_const (file, disp);
18805 putc ('(', file);
18806 if (base)
18807 print_reg (base, code, file);
18808 if (index)
18810 putc (',', file);
18811 print_reg (index, vsib ? 0 : code, file);
18812 if (scale != 1 || vsib)
18813 fprintf (file, ",%d", scale);
18815 putc (')', file);
18817 else
18819 rtx offset = NULL_RTX;
18821 if (disp)
18823 /* Pull out the offset of a symbol; print any symbol itself. */
18824 if (GET_CODE (disp) == CONST
18825 && GET_CODE (XEXP (disp, 0)) == PLUS
18826 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18828 offset = XEXP (XEXP (disp, 0), 1);
18829 disp = gen_rtx_CONST (VOIDmode,
18830 XEXP (XEXP (disp, 0), 0));
18833 if (flag_pic)
18834 output_pic_addr_const (file, disp, 0);
18835 else if (GET_CODE (disp) == LABEL_REF)
18836 output_asm_label (disp);
18837 else if (CONST_INT_P (disp))
18838 offset = disp;
18839 else
18840 output_addr_const (file, disp);
18843 putc ('[', file);
18844 if (base)
18846 print_reg (base, code, file);
18847 if (offset)
18849 if (INTVAL (offset) >= 0)
18850 putc ('+', file);
18851 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18854 else if (offset)
18855 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18856 else
18857 putc ('0', file);
18859 if (index)
18861 putc ('+', file);
18862 print_reg (index, vsib ? 0 : code, file);
18863 if (scale != 1 || vsib)
18864 fprintf (file, "*%d", scale);
18866 putc (']', file);
18871 static void
18872 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18874 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18877 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18879 static bool
18880 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18882 rtx op;
18884 if (GET_CODE (x) != UNSPEC)
18885 return false;
18887 op = XVECEXP (x, 0, 0);
18888 switch (XINT (x, 1))
18890 case UNSPEC_GOTOFF:
18891 output_addr_const (file, op);
18892 fputs ("@gotoff", file);
18893 break;
18894 case UNSPEC_GOTTPOFF:
18895 output_addr_const (file, op);
18896 /* FIXME: This might be @TPOFF in Sun ld. */
18897 fputs ("@gottpoff", file);
18898 break;
18899 case UNSPEC_TPOFF:
18900 output_addr_const (file, op);
18901 fputs ("@tpoff", file);
18902 break;
18903 case UNSPEC_NTPOFF:
18904 output_addr_const (file, op);
18905 if (TARGET_64BIT)
18906 fputs ("@tpoff", file);
18907 else
18908 fputs ("@ntpoff", file);
18909 break;
18910 case UNSPEC_DTPOFF:
18911 output_addr_const (file, op);
18912 fputs ("@dtpoff", file);
18913 break;
18914 case UNSPEC_GOTNTPOFF:
18915 output_addr_const (file, op);
18916 if (TARGET_64BIT)
18917 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18918 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18919 else
18920 fputs ("@gotntpoff", file);
18921 break;
18922 case UNSPEC_INDNTPOFF:
18923 output_addr_const (file, op);
18924 fputs ("@indntpoff", file);
18925 break;
18926 #if TARGET_MACHO
18927 case UNSPEC_MACHOPIC_OFFSET:
18928 output_addr_const (file, op);
18929 putc ('-', file);
18930 machopic_output_function_base_name (file);
18931 break;
18932 #endif
18934 default:
18935 return false;
18938 return true;
18941 /* Split one or more double-mode RTL references into pairs of half-mode
18942 references. The RTL can be REG, offsettable MEM, integer constant, or
18943 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18944 split and "num" is its length. lo_half and hi_half are output arrays
18945 that parallel "operands". */
18947 void
18948 split_double_mode (machine_mode mode, rtx operands[],
18949 int num, rtx lo_half[], rtx hi_half[])
18951 machine_mode half_mode;
18952 unsigned int byte;
18954 switch (mode)
18956 case E_TImode:
18957 half_mode = DImode;
18958 break;
18959 case E_DImode:
18960 half_mode = SImode;
18961 break;
18962 default:
18963 gcc_unreachable ();
18966 byte = GET_MODE_SIZE (half_mode);
18968 while (num--)
18970 rtx op = operands[num];
18972 /* simplify_subreg refuse to split volatile memory addresses,
18973 but we still have to handle it. */
18974 if (MEM_P (op))
18976 lo_half[num] = adjust_address (op, half_mode, 0);
18977 hi_half[num] = adjust_address (op, half_mode, byte);
18979 else
18981 lo_half[num] = simplify_gen_subreg (half_mode, op,
18982 GET_MODE (op) == VOIDmode
18983 ? mode : GET_MODE (op), 0);
18984 hi_half[num] = simplify_gen_subreg (half_mode, op,
18985 GET_MODE (op) == VOIDmode
18986 ? mode : GET_MODE (op), byte);
18991 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18992 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18993 is the expression of the binary operation. The output may either be
18994 emitted here, or returned to the caller, like all output_* functions.
18996 There is no guarantee that the operands are the same mode, as they
18997 might be within FLOAT or FLOAT_EXTEND expressions. */
18999 #ifndef SYSV386_COMPAT
19000 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
19001 wants to fix the assemblers because that causes incompatibility
19002 with gcc. No-one wants to fix gcc because that causes
19003 incompatibility with assemblers... You can use the option of
19004 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19005 #define SYSV386_COMPAT 1
19006 #endif
19008 const char *
19009 output_387_binary_op (rtx_insn *insn, rtx *operands)
19011 static char buf[40];
19012 const char *p;
19013 bool is_sse
19014 = (SSE_REG_P (operands[0])
19015 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19017 if (is_sse)
19018 p = "%v";
19019 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19020 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19021 p = "fi";
19022 else
19023 p = "f";
19025 strcpy (buf, p);
19027 switch (GET_CODE (operands[3]))
19029 case PLUS:
19030 p = "add"; break;
19031 case MINUS:
19032 p = "sub"; break;
19033 case MULT:
19034 p = "mul"; break;
19035 case DIV:
19036 p = "div"; break;
19037 default:
19038 gcc_unreachable ();
19041 strcat (buf, p);
19043 if (is_sse)
19045 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19046 strcat (buf, p);
19048 if (TARGET_AVX)
19049 p = "\t{%2, %1, %0|%0, %1, %2}";
19050 else
19051 p = "\t{%2, %0|%0, %2}";
19053 strcat (buf, p);
19054 return buf;
19057 /* Even if we do not want to check the inputs, this documents input
19058 constraints. Which helps in understanding the following code. */
19059 if (flag_checking)
19061 if (STACK_REG_P (operands[0])
19062 && ((REG_P (operands[1])
19063 && REGNO (operands[0]) == REGNO (operands[1])
19064 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19065 || (REG_P (operands[2])
19066 && REGNO (operands[0]) == REGNO (operands[2])
19067 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19068 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19069 ; /* ok */
19070 else
19071 gcc_unreachable ();
19074 switch (GET_CODE (operands[3]))
19076 case MULT:
19077 case PLUS:
19078 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19079 std::swap (operands[1], operands[2]);
19081 /* know operands[0] == operands[1]. */
19083 if (MEM_P (operands[2]))
19085 p = "%Z2\t%2";
19086 break;
19089 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19091 if (STACK_TOP_P (operands[0]))
19092 /* How is it that we are storing to a dead operand[2]?
19093 Well, presumably operands[1] is dead too. We can't
19094 store the result to st(0) as st(0) gets popped on this
19095 instruction. Instead store to operands[2] (which I
19096 think has to be st(1)). st(1) will be popped later.
19097 gcc <= 2.8.1 didn't have this check and generated
19098 assembly code that the Unixware assembler rejected. */
19099 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19100 else
19101 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19102 break;
19105 if (STACK_TOP_P (operands[0]))
19106 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19107 else
19108 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19109 break;
19111 case MINUS:
19112 case DIV:
19113 if (MEM_P (operands[1]))
19115 p = "r%Z1\t%1";
19116 break;
19119 if (MEM_P (operands[2]))
19121 p = "%Z2\t%2";
19122 break;
19125 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19127 #if SYSV386_COMPAT
19128 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19129 derived assemblers, confusingly reverse the direction of
19130 the operation for fsub{r} and fdiv{r} when the
19131 destination register is not st(0). The Intel assembler
19132 doesn't have this brain damage. Read !SYSV386_COMPAT to
19133 figure out what the hardware really does. */
19134 if (STACK_TOP_P (operands[0]))
19135 p = "{p\t%0, %2|rp\t%2, %0}";
19136 else
19137 p = "{rp\t%2, %0|p\t%0, %2}";
19138 #else
19139 if (STACK_TOP_P (operands[0]))
19140 /* As above for fmul/fadd, we can't store to st(0). */
19141 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19142 else
19143 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19144 #endif
19145 break;
19148 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19150 #if SYSV386_COMPAT
19151 if (STACK_TOP_P (operands[0]))
19152 p = "{rp\t%0, %1|p\t%1, %0}";
19153 else
19154 p = "{p\t%1, %0|rp\t%0, %1}";
19155 #else
19156 if (STACK_TOP_P (operands[0]))
19157 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19158 else
19159 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19160 #endif
19161 break;
19164 if (STACK_TOP_P (operands[0]))
19166 if (STACK_TOP_P (operands[1]))
19167 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19168 else
19169 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19170 break;
19172 else if (STACK_TOP_P (operands[1]))
19174 #if SYSV386_COMPAT
19175 p = "{\t%1, %0|r\t%0, %1}";
19176 #else
19177 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19178 #endif
19180 else
19182 #if SYSV386_COMPAT
19183 p = "{r\t%2, %0|\t%0, %2}";
19184 #else
19185 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19186 #endif
19188 break;
19190 default:
19191 gcc_unreachable ();
19194 strcat (buf, p);
19195 return buf;
19198 /* Return needed mode for entity in optimize_mode_switching pass. */
19200 static int
19201 ix86_dirflag_mode_needed (rtx_insn *insn)
19203 if (CALL_P (insn))
19205 if (cfun->machine->func_type == TYPE_NORMAL)
19206 return X86_DIRFLAG_ANY;
19207 else
19208 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19209 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19212 if (recog_memoized (insn) < 0)
19213 return X86_DIRFLAG_ANY;
19215 if (get_attr_type (insn) == TYPE_STR)
19217 /* Emit cld instruction if stringops are used in the function. */
19218 if (cfun->machine->func_type == TYPE_NORMAL)
19219 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19220 else
19221 return X86_DIRFLAG_RESET;
19224 return X86_DIRFLAG_ANY;
19227 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
19229 static bool
19230 ix86_check_avx_upper_register (const_rtx exp)
19232 if (SUBREG_P (exp))
19233 exp = SUBREG_REG (exp);
19235 return (REG_P (exp)
19236 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19237 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19240 /* Return needed mode for entity in optimize_mode_switching pass. */
19242 static int
19243 ix86_avx_u128_mode_needed (rtx_insn *insn)
19245 if (CALL_P (insn))
19247 rtx link;
19249 /* Needed mode is set to AVX_U128_CLEAN if there are
19250 no 256bit or 512bit modes used in function arguments. */
19251 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19252 link;
19253 link = XEXP (link, 1))
19255 if (GET_CODE (XEXP (link, 0)) == USE)
19257 rtx arg = XEXP (XEXP (link, 0), 0);
19259 if (ix86_check_avx_upper_register (arg))
19260 return AVX_U128_DIRTY;
19264 return AVX_U128_CLEAN;
19267 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19268 Hardware changes state only when a 256bit register is written to,
19269 but we need to prevent the compiler from moving optimal insertion
19270 point above eventual read from 256bit or 512 bit register. */
19271 subrtx_iterator::array_type array;
19272 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19273 if (ix86_check_avx_upper_register (*iter))
19274 return AVX_U128_DIRTY;
19276 return AVX_U128_ANY;
19279 /* Return mode that i387 must be switched into
19280 prior to the execution of insn. */
19282 static int
19283 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19285 enum attr_i387_cw mode;
19287 /* The mode UNINITIALIZED is used to store control word after a
19288 function call or ASM pattern. The mode ANY specify that function
19289 has no requirements on the control word and make no changes in the
19290 bits we are interested in. */
19292 if (CALL_P (insn)
19293 || (NONJUMP_INSN_P (insn)
19294 && (asm_noperands (PATTERN (insn)) >= 0
19295 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19296 return I387_CW_UNINITIALIZED;
19298 if (recog_memoized (insn) < 0)
19299 return I387_CW_ANY;
19301 mode = get_attr_i387_cw (insn);
19303 switch (entity)
19305 case I387_TRUNC:
19306 if (mode == I387_CW_TRUNC)
19307 return mode;
19308 break;
19310 case I387_FLOOR:
19311 if (mode == I387_CW_FLOOR)
19312 return mode;
19313 break;
19315 case I387_CEIL:
19316 if (mode == I387_CW_CEIL)
19317 return mode;
19318 break;
19320 case I387_MASK_PM:
19321 if (mode == I387_CW_MASK_PM)
19322 return mode;
19323 break;
19325 default:
19326 gcc_unreachable ();
19329 return I387_CW_ANY;
19332 /* Return mode that entity must be switched into
19333 prior to the execution of insn. */
19335 static int
19336 ix86_mode_needed (int entity, rtx_insn *insn)
19338 switch (entity)
19340 case X86_DIRFLAG:
19341 return ix86_dirflag_mode_needed (insn);
19342 case AVX_U128:
19343 return ix86_avx_u128_mode_needed (insn);
19344 case I387_TRUNC:
19345 case I387_FLOOR:
19346 case I387_CEIL:
19347 case I387_MASK_PM:
19348 return ix86_i387_mode_needed (entity, insn);
19349 default:
19350 gcc_unreachable ();
19352 return 0;
19355 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
19357 static void
19358 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19360 if (ix86_check_avx_upper_register (dest))
19362 bool *used = (bool *) data;
19363 *used = true;
19367 /* Calculate mode of upper 128bit AVX registers after the insn. */
19369 static int
19370 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19372 rtx pat = PATTERN (insn);
19374 if (vzeroupper_operation (pat, VOIDmode)
19375 || vzeroall_operation (pat, VOIDmode))
19376 return AVX_U128_CLEAN;
19378 /* We know that state is clean after CALL insn if there are no
19379 256bit or 512bit registers used in the function return register. */
19380 if (CALL_P (insn))
19382 bool avx_upper_reg_found = false;
19383 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19385 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19388 /* Otherwise, return current mode. Remember that if insn
19389 references AVX 256bit or 512bit registers, the mode was already
19390 changed to DIRTY from MODE_NEEDED. */
19391 return mode;
19394 /* Return the mode that an insn results in. */
19396 static int
19397 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19399 switch (entity)
19401 case X86_DIRFLAG:
19402 return mode;
19403 case AVX_U128:
19404 return ix86_avx_u128_mode_after (mode, insn);
19405 case I387_TRUNC:
19406 case I387_FLOOR:
19407 case I387_CEIL:
19408 case I387_MASK_PM:
19409 return mode;
19410 default:
19411 gcc_unreachable ();
19415 static int
19416 ix86_dirflag_mode_entry (void)
19418 /* For TARGET_CLD or in the interrupt handler we can't assume
19419 direction flag state at function entry. */
19420 if (TARGET_CLD
19421 || cfun->machine->func_type != TYPE_NORMAL)
19422 return X86_DIRFLAG_ANY;
19424 return X86_DIRFLAG_RESET;
19427 static int
19428 ix86_avx_u128_mode_entry (void)
19430 tree arg;
19432 /* Entry mode is set to AVX_U128_DIRTY if there are
19433 256bit or 512bit modes used in function arguments. */
19434 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19435 arg = TREE_CHAIN (arg))
19437 rtx incoming = DECL_INCOMING_RTL (arg);
19439 if (incoming && ix86_check_avx_upper_register (incoming))
19440 return AVX_U128_DIRTY;
19443 return AVX_U128_CLEAN;
19446 /* Return a mode that ENTITY is assumed to be
19447 switched to at function entry. */
19449 static int
19450 ix86_mode_entry (int entity)
19452 switch (entity)
19454 case X86_DIRFLAG:
19455 return ix86_dirflag_mode_entry ();
19456 case AVX_U128:
19457 return ix86_avx_u128_mode_entry ();
19458 case I387_TRUNC:
19459 case I387_FLOOR:
19460 case I387_CEIL:
19461 case I387_MASK_PM:
19462 return I387_CW_ANY;
19463 default:
19464 gcc_unreachable ();
19468 static int
19469 ix86_avx_u128_mode_exit (void)
19471 rtx reg = crtl->return_rtx;
19473 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19474 or 512 bit modes used in the function return register. */
19475 if (reg && ix86_check_avx_upper_register (reg))
19476 return AVX_U128_DIRTY;
19478 return AVX_U128_CLEAN;
19481 /* Return a mode that ENTITY is assumed to be
19482 switched to at function exit. */
19484 static int
19485 ix86_mode_exit (int entity)
19487 switch (entity)
19489 case X86_DIRFLAG:
19490 return X86_DIRFLAG_ANY;
19491 case AVX_U128:
19492 return ix86_avx_u128_mode_exit ();
19493 case I387_TRUNC:
19494 case I387_FLOOR:
19495 case I387_CEIL:
19496 case I387_MASK_PM:
19497 return I387_CW_ANY;
19498 default:
19499 gcc_unreachable ();
19503 static int
19504 ix86_mode_priority (int, int n)
19506 return n;
19509 /* Output code to initialize control word copies used by trunc?f?i and
19510 rounding patterns. CURRENT_MODE is set to current control word,
19511 while NEW_MODE is set to new control word. */
19513 static void
19514 emit_i387_cw_initialization (int mode)
19516 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19517 rtx new_mode;
19519 enum ix86_stack_slot slot;
19521 rtx reg = gen_reg_rtx (HImode);
19523 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19524 emit_move_insn (reg, copy_rtx (stored_mode));
19526 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19527 || optimize_insn_for_size_p ())
19529 switch (mode)
19531 case I387_CW_TRUNC:
19532 /* round toward zero (truncate) */
19533 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19534 slot = SLOT_CW_TRUNC;
19535 break;
19537 case I387_CW_FLOOR:
19538 /* round down toward -oo */
19539 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19540 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19541 slot = SLOT_CW_FLOOR;
19542 break;
19544 case I387_CW_CEIL:
19545 /* round up toward +oo */
19546 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19547 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19548 slot = SLOT_CW_CEIL;
19549 break;
19551 case I387_CW_MASK_PM:
19552 /* mask precision exception for nearbyint() */
19553 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19554 slot = SLOT_CW_MASK_PM;
19555 break;
19557 default:
19558 gcc_unreachable ();
19561 else
19563 switch (mode)
19565 case I387_CW_TRUNC:
19566 /* round toward zero (truncate) */
19567 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19568 slot = SLOT_CW_TRUNC;
19569 break;
19571 case I387_CW_FLOOR:
19572 /* round down toward -oo */
19573 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19574 slot = SLOT_CW_FLOOR;
19575 break;
19577 case I387_CW_CEIL:
19578 /* round up toward +oo */
19579 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19580 slot = SLOT_CW_CEIL;
19581 break;
19583 case I387_CW_MASK_PM:
19584 /* mask precision exception for nearbyint() */
19585 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19586 slot = SLOT_CW_MASK_PM;
19587 break;
19589 default:
19590 gcc_unreachable ();
19594 gcc_assert (slot < MAX_386_STACK_LOCALS);
19596 new_mode = assign_386_stack_local (HImode, slot);
19597 emit_move_insn (new_mode, reg);
19600 /* Emit vzeroupper. */
19602 void
19603 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19605 int i;
19607 /* Cancel automatic vzeroupper insertion if there are
19608 live call-saved SSE registers at the insertion point. */
19610 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19611 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19612 return;
19614 if (TARGET_64BIT)
19615 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19616 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19617 return;
19619 emit_insn (gen_avx_vzeroupper ());
19622 /* Generate one or more insns to set ENTITY to MODE. */
19624 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19625 is the set of hard registers live at the point where the insn(s)
19626 are to be inserted. */
19628 static void
19629 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19630 HARD_REG_SET regs_live)
19632 switch (entity)
19634 case X86_DIRFLAG:
19635 if (mode == X86_DIRFLAG_RESET)
19636 emit_insn (gen_cld ());
19637 break;
19638 case AVX_U128:
19639 if (mode == AVX_U128_CLEAN)
19640 ix86_avx_emit_vzeroupper (regs_live);
19641 break;
19642 case I387_TRUNC:
19643 case I387_FLOOR:
19644 case I387_CEIL:
19645 case I387_MASK_PM:
19646 if (mode != I387_CW_ANY
19647 && mode != I387_CW_UNINITIALIZED)
19648 emit_i387_cw_initialization (mode);
19649 break;
19650 default:
19651 gcc_unreachable ();
19655 /* Output code for INSN to convert a float to a signed int. OPERANDS
19656 are the insn operands. The output may be [HSD]Imode and the input
19657 operand may be [SDX]Fmode. */
19659 const char *
19660 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19662 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19663 bool dimode_p = GET_MODE (operands[0]) == DImode;
19664 int round_mode = get_attr_i387_cw (insn);
19666 static char buf[40];
19667 const char *p;
19669 /* Jump through a hoop or two for DImode, since the hardware has no
19670 non-popping instruction. We used to do this a different way, but
19671 that was somewhat fragile and broke with post-reload splitters. */
19672 if ((dimode_p || fisttp) && !stack_top_dies)
19673 output_asm_insn ("fld\t%y1", operands);
19675 gcc_assert (STACK_TOP_P (operands[1]));
19676 gcc_assert (MEM_P (operands[0]));
19677 gcc_assert (GET_MODE (operands[1]) != TFmode);
19679 if (fisttp)
19680 return "fisttp%Z0\t%0";
19682 strcpy (buf, "fist");
19684 if (round_mode != I387_CW_ANY)
19685 output_asm_insn ("fldcw\t%3", operands);
19687 p = "p%Z0\t%0";
19688 strcat (buf, p + !(stack_top_dies || dimode_p));
19690 output_asm_insn (buf, operands);
19692 if (round_mode != I387_CW_ANY)
19693 output_asm_insn ("fldcw\t%2", operands);
19695 return "";
19698 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19699 have the values zero or one, indicates the ffreep insn's operand
19700 from the OPERANDS array. */
19702 static const char *
19703 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19705 if (TARGET_USE_FFREEP)
19706 #ifdef HAVE_AS_IX86_FFREEP
19707 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19708 #else
19710 static char retval[32];
19711 int regno = REGNO (operands[opno]);
19713 gcc_assert (STACK_REGNO_P (regno));
19715 regno -= FIRST_STACK_REG;
19717 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19718 return retval;
19720 #endif
19722 return opno ? "fstp\t%y1" : "fstp\t%y0";
19726 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19727 should be used. UNORDERED_P is true when fucom should be used. */
19729 const char *
19730 output_fp_compare (rtx_insn *insn, rtx *operands,
19731 bool eflags_p, bool unordered_p)
19733 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19734 bool stack_top_dies;
19736 static char buf[40];
19737 const char *p;
19739 gcc_assert (STACK_TOP_P (xops[0]));
19741 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19743 if (eflags_p)
19745 p = unordered_p ? "fucomi" : "fcomi";
19746 strcpy (buf, p);
19748 p = "p\t{%y1, %0|%0, %y1}";
19749 strcat (buf, p + !stack_top_dies);
19751 return buf;
19754 if (STACK_REG_P (xops[1])
19755 && stack_top_dies
19756 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19758 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19760 /* If both the top of the 387 stack die, and the other operand
19761 is also a stack register that dies, then this must be a
19762 `fcompp' float compare. */
19763 p = unordered_p ? "fucompp" : "fcompp";
19764 strcpy (buf, p);
19766 else if (const0_operand (xops[1], VOIDmode))
19768 gcc_assert (!unordered_p);
19769 strcpy (buf, "ftst");
19771 else
19773 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19775 gcc_assert (!unordered_p);
19776 p = "ficom";
19778 else
19779 p = unordered_p ? "fucom" : "fcom";
19781 strcpy (buf, p);
19783 p = "p%Z2\t%y2";
19784 strcat (buf, p + !stack_top_dies);
19787 output_asm_insn (buf, operands);
19788 return "fnstsw\t%0";
19791 void
19792 ix86_output_addr_vec_elt (FILE *file, int value)
19794 const char *directive = ASM_LONG;
19796 #ifdef ASM_QUAD
19797 if (TARGET_LP64)
19798 directive = ASM_QUAD;
19799 #else
19800 gcc_assert (!TARGET_64BIT);
19801 #endif
19803 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19806 void
19807 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19809 const char *directive = ASM_LONG;
19811 #ifdef ASM_QUAD
19812 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19813 directive = ASM_QUAD;
19814 #else
19815 gcc_assert (!TARGET_64BIT);
19816 #endif
19817 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19818 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19819 fprintf (file, "%s%s%d-%s%d\n",
19820 directive, LPREFIX, value, LPREFIX, rel);
19821 else if (HAVE_AS_GOTOFF_IN_DATA)
19822 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19823 #if TARGET_MACHO
19824 else if (TARGET_MACHO)
19826 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19827 machopic_output_function_base_name (file);
19828 putc ('\n', file);
19830 #endif
19831 else
19832 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19833 GOT_SYMBOL_NAME, LPREFIX, value);
19836 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19837 for the target. */
19839 void
19840 ix86_expand_clear (rtx dest)
19842 rtx tmp;
19844 /* We play register width games, which are only valid after reload. */
19845 gcc_assert (reload_completed);
19847 /* Avoid HImode and its attendant prefix byte. */
19848 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19849 dest = gen_rtx_REG (SImode, REGNO (dest));
19850 tmp = gen_rtx_SET (dest, const0_rtx);
19852 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19854 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19855 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19858 emit_insn (tmp);
19861 void
19862 ix86_expand_move (machine_mode mode, rtx operands[])
19864 rtx op0, op1;
19865 rtx tmp, addend = NULL_RTX;
19866 enum tls_model model;
19868 op0 = operands[0];
19869 op1 = operands[1];
19871 switch (GET_CODE (op1))
19873 case CONST:
19874 tmp = XEXP (op1, 0);
19876 if (GET_CODE (tmp) != PLUS
19877 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19878 break;
19880 op1 = XEXP (tmp, 0);
19881 addend = XEXP (tmp, 1);
19882 /* FALLTHRU */
19884 case SYMBOL_REF:
19885 model = SYMBOL_REF_TLS_MODEL (op1);
19887 if (model)
19888 op1 = legitimize_tls_address (op1, model, true);
19889 else if (ix86_force_load_from_GOT_p (op1))
19891 /* Load the external function address via GOT slot to avoid PLT. */
19892 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19893 (TARGET_64BIT
19894 ? UNSPEC_GOTPCREL
19895 : UNSPEC_GOT));
19896 op1 = gen_rtx_CONST (Pmode, op1);
19897 op1 = gen_const_mem (Pmode, op1);
19898 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19900 else
19902 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19903 if (tmp)
19905 op1 = tmp;
19906 if (!addend)
19907 break;
19909 else
19911 op1 = operands[1];
19912 break;
19916 if (addend)
19918 op1 = force_operand (op1, NULL_RTX);
19919 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19920 op0, 1, OPTAB_DIRECT);
19922 else
19923 op1 = force_operand (op1, op0);
19925 if (op1 == op0)
19926 return;
19928 op1 = convert_to_mode (mode, op1, 1);
19930 default:
19931 break;
19934 if ((flag_pic || MACHOPIC_INDIRECT)
19935 && symbolic_operand (op1, mode))
19937 if (TARGET_MACHO && !TARGET_64BIT)
19939 #if TARGET_MACHO
19940 /* dynamic-no-pic */
19941 if (MACHOPIC_INDIRECT)
19943 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19944 ? op0 : gen_reg_rtx (Pmode);
19945 op1 = machopic_indirect_data_reference (op1, temp);
19946 if (MACHOPIC_PURE)
19947 op1 = machopic_legitimize_pic_address (op1, mode,
19948 temp == op1 ? 0 : temp);
19950 if (op0 != op1 && GET_CODE (op0) != MEM)
19952 rtx insn = gen_rtx_SET (op0, op1);
19953 emit_insn (insn);
19954 return;
19956 if (GET_CODE (op0) == MEM)
19957 op1 = force_reg (Pmode, op1);
19958 else
19960 rtx temp = op0;
19961 if (GET_CODE (temp) != REG)
19962 temp = gen_reg_rtx (Pmode);
19963 temp = legitimize_pic_address (op1, temp);
19964 if (temp == op0)
19965 return;
19966 op1 = temp;
19968 /* dynamic-no-pic */
19969 #endif
19971 else
19973 if (MEM_P (op0))
19974 op1 = force_reg (mode, op1);
19975 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19977 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19978 op1 = legitimize_pic_address (op1, reg);
19979 if (op0 == op1)
19980 return;
19981 op1 = convert_to_mode (mode, op1, 1);
19985 else
19987 if (MEM_P (op0)
19988 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19989 || !push_operand (op0, mode))
19990 && MEM_P (op1))
19991 op1 = force_reg (mode, op1);
19993 if (push_operand (op0, mode)
19994 && ! general_no_elim_operand (op1, mode))
19995 op1 = copy_to_mode_reg (mode, op1);
19997 /* Force large constants in 64bit compilation into register
19998 to get them CSEed. */
19999 if (can_create_pseudo_p ()
20000 && (mode == DImode) && TARGET_64BIT
20001 && immediate_operand (op1, mode)
20002 && !x86_64_zext_immediate_operand (op1, VOIDmode)
20003 && !register_operand (op0, mode)
20004 && optimize)
20005 op1 = copy_to_mode_reg (mode, op1);
20007 if (can_create_pseudo_p ()
20008 && CONST_DOUBLE_P (op1))
20010 /* If we are loading a floating point constant to a register,
20011 force the value to memory now, since we'll get better code
20012 out the back end. */
20014 op1 = validize_mem (force_const_mem (mode, op1));
20015 if (!register_operand (op0, mode))
20017 rtx temp = gen_reg_rtx (mode);
20018 emit_insn (gen_rtx_SET (temp, op1));
20019 emit_move_insn (op0, temp);
20020 return;
20025 emit_insn (gen_rtx_SET (op0, op1));
20028 void
20029 ix86_expand_vector_move (machine_mode mode, rtx operands[])
20031 rtx op0 = operands[0], op1 = operands[1];
20032 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
20033 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
20034 unsigned int align = (TARGET_IAMCU
20035 ? GET_MODE_BITSIZE (mode)
20036 : GET_MODE_ALIGNMENT (mode));
20038 if (push_operand (op0, VOIDmode))
20039 op0 = emit_move_resolve_push (mode, op0);
20041 /* Force constants other than zero into memory. We do not know how
20042 the instructions used to build constants modify the upper 64 bits
20043 of the register, once we have that information we may be able
20044 to handle some of them more efficiently. */
20045 if (can_create_pseudo_p ()
20046 && (CONSTANT_P (op1)
20047 || (SUBREG_P (op1)
20048 && CONSTANT_P (SUBREG_REG (op1))))
20049 && ((register_operand (op0, mode)
20050 && !standard_sse_constant_p (op1, mode))
20051 /* ix86_expand_vector_move_misalign() does not like constants. */
20052 || (SSE_REG_MODE_P (mode)
20053 && MEM_P (op0)
20054 && MEM_ALIGN (op0) < align)))
20056 if (SUBREG_P (op1))
20058 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20059 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20060 if (r)
20061 r = validize_mem (r);
20062 else
20063 r = force_reg (imode, SUBREG_REG (op1));
20064 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20066 else
20067 op1 = validize_mem (force_const_mem (mode, op1));
20070 /* We need to check memory alignment for SSE mode since attribute
20071 can make operands unaligned. */
20072 if (can_create_pseudo_p ()
20073 && SSE_REG_MODE_P (mode)
20074 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20075 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20077 rtx tmp[2];
20079 /* ix86_expand_vector_move_misalign() does not like both
20080 arguments in memory. */
20081 if (!register_operand (op0, mode)
20082 && !register_operand (op1, mode))
20083 op1 = force_reg (mode, op1);
20085 tmp[0] = op0; tmp[1] = op1;
20086 ix86_expand_vector_move_misalign (mode, tmp);
20087 return;
20090 /* Make operand1 a register if it isn't already. */
20091 if (can_create_pseudo_p ()
20092 && !register_operand (op0, mode)
20093 && !register_operand (op1, mode))
20095 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20096 return;
20099 emit_insn (gen_rtx_SET (op0, op1));
20102 /* Split 32-byte AVX unaligned load and store if needed. */
20104 static void
20105 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20107 rtx m;
20108 rtx (*extract) (rtx, rtx, rtx);
20109 machine_mode mode;
20111 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20112 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20114 emit_insn (gen_rtx_SET (op0, op1));
20115 return;
20118 rtx orig_op0 = NULL_RTX;
20119 mode = GET_MODE (op0);
20120 switch (GET_MODE_CLASS (mode))
20122 case MODE_VECTOR_INT:
20123 case MODE_INT:
20124 if (mode != V32QImode)
20126 if (!MEM_P (op0))
20128 orig_op0 = op0;
20129 op0 = gen_reg_rtx (V32QImode);
20131 else
20132 op0 = gen_lowpart (V32QImode, op0);
20133 op1 = gen_lowpart (V32QImode, op1);
20134 mode = V32QImode;
20136 break;
20137 case MODE_VECTOR_FLOAT:
20138 break;
20139 default:
20140 gcc_unreachable ();
20143 switch (mode)
20145 default:
20146 gcc_unreachable ();
20147 case E_V32QImode:
20148 extract = gen_avx_vextractf128v32qi;
20149 mode = V16QImode;
20150 break;
20151 case E_V8SFmode:
20152 extract = gen_avx_vextractf128v8sf;
20153 mode = V4SFmode;
20154 break;
20155 case E_V4DFmode:
20156 extract = gen_avx_vextractf128v4df;
20157 mode = V2DFmode;
20158 break;
20161 if (MEM_P (op1))
20163 rtx r = gen_reg_rtx (mode);
20164 m = adjust_address (op1, mode, 0);
20165 emit_move_insn (r, m);
20166 m = adjust_address (op1, mode, 16);
20167 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20168 emit_move_insn (op0, r);
20170 else if (MEM_P (op0))
20172 m = adjust_address (op0, mode, 0);
20173 emit_insn (extract (m, op1, const0_rtx));
20174 m = adjust_address (op0, mode, 16);
20175 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20177 else
20178 gcc_unreachable ();
20180 if (orig_op0)
20181 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20184 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20185 straight to ix86_expand_vector_move. */
20186 /* Code generation for scalar reg-reg moves of single and double precision data:
20187 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20188 movaps reg, reg
20189 else
20190 movss reg, reg
20191 if (x86_sse_partial_reg_dependency == true)
20192 movapd reg, reg
20193 else
20194 movsd reg, reg
20196 Code generation for scalar loads of double precision data:
20197 if (x86_sse_split_regs == true)
20198 movlpd mem, reg (gas syntax)
20199 else
20200 movsd mem, reg
20202 Code generation for unaligned packed loads of single precision data
20203 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20204 if (x86_sse_unaligned_move_optimal)
20205 movups mem, reg
20207 if (x86_sse_partial_reg_dependency == true)
20209 xorps reg, reg
20210 movlps mem, reg
20211 movhps mem+8, reg
20213 else
20215 movlps mem, reg
20216 movhps mem+8, reg
20219 Code generation for unaligned packed loads of double precision data
20220 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20221 if (x86_sse_unaligned_move_optimal)
20222 movupd mem, reg
20224 if (x86_sse_split_regs == true)
20226 movlpd mem, reg
20227 movhpd mem+8, reg
20229 else
20231 movsd mem, reg
20232 movhpd mem+8, reg
20236 void
20237 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20239 rtx op0, op1, m;
20241 op0 = operands[0];
20242 op1 = operands[1];
20244 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20245 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20247 emit_insn (gen_rtx_SET (op0, op1));
20248 return;
20251 if (TARGET_AVX)
20253 if (GET_MODE_SIZE (mode) == 32)
20254 ix86_avx256_split_vector_move_misalign (op0, op1);
20255 else
20256 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20257 emit_insn (gen_rtx_SET (op0, op1));
20258 return;
20261 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20262 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20264 emit_insn (gen_rtx_SET (op0, op1));
20265 return;
20268 /* ??? If we have typed data, then it would appear that using
20269 movdqu is the only way to get unaligned data loaded with
20270 integer type. */
20271 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20273 emit_insn (gen_rtx_SET (op0, op1));
20274 return;
20277 if (MEM_P (op1))
20279 if (TARGET_SSE2 && mode == V2DFmode)
20281 rtx zero;
20283 /* When SSE registers are split into halves, we can avoid
20284 writing to the top half twice. */
20285 if (TARGET_SSE_SPLIT_REGS)
20287 emit_clobber (op0);
20288 zero = op0;
20290 else
20292 /* ??? Not sure about the best option for the Intel chips.
20293 The following would seem to satisfy; the register is
20294 entirely cleared, breaking the dependency chain. We
20295 then store to the upper half, with a dependency depth
20296 of one. A rumor has it that Intel recommends two movsd
20297 followed by an unpacklpd, but this is unconfirmed. And
20298 given that the dependency depth of the unpacklpd would
20299 still be one, I'm not sure why this would be better. */
20300 zero = CONST0_RTX (V2DFmode);
20303 m = adjust_address (op1, DFmode, 0);
20304 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20305 m = adjust_address (op1, DFmode, 8);
20306 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20308 else
20310 rtx t;
20312 if (mode != V4SFmode)
20313 t = gen_reg_rtx (V4SFmode);
20314 else
20315 t = op0;
20317 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20318 emit_move_insn (t, CONST0_RTX (V4SFmode));
20319 else
20320 emit_clobber (t);
20322 m = adjust_address (op1, V2SFmode, 0);
20323 emit_insn (gen_sse_loadlps (t, t, m));
20324 m = adjust_address (op1, V2SFmode, 8);
20325 emit_insn (gen_sse_loadhps (t, t, m));
20326 if (mode != V4SFmode)
20327 emit_move_insn (op0, gen_lowpart (mode, t));
20330 else if (MEM_P (op0))
20332 if (TARGET_SSE2 && mode == V2DFmode)
20334 m = adjust_address (op0, DFmode, 0);
20335 emit_insn (gen_sse2_storelpd (m, op1));
20336 m = adjust_address (op0, DFmode, 8);
20337 emit_insn (gen_sse2_storehpd (m, op1));
20339 else
20341 if (mode != V4SFmode)
20342 op1 = gen_lowpart (V4SFmode, op1);
20344 m = adjust_address (op0, V2SFmode, 0);
20345 emit_insn (gen_sse_storelps (m, op1));
20346 m = adjust_address (op0, V2SFmode, 8);
20347 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20350 else
20351 gcc_unreachable ();
20354 /* Helper function of ix86_fixup_binary_operands to canonicalize
20355 operand order. Returns true if the operands should be swapped. */
20357 static bool
20358 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20359 rtx operands[])
20361 rtx dst = operands[0];
20362 rtx src1 = operands[1];
20363 rtx src2 = operands[2];
20365 /* If the operation is not commutative, we can't do anything. */
20366 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20367 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20368 return false;
20370 /* Highest priority is that src1 should match dst. */
20371 if (rtx_equal_p (dst, src1))
20372 return false;
20373 if (rtx_equal_p (dst, src2))
20374 return true;
20376 /* Next highest priority is that immediate constants come second. */
20377 if (immediate_operand (src2, mode))
20378 return false;
20379 if (immediate_operand (src1, mode))
20380 return true;
20382 /* Lowest priority is that memory references should come second. */
20383 if (MEM_P (src2))
20384 return false;
20385 if (MEM_P (src1))
20386 return true;
20388 return false;
20392 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20393 destination to use for the operation. If different from the true
20394 destination in operands[0], a copy operation will be required. */
20397 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20398 rtx operands[])
20400 rtx dst = operands[0];
20401 rtx src1 = operands[1];
20402 rtx src2 = operands[2];
20404 /* Canonicalize operand order. */
20405 if (ix86_swap_binary_operands_p (code, mode, operands))
20407 /* It is invalid to swap operands of different modes. */
20408 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20410 std::swap (src1, src2);
20413 /* Both source operands cannot be in memory. */
20414 if (MEM_P (src1) && MEM_P (src2))
20416 /* Optimization: Only read from memory once. */
20417 if (rtx_equal_p (src1, src2))
20419 src2 = force_reg (mode, src2);
20420 src1 = src2;
20422 else if (rtx_equal_p (dst, src1))
20423 src2 = force_reg (mode, src2);
20424 else
20425 src1 = force_reg (mode, src1);
20428 /* If the destination is memory, and we do not have matching source
20429 operands, do things in registers. */
20430 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20431 dst = gen_reg_rtx (mode);
20433 /* Source 1 cannot be a constant. */
20434 if (CONSTANT_P (src1))
20435 src1 = force_reg (mode, src1);
20437 /* Source 1 cannot be a non-matching memory. */
20438 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20439 src1 = force_reg (mode, src1);
20441 /* Improve address combine. */
20442 if (code == PLUS
20443 && GET_MODE_CLASS (mode) == MODE_INT
20444 && MEM_P (src2))
20445 src2 = force_reg (mode, src2);
20447 operands[1] = src1;
20448 operands[2] = src2;
20449 return dst;
20452 /* Similarly, but assume that the destination has already been
20453 set up properly. */
20455 void
20456 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20457 machine_mode mode, rtx operands[])
20459 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20460 gcc_assert (dst == operands[0]);
20463 /* Attempt to expand a binary operator. Make the expansion closer to the
20464 actual machine, then just general_operand, which will allow 3 separate
20465 memory references (one output, two input) in a single insn. */
20467 void
20468 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20469 rtx operands[])
20471 rtx src1, src2, dst, op, clob;
20473 dst = ix86_fixup_binary_operands (code, mode, operands);
20474 src1 = operands[1];
20475 src2 = operands[2];
20477 /* Emit the instruction. */
20479 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20481 if (reload_completed
20482 && code == PLUS
20483 && !rtx_equal_p (dst, src1))
20485 /* This is going to be an LEA; avoid splitting it later. */
20486 emit_insn (op);
20488 else
20490 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20491 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20494 /* Fix up the destination if needed. */
20495 if (dst != operands[0])
20496 emit_move_insn (operands[0], dst);
20499 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20500 the given OPERANDS. */
20502 void
20503 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20504 rtx operands[])
20506 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20507 if (SUBREG_P (operands[1]))
20509 op1 = operands[1];
20510 op2 = operands[2];
20512 else if (SUBREG_P (operands[2]))
20514 op1 = operands[2];
20515 op2 = operands[1];
20517 /* Optimize (__m128i) d | (__m128i) e and similar code
20518 when d and e are float vectors into float vector logical
20519 insn. In C/C++ without using intrinsics there is no other way
20520 to express vector logical operation on float vectors than
20521 to cast them temporarily to integer vectors. */
20522 if (op1
20523 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20524 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20525 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20526 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20527 && SUBREG_BYTE (op1) == 0
20528 && (GET_CODE (op2) == CONST_VECTOR
20529 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20530 && SUBREG_BYTE (op2) == 0))
20531 && can_create_pseudo_p ())
20533 rtx dst;
20534 switch (GET_MODE (SUBREG_REG (op1)))
20536 case E_V4SFmode:
20537 case E_V8SFmode:
20538 case E_V16SFmode:
20539 case E_V2DFmode:
20540 case E_V4DFmode:
20541 case E_V8DFmode:
20542 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20543 if (GET_CODE (op2) == CONST_VECTOR)
20545 op2 = gen_lowpart (GET_MODE (dst), op2);
20546 op2 = force_reg (GET_MODE (dst), op2);
20548 else
20550 op1 = operands[1];
20551 op2 = SUBREG_REG (operands[2]);
20552 if (!vector_operand (op2, GET_MODE (dst)))
20553 op2 = force_reg (GET_MODE (dst), op2);
20555 op1 = SUBREG_REG (op1);
20556 if (!vector_operand (op1, GET_MODE (dst)))
20557 op1 = force_reg (GET_MODE (dst), op1);
20558 emit_insn (gen_rtx_SET (dst,
20559 gen_rtx_fmt_ee (code, GET_MODE (dst),
20560 op1, op2)));
20561 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20562 return;
20563 default:
20564 break;
20567 if (!vector_operand (operands[1], mode))
20568 operands[1] = force_reg (mode, operands[1]);
20569 if (!vector_operand (operands[2], mode))
20570 operands[2] = force_reg (mode, operands[2]);
20571 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20572 emit_insn (gen_rtx_SET (operands[0],
20573 gen_rtx_fmt_ee (code, mode, operands[1],
20574 operands[2])));
20577 /* Return TRUE or FALSE depending on whether the binary operator meets the
20578 appropriate constraints. */
20580 bool
20581 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20582 rtx operands[3])
20584 rtx dst = operands[0];
20585 rtx src1 = operands[1];
20586 rtx src2 = operands[2];
20588 /* Both source operands cannot be in memory. */
20589 if (MEM_P (src1) && MEM_P (src2))
20590 return false;
20592 /* Canonicalize operand order for commutative operators. */
20593 if (ix86_swap_binary_operands_p (code, mode, operands))
20594 std::swap (src1, src2);
20596 /* If the destination is memory, we must have a matching source operand. */
20597 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20598 return false;
20600 /* Source 1 cannot be a constant. */
20601 if (CONSTANT_P (src1))
20602 return false;
20604 /* Source 1 cannot be a non-matching memory. */
20605 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20606 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20607 return (code == AND
20608 && (mode == HImode
20609 || mode == SImode
20610 || (TARGET_64BIT && mode == DImode))
20611 && satisfies_constraint_L (src2));
20613 return true;
20616 /* Attempt to expand a unary operator. Make the expansion closer to the
20617 actual machine, then just general_operand, which will allow 2 separate
20618 memory references (one output, one input) in a single insn. */
20620 void
20621 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20622 rtx operands[])
20624 bool matching_memory = false;
20625 rtx src, dst, op, clob;
20627 dst = operands[0];
20628 src = operands[1];
20630 /* If the destination is memory, and we do not have matching source
20631 operands, do things in registers. */
20632 if (MEM_P (dst))
20634 if (rtx_equal_p (dst, src))
20635 matching_memory = true;
20636 else
20637 dst = gen_reg_rtx (mode);
20640 /* When source operand is memory, destination must match. */
20641 if (MEM_P (src) && !matching_memory)
20642 src = force_reg (mode, src);
20644 /* Emit the instruction. */
20646 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20648 if (code == NOT)
20649 emit_insn (op);
20650 else
20652 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20653 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20656 /* Fix up the destination if needed. */
20657 if (dst != operands[0])
20658 emit_move_insn (operands[0], dst);
20661 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20662 divisor are within the range [0-255]. */
20664 void
20665 ix86_split_idivmod (machine_mode mode, rtx operands[],
20666 bool signed_p)
20668 rtx_code_label *end_label, *qimode_label;
20669 rtx div, mod;
20670 rtx_insn *insn;
20671 rtx scratch, tmp0, tmp1, tmp2;
20672 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20673 rtx (*gen_zero_extend) (rtx, rtx);
20674 rtx (*gen_test_ccno_1) (rtx, rtx);
20676 switch (mode)
20678 case E_SImode:
20679 if (GET_MODE (operands[0]) == SImode)
20681 if (GET_MODE (operands[1]) == SImode)
20682 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20683 else
20684 gen_divmod4_1
20685 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20686 gen_zero_extend = gen_zero_extendqisi2;
20688 else
20690 gen_divmod4_1
20691 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20692 gen_zero_extend = gen_zero_extendqidi2;
20694 gen_test_ccno_1 = gen_testsi_ccno_1;
20695 break;
20696 case E_DImode:
20697 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20698 gen_test_ccno_1 = gen_testdi_ccno_1;
20699 gen_zero_extend = gen_zero_extendqidi2;
20700 break;
20701 default:
20702 gcc_unreachable ();
20705 end_label = gen_label_rtx ();
20706 qimode_label = gen_label_rtx ();
20708 scratch = gen_reg_rtx (mode);
20710 /* Use 8bit unsigned divimod if dividend and divisor are within
20711 the range [0-255]. */
20712 emit_move_insn (scratch, operands[2]);
20713 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20714 scratch, 1, OPTAB_DIRECT);
20715 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20716 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20717 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20718 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20719 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20720 pc_rtx);
20721 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20722 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20723 JUMP_LABEL (insn) = qimode_label;
20725 /* Generate original signed/unsigned divimod. */
20726 div = gen_divmod4_1 (operands[0], operands[1],
20727 operands[2], operands[3]);
20728 emit_insn (div);
20730 /* Branch to the end. */
20731 emit_jump_insn (gen_jump (end_label));
20732 emit_barrier ();
20734 /* Generate 8bit unsigned divide. */
20735 emit_label (qimode_label);
20736 /* Don't use operands[0] for result of 8bit divide since not all
20737 registers support QImode ZERO_EXTRACT. */
20738 tmp0 = lowpart_subreg (HImode, scratch, mode);
20739 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20740 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20741 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20743 if (signed_p)
20745 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20746 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20748 else
20750 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20751 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20753 if (mode == SImode)
20755 if (GET_MODE (operands[0]) != SImode)
20756 div = gen_rtx_ZERO_EXTEND (DImode, div);
20757 if (GET_MODE (operands[1]) != SImode)
20758 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20761 /* Extract remainder from AH. */
20762 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20763 tmp0, GEN_INT (8), GEN_INT (8));
20764 if (REG_P (operands[1]))
20765 insn = emit_move_insn (operands[1], tmp1);
20766 else
20768 /* Need a new scratch register since the old one has result
20769 of 8bit divide. */
20770 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20771 emit_move_insn (scratch, tmp1);
20772 insn = emit_move_insn (operands[1], scratch);
20774 set_unique_reg_note (insn, REG_EQUAL, mod);
20776 /* Zero extend quotient from AL. */
20777 tmp1 = gen_lowpart (QImode, tmp0);
20778 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20779 set_unique_reg_note (insn, REG_EQUAL, div);
20781 emit_label (end_label);
20784 #define LEA_MAX_STALL (3)
20785 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20787 /* Increase given DISTANCE in half-cycles according to
20788 dependencies between PREV and NEXT instructions.
20789 Add 1 half-cycle if there is no dependency and
20790 go to next cycle if there is some dependecy. */
20792 static unsigned int
20793 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20795 df_ref def, use;
20797 if (!prev || !next)
20798 return distance + (distance & 1) + 2;
20800 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20801 return distance + 1;
20803 FOR_EACH_INSN_USE (use, next)
20804 FOR_EACH_INSN_DEF (def, prev)
20805 if (!DF_REF_IS_ARTIFICIAL (def)
20806 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20807 return distance + (distance & 1) + 2;
20809 return distance + 1;
20812 /* Function checks if instruction INSN defines register number
20813 REGNO1 or REGNO2. */
20815 static bool
20816 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20817 rtx_insn *insn)
20819 df_ref def;
20821 FOR_EACH_INSN_DEF (def, insn)
20822 if (DF_REF_REG_DEF_P (def)
20823 && !DF_REF_IS_ARTIFICIAL (def)
20824 && (regno1 == DF_REF_REGNO (def)
20825 || regno2 == DF_REF_REGNO (def)))
20826 return true;
20828 return false;
20831 /* Function checks if instruction INSN uses register number
20832 REGNO as a part of address expression. */
20834 static bool
20835 insn_uses_reg_mem (unsigned int regno, rtx insn)
20837 df_ref use;
20839 FOR_EACH_INSN_USE (use, insn)
20840 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20841 return true;
20843 return false;
20846 /* Search backward for non-agu definition of register number REGNO1
20847 or register number REGNO2 in basic block starting from instruction
20848 START up to head of basic block or instruction INSN.
20850 Function puts true value into *FOUND var if definition was found
20851 and false otherwise.
20853 Distance in half-cycles between START and found instruction or head
20854 of BB is added to DISTANCE and returned. */
20856 static int
20857 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20858 rtx_insn *insn, int distance,
20859 rtx_insn *start, bool *found)
20861 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20862 rtx_insn *prev = start;
20863 rtx_insn *next = NULL;
20865 *found = false;
20867 while (prev
20868 && prev != insn
20869 && distance < LEA_SEARCH_THRESHOLD)
20871 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20873 distance = increase_distance (prev, next, distance);
20874 if (insn_defines_reg (regno1, regno2, prev))
20876 if (recog_memoized (prev) < 0
20877 || get_attr_type (prev) != TYPE_LEA)
20879 *found = true;
20880 return distance;
20884 next = prev;
20886 if (prev == BB_HEAD (bb))
20887 break;
20889 prev = PREV_INSN (prev);
20892 return distance;
20895 /* Search backward for non-agu definition of register number REGNO1
20896 or register number REGNO2 in INSN's basic block until
20897 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20898 2. Reach neighbor BBs boundary, or
20899 3. Reach agu definition.
20900 Returns the distance between the non-agu definition point and INSN.
20901 If no definition point, returns -1. */
20903 static int
20904 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20905 rtx_insn *insn)
20907 basic_block bb = BLOCK_FOR_INSN (insn);
20908 int distance = 0;
20909 bool found = false;
20911 if (insn != BB_HEAD (bb))
20912 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20913 distance, PREV_INSN (insn),
20914 &found);
20916 if (!found && distance < LEA_SEARCH_THRESHOLD)
20918 edge e;
20919 edge_iterator ei;
20920 bool simple_loop = false;
20922 FOR_EACH_EDGE (e, ei, bb->preds)
20923 if (e->src == bb)
20925 simple_loop = true;
20926 break;
20929 if (simple_loop)
20930 distance = distance_non_agu_define_in_bb (regno1, regno2,
20931 insn, distance,
20932 BB_END (bb), &found);
20933 else
20935 int shortest_dist = -1;
20936 bool found_in_bb = false;
20938 FOR_EACH_EDGE (e, ei, bb->preds)
20940 int bb_dist
20941 = distance_non_agu_define_in_bb (regno1, regno2,
20942 insn, distance,
20943 BB_END (e->src),
20944 &found_in_bb);
20945 if (found_in_bb)
20947 if (shortest_dist < 0)
20948 shortest_dist = bb_dist;
20949 else if (bb_dist > 0)
20950 shortest_dist = MIN (bb_dist, shortest_dist);
20952 found = true;
20956 distance = shortest_dist;
20960 /* get_attr_type may modify recog data. We want to make sure
20961 that recog data is valid for instruction INSN, on which
20962 distance_non_agu_define is called. INSN is unchanged here. */
20963 extract_insn_cached (insn);
20965 if (!found)
20966 return -1;
20968 return distance >> 1;
20971 /* Return the distance in half-cycles between INSN and the next
20972 insn that uses register number REGNO in memory address added
20973 to DISTANCE. Return -1 if REGNO0 is set.
20975 Put true value into *FOUND if register usage was found and
20976 false otherwise.
20977 Put true value into *REDEFINED if register redefinition was
20978 found and false otherwise. */
20980 static int
20981 distance_agu_use_in_bb (unsigned int regno,
20982 rtx_insn *insn, int distance, rtx_insn *start,
20983 bool *found, bool *redefined)
20985 basic_block bb = NULL;
20986 rtx_insn *next = start;
20987 rtx_insn *prev = NULL;
20989 *found = false;
20990 *redefined = false;
20992 if (start != NULL_RTX)
20994 bb = BLOCK_FOR_INSN (start);
20995 if (start != BB_HEAD (bb))
20996 /* If insn and start belong to the same bb, set prev to insn,
20997 so the call to increase_distance will increase the distance
20998 between insns by 1. */
20999 prev = insn;
21002 while (next
21003 && next != insn
21004 && distance < LEA_SEARCH_THRESHOLD)
21006 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
21008 distance = increase_distance(prev, next, distance);
21009 if (insn_uses_reg_mem (regno, next))
21011 /* Return DISTANCE if OP0 is used in memory
21012 address in NEXT. */
21013 *found = true;
21014 return distance;
21017 if (insn_defines_reg (regno, INVALID_REGNUM, next))
21019 /* Return -1 if OP0 is set in NEXT. */
21020 *redefined = true;
21021 return -1;
21024 prev = next;
21027 if (next == BB_END (bb))
21028 break;
21030 next = NEXT_INSN (next);
21033 return distance;
21036 /* Return the distance between INSN and the next insn that uses
21037 register number REGNO0 in memory address. Return -1 if no such
21038 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21040 static int
21041 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21043 basic_block bb = BLOCK_FOR_INSN (insn);
21044 int distance = 0;
21045 bool found = false;
21046 bool redefined = false;
21048 if (insn != BB_END (bb))
21049 distance = distance_agu_use_in_bb (regno0, insn, distance,
21050 NEXT_INSN (insn),
21051 &found, &redefined);
21053 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21055 edge e;
21056 edge_iterator ei;
21057 bool simple_loop = false;
21059 FOR_EACH_EDGE (e, ei, bb->succs)
21060 if (e->dest == bb)
21062 simple_loop = true;
21063 break;
21066 if (simple_loop)
21067 distance = distance_agu_use_in_bb (regno0, insn,
21068 distance, BB_HEAD (bb),
21069 &found, &redefined);
21070 else
21072 int shortest_dist = -1;
21073 bool found_in_bb = false;
21074 bool redefined_in_bb = false;
21076 FOR_EACH_EDGE (e, ei, bb->succs)
21078 int bb_dist
21079 = distance_agu_use_in_bb (regno0, insn,
21080 distance, BB_HEAD (e->dest),
21081 &found_in_bb, &redefined_in_bb);
21082 if (found_in_bb)
21084 if (shortest_dist < 0)
21085 shortest_dist = bb_dist;
21086 else if (bb_dist > 0)
21087 shortest_dist = MIN (bb_dist, shortest_dist);
21089 found = true;
21093 distance = shortest_dist;
21097 if (!found || redefined)
21098 return -1;
21100 return distance >> 1;
21103 /* Define this macro to tune LEA priority vs ADD, it take effect when
21104 there is a dilemma of choicing LEA or ADD
21105 Negative value: ADD is more preferred than LEA
21106 Zero: Netrual
21107 Positive value: LEA is more preferred than ADD*/
21108 #define IX86_LEA_PRIORITY 0
21110 /* Return true if usage of lea INSN has performance advantage
21111 over a sequence of instructions. Instructions sequence has
21112 SPLIT_COST cycles higher latency than lea latency. */
21114 static bool
21115 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21116 unsigned int regno2, int split_cost, bool has_scale)
21118 int dist_define, dist_use;
21120 /* For Silvermont if using a 2-source or 3-source LEA for
21121 non-destructive destination purposes, or due to wanting
21122 ability to use SCALE, the use of LEA is justified. */
21123 if (TARGET_SILVERMONT || TARGET_INTEL)
21125 if (has_scale)
21126 return true;
21127 if (split_cost < 1)
21128 return false;
21129 if (regno0 == regno1 || regno0 == regno2)
21130 return false;
21131 return true;
21134 dist_define = distance_non_agu_define (regno1, regno2, insn);
21135 dist_use = distance_agu_use (regno0, insn);
21137 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21139 /* If there is no non AGU operand definition, no AGU
21140 operand usage and split cost is 0 then both lea
21141 and non lea variants have same priority. Currently
21142 we prefer lea for 64 bit code and non lea on 32 bit
21143 code. */
21144 if (dist_use < 0 && split_cost == 0)
21145 return TARGET_64BIT || IX86_LEA_PRIORITY;
21146 else
21147 return true;
21150 /* With longer definitions distance lea is more preferable.
21151 Here we change it to take into account splitting cost and
21152 lea priority. */
21153 dist_define += split_cost + IX86_LEA_PRIORITY;
21155 /* If there is no use in memory addess then we just check
21156 that split cost exceeds AGU stall. */
21157 if (dist_use < 0)
21158 return dist_define > LEA_MAX_STALL;
21160 /* If this insn has both backward non-agu dependence and forward
21161 agu dependence, the one with short distance takes effect. */
21162 return dist_define >= dist_use;
21165 /* Return true if it is legal to clobber flags by INSN and
21166 false otherwise. */
21168 static bool
21169 ix86_ok_to_clobber_flags (rtx_insn *insn)
21171 basic_block bb = BLOCK_FOR_INSN (insn);
21172 df_ref use;
21173 bitmap live;
21175 while (insn)
21177 if (NONDEBUG_INSN_P (insn))
21179 FOR_EACH_INSN_USE (use, insn)
21180 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21181 return false;
21183 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21184 return true;
21187 if (insn == BB_END (bb))
21188 break;
21190 insn = NEXT_INSN (insn);
21193 live = df_get_live_out(bb);
21194 return !REGNO_REG_SET_P (live, FLAGS_REG);
21197 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21198 move and add to avoid AGU stalls. */
21200 bool
21201 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21203 unsigned int regno0, regno1, regno2;
21205 /* Check if we need to optimize. */
21206 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21207 return false;
21209 /* Check it is correct to split here. */
21210 if (!ix86_ok_to_clobber_flags(insn))
21211 return false;
21213 regno0 = true_regnum (operands[0]);
21214 regno1 = true_regnum (operands[1]);
21215 regno2 = true_regnum (operands[2]);
21217 /* We need to split only adds with non destructive
21218 destination operand. */
21219 if (regno0 == regno1 || regno0 == regno2)
21220 return false;
21221 else
21222 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21225 /* Return true if we should emit lea instruction instead of mov
21226 instruction. */
21228 bool
21229 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21231 unsigned int regno0, regno1;
21233 /* Check if we need to optimize. */
21234 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21235 return false;
21237 /* Use lea for reg to reg moves only. */
21238 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21239 return false;
21241 regno0 = true_regnum (operands[0]);
21242 regno1 = true_regnum (operands[1]);
21244 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21247 /* Return true if we need to split lea into a sequence of
21248 instructions to avoid AGU stalls. */
21250 bool
21251 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21253 unsigned int regno0, regno1, regno2;
21254 int split_cost;
21255 struct ix86_address parts;
21256 int ok;
21258 /* Check we need to optimize. */
21259 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21260 return false;
21262 /* The "at least two components" test below might not catch simple
21263 move or zero extension insns if parts.base is non-NULL and parts.disp
21264 is const0_rtx as the only components in the address, e.g. if the
21265 register is %rbp or %r13. As this test is much cheaper and moves or
21266 zero extensions are the common case, do this check first. */
21267 if (REG_P (operands[1])
21268 || (SImode_address_operand (operands[1], VOIDmode)
21269 && REG_P (XEXP (operands[1], 0))))
21270 return false;
21272 /* Check if it is OK to split here. */
21273 if (!ix86_ok_to_clobber_flags (insn))
21274 return false;
21276 ok = ix86_decompose_address (operands[1], &parts);
21277 gcc_assert (ok);
21279 /* There should be at least two components in the address. */
21280 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21281 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21282 return false;
21284 /* We should not split into add if non legitimate pic
21285 operand is used as displacement. */
21286 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21287 return false;
21289 regno0 = true_regnum (operands[0]) ;
21290 regno1 = INVALID_REGNUM;
21291 regno2 = INVALID_REGNUM;
21293 if (parts.base)
21294 regno1 = true_regnum (parts.base);
21295 if (parts.index)
21296 regno2 = true_regnum (parts.index);
21298 split_cost = 0;
21300 /* Compute how many cycles we will add to execution time
21301 if split lea into a sequence of instructions. */
21302 if (parts.base || parts.index)
21304 /* Have to use mov instruction if non desctructive
21305 destination form is used. */
21306 if (regno1 != regno0 && regno2 != regno0)
21307 split_cost += 1;
21309 /* Have to add index to base if both exist. */
21310 if (parts.base && parts.index)
21311 split_cost += 1;
21313 /* Have to use shift and adds if scale is 2 or greater. */
21314 if (parts.scale > 1)
21316 if (regno0 != regno1)
21317 split_cost += 1;
21318 else if (regno2 == regno0)
21319 split_cost += 4;
21320 else
21321 split_cost += parts.scale;
21324 /* Have to use add instruction with immediate if
21325 disp is non zero. */
21326 if (parts.disp && parts.disp != const0_rtx)
21327 split_cost += 1;
21329 /* Subtract the price of lea. */
21330 split_cost -= 1;
21333 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21334 parts.scale > 1);
21337 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21338 matches destination. RTX includes clobber of FLAGS_REG. */
21340 static void
21341 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21342 rtx dst, rtx src)
21344 rtx op, clob;
21346 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21347 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21349 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21352 /* Return true if regno1 def is nearest to the insn. */
21354 static bool
21355 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21357 rtx_insn *prev = insn;
21358 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21360 if (insn == start)
21361 return false;
21362 while (prev && prev != start)
21364 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21366 prev = PREV_INSN (prev);
21367 continue;
21369 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21370 return true;
21371 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21372 return false;
21373 prev = PREV_INSN (prev);
21376 /* None of the regs is defined in the bb. */
21377 return false;
21380 /* Split lea instructions into a sequence of instructions
21381 which are executed on ALU to avoid AGU stalls.
21382 It is assumed that it is allowed to clobber flags register
21383 at lea position. */
21385 void
21386 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21388 unsigned int regno0, regno1, regno2;
21389 struct ix86_address parts;
21390 rtx target, tmp;
21391 int ok, adds;
21393 ok = ix86_decompose_address (operands[1], &parts);
21394 gcc_assert (ok);
21396 target = gen_lowpart (mode, operands[0]);
21398 regno0 = true_regnum (target);
21399 regno1 = INVALID_REGNUM;
21400 regno2 = INVALID_REGNUM;
21402 if (parts.base)
21404 parts.base = gen_lowpart (mode, parts.base);
21405 regno1 = true_regnum (parts.base);
21408 if (parts.index)
21410 parts.index = gen_lowpart (mode, parts.index);
21411 regno2 = true_regnum (parts.index);
21414 if (parts.disp)
21415 parts.disp = gen_lowpart (mode, parts.disp);
21417 if (parts.scale > 1)
21419 /* Case r1 = r1 + ... */
21420 if (regno1 == regno0)
21422 /* If we have a case r1 = r1 + C * r2 then we
21423 should use multiplication which is very
21424 expensive. Assume cost model is wrong if we
21425 have such case here. */
21426 gcc_assert (regno2 != regno0);
21428 for (adds = parts.scale; adds > 0; adds--)
21429 ix86_emit_binop (PLUS, mode, target, parts.index);
21431 else
21433 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21434 if (regno0 != regno2)
21435 emit_insn (gen_rtx_SET (target, parts.index));
21437 /* Use shift for scaling. */
21438 ix86_emit_binop (ASHIFT, mode, target,
21439 GEN_INT (exact_log2 (parts.scale)));
21441 if (parts.base)
21442 ix86_emit_binop (PLUS, mode, target, parts.base);
21444 if (parts.disp && parts.disp != const0_rtx)
21445 ix86_emit_binop (PLUS, mode, target, parts.disp);
21448 else if (!parts.base && !parts.index)
21450 gcc_assert(parts.disp);
21451 emit_insn (gen_rtx_SET (target, parts.disp));
21453 else
21455 if (!parts.base)
21457 if (regno0 != regno2)
21458 emit_insn (gen_rtx_SET (target, parts.index));
21460 else if (!parts.index)
21462 if (regno0 != regno1)
21463 emit_insn (gen_rtx_SET (target, parts.base));
21465 else
21467 if (regno0 == regno1)
21468 tmp = parts.index;
21469 else if (regno0 == regno2)
21470 tmp = parts.base;
21471 else
21473 rtx tmp1;
21475 /* Find better operand for SET instruction, depending
21476 on which definition is farther from the insn. */
21477 if (find_nearest_reg_def (insn, regno1, regno2))
21478 tmp = parts.index, tmp1 = parts.base;
21479 else
21480 tmp = parts.base, tmp1 = parts.index;
21482 emit_insn (gen_rtx_SET (target, tmp));
21484 if (parts.disp && parts.disp != const0_rtx)
21485 ix86_emit_binop (PLUS, mode, target, parts.disp);
21487 ix86_emit_binop (PLUS, mode, target, tmp1);
21488 return;
21491 ix86_emit_binop (PLUS, mode, target, tmp);
21494 if (parts.disp && parts.disp != const0_rtx)
21495 ix86_emit_binop (PLUS, mode, target, parts.disp);
21499 /* Return true if it is ok to optimize an ADD operation to LEA
21500 operation to avoid flag register consumation. For most processors,
21501 ADD is faster than LEA. For the processors like BONNELL, if the
21502 destination register of LEA holds an actual address which will be
21503 used soon, LEA is better and otherwise ADD is better. */
21505 bool
21506 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21508 unsigned int regno0 = true_regnum (operands[0]);
21509 unsigned int regno1 = true_regnum (operands[1]);
21510 unsigned int regno2 = true_regnum (operands[2]);
21512 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21513 if (regno0 != regno1 && regno0 != regno2)
21514 return true;
21516 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21517 return false;
21519 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21522 /* Return true if destination reg of SET_BODY is shift count of
21523 USE_BODY. */
21525 static bool
21526 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21528 rtx set_dest;
21529 rtx shift_rtx;
21530 int i;
21532 /* Retrieve destination of SET_BODY. */
21533 switch (GET_CODE (set_body))
21535 case SET:
21536 set_dest = SET_DEST (set_body);
21537 if (!set_dest || !REG_P (set_dest))
21538 return false;
21539 break;
21540 case PARALLEL:
21541 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21542 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21543 use_body))
21544 return true;
21545 /* FALLTHROUGH */
21546 default:
21547 return false;
21550 /* Retrieve shift count of USE_BODY. */
21551 switch (GET_CODE (use_body))
21553 case SET:
21554 shift_rtx = XEXP (use_body, 1);
21555 break;
21556 case PARALLEL:
21557 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21558 if (ix86_dep_by_shift_count_body (set_body,
21559 XVECEXP (use_body, 0, i)))
21560 return true;
21561 /* FALLTHROUGH */
21562 default:
21563 return false;
21566 if (shift_rtx
21567 && (GET_CODE (shift_rtx) == ASHIFT
21568 || GET_CODE (shift_rtx) == LSHIFTRT
21569 || GET_CODE (shift_rtx) == ASHIFTRT
21570 || GET_CODE (shift_rtx) == ROTATE
21571 || GET_CODE (shift_rtx) == ROTATERT))
21573 rtx shift_count = XEXP (shift_rtx, 1);
21575 /* Return true if shift count is dest of SET_BODY. */
21576 if (REG_P (shift_count))
21578 /* Add check since it can be invoked before register
21579 allocation in pre-reload schedule. */
21580 if (reload_completed
21581 && true_regnum (set_dest) == true_regnum (shift_count))
21582 return true;
21583 else if (REGNO(set_dest) == REGNO(shift_count))
21584 return true;
21588 return false;
21591 /* Return true if destination reg of SET_INSN is shift count of
21592 USE_INSN. */
21594 bool
21595 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21597 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21598 PATTERN (use_insn));
21601 /* Return TRUE or FALSE depending on whether the unary operator meets the
21602 appropriate constraints. */
21604 bool
21605 ix86_unary_operator_ok (enum rtx_code,
21606 machine_mode,
21607 rtx operands[2])
21609 /* If one of operands is memory, source and destination must match. */
21610 if ((MEM_P (operands[0])
21611 || MEM_P (operands[1]))
21612 && ! rtx_equal_p (operands[0], operands[1]))
21613 return false;
21614 return true;
21617 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21618 are ok, keeping in mind the possible movddup alternative. */
21620 bool
21621 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21623 if (MEM_P (operands[0]))
21624 return rtx_equal_p (operands[0], operands[1 + high]);
21625 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21626 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21627 return true;
21630 /* Post-reload splitter for converting an SF or DFmode value in an
21631 SSE register into an unsigned SImode. */
21633 void
21634 ix86_split_convert_uns_si_sse (rtx operands[])
21636 machine_mode vecmode;
21637 rtx value, large, zero_or_two31, input, two31, x;
21639 large = operands[1];
21640 zero_or_two31 = operands[2];
21641 input = operands[3];
21642 two31 = operands[4];
21643 vecmode = GET_MODE (large);
21644 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21646 /* Load up the value into the low element. We must ensure that the other
21647 elements are valid floats -- zero is the easiest such value. */
21648 if (MEM_P (input))
21650 if (vecmode == V4SFmode)
21651 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21652 else
21653 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21655 else
21657 input = gen_rtx_REG (vecmode, REGNO (input));
21658 emit_move_insn (value, CONST0_RTX (vecmode));
21659 if (vecmode == V4SFmode)
21660 emit_insn (gen_sse_movss (value, value, input));
21661 else
21662 emit_insn (gen_sse2_movsd (value, value, input));
21665 emit_move_insn (large, two31);
21666 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21668 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21669 emit_insn (gen_rtx_SET (large, x));
21671 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21672 emit_insn (gen_rtx_SET (zero_or_two31, x));
21674 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21675 emit_insn (gen_rtx_SET (value, x));
21677 large = gen_rtx_REG (V4SImode, REGNO (large));
21678 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21680 x = gen_rtx_REG (V4SImode, REGNO (value));
21681 if (vecmode == V4SFmode)
21682 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21683 else
21684 emit_insn (gen_sse2_cvttpd2dq (x, value));
21685 value = x;
21687 emit_insn (gen_xorv4si3 (value, value, large));
21690 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21691 Expects the 64-bit DImode to be supplied in a pair of integral
21692 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21693 -mfpmath=sse, !optimize_size only. */
21695 void
21696 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21698 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21699 rtx int_xmm, fp_xmm;
21700 rtx biases, exponents;
21701 rtx x;
21703 int_xmm = gen_reg_rtx (V4SImode);
21704 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21705 emit_insn (gen_movdi_to_sse (int_xmm, input));
21706 else if (TARGET_SSE_SPLIT_REGS)
21708 emit_clobber (int_xmm);
21709 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21711 else
21713 x = gen_reg_rtx (V2DImode);
21714 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21715 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21718 x = gen_rtx_CONST_VECTOR (V4SImode,
21719 gen_rtvec (4, GEN_INT (0x43300000UL),
21720 GEN_INT (0x45300000UL),
21721 const0_rtx, const0_rtx));
21722 exponents = validize_mem (force_const_mem (V4SImode, x));
21724 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21725 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21727 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21728 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21729 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21730 (0x1.0p84 + double(fp_value_hi_xmm)).
21731 Note these exponents differ by 32. */
21733 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21735 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21736 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21737 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21738 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21739 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21740 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21741 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21742 biases = validize_mem (force_const_mem (V2DFmode, biases));
21743 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21745 /* Add the upper and lower DFmode values together. */
21746 if (TARGET_SSE3)
21747 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21748 else
21750 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21751 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21752 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21755 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21758 /* Not used, but eases macroization of patterns. */
21759 void
21760 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21762 gcc_unreachable ();
21765 /* Convert an unsigned SImode value into a DFmode. Only currently used
21766 for SSE, but applicable anywhere. */
21768 void
21769 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21771 REAL_VALUE_TYPE TWO31r;
21772 rtx x, fp;
21774 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21775 NULL, 1, OPTAB_DIRECT);
21777 fp = gen_reg_rtx (DFmode);
21778 emit_insn (gen_floatsidf2 (fp, x));
21780 real_ldexp (&TWO31r, &dconst1, 31);
21781 x = const_double_from_real_value (TWO31r, DFmode);
21783 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21784 if (x != target)
21785 emit_move_insn (target, x);
21788 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21789 32-bit mode; otherwise we have a direct convert instruction. */
21791 void
21792 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21794 REAL_VALUE_TYPE TWO32r;
21795 rtx fp_lo, fp_hi, x;
21797 fp_lo = gen_reg_rtx (DFmode);
21798 fp_hi = gen_reg_rtx (DFmode);
21800 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21802 real_ldexp (&TWO32r, &dconst1, 32);
21803 x = const_double_from_real_value (TWO32r, DFmode);
21804 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21806 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21808 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21809 0, OPTAB_DIRECT);
21810 if (x != target)
21811 emit_move_insn (target, x);
21814 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21815 For x86_32, -mfpmath=sse, !optimize_size only. */
21816 void
21817 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21819 REAL_VALUE_TYPE ONE16r;
21820 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21822 real_ldexp (&ONE16r, &dconst1, 16);
21823 x = const_double_from_real_value (ONE16r, SFmode);
21824 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21825 NULL, 0, OPTAB_DIRECT);
21826 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21827 NULL, 0, OPTAB_DIRECT);
21828 fp_hi = gen_reg_rtx (SFmode);
21829 fp_lo = gen_reg_rtx (SFmode);
21830 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21831 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21832 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21833 0, OPTAB_DIRECT);
21834 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21835 0, OPTAB_DIRECT);
21836 if (!rtx_equal_p (target, fp_hi))
21837 emit_move_insn (target, fp_hi);
21840 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21841 a vector of unsigned ints VAL to vector of floats TARGET. */
21843 void
21844 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21846 rtx tmp[8];
21847 REAL_VALUE_TYPE TWO16r;
21848 machine_mode intmode = GET_MODE (val);
21849 machine_mode fltmode = GET_MODE (target);
21850 rtx (*cvt) (rtx, rtx);
21852 if (intmode == V4SImode)
21853 cvt = gen_floatv4siv4sf2;
21854 else
21855 cvt = gen_floatv8siv8sf2;
21856 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21857 tmp[0] = force_reg (intmode, tmp[0]);
21858 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21859 OPTAB_DIRECT);
21860 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21861 NULL_RTX, 1, OPTAB_DIRECT);
21862 tmp[3] = gen_reg_rtx (fltmode);
21863 emit_insn (cvt (tmp[3], tmp[1]));
21864 tmp[4] = gen_reg_rtx (fltmode);
21865 emit_insn (cvt (tmp[4], tmp[2]));
21866 real_ldexp (&TWO16r, &dconst1, 16);
21867 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21868 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21869 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21870 OPTAB_DIRECT);
21871 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21872 OPTAB_DIRECT);
21873 if (tmp[7] != target)
21874 emit_move_insn (target, tmp[7]);
21877 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21878 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21879 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21880 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21883 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21885 REAL_VALUE_TYPE TWO31r;
21886 rtx two31r, tmp[4];
21887 machine_mode mode = GET_MODE (val);
21888 machine_mode scalarmode = GET_MODE_INNER (mode);
21889 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21890 rtx (*cmp) (rtx, rtx, rtx, rtx);
21891 int i;
21893 for (i = 0; i < 3; i++)
21894 tmp[i] = gen_reg_rtx (mode);
21895 real_ldexp (&TWO31r, &dconst1, 31);
21896 two31r = const_double_from_real_value (TWO31r, scalarmode);
21897 two31r = ix86_build_const_vector (mode, 1, two31r);
21898 two31r = force_reg (mode, two31r);
21899 switch (mode)
21901 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21902 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21903 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21904 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21905 default: gcc_unreachable ();
21907 tmp[3] = gen_rtx_LE (mode, two31r, val);
21908 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21909 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21910 0, OPTAB_DIRECT);
21911 if (intmode == V4SImode || TARGET_AVX2)
21912 *xorp = expand_simple_binop (intmode, ASHIFT,
21913 gen_lowpart (intmode, tmp[0]),
21914 GEN_INT (31), NULL_RTX, 0,
21915 OPTAB_DIRECT);
21916 else
21918 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21919 two31 = ix86_build_const_vector (intmode, 1, two31);
21920 *xorp = expand_simple_binop (intmode, AND,
21921 gen_lowpart (intmode, tmp[0]),
21922 two31, NULL_RTX, 0,
21923 OPTAB_DIRECT);
21925 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21926 0, OPTAB_DIRECT);
21929 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21930 then replicate the value for all elements of the vector
21931 register. */
21934 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21936 int i, n_elt;
21937 rtvec v;
21938 machine_mode scalar_mode;
21940 switch (mode)
21942 case E_V64QImode:
21943 case E_V32QImode:
21944 case E_V16QImode:
21945 case E_V32HImode:
21946 case E_V16HImode:
21947 case E_V8HImode:
21948 case E_V16SImode:
21949 case E_V8SImode:
21950 case E_V4SImode:
21951 case E_V8DImode:
21952 case E_V4DImode:
21953 case E_V2DImode:
21954 gcc_assert (vect);
21955 /* FALLTHRU */
21956 case E_V16SFmode:
21957 case E_V8SFmode:
21958 case E_V4SFmode:
21959 case E_V8DFmode:
21960 case E_V4DFmode:
21961 case E_V2DFmode:
21962 n_elt = GET_MODE_NUNITS (mode);
21963 v = rtvec_alloc (n_elt);
21964 scalar_mode = GET_MODE_INNER (mode);
21966 RTVEC_ELT (v, 0) = value;
21968 for (i = 1; i < n_elt; ++i)
21969 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21971 return gen_rtx_CONST_VECTOR (mode, v);
21973 default:
21974 gcc_unreachable ();
21978 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21979 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21980 for an SSE register. If VECT is true, then replicate the mask for
21981 all elements of the vector register. If INVERT is true, then create
21982 a mask excluding the sign bit. */
21985 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21987 machine_mode vec_mode, imode;
21988 wide_int w;
21989 rtx mask, v;
21991 switch (mode)
21993 case E_V16SImode:
21994 case E_V16SFmode:
21995 case E_V8SImode:
21996 case E_V4SImode:
21997 case E_V8SFmode:
21998 case E_V4SFmode:
21999 vec_mode = mode;
22000 imode = SImode;
22001 break;
22003 case E_V8DImode:
22004 case E_V4DImode:
22005 case E_V2DImode:
22006 case E_V8DFmode:
22007 case E_V4DFmode:
22008 case E_V2DFmode:
22009 vec_mode = mode;
22010 imode = DImode;
22011 break;
22013 case E_TImode:
22014 case E_TFmode:
22015 vec_mode = VOIDmode;
22016 imode = TImode;
22017 break;
22019 default:
22020 gcc_unreachable ();
22023 machine_mode inner_mode = GET_MODE_INNER (mode);
22024 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
22025 GET_MODE_BITSIZE (inner_mode));
22026 if (invert)
22027 w = wi::bit_not (w);
22029 /* Force this value into the low part of a fp vector constant. */
22030 mask = immed_wide_int_const (w, imode);
22031 mask = gen_lowpart (inner_mode, mask);
22033 if (vec_mode == VOIDmode)
22034 return force_reg (inner_mode, mask);
22036 v = ix86_build_const_vector (vec_mode, vect, mask);
22037 return force_reg (vec_mode, v);
22040 /* Generate code for floating point ABS or NEG. */
22042 void
22043 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22044 rtx operands[])
22046 rtx mask, set, dst, src;
22047 bool use_sse = false;
22048 bool vector_mode = VECTOR_MODE_P (mode);
22049 machine_mode vmode = mode;
22051 if (vector_mode)
22052 use_sse = true;
22053 else if (mode == TFmode)
22054 use_sse = true;
22055 else if (TARGET_SSE_MATH)
22057 use_sse = SSE_FLOAT_MODE_P (mode);
22058 if (mode == SFmode)
22059 vmode = V4SFmode;
22060 else if (mode == DFmode)
22061 vmode = V2DFmode;
22064 /* NEG and ABS performed with SSE use bitwise mask operations.
22065 Create the appropriate mask now. */
22066 if (use_sse)
22067 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22068 else
22069 mask = NULL_RTX;
22071 dst = operands[0];
22072 src = operands[1];
22074 set = gen_rtx_fmt_e (code, mode, src);
22075 set = gen_rtx_SET (dst, set);
22077 if (mask)
22079 rtx use, clob;
22080 rtvec par;
22082 use = gen_rtx_USE (VOIDmode, mask);
22083 if (vector_mode)
22084 par = gen_rtvec (2, set, use);
22085 else
22087 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22088 par = gen_rtvec (3, set, use, clob);
22090 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22092 else
22093 emit_insn (set);
22096 /* Expand a copysign operation. Special case operand 0 being a constant. */
22098 void
22099 ix86_expand_copysign (rtx operands[])
22101 machine_mode mode, vmode;
22102 rtx dest, op0, op1, mask, nmask;
22104 dest = operands[0];
22105 op0 = operands[1];
22106 op1 = operands[2];
22108 mode = GET_MODE (dest);
22110 if (mode == SFmode)
22111 vmode = V4SFmode;
22112 else if (mode == DFmode)
22113 vmode = V2DFmode;
22114 else
22115 vmode = mode;
22117 if (CONST_DOUBLE_P (op0))
22119 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22121 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22122 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22124 if (mode == SFmode || mode == DFmode)
22126 if (op0 == CONST0_RTX (mode))
22127 op0 = CONST0_RTX (vmode);
22128 else
22130 rtx v = ix86_build_const_vector (vmode, false, op0);
22132 op0 = force_reg (vmode, v);
22135 else if (op0 != CONST0_RTX (mode))
22136 op0 = force_reg (mode, op0);
22138 mask = ix86_build_signbit_mask (vmode, 0, 0);
22140 if (mode == SFmode)
22141 copysign_insn = gen_copysignsf3_const;
22142 else if (mode == DFmode)
22143 copysign_insn = gen_copysigndf3_const;
22144 else
22145 copysign_insn = gen_copysigntf3_const;
22147 emit_insn (copysign_insn (dest, op0, op1, mask));
22149 else
22151 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22153 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22154 mask = ix86_build_signbit_mask (vmode, 0, 0);
22156 if (mode == SFmode)
22157 copysign_insn = gen_copysignsf3_var;
22158 else if (mode == DFmode)
22159 copysign_insn = gen_copysigndf3_var;
22160 else
22161 copysign_insn = gen_copysigntf3_var;
22163 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22167 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22168 be a constant, and so has already been expanded into a vector constant. */
22170 void
22171 ix86_split_copysign_const (rtx operands[])
22173 machine_mode mode, vmode;
22174 rtx dest, op0, mask, x;
22176 dest = operands[0];
22177 op0 = operands[1];
22178 mask = operands[3];
22180 mode = GET_MODE (dest);
22181 vmode = GET_MODE (mask);
22183 dest = lowpart_subreg (vmode, dest, mode);
22184 x = gen_rtx_AND (vmode, dest, mask);
22185 emit_insn (gen_rtx_SET (dest, x));
22187 if (op0 != CONST0_RTX (vmode))
22189 x = gen_rtx_IOR (vmode, dest, op0);
22190 emit_insn (gen_rtx_SET (dest, x));
22194 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22195 so we have to do two masks. */
22197 void
22198 ix86_split_copysign_var (rtx operands[])
22200 machine_mode mode, vmode;
22201 rtx dest, scratch, op0, op1, mask, nmask, x;
22203 dest = operands[0];
22204 scratch = operands[1];
22205 op0 = operands[2];
22206 op1 = operands[3];
22207 nmask = operands[4];
22208 mask = operands[5];
22210 mode = GET_MODE (dest);
22211 vmode = GET_MODE (mask);
22213 if (rtx_equal_p (op0, op1))
22215 /* Shouldn't happen often (it's useless, obviously), but when it does
22216 we'd generate incorrect code if we continue below. */
22217 emit_move_insn (dest, op0);
22218 return;
22221 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22223 gcc_assert (REGNO (op1) == REGNO (scratch));
22225 x = gen_rtx_AND (vmode, scratch, mask);
22226 emit_insn (gen_rtx_SET (scratch, x));
22228 dest = mask;
22229 op0 = lowpart_subreg (vmode, op0, mode);
22230 x = gen_rtx_NOT (vmode, dest);
22231 x = gen_rtx_AND (vmode, x, op0);
22232 emit_insn (gen_rtx_SET (dest, x));
22234 else
22236 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22238 x = gen_rtx_AND (vmode, scratch, mask);
22240 else /* alternative 2,4 */
22242 gcc_assert (REGNO (mask) == REGNO (scratch));
22243 op1 = lowpart_subreg (vmode, op1, mode);
22244 x = gen_rtx_AND (vmode, scratch, op1);
22246 emit_insn (gen_rtx_SET (scratch, x));
22248 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22250 dest = lowpart_subreg (vmode, op0, mode);
22251 x = gen_rtx_AND (vmode, dest, nmask);
22253 else /* alternative 3,4 */
22255 gcc_assert (REGNO (nmask) == REGNO (dest));
22256 dest = nmask;
22257 op0 = lowpart_subreg (vmode, op0, mode);
22258 x = gen_rtx_AND (vmode, dest, op0);
22260 emit_insn (gen_rtx_SET (dest, x));
22263 x = gen_rtx_IOR (vmode, dest, scratch);
22264 emit_insn (gen_rtx_SET (dest, x));
22267 /* Return TRUE or FALSE depending on whether the first SET in INSN
22268 has source and destination with matching CC modes, and that the
22269 CC mode is at least as constrained as REQ_MODE. */
22271 bool
22272 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22274 rtx set;
22275 machine_mode set_mode;
22277 set = PATTERN (insn);
22278 if (GET_CODE (set) == PARALLEL)
22279 set = XVECEXP (set, 0, 0);
22280 gcc_assert (GET_CODE (set) == SET);
22281 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22283 set_mode = GET_MODE (SET_DEST (set));
22284 switch (set_mode)
22286 case E_CCNOmode:
22287 if (req_mode != CCNOmode
22288 && (req_mode != CCmode
22289 || XEXP (SET_SRC (set), 1) != const0_rtx))
22290 return false;
22291 break;
22292 case E_CCmode:
22293 if (req_mode == CCGCmode)
22294 return false;
22295 /* FALLTHRU */
22296 case E_CCGCmode:
22297 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22298 return false;
22299 /* FALLTHRU */
22300 case E_CCGOCmode:
22301 if (req_mode == CCZmode)
22302 return false;
22303 /* FALLTHRU */
22304 case E_CCZmode:
22305 break;
22307 case E_CCGZmode:
22309 case E_CCAmode:
22310 case E_CCCmode:
22311 case E_CCOmode:
22312 case E_CCPmode:
22313 case E_CCSmode:
22314 if (set_mode != req_mode)
22315 return false;
22316 break;
22318 default:
22319 gcc_unreachable ();
22322 return GET_MODE (SET_SRC (set)) == set_mode;
22325 /* Generate insn patterns to do an integer compare of OPERANDS. */
22327 static rtx
22328 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22330 machine_mode cmpmode;
22331 rtx tmp, flags;
22333 cmpmode = SELECT_CC_MODE (code, op0, op1);
22334 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22336 /* This is very simple, but making the interface the same as in the
22337 FP case makes the rest of the code easier. */
22338 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22339 emit_insn (gen_rtx_SET (flags, tmp));
22341 /* Return the test that should be put into the flags user, i.e.
22342 the bcc, scc, or cmov instruction. */
22343 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22346 /* Figure out whether to use unordered fp comparisons. */
22348 static bool
22349 ix86_unordered_fp_compare (enum rtx_code code)
22351 if (!TARGET_IEEE_FP)
22352 return false;
22354 switch (code)
22356 case GT:
22357 case GE:
22358 case LT:
22359 case LE:
22360 return false;
22362 case EQ:
22363 case NE:
22365 case LTGT:
22366 case UNORDERED:
22367 case ORDERED:
22368 case UNLT:
22369 case UNLE:
22370 case UNGT:
22371 case UNGE:
22372 case UNEQ:
22373 return true;
22375 default:
22376 gcc_unreachable ();
22380 machine_mode
22381 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22383 machine_mode mode = GET_MODE (op0);
22385 if (SCALAR_FLOAT_MODE_P (mode))
22387 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22388 return CCFPmode;
22391 switch (code)
22393 /* Only zero flag is needed. */
22394 case EQ: /* ZF=0 */
22395 case NE: /* ZF!=0 */
22396 return CCZmode;
22397 /* Codes needing carry flag. */
22398 case GEU: /* CF=0 */
22399 case LTU: /* CF=1 */
22400 /* Detect overflow checks. They need just the carry flag. */
22401 if (GET_CODE (op0) == PLUS
22402 && (rtx_equal_p (op1, XEXP (op0, 0))
22403 || rtx_equal_p (op1, XEXP (op0, 1))))
22404 return CCCmode;
22405 else
22406 return CCmode;
22407 case GTU: /* CF=0 & ZF=0 */
22408 case LEU: /* CF=1 | ZF=1 */
22409 return CCmode;
22410 /* Codes possibly doable only with sign flag when
22411 comparing against zero. */
22412 case GE: /* SF=OF or SF=0 */
22413 case LT: /* SF<>OF or SF=1 */
22414 if (op1 == const0_rtx)
22415 return CCGOCmode;
22416 else
22417 /* For other cases Carry flag is not required. */
22418 return CCGCmode;
22419 /* Codes doable only with sign flag when comparing
22420 against zero, but we miss jump instruction for it
22421 so we need to use relational tests against overflow
22422 that thus needs to be zero. */
22423 case GT: /* ZF=0 & SF=OF */
22424 case LE: /* ZF=1 | SF<>OF */
22425 if (op1 == const0_rtx)
22426 return CCNOmode;
22427 else
22428 return CCGCmode;
22429 /* strcmp pattern do (use flags) and combine may ask us for proper
22430 mode. */
22431 case USE:
22432 return CCmode;
22433 default:
22434 gcc_unreachable ();
22438 /* Return the fixed registers used for condition codes. */
22440 static bool
22441 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22443 *p1 = FLAGS_REG;
22444 *p2 = FPSR_REG;
22445 return true;
22448 /* If two condition code modes are compatible, return a condition code
22449 mode which is compatible with both. Otherwise, return
22450 VOIDmode. */
22452 static machine_mode
22453 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22455 if (m1 == m2)
22456 return m1;
22458 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22459 return VOIDmode;
22461 if ((m1 == CCGCmode && m2 == CCGOCmode)
22462 || (m1 == CCGOCmode && m2 == CCGCmode))
22463 return CCGCmode;
22465 if ((m1 == CCNOmode && m2 == CCGOCmode)
22466 || (m1 == CCGOCmode && m2 == CCNOmode))
22467 return CCNOmode;
22469 if (m1 == CCZmode
22470 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22471 return m2;
22472 else if (m2 == CCZmode
22473 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22474 return m1;
22476 switch (m1)
22478 default:
22479 gcc_unreachable ();
22481 case E_CCmode:
22482 case E_CCGCmode:
22483 case E_CCGOCmode:
22484 case E_CCNOmode:
22485 case E_CCAmode:
22486 case E_CCCmode:
22487 case E_CCOmode:
22488 case E_CCPmode:
22489 case E_CCSmode:
22490 case E_CCZmode:
22491 switch (m2)
22493 default:
22494 return VOIDmode;
22496 case E_CCmode:
22497 case E_CCGCmode:
22498 case E_CCGOCmode:
22499 case E_CCNOmode:
22500 case E_CCAmode:
22501 case E_CCCmode:
22502 case E_CCOmode:
22503 case E_CCPmode:
22504 case E_CCSmode:
22505 case E_CCZmode:
22506 return CCmode;
22509 case E_CCFPmode:
22510 /* These are only compatible with themselves, which we already
22511 checked above. */
22512 return VOIDmode;
22517 /* Return a comparison we can do and that it is equivalent to
22518 swap_condition (code) apart possibly from orderedness.
22519 But, never change orderedness if TARGET_IEEE_FP, returning
22520 UNKNOWN in that case if necessary. */
22522 static enum rtx_code
22523 ix86_fp_swap_condition (enum rtx_code code)
22525 switch (code)
22527 case GT: /* GTU - CF=0 & ZF=0 */
22528 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22529 case GE: /* GEU - CF=0 */
22530 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22531 case UNLT: /* LTU - CF=1 */
22532 return TARGET_IEEE_FP ? UNKNOWN : GT;
22533 case UNLE: /* LEU - CF=1 | ZF=1 */
22534 return TARGET_IEEE_FP ? UNKNOWN : GE;
22535 default:
22536 return swap_condition (code);
22540 /* Return cost of comparison CODE using the best strategy for performance.
22541 All following functions do use number of instructions as a cost metrics.
22542 In future this should be tweaked to compute bytes for optimize_size and
22543 take into account performance of various instructions on various CPUs. */
22545 static int
22546 ix86_fp_comparison_cost (enum rtx_code code)
22548 int arith_cost;
22550 /* The cost of code using bit-twiddling on %ah. */
22551 switch (code)
22553 case UNLE:
22554 case UNLT:
22555 case LTGT:
22556 case GT:
22557 case GE:
22558 case UNORDERED:
22559 case ORDERED:
22560 case UNEQ:
22561 arith_cost = 4;
22562 break;
22563 case LT:
22564 case NE:
22565 case EQ:
22566 case UNGE:
22567 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22568 break;
22569 case LE:
22570 case UNGT:
22571 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22572 break;
22573 default:
22574 gcc_unreachable ();
22577 switch (ix86_fp_comparison_strategy (code))
22579 case IX86_FPCMP_COMI:
22580 return arith_cost > 4 ? 3 : 2;
22581 case IX86_FPCMP_SAHF:
22582 return arith_cost > 4 ? 4 : 3;
22583 default:
22584 return arith_cost;
22588 /* Return strategy to use for floating-point. We assume that fcomi is always
22589 preferrable where available, since that is also true when looking at size
22590 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22592 enum ix86_fpcmp_strategy
22593 ix86_fp_comparison_strategy (enum rtx_code)
22595 /* Do fcomi/sahf based test when profitable. */
22597 if (TARGET_CMOVE)
22598 return IX86_FPCMP_COMI;
22600 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22601 return IX86_FPCMP_SAHF;
22603 return IX86_FPCMP_ARITH;
22606 /* Swap, force into registers, or otherwise massage the two operands
22607 to a fp comparison. The operands are updated in place; the new
22608 comparison code is returned. */
22610 static enum rtx_code
22611 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22613 bool unordered_compare = ix86_unordered_fp_compare (code);
22614 rtx op0 = *pop0, op1 = *pop1;
22615 machine_mode op_mode = GET_MODE (op0);
22616 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22618 /* All of the unordered compare instructions only work on registers.
22619 The same is true of the fcomi compare instructions. The XFmode
22620 compare instructions require registers except when comparing
22621 against zero or when converting operand 1 from fixed point to
22622 floating point. */
22624 if (!is_sse
22625 && (unordered_compare
22626 || (op_mode == XFmode
22627 && ! (standard_80387_constant_p (op0) == 1
22628 || standard_80387_constant_p (op1) == 1)
22629 && GET_CODE (op1) != FLOAT)
22630 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22632 op0 = force_reg (op_mode, op0);
22633 op1 = force_reg (op_mode, op1);
22635 else
22637 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22638 things around if they appear profitable, otherwise force op0
22639 into a register. */
22641 if (standard_80387_constant_p (op0) == 0
22642 || (MEM_P (op0)
22643 && ! (standard_80387_constant_p (op1) == 0
22644 || MEM_P (op1))))
22646 enum rtx_code new_code = ix86_fp_swap_condition (code);
22647 if (new_code != UNKNOWN)
22649 std::swap (op0, op1);
22650 code = new_code;
22654 if (!REG_P (op0))
22655 op0 = force_reg (op_mode, op0);
22657 if (CONSTANT_P (op1))
22659 int tmp = standard_80387_constant_p (op1);
22660 if (tmp == 0)
22661 op1 = validize_mem (force_const_mem (op_mode, op1));
22662 else if (tmp == 1)
22664 if (TARGET_CMOVE)
22665 op1 = force_reg (op_mode, op1);
22667 else
22668 op1 = force_reg (op_mode, op1);
22672 /* Try to rearrange the comparison to make it cheaper. */
22673 if (ix86_fp_comparison_cost (code)
22674 > ix86_fp_comparison_cost (swap_condition (code))
22675 && (REG_P (op1) || can_create_pseudo_p ()))
22677 std::swap (op0, op1);
22678 code = swap_condition (code);
22679 if (!REG_P (op0))
22680 op0 = force_reg (op_mode, op0);
22683 *pop0 = op0;
22684 *pop1 = op1;
22685 return code;
22688 /* Convert comparison codes we use to represent FP comparison to integer
22689 code that will result in proper branch. Return UNKNOWN if no such code
22690 is available. */
22692 enum rtx_code
22693 ix86_fp_compare_code_to_integer (enum rtx_code code)
22695 switch (code)
22697 case GT:
22698 return GTU;
22699 case GE:
22700 return GEU;
22701 case ORDERED:
22702 case UNORDERED:
22703 return code;
22704 case UNEQ:
22705 return EQ;
22706 case UNLT:
22707 return LTU;
22708 case UNLE:
22709 return LEU;
22710 case LTGT:
22711 return NE;
22712 default:
22713 return UNKNOWN;
22717 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22719 static rtx
22720 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22722 bool unordered_compare = ix86_unordered_fp_compare (code);
22723 machine_mode intcmp_mode;
22724 rtx tmp, tmp2;
22726 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22728 /* Do fcomi/sahf based test when profitable. */
22729 switch (ix86_fp_comparison_strategy (code))
22731 case IX86_FPCMP_COMI:
22732 intcmp_mode = CCFPmode;
22733 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22734 if (unordered_compare)
22735 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22736 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22737 break;
22739 case IX86_FPCMP_SAHF:
22740 intcmp_mode = CCFPmode;
22741 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22742 if (unordered_compare)
22743 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22744 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22745 if (!scratch)
22746 scratch = gen_reg_rtx (HImode);
22747 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22748 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22749 break;
22751 case IX86_FPCMP_ARITH:
22752 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22753 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22754 if (unordered_compare)
22755 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22756 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22757 if (!scratch)
22758 scratch = gen_reg_rtx (HImode);
22759 emit_insn (gen_rtx_SET (scratch, tmp));
22761 /* In the unordered case, we have to check C2 for NaN's, which
22762 doesn't happen to work out to anything nice combination-wise.
22763 So do some bit twiddling on the value we've got in AH to come
22764 up with an appropriate set of condition codes. */
22766 intcmp_mode = CCNOmode;
22767 switch (code)
22769 case GT:
22770 case UNGT:
22771 if (code == GT || !TARGET_IEEE_FP)
22773 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22774 code = EQ;
22776 else
22778 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22779 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22780 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22781 intcmp_mode = CCmode;
22782 code = GEU;
22784 break;
22785 case LT:
22786 case UNLT:
22787 if (code == LT && TARGET_IEEE_FP)
22789 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22790 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22791 intcmp_mode = CCmode;
22792 code = EQ;
22794 else
22796 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22797 code = NE;
22799 break;
22800 case GE:
22801 case UNGE:
22802 if (code == GE || !TARGET_IEEE_FP)
22804 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22805 code = EQ;
22807 else
22809 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22810 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22811 code = NE;
22813 break;
22814 case LE:
22815 case UNLE:
22816 if (code == LE && TARGET_IEEE_FP)
22818 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22819 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22820 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22821 intcmp_mode = CCmode;
22822 code = LTU;
22824 else
22826 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22827 code = NE;
22829 break;
22830 case EQ:
22831 case UNEQ:
22832 if (code == EQ && TARGET_IEEE_FP)
22834 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22835 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22836 intcmp_mode = CCmode;
22837 code = EQ;
22839 else
22841 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22842 code = NE;
22844 break;
22845 case NE:
22846 case LTGT:
22847 if (code == NE && TARGET_IEEE_FP)
22849 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22850 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22851 GEN_INT (0x40)));
22852 code = NE;
22854 else
22856 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22857 code = EQ;
22859 break;
22861 case UNORDERED:
22862 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22863 code = NE;
22864 break;
22865 case ORDERED:
22866 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22867 code = EQ;
22868 break;
22870 default:
22871 gcc_unreachable ();
22873 break;
22875 default:
22876 gcc_unreachable();
22879 /* Return the test that should be put into the flags user, i.e.
22880 the bcc, scc, or cmov instruction. */
22881 return gen_rtx_fmt_ee (code, VOIDmode,
22882 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22883 const0_rtx);
22886 static rtx
22887 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22889 rtx ret;
22891 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22892 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22894 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22896 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22897 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22899 else
22900 ret = ix86_expand_int_compare (code, op0, op1);
22902 return ret;
22905 void
22906 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22908 machine_mode mode = GET_MODE (op0);
22909 rtx tmp;
22911 /* Handle special case - vector comparsion with boolean result, transform
22912 it using ptest instruction. */
22913 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22915 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22916 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22918 gcc_assert (code == EQ || code == NE);
22919 /* Generate XOR since we can't check that one operand is zero vector. */
22920 tmp = gen_reg_rtx (mode);
22921 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22922 tmp = gen_lowpart (p_mode, tmp);
22923 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22924 gen_rtx_UNSPEC (CCmode,
22925 gen_rtvec (2, tmp, tmp),
22926 UNSPEC_PTEST)));
22927 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22928 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22929 gen_rtx_LABEL_REF (VOIDmode, label),
22930 pc_rtx);
22931 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22932 return;
22935 switch (mode)
22937 case E_SFmode:
22938 case E_DFmode:
22939 case E_XFmode:
22940 case E_QImode:
22941 case E_HImode:
22942 case E_SImode:
22943 simple:
22944 tmp = ix86_expand_compare (code, op0, op1);
22945 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22946 gen_rtx_LABEL_REF (VOIDmode, label),
22947 pc_rtx);
22948 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22949 return;
22951 case E_DImode:
22952 if (TARGET_64BIT)
22953 goto simple;
22954 /* For 32-bit target DI comparison may be performed on
22955 SSE registers. To allow this we should avoid split
22956 to SI mode which is achieved by doing xor in DI mode
22957 and then comparing with zero (which is recognized by
22958 STV pass). We don't compare using xor when optimizing
22959 for size. */
22960 if (!optimize_insn_for_size_p ()
22961 && TARGET_STV
22962 && (code == EQ || code == NE))
22964 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22965 op1 = const0_rtx;
22967 /* FALLTHRU */
22968 case E_TImode:
22969 /* Expand DImode branch into multiple compare+branch. */
22971 rtx lo[2], hi[2];
22972 rtx_code_label *label2;
22973 enum rtx_code code1, code2, code3;
22974 machine_mode submode;
22976 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22978 std::swap (op0, op1);
22979 code = swap_condition (code);
22982 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22983 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22985 submode = mode == DImode ? SImode : DImode;
22987 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22988 avoid two branches. This costs one extra insn, so disable when
22989 optimizing for size. */
22991 if ((code == EQ || code == NE)
22992 && (!optimize_insn_for_size_p ()
22993 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22995 rtx xor0, xor1;
22997 xor1 = hi[0];
22998 if (hi[1] != const0_rtx)
22999 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
23000 NULL_RTX, 0, OPTAB_WIDEN);
23002 xor0 = lo[0];
23003 if (lo[1] != const0_rtx)
23004 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
23005 NULL_RTX, 0, OPTAB_WIDEN);
23007 tmp = expand_binop (submode, ior_optab, xor1, xor0,
23008 NULL_RTX, 0, OPTAB_WIDEN);
23010 ix86_expand_branch (code, tmp, const0_rtx, label);
23011 return;
23014 /* Otherwise, if we are doing less-than or greater-or-equal-than,
23015 op1 is a constant and the low word is zero, then we can just
23016 examine the high word. Similarly for low word -1 and
23017 less-or-equal-than or greater-than. */
23019 if (CONST_INT_P (hi[1]))
23020 switch (code)
23022 case LT: case LTU: case GE: case GEU:
23023 if (lo[1] == const0_rtx)
23025 ix86_expand_branch (code, hi[0], hi[1], label);
23026 return;
23028 break;
23029 case LE: case LEU: case GT: case GTU:
23030 if (lo[1] == constm1_rtx)
23032 ix86_expand_branch (code, hi[0], hi[1], label);
23033 return;
23035 break;
23036 default:
23037 break;
23040 /* Emulate comparisons that do not depend on Zero flag with
23041 double-word subtraction. Note that only Overflow, Sign
23042 and Carry flags are valid, so swap arguments and condition
23043 of comparisons that would otherwise test Zero flag. */
23045 switch (code)
23047 case LE: case LEU: case GT: case GTU:
23048 std::swap (lo[0], lo[1]);
23049 std::swap (hi[0], hi[1]);
23050 code = swap_condition (code);
23051 /* FALLTHRU */
23053 case LT: case LTU: case GE: case GEU:
23055 rtx (*cmp_insn) (rtx, rtx);
23056 rtx (*sbb_insn) (rtx, rtx, rtx);
23057 bool uns = (code == LTU || code == GEU);
23059 if (TARGET_64BIT)
23061 cmp_insn = gen_cmpdi_1;
23062 sbb_insn
23063 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23065 else
23067 cmp_insn = gen_cmpsi_1;
23068 sbb_insn
23069 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23072 if (!nonimmediate_operand (lo[0], submode))
23073 lo[0] = force_reg (submode, lo[0]);
23074 if (!x86_64_general_operand (lo[1], submode))
23075 lo[1] = force_reg (submode, lo[1]);
23077 if (!register_operand (hi[0], submode))
23078 hi[0] = force_reg (submode, hi[0]);
23079 if ((uns && !nonimmediate_operand (hi[1], submode))
23080 || (!uns && !x86_64_general_operand (hi[1], submode)))
23081 hi[1] = force_reg (submode, hi[1]);
23083 emit_insn (cmp_insn (lo[0], lo[1]));
23084 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23086 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23088 ix86_expand_branch (code, tmp, const0_rtx, label);
23089 return;
23092 default:
23093 break;
23096 /* Otherwise, we need two or three jumps. */
23098 label2 = gen_label_rtx ();
23100 code1 = code;
23101 code2 = swap_condition (code);
23102 code3 = unsigned_condition (code);
23104 switch (code)
23106 case LT: case GT: case LTU: case GTU:
23107 break;
23109 case LE: code1 = LT; code2 = GT; break;
23110 case GE: code1 = GT; code2 = LT; break;
23111 case LEU: code1 = LTU; code2 = GTU; break;
23112 case GEU: code1 = GTU; code2 = LTU; break;
23114 case EQ: code1 = UNKNOWN; code2 = NE; break;
23115 case NE: code2 = UNKNOWN; break;
23117 default:
23118 gcc_unreachable ();
23122 * a < b =>
23123 * if (hi(a) < hi(b)) goto true;
23124 * if (hi(a) > hi(b)) goto false;
23125 * if (lo(a) < lo(b)) goto true;
23126 * false:
23129 if (code1 != UNKNOWN)
23130 ix86_expand_branch (code1, hi[0], hi[1], label);
23131 if (code2 != UNKNOWN)
23132 ix86_expand_branch (code2, hi[0], hi[1], label2);
23134 ix86_expand_branch (code3, lo[0], lo[1], label);
23136 if (code2 != UNKNOWN)
23137 emit_label (label2);
23138 return;
23141 default:
23142 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23143 goto simple;
23147 void
23148 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23150 rtx ret;
23152 gcc_assert (GET_MODE (dest) == QImode);
23154 ret = ix86_expand_compare (code, op0, op1);
23155 PUT_MODE (ret, QImode);
23156 emit_insn (gen_rtx_SET (dest, ret));
23159 /* Expand comparison setting or clearing carry flag. Return true when
23160 successful and set pop for the operation. */
23161 static bool
23162 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23164 machine_mode mode =
23165 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23167 /* Do not handle double-mode compares that go through special path. */
23168 if (mode == (TARGET_64BIT ? TImode : DImode))
23169 return false;
23171 if (SCALAR_FLOAT_MODE_P (mode))
23173 rtx compare_op;
23174 rtx_insn *compare_seq;
23176 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23178 /* Shortcut: following common codes never translate
23179 into carry flag compares. */
23180 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23181 || code == ORDERED || code == UNORDERED)
23182 return false;
23184 /* These comparisons require zero flag; swap operands so they won't. */
23185 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23186 && !TARGET_IEEE_FP)
23188 std::swap (op0, op1);
23189 code = swap_condition (code);
23192 /* Try to expand the comparison and verify that we end up with
23193 carry flag based comparison. This fails to be true only when
23194 we decide to expand comparison using arithmetic that is not
23195 too common scenario. */
23196 start_sequence ();
23197 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23198 compare_seq = get_insns ();
23199 end_sequence ();
23201 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23202 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23203 else
23204 code = GET_CODE (compare_op);
23206 if (code != LTU && code != GEU)
23207 return false;
23209 emit_insn (compare_seq);
23210 *pop = compare_op;
23211 return true;
23214 if (!INTEGRAL_MODE_P (mode))
23215 return false;
23217 switch (code)
23219 case LTU:
23220 case GEU:
23221 break;
23223 /* Convert a==0 into (unsigned)a<1. */
23224 case EQ:
23225 case NE:
23226 if (op1 != const0_rtx)
23227 return false;
23228 op1 = const1_rtx;
23229 code = (code == EQ ? LTU : GEU);
23230 break;
23232 /* Convert a>b into b<a or a>=b-1. */
23233 case GTU:
23234 case LEU:
23235 if (CONST_INT_P (op1))
23237 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23238 /* Bail out on overflow. We still can swap operands but that
23239 would force loading of the constant into register. */
23240 if (op1 == const0_rtx
23241 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23242 return false;
23243 code = (code == GTU ? GEU : LTU);
23245 else
23247 std::swap (op0, op1);
23248 code = (code == GTU ? LTU : GEU);
23250 break;
23252 /* Convert a>=0 into (unsigned)a<0x80000000. */
23253 case LT:
23254 case GE:
23255 if (mode == DImode || op1 != const0_rtx)
23256 return false;
23257 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23258 code = (code == LT ? GEU : LTU);
23259 break;
23260 case LE:
23261 case GT:
23262 if (mode == DImode || op1 != constm1_rtx)
23263 return false;
23264 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23265 code = (code == LE ? GEU : LTU);
23266 break;
23268 default:
23269 return false;
23271 /* Swapping operands may cause constant to appear as first operand. */
23272 if (!nonimmediate_operand (op0, VOIDmode))
23274 if (!can_create_pseudo_p ())
23275 return false;
23276 op0 = force_reg (mode, op0);
23278 *pop = ix86_expand_compare (code, op0, op1);
23279 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23280 return true;
23283 bool
23284 ix86_expand_int_movcc (rtx operands[])
23286 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23287 rtx_insn *compare_seq;
23288 rtx compare_op;
23289 machine_mode mode = GET_MODE (operands[0]);
23290 bool sign_bit_compare_p = false;
23291 rtx op0 = XEXP (operands[1], 0);
23292 rtx op1 = XEXP (operands[1], 1);
23294 if (GET_MODE (op0) == TImode
23295 || (GET_MODE (op0) == DImode
23296 && !TARGET_64BIT))
23297 return false;
23299 start_sequence ();
23300 compare_op = ix86_expand_compare (code, op0, op1);
23301 compare_seq = get_insns ();
23302 end_sequence ();
23304 compare_code = GET_CODE (compare_op);
23306 if ((op1 == const0_rtx && (code == GE || code == LT))
23307 || (op1 == constm1_rtx && (code == GT || code == LE)))
23308 sign_bit_compare_p = true;
23310 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23311 HImode insns, we'd be swallowed in word prefix ops. */
23313 if ((mode != HImode || TARGET_FAST_PREFIX)
23314 && (mode != (TARGET_64BIT ? TImode : DImode))
23315 && CONST_INT_P (operands[2])
23316 && CONST_INT_P (operands[3]))
23318 rtx out = operands[0];
23319 HOST_WIDE_INT ct = INTVAL (operands[2]);
23320 HOST_WIDE_INT cf = INTVAL (operands[3]);
23321 HOST_WIDE_INT diff;
23323 diff = ct - cf;
23324 /* Sign bit compares are better done using shifts than we do by using
23325 sbb. */
23326 if (sign_bit_compare_p
23327 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23329 /* Detect overlap between destination and compare sources. */
23330 rtx tmp = out;
23332 if (!sign_bit_compare_p)
23334 rtx flags;
23335 bool fpcmp = false;
23337 compare_code = GET_CODE (compare_op);
23339 flags = XEXP (compare_op, 0);
23341 if (GET_MODE (flags) == CCFPmode)
23343 fpcmp = true;
23344 compare_code
23345 = ix86_fp_compare_code_to_integer (compare_code);
23348 /* To simplify rest of code, restrict to the GEU case. */
23349 if (compare_code == LTU)
23351 std::swap (ct, cf);
23352 compare_code = reverse_condition (compare_code);
23353 code = reverse_condition (code);
23355 else
23357 if (fpcmp)
23358 PUT_CODE (compare_op,
23359 reverse_condition_maybe_unordered
23360 (GET_CODE (compare_op)));
23361 else
23362 PUT_CODE (compare_op,
23363 reverse_condition (GET_CODE (compare_op)));
23365 diff = ct - cf;
23367 if (reg_overlap_mentioned_p (out, op0)
23368 || reg_overlap_mentioned_p (out, op1))
23369 tmp = gen_reg_rtx (mode);
23371 if (mode == DImode)
23372 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23373 else
23374 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23375 flags, compare_op));
23377 else
23379 if (code == GT || code == GE)
23380 code = reverse_condition (code);
23381 else
23383 std::swap (ct, cf);
23384 diff = ct - cf;
23386 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23389 if (diff == 1)
23392 * cmpl op0,op1
23393 * sbbl dest,dest
23394 * [addl dest, ct]
23396 * Size 5 - 8.
23398 if (ct)
23399 tmp = expand_simple_binop (mode, PLUS,
23400 tmp, GEN_INT (ct),
23401 copy_rtx (tmp), 1, OPTAB_DIRECT);
23403 else if (cf == -1)
23406 * cmpl op0,op1
23407 * sbbl dest,dest
23408 * orl $ct, dest
23410 * Size 8.
23412 tmp = expand_simple_binop (mode, IOR,
23413 tmp, GEN_INT (ct),
23414 copy_rtx (tmp), 1, OPTAB_DIRECT);
23416 else if (diff == -1 && ct)
23419 * cmpl op0,op1
23420 * sbbl dest,dest
23421 * notl dest
23422 * [addl dest, cf]
23424 * Size 8 - 11.
23426 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23427 if (cf)
23428 tmp = expand_simple_binop (mode, PLUS,
23429 copy_rtx (tmp), GEN_INT (cf),
23430 copy_rtx (tmp), 1, OPTAB_DIRECT);
23432 else
23435 * cmpl op0,op1
23436 * sbbl dest,dest
23437 * [notl dest]
23438 * andl cf - ct, dest
23439 * [addl dest, ct]
23441 * Size 8 - 11.
23444 if (cf == 0)
23446 cf = ct;
23447 ct = 0;
23448 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23451 tmp = expand_simple_binop (mode, AND,
23452 copy_rtx (tmp),
23453 gen_int_mode (cf - ct, mode),
23454 copy_rtx (tmp), 1, OPTAB_DIRECT);
23455 if (ct)
23456 tmp = expand_simple_binop (mode, PLUS,
23457 copy_rtx (tmp), GEN_INT (ct),
23458 copy_rtx (tmp), 1, OPTAB_DIRECT);
23461 if (!rtx_equal_p (tmp, out))
23462 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23464 return true;
23467 if (diff < 0)
23469 machine_mode cmp_mode = GET_MODE (op0);
23470 enum rtx_code new_code;
23472 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23474 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23476 /* We may be reversing unordered compare to normal compare, that
23477 is not valid in general (we may convert non-trapping condition
23478 to trapping one), however on i386 we currently emit all
23479 comparisons unordered. */
23480 new_code = reverse_condition_maybe_unordered (code);
23482 else
23483 new_code = ix86_reverse_condition (code, cmp_mode);
23484 if (new_code != UNKNOWN)
23486 std::swap (ct, cf);
23487 diff = -diff;
23488 code = new_code;
23492 compare_code = UNKNOWN;
23493 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23494 && CONST_INT_P (op1))
23496 if (op1 == const0_rtx
23497 && (code == LT || code == GE))
23498 compare_code = code;
23499 else if (op1 == constm1_rtx)
23501 if (code == LE)
23502 compare_code = LT;
23503 else if (code == GT)
23504 compare_code = GE;
23508 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23509 if (compare_code != UNKNOWN
23510 && GET_MODE (op0) == GET_MODE (out)
23511 && (cf == -1 || ct == -1))
23513 /* If lea code below could be used, only optimize
23514 if it results in a 2 insn sequence. */
23516 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23517 || diff == 3 || diff == 5 || diff == 9)
23518 || (compare_code == LT && ct == -1)
23519 || (compare_code == GE && cf == -1))
23522 * notl op1 (if necessary)
23523 * sarl $31, op1
23524 * orl cf, op1
23526 if (ct != -1)
23528 cf = ct;
23529 ct = -1;
23530 code = reverse_condition (code);
23533 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23535 out = expand_simple_binop (mode, IOR,
23536 out, GEN_INT (cf),
23537 out, 1, OPTAB_DIRECT);
23538 if (out != operands[0])
23539 emit_move_insn (operands[0], out);
23541 return true;
23546 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23547 || diff == 3 || diff == 5 || diff == 9)
23548 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23549 && (mode != DImode
23550 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23553 * xorl dest,dest
23554 * cmpl op1,op2
23555 * setcc dest
23556 * lea cf(dest*(ct-cf)),dest
23558 * Size 14.
23560 * This also catches the degenerate setcc-only case.
23563 rtx tmp;
23564 int nops;
23566 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23568 nops = 0;
23569 /* On x86_64 the lea instruction operates on Pmode, so we need
23570 to get arithmetics done in proper mode to match. */
23571 if (diff == 1)
23572 tmp = copy_rtx (out);
23573 else
23575 rtx out1;
23576 out1 = copy_rtx (out);
23577 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23578 nops++;
23579 if (diff & 1)
23581 tmp = gen_rtx_PLUS (mode, tmp, out1);
23582 nops++;
23585 if (cf != 0)
23587 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23588 nops++;
23590 if (!rtx_equal_p (tmp, out))
23592 if (nops == 1)
23593 out = force_operand (tmp, copy_rtx (out));
23594 else
23595 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23597 if (!rtx_equal_p (out, operands[0]))
23598 emit_move_insn (operands[0], copy_rtx (out));
23600 return true;
23604 * General case: Jumpful:
23605 * xorl dest,dest cmpl op1, op2
23606 * cmpl op1, op2 movl ct, dest
23607 * setcc dest jcc 1f
23608 * decl dest movl cf, dest
23609 * andl (cf-ct),dest 1:
23610 * addl ct,dest
23612 * Size 20. Size 14.
23614 * This is reasonably steep, but branch mispredict costs are
23615 * high on modern cpus, so consider failing only if optimizing
23616 * for space.
23619 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23620 && BRANCH_COST (optimize_insn_for_speed_p (),
23621 false) >= 2)
23623 if (cf == 0)
23625 machine_mode cmp_mode = GET_MODE (op0);
23626 enum rtx_code new_code;
23628 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23630 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23632 /* We may be reversing unordered compare to normal compare,
23633 that is not valid in general (we may convert non-trapping
23634 condition to trapping one), however on i386 we currently
23635 emit all comparisons unordered. */
23636 new_code = reverse_condition_maybe_unordered (code);
23638 else
23640 new_code = ix86_reverse_condition (code, cmp_mode);
23641 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23642 compare_code = reverse_condition (compare_code);
23645 if (new_code != UNKNOWN)
23647 cf = ct;
23648 ct = 0;
23649 code = new_code;
23653 if (compare_code != UNKNOWN)
23655 /* notl op1 (if needed)
23656 sarl $31, op1
23657 andl (cf-ct), op1
23658 addl ct, op1
23660 For x < 0 (resp. x <= -1) there will be no notl,
23661 so if possible swap the constants to get rid of the
23662 complement.
23663 True/false will be -1/0 while code below (store flag
23664 followed by decrement) is 0/-1, so the constants need
23665 to be exchanged once more. */
23667 if (compare_code == GE || !cf)
23669 code = reverse_condition (code);
23670 compare_code = LT;
23672 else
23673 std::swap (ct, cf);
23675 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23677 else
23679 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23681 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23682 constm1_rtx,
23683 copy_rtx (out), 1, OPTAB_DIRECT);
23686 out = expand_simple_binop (mode, AND, copy_rtx (out),
23687 gen_int_mode (cf - ct, mode),
23688 copy_rtx (out), 1, OPTAB_DIRECT);
23689 if (ct)
23690 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23691 copy_rtx (out), 1, OPTAB_DIRECT);
23692 if (!rtx_equal_p (out, operands[0]))
23693 emit_move_insn (operands[0], copy_rtx (out));
23695 return true;
23699 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23701 /* Try a few things more with specific constants and a variable. */
23703 optab op;
23704 rtx var, orig_out, out, tmp;
23706 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23707 return false;
23709 /* If one of the two operands is an interesting constant, load a
23710 constant with the above and mask it in with a logical operation. */
23712 if (CONST_INT_P (operands[2]))
23714 var = operands[3];
23715 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23716 operands[3] = constm1_rtx, op = and_optab;
23717 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23718 operands[3] = const0_rtx, op = ior_optab;
23719 else
23720 return false;
23722 else if (CONST_INT_P (operands[3]))
23724 var = operands[2];
23725 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23726 operands[2] = constm1_rtx, op = and_optab;
23727 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23728 operands[2] = const0_rtx, op = ior_optab;
23729 else
23730 return false;
23732 else
23733 return false;
23735 orig_out = operands[0];
23736 tmp = gen_reg_rtx (mode);
23737 operands[0] = tmp;
23739 /* Recurse to get the constant loaded. */
23740 if (!ix86_expand_int_movcc (operands))
23741 return false;
23743 /* Mask in the interesting variable. */
23744 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23745 OPTAB_WIDEN);
23746 if (!rtx_equal_p (out, orig_out))
23747 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23749 return true;
23753 * For comparison with above,
23755 * movl cf,dest
23756 * movl ct,tmp
23757 * cmpl op1,op2
23758 * cmovcc tmp,dest
23760 * Size 15.
23763 if (! nonimmediate_operand (operands[2], mode))
23764 operands[2] = force_reg (mode, operands[2]);
23765 if (! nonimmediate_operand (operands[3], mode))
23766 operands[3] = force_reg (mode, operands[3]);
23768 if (! register_operand (operands[2], VOIDmode)
23769 && (mode == QImode
23770 || ! register_operand (operands[3], VOIDmode)))
23771 operands[2] = force_reg (mode, operands[2]);
23773 if (mode == QImode
23774 && ! register_operand (operands[3], VOIDmode))
23775 operands[3] = force_reg (mode, operands[3]);
23777 emit_insn (compare_seq);
23778 emit_insn (gen_rtx_SET (operands[0],
23779 gen_rtx_IF_THEN_ELSE (mode,
23780 compare_op, operands[2],
23781 operands[3])));
23782 return true;
23785 /* Swap, force into registers, or otherwise massage the two operands
23786 to an sse comparison with a mask result. Thus we differ a bit from
23787 ix86_prepare_fp_compare_args which expects to produce a flags result.
23789 The DEST operand exists to help determine whether to commute commutative
23790 operators. The POP0/POP1 operands are updated in place. The new
23791 comparison code is returned, or UNKNOWN if not implementable. */
23793 static enum rtx_code
23794 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23795 rtx *pop0, rtx *pop1)
23797 switch (code)
23799 case LTGT:
23800 case UNEQ:
23801 /* AVX supports all the needed comparisons. */
23802 if (TARGET_AVX)
23803 break;
23804 /* We have no LTGT as an operator. We could implement it with
23805 NE & ORDERED, but this requires an extra temporary. It's
23806 not clear that it's worth it. */
23807 return UNKNOWN;
23809 case LT:
23810 case LE:
23811 case UNGT:
23812 case UNGE:
23813 /* These are supported directly. */
23814 break;
23816 case EQ:
23817 case NE:
23818 case UNORDERED:
23819 case ORDERED:
23820 /* AVX has 3 operand comparisons, no need to swap anything. */
23821 if (TARGET_AVX)
23822 break;
23823 /* For commutative operators, try to canonicalize the destination
23824 operand to be first in the comparison - this helps reload to
23825 avoid extra moves. */
23826 if (!dest || !rtx_equal_p (dest, *pop1))
23827 break;
23828 /* FALLTHRU */
23830 case GE:
23831 case GT:
23832 case UNLE:
23833 case UNLT:
23834 /* These are not supported directly before AVX, and furthermore
23835 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23836 comparison operands to transform into something that is
23837 supported. */
23838 std::swap (*pop0, *pop1);
23839 code = swap_condition (code);
23840 break;
23842 default:
23843 gcc_unreachable ();
23846 return code;
23849 /* Detect conditional moves that exactly match min/max operational
23850 semantics. Note that this is IEEE safe, as long as we don't
23851 interchange the operands.
23853 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23854 and TRUE if the operation is successful and instructions are emitted. */
23856 static bool
23857 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23858 rtx cmp_op1, rtx if_true, rtx if_false)
23860 machine_mode mode;
23861 bool is_min;
23862 rtx tmp;
23864 if (code == LT)
23866 else if (code == UNGE)
23867 std::swap (if_true, if_false);
23868 else
23869 return false;
23871 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23872 is_min = true;
23873 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23874 is_min = false;
23875 else
23876 return false;
23878 mode = GET_MODE (dest);
23880 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23881 but MODE may be a vector mode and thus not appropriate. */
23882 if (!flag_finite_math_only || flag_signed_zeros)
23884 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23885 rtvec v;
23887 if_true = force_reg (mode, if_true);
23888 v = gen_rtvec (2, if_true, if_false);
23889 tmp = gen_rtx_UNSPEC (mode, v, u);
23891 else
23893 code = is_min ? SMIN : SMAX;
23894 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23897 emit_insn (gen_rtx_SET (dest, tmp));
23898 return true;
23901 /* Expand an sse vector comparison. Return the register with the result. */
23903 static rtx
23904 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23905 rtx op_true, rtx op_false)
23907 machine_mode mode = GET_MODE (dest);
23908 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23910 /* In general case result of comparison can differ from operands' type. */
23911 machine_mode cmp_mode;
23913 /* In AVX512F the result of comparison is an integer mask. */
23914 bool maskcmp = false;
23915 rtx x;
23917 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23919 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23920 cmp_mode = int_mode_for_size (nbits, 0).require ();
23921 maskcmp = true;
23923 else
23924 cmp_mode = cmp_ops_mode;
23927 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23928 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23929 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23931 if (optimize
23932 || (maskcmp && cmp_mode != mode)
23933 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23934 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23935 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23937 /* Compare patterns for int modes are unspec in AVX512F only. */
23938 if (maskcmp && (code == GT || code == EQ))
23940 rtx (*gen)(rtx, rtx, rtx);
23942 switch (cmp_ops_mode)
23944 case E_V64QImode:
23945 gcc_assert (TARGET_AVX512BW);
23946 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23947 break;
23948 case E_V32HImode:
23949 gcc_assert (TARGET_AVX512BW);
23950 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23951 break;
23952 case E_V16SImode:
23953 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23954 break;
23955 case E_V8DImode:
23956 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23957 break;
23958 default:
23959 gen = NULL;
23962 if (gen)
23964 emit_insn (gen (dest, cmp_op0, cmp_op1));
23965 return dest;
23968 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23970 if (cmp_mode != mode && !maskcmp)
23972 x = force_reg (cmp_ops_mode, x);
23973 convert_move (dest, x, false);
23975 else
23976 emit_insn (gen_rtx_SET (dest, x));
23978 return dest;
23981 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23982 operations. This is used for both scalar and vector conditional moves. */
23984 void
23985 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23987 machine_mode mode = GET_MODE (dest);
23988 machine_mode cmpmode = GET_MODE (cmp);
23990 /* In AVX512F the result of comparison is an integer mask. */
23991 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23993 rtx t2, t3, x;
23995 /* If we have an integer mask and FP value then we need
23996 to cast mask to FP mode. */
23997 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23999 cmp = force_reg (cmpmode, cmp);
24000 cmp = gen_rtx_SUBREG (mode, cmp, 0);
24003 if (vector_all_ones_operand (op_true, mode)
24004 && rtx_equal_p (op_false, CONST0_RTX (mode))
24005 && !maskcmp)
24007 emit_insn (gen_rtx_SET (dest, cmp));
24009 else if (op_false == CONST0_RTX (mode)
24010 && !maskcmp)
24012 op_true = force_reg (mode, op_true);
24013 x = gen_rtx_AND (mode, cmp, op_true);
24014 emit_insn (gen_rtx_SET (dest, x));
24016 else if (op_true == CONST0_RTX (mode)
24017 && !maskcmp)
24019 op_false = force_reg (mode, op_false);
24020 x = gen_rtx_NOT (mode, cmp);
24021 x = gen_rtx_AND (mode, x, op_false);
24022 emit_insn (gen_rtx_SET (dest, x));
24024 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
24025 && !maskcmp)
24027 op_false = force_reg (mode, op_false);
24028 x = gen_rtx_IOR (mode, cmp, op_false);
24029 emit_insn (gen_rtx_SET (dest, x));
24031 else if (TARGET_XOP
24032 && !maskcmp)
24034 op_true = force_reg (mode, op_true);
24036 if (!nonimmediate_operand (op_false, mode))
24037 op_false = force_reg (mode, op_false);
24039 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24040 op_true,
24041 op_false)));
24043 else
24045 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24046 rtx d = dest;
24048 if (!nonimmediate_operand (op_true, mode))
24049 op_true = force_reg (mode, op_true);
24051 op_false = force_reg (mode, op_false);
24053 switch (mode)
24055 case E_V4SFmode:
24056 if (TARGET_SSE4_1)
24057 gen = gen_sse4_1_blendvps;
24058 break;
24059 case E_V2DFmode:
24060 if (TARGET_SSE4_1)
24061 gen = gen_sse4_1_blendvpd;
24062 break;
24063 case E_V16QImode:
24064 case E_V8HImode:
24065 case E_V4SImode:
24066 case E_V2DImode:
24067 if (TARGET_SSE4_1)
24069 gen = gen_sse4_1_pblendvb;
24070 if (mode != V16QImode)
24071 d = gen_reg_rtx (V16QImode);
24072 op_false = gen_lowpart (V16QImode, op_false);
24073 op_true = gen_lowpart (V16QImode, op_true);
24074 cmp = gen_lowpart (V16QImode, cmp);
24076 break;
24077 case E_V8SFmode:
24078 if (TARGET_AVX)
24079 gen = gen_avx_blendvps256;
24080 break;
24081 case E_V4DFmode:
24082 if (TARGET_AVX)
24083 gen = gen_avx_blendvpd256;
24084 break;
24085 case E_V32QImode:
24086 case E_V16HImode:
24087 case E_V8SImode:
24088 case E_V4DImode:
24089 if (TARGET_AVX2)
24091 gen = gen_avx2_pblendvb;
24092 if (mode != V32QImode)
24093 d = gen_reg_rtx (V32QImode);
24094 op_false = gen_lowpart (V32QImode, op_false);
24095 op_true = gen_lowpart (V32QImode, op_true);
24096 cmp = gen_lowpart (V32QImode, cmp);
24098 break;
24100 case E_V64QImode:
24101 gen = gen_avx512bw_blendmv64qi;
24102 break;
24103 case E_V32HImode:
24104 gen = gen_avx512bw_blendmv32hi;
24105 break;
24106 case E_V16SImode:
24107 gen = gen_avx512f_blendmv16si;
24108 break;
24109 case E_V8DImode:
24110 gen = gen_avx512f_blendmv8di;
24111 break;
24112 case E_V8DFmode:
24113 gen = gen_avx512f_blendmv8df;
24114 break;
24115 case E_V16SFmode:
24116 gen = gen_avx512f_blendmv16sf;
24117 break;
24119 default:
24120 break;
24123 if (gen != NULL)
24125 emit_insn (gen (d, op_false, op_true, cmp));
24126 if (d != dest)
24127 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24129 else
24131 op_true = force_reg (mode, op_true);
24133 t2 = gen_reg_rtx (mode);
24134 if (optimize)
24135 t3 = gen_reg_rtx (mode);
24136 else
24137 t3 = dest;
24139 x = gen_rtx_AND (mode, op_true, cmp);
24140 emit_insn (gen_rtx_SET (t2, x));
24142 x = gen_rtx_NOT (mode, cmp);
24143 x = gen_rtx_AND (mode, x, op_false);
24144 emit_insn (gen_rtx_SET (t3, x));
24146 x = gen_rtx_IOR (mode, t3, t2);
24147 emit_insn (gen_rtx_SET (dest, x));
24152 /* Expand a floating-point conditional move. Return true if successful. */
24154 bool
24155 ix86_expand_fp_movcc (rtx operands[])
24157 machine_mode mode = GET_MODE (operands[0]);
24158 enum rtx_code code = GET_CODE (operands[1]);
24159 rtx tmp, compare_op;
24160 rtx op0 = XEXP (operands[1], 0);
24161 rtx op1 = XEXP (operands[1], 1);
24163 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24165 machine_mode cmode;
24167 /* Since we've no cmove for sse registers, don't force bad register
24168 allocation just to gain access to it. Deny movcc when the
24169 comparison mode doesn't match the move mode. */
24170 cmode = GET_MODE (op0);
24171 if (cmode == VOIDmode)
24172 cmode = GET_MODE (op1);
24173 if (cmode != mode)
24174 return false;
24176 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24177 if (code == UNKNOWN)
24178 return false;
24180 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24181 operands[2], operands[3]))
24182 return true;
24184 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24185 operands[2], operands[3]);
24186 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24187 return true;
24190 if (GET_MODE (op0) == TImode
24191 || (GET_MODE (op0) == DImode
24192 && !TARGET_64BIT))
24193 return false;
24195 /* The floating point conditional move instructions don't directly
24196 support conditions resulting from a signed integer comparison. */
24198 compare_op = ix86_expand_compare (code, op0, op1);
24199 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24201 tmp = gen_reg_rtx (QImode);
24202 ix86_expand_setcc (tmp, code, op0, op1);
24204 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24207 emit_insn (gen_rtx_SET (operands[0],
24208 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24209 operands[2], operands[3])));
24211 return true;
24214 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24216 static int
24217 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24219 switch (code)
24221 case EQ:
24222 return 0;
24223 case LT:
24224 case LTU:
24225 return 1;
24226 case LE:
24227 case LEU:
24228 return 2;
24229 case NE:
24230 return 4;
24231 case GE:
24232 case GEU:
24233 return 5;
24234 case GT:
24235 case GTU:
24236 return 6;
24237 default:
24238 gcc_unreachable ();
24242 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24244 static int
24245 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24247 switch (code)
24249 case EQ:
24250 return 0x00;
24251 case NE:
24252 return 0x04;
24253 case GT:
24254 return 0x0e;
24255 case LE:
24256 return 0x02;
24257 case GE:
24258 return 0x0d;
24259 case LT:
24260 return 0x01;
24261 case UNLE:
24262 return 0x0a;
24263 case UNLT:
24264 return 0x09;
24265 case UNGE:
24266 return 0x05;
24267 case UNGT:
24268 return 0x06;
24269 case UNEQ:
24270 return 0x18;
24271 case LTGT:
24272 return 0x0c;
24273 case ORDERED:
24274 return 0x07;
24275 case UNORDERED:
24276 return 0x03;
24277 default:
24278 gcc_unreachable ();
24282 /* Return immediate value to be used in UNSPEC_PCMP
24283 for comparison CODE in MODE. */
24285 static int
24286 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24288 if (FLOAT_MODE_P (mode))
24289 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24290 return ix86_int_cmp_code_to_pcmp_immediate (code);
24293 /* Expand AVX-512 vector comparison. */
24295 bool
24296 ix86_expand_mask_vec_cmp (rtx operands[])
24298 machine_mode mask_mode = GET_MODE (operands[0]);
24299 machine_mode cmp_mode = GET_MODE (operands[2]);
24300 enum rtx_code code = GET_CODE (operands[1]);
24301 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24302 int unspec_code;
24303 rtx unspec;
24305 switch (code)
24307 case LEU:
24308 case GTU:
24309 case GEU:
24310 case LTU:
24311 unspec_code = UNSPEC_UNSIGNED_PCMP;
24312 break;
24314 default:
24315 unspec_code = UNSPEC_PCMP;
24318 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24319 operands[3], imm),
24320 unspec_code);
24321 emit_insn (gen_rtx_SET (operands[0], unspec));
24323 return true;
24326 /* Expand fp vector comparison. */
24328 bool
24329 ix86_expand_fp_vec_cmp (rtx operands[])
24331 enum rtx_code code = GET_CODE (operands[1]);
24332 rtx cmp;
24334 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24335 &operands[2], &operands[3]);
24336 if (code == UNKNOWN)
24338 rtx temp;
24339 switch (GET_CODE (operands[1]))
24341 case LTGT:
24342 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24343 operands[3], NULL, NULL);
24344 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24345 operands[3], NULL, NULL);
24346 code = AND;
24347 break;
24348 case UNEQ:
24349 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24350 operands[3], NULL, NULL);
24351 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24352 operands[3], NULL, NULL);
24353 code = IOR;
24354 break;
24355 default:
24356 gcc_unreachable ();
24358 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24359 OPTAB_DIRECT);
24361 else
24362 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24363 operands[1], operands[2]);
24365 if (operands[0] != cmp)
24366 emit_move_insn (operands[0], cmp);
24368 return true;
24371 static rtx
24372 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24373 rtx op_true, rtx op_false, bool *negate)
24375 machine_mode data_mode = GET_MODE (dest);
24376 machine_mode mode = GET_MODE (cop0);
24377 rtx x;
24379 *negate = false;
24381 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24382 if (TARGET_XOP
24383 && (mode == V16QImode || mode == V8HImode
24384 || mode == V4SImode || mode == V2DImode))
24386 else
24388 /* Canonicalize the comparison to EQ, GT, GTU. */
24389 switch (code)
24391 case EQ:
24392 case GT:
24393 case GTU:
24394 break;
24396 case NE:
24397 case LE:
24398 case LEU:
24399 code = reverse_condition (code);
24400 *negate = true;
24401 break;
24403 case GE:
24404 case GEU:
24405 code = reverse_condition (code);
24406 *negate = true;
24407 /* FALLTHRU */
24409 case LT:
24410 case LTU:
24411 std::swap (cop0, cop1);
24412 code = swap_condition (code);
24413 break;
24415 default:
24416 gcc_unreachable ();
24419 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24420 if (mode == V2DImode)
24422 switch (code)
24424 case EQ:
24425 /* SSE4.1 supports EQ. */
24426 if (!TARGET_SSE4_1)
24427 return NULL;
24428 break;
24430 case GT:
24431 case GTU:
24432 /* SSE4.2 supports GT/GTU. */
24433 if (!TARGET_SSE4_2)
24434 return NULL;
24435 break;
24437 default:
24438 gcc_unreachable ();
24442 /* Unsigned parallel compare is not supported by the hardware.
24443 Play some tricks to turn this into a signed comparison
24444 against 0. */
24445 if (code == GTU)
24447 cop0 = force_reg (mode, cop0);
24449 switch (mode)
24451 case E_V16SImode:
24452 case E_V8DImode:
24453 case E_V8SImode:
24454 case E_V4DImode:
24455 case E_V4SImode:
24456 case E_V2DImode:
24458 rtx t1, t2, mask;
24459 rtx (*gen_sub3) (rtx, rtx, rtx);
24461 switch (mode)
24463 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24464 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24465 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24466 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24467 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24468 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24469 default:
24470 gcc_unreachable ();
24472 /* Subtract (-(INT MAX) - 1) from both operands to make
24473 them signed. */
24474 mask = ix86_build_signbit_mask (mode, true, false);
24475 t1 = gen_reg_rtx (mode);
24476 emit_insn (gen_sub3 (t1, cop0, mask));
24478 t2 = gen_reg_rtx (mode);
24479 emit_insn (gen_sub3 (t2, cop1, mask));
24481 cop0 = t1;
24482 cop1 = t2;
24483 code = GT;
24485 break;
24487 case E_V64QImode:
24488 case E_V32HImode:
24489 case E_V32QImode:
24490 case E_V16HImode:
24491 case E_V16QImode:
24492 case E_V8HImode:
24493 /* Perform a parallel unsigned saturating subtraction. */
24494 x = gen_reg_rtx (mode);
24495 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24496 cop1)));
24498 cop0 = x;
24499 cop1 = CONST0_RTX (mode);
24500 code = EQ;
24501 *negate = !*negate;
24502 break;
24504 default:
24505 gcc_unreachable ();
24510 if (*negate)
24511 std::swap (op_true, op_false);
24513 /* Allow the comparison to be done in one mode, but the movcc to
24514 happen in another mode. */
24515 if (data_mode == mode)
24517 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24518 op_true, op_false);
24520 else
24522 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24523 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24524 op_true, op_false);
24525 if (GET_MODE (x) == mode)
24526 x = gen_lowpart (data_mode, x);
24529 return x;
24532 /* Expand integer vector comparison. */
24534 bool
24535 ix86_expand_int_vec_cmp (rtx operands[])
24537 rtx_code code = GET_CODE (operands[1]);
24538 bool negate = false;
24539 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24540 operands[3], NULL, NULL, &negate);
24542 if (!cmp)
24543 return false;
24545 if (negate)
24546 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24547 CONST0_RTX (GET_MODE (cmp)),
24548 NULL, NULL, &negate);
24550 gcc_assert (!negate);
24552 if (operands[0] != cmp)
24553 emit_move_insn (operands[0], cmp);
24555 return true;
24558 /* Expand a floating-point vector conditional move; a vcond operation
24559 rather than a movcc operation. */
24561 bool
24562 ix86_expand_fp_vcond (rtx operands[])
24564 enum rtx_code code = GET_CODE (operands[3]);
24565 rtx cmp;
24567 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24568 &operands[4], &operands[5]);
24569 if (code == UNKNOWN)
24571 rtx temp;
24572 switch (GET_CODE (operands[3]))
24574 case LTGT:
24575 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24576 operands[5], operands[0], operands[0]);
24577 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24578 operands[5], operands[1], operands[2]);
24579 code = AND;
24580 break;
24581 case UNEQ:
24582 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24583 operands[5], operands[0], operands[0]);
24584 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24585 operands[5], operands[1], operands[2]);
24586 code = IOR;
24587 break;
24588 default:
24589 gcc_unreachable ();
24591 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24592 OPTAB_DIRECT);
24593 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24594 return true;
24597 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24598 operands[5], operands[1], operands[2]))
24599 return true;
24601 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24602 operands[1], operands[2]);
24603 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24604 return true;
24607 /* Expand a signed/unsigned integral vector conditional move. */
24609 bool
24610 ix86_expand_int_vcond (rtx operands[])
24612 machine_mode data_mode = GET_MODE (operands[0]);
24613 machine_mode mode = GET_MODE (operands[4]);
24614 enum rtx_code code = GET_CODE (operands[3]);
24615 bool negate = false;
24616 rtx x, cop0, cop1;
24618 cop0 = operands[4];
24619 cop1 = operands[5];
24621 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24622 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24623 if ((code == LT || code == GE)
24624 && data_mode == mode
24625 && cop1 == CONST0_RTX (mode)
24626 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24627 && GET_MODE_UNIT_SIZE (data_mode) > 1
24628 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24629 && (GET_MODE_SIZE (data_mode) == 16
24630 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24632 rtx negop = operands[2 - (code == LT)];
24633 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24634 if (negop == CONST1_RTX (data_mode))
24636 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24637 operands[0], 1, OPTAB_DIRECT);
24638 if (res != operands[0])
24639 emit_move_insn (operands[0], res);
24640 return true;
24642 else if (GET_MODE_INNER (data_mode) != DImode
24643 && vector_all_ones_operand (negop, data_mode))
24645 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24646 operands[0], 0, OPTAB_DIRECT);
24647 if (res != operands[0])
24648 emit_move_insn (operands[0], res);
24649 return true;
24653 if (!nonimmediate_operand (cop1, mode))
24654 cop1 = force_reg (mode, cop1);
24655 if (!general_operand (operands[1], data_mode))
24656 operands[1] = force_reg (data_mode, operands[1]);
24657 if (!general_operand (operands[2], data_mode))
24658 operands[2] = force_reg (data_mode, operands[2]);
24660 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24661 operands[1], operands[2], &negate);
24663 if (!x)
24664 return false;
24666 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24667 operands[2-negate]);
24668 return true;
24671 /* AVX512F does support 64-byte integer vector operations,
24672 thus the longest vector we are faced with is V64QImode. */
24673 #define MAX_VECT_LEN 64
24675 struct expand_vec_perm_d
24677 rtx target, op0, op1;
24678 unsigned char perm[MAX_VECT_LEN];
24679 machine_mode vmode;
24680 unsigned char nelt;
24681 bool one_operand_p;
24682 bool testing_p;
24685 static bool
24686 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24687 struct expand_vec_perm_d *d)
24689 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24690 expander, so args are either in d, or in op0, op1 etc. */
24691 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24692 machine_mode maskmode = mode;
24693 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24695 switch (mode)
24697 case E_V8HImode:
24698 if (TARGET_AVX512VL && TARGET_AVX512BW)
24699 gen = gen_avx512vl_vpermt2varv8hi3;
24700 break;
24701 case E_V16HImode:
24702 if (TARGET_AVX512VL && TARGET_AVX512BW)
24703 gen = gen_avx512vl_vpermt2varv16hi3;
24704 break;
24705 case E_V64QImode:
24706 if (TARGET_AVX512VBMI)
24707 gen = gen_avx512bw_vpermt2varv64qi3;
24708 break;
24709 case E_V32HImode:
24710 if (TARGET_AVX512BW)
24711 gen = gen_avx512bw_vpermt2varv32hi3;
24712 break;
24713 case E_V4SImode:
24714 if (TARGET_AVX512VL)
24715 gen = gen_avx512vl_vpermt2varv4si3;
24716 break;
24717 case E_V8SImode:
24718 if (TARGET_AVX512VL)
24719 gen = gen_avx512vl_vpermt2varv8si3;
24720 break;
24721 case E_V16SImode:
24722 if (TARGET_AVX512F)
24723 gen = gen_avx512f_vpermt2varv16si3;
24724 break;
24725 case E_V4SFmode:
24726 if (TARGET_AVX512VL)
24728 gen = gen_avx512vl_vpermt2varv4sf3;
24729 maskmode = V4SImode;
24731 break;
24732 case E_V8SFmode:
24733 if (TARGET_AVX512VL)
24735 gen = gen_avx512vl_vpermt2varv8sf3;
24736 maskmode = V8SImode;
24738 break;
24739 case E_V16SFmode:
24740 if (TARGET_AVX512F)
24742 gen = gen_avx512f_vpermt2varv16sf3;
24743 maskmode = V16SImode;
24745 break;
24746 case E_V2DImode:
24747 if (TARGET_AVX512VL)
24748 gen = gen_avx512vl_vpermt2varv2di3;
24749 break;
24750 case E_V4DImode:
24751 if (TARGET_AVX512VL)
24752 gen = gen_avx512vl_vpermt2varv4di3;
24753 break;
24754 case E_V8DImode:
24755 if (TARGET_AVX512F)
24756 gen = gen_avx512f_vpermt2varv8di3;
24757 break;
24758 case E_V2DFmode:
24759 if (TARGET_AVX512VL)
24761 gen = gen_avx512vl_vpermt2varv2df3;
24762 maskmode = V2DImode;
24764 break;
24765 case E_V4DFmode:
24766 if (TARGET_AVX512VL)
24768 gen = gen_avx512vl_vpermt2varv4df3;
24769 maskmode = V4DImode;
24771 break;
24772 case E_V8DFmode:
24773 if (TARGET_AVX512F)
24775 gen = gen_avx512f_vpermt2varv8df3;
24776 maskmode = V8DImode;
24778 break;
24779 default:
24780 break;
24783 if (gen == NULL)
24784 return false;
24786 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24787 expander, so args are either in d, or in op0, op1 etc. */
24788 if (d)
24790 rtx vec[64];
24791 target = d->target;
24792 op0 = d->op0;
24793 op1 = d->op1;
24794 for (int i = 0; i < d->nelt; ++i)
24795 vec[i] = GEN_INT (d->perm[i]);
24796 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24799 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24800 return true;
24803 /* Expand a variable vector permutation. */
24805 void
24806 ix86_expand_vec_perm (rtx operands[])
24808 rtx target = operands[0];
24809 rtx op0 = operands[1];
24810 rtx op1 = operands[2];
24811 rtx mask = operands[3];
24812 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24813 machine_mode mode = GET_MODE (op0);
24814 machine_mode maskmode = GET_MODE (mask);
24815 int w, e, i;
24816 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24818 /* Number of elements in the vector. */
24819 w = GET_MODE_NUNITS (mode);
24820 e = GET_MODE_UNIT_SIZE (mode);
24821 gcc_assert (w <= 64);
24823 if (TARGET_AVX512F && one_operand_shuffle)
24825 rtx (*gen) (rtx, rtx, rtx) = NULL;
24826 switch (mode)
24828 case E_V16SImode:
24829 gen =gen_avx512f_permvarv16si;
24830 break;
24831 case E_V16SFmode:
24832 gen = gen_avx512f_permvarv16sf;
24833 break;
24834 case E_V8DImode:
24835 gen = gen_avx512f_permvarv8di;
24836 break;
24837 case E_V8DFmode:
24838 gen = gen_avx512f_permvarv8df;
24839 break;
24840 default:
24841 break;
24843 if (gen != NULL)
24845 emit_insn (gen (target, op0, mask));
24846 return;
24850 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24851 return;
24853 if (TARGET_AVX2)
24855 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24857 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24858 an constant shuffle operand. With a tiny bit of effort we can
24859 use VPERMD instead. A re-interpretation stall for V4DFmode is
24860 unfortunate but there's no avoiding it.
24861 Similarly for V16HImode we don't have instructions for variable
24862 shuffling, while for V32QImode we can use after preparing suitable
24863 masks vpshufb; vpshufb; vpermq; vpor. */
24865 if (mode == V16HImode)
24867 maskmode = mode = V32QImode;
24868 w = 32;
24869 e = 1;
24871 else
24873 maskmode = mode = V8SImode;
24874 w = 8;
24875 e = 4;
24877 t1 = gen_reg_rtx (maskmode);
24879 /* Replicate the low bits of the V4DImode mask into V8SImode:
24880 mask = { A B C D }
24881 t1 = { A A B B C C D D }. */
24882 for (i = 0; i < w / 2; ++i)
24883 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24884 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24885 vt = force_reg (maskmode, vt);
24886 mask = gen_lowpart (maskmode, mask);
24887 if (maskmode == V8SImode)
24888 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24889 else
24890 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24892 /* Multiply the shuffle indicies by two. */
24893 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24894 OPTAB_DIRECT);
24896 /* Add one to the odd shuffle indicies:
24897 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24898 for (i = 0; i < w / 2; ++i)
24900 vec[i * 2] = const0_rtx;
24901 vec[i * 2 + 1] = const1_rtx;
24903 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24904 vt = validize_mem (force_const_mem (maskmode, vt));
24905 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24906 OPTAB_DIRECT);
24908 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24909 operands[3] = mask = t1;
24910 target = gen_reg_rtx (mode);
24911 op0 = gen_lowpart (mode, op0);
24912 op1 = gen_lowpart (mode, op1);
24915 switch (mode)
24917 case E_V8SImode:
24918 /* The VPERMD and VPERMPS instructions already properly ignore
24919 the high bits of the shuffle elements. No need for us to
24920 perform an AND ourselves. */
24921 if (one_operand_shuffle)
24923 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24924 if (target != operands[0])
24925 emit_move_insn (operands[0],
24926 gen_lowpart (GET_MODE (operands[0]), target));
24928 else
24930 t1 = gen_reg_rtx (V8SImode);
24931 t2 = gen_reg_rtx (V8SImode);
24932 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24933 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24934 goto merge_two;
24936 return;
24938 case E_V8SFmode:
24939 mask = gen_lowpart (V8SImode, mask);
24940 if (one_operand_shuffle)
24941 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24942 else
24944 t1 = gen_reg_rtx (V8SFmode);
24945 t2 = gen_reg_rtx (V8SFmode);
24946 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24947 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24948 goto merge_two;
24950 return;
24952 case E_V4SImode:
24953 /* By combining the two 128-bit input vectors into one 256-bit
24954 input vector, we can use VPERMD and VPERMPS for the full
24955 two-operand shuffle. */
24956 t1 = gen_reg_rtx (V8SImode);
24957 t2 = gen_reg_rtx (V8SImode);
24958 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24959 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24960 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24961 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24962 return;
24964 case E_V4SFmode:
24965 t1 = gen_reg_rtx (V8SFmode);
24966 t2 = gen_reg_rtx (V8SImode);
24967 mask = gen_lowpart (V4SImode, mask);
24968 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24969 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24970 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24971 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24972 return;
24974 case E_V32QImode:
24975 t1 = gen_reg_rtx (V32QImode);
24976 t2 = gen_reg_rtx (V32QImode);
24977 t3 = gen_reg_rtx (V32QImode);
24978 vt2 = GEN_INT (-128);
24979 vt = gen_const_vec_duplicate (V32QImode, vt2);
24980 vt = force_reg (V32QImode, vt);
24981 for (i = 0; i < 32; i++)
24982 vec[i] = i < 16 ? vt2 : const0_rtx;
24983 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24984 vt2 = force_reg (V32QImode, vt2);
24985 /* From mask create two adjusted masks, which contain the same
24986 bits as mask in the low 7 bits of each vector element.
24987 The first mask will have the most significant bit clear
24988 if it requests element from the same 128-bit lane
24989 and MSB set if it requests element from the other 128-bit lane.
24990 The second mask will have the opposite values of the MSB,
24991 and additionally will have its 128-bit lanes swapped.
24992 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24993 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24994 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24995 stands for other 12 bytes. */
24996 /* The bit whether element is from the same lane or the other
24997 lane is bit 4, so shift it up by 3 to the MSB position. */
24998 t5 = gen_reg_rtx (V4DImode);
24999 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
25000 GEN_INT (3)));
25001 /* Clear MSB bits from the mask just in case it had them set. */
25002 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
25003 /* After this t1 will have MSB set for elements from other lane. */
25004 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
25005 /* Clear bits other than MSB. */
25006 emit_insn (gen_andv32qi3 (t1, t1, vt));
25007 /* Or in the lower bits from mask into t3. */
25008 emit_insn (gen_iorv32qi3 (t3, t1, t2));
25009 /* And invert MSB bits in t1, so MSB is set for elements from the same
25010 lane. */
25011 emit_insn (gen_xorv32qi3 (t1, t1, vt));
25012 /* Swap 128-bit lanes in t3. */
25013 t6 = gen_reg_rtx (V4DImode);
25014 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
25015 const2_rtx, GEN_INT (3),
25016 const0_rtx, const1_rtx));
25017 /* And or in the lower bits from mask into t1. */
25018 emit_insn (gen_iorv32qi3 (t1, t1, t2));
25019 if (one_operand_shuffle)
25021 /* Each of these shuffles will put 0s in places where
25022 element from the other 128-bit lane is needed, otherwise
25023 will shuffle in the requested value. */
25024 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
25025 gen_lowpart (V32QImode, t6)));
25026 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
25027 /* For t3 the 128-bit lanes are swapped again. */
25028 t7 = gen_reg_rtx (V4DImode);
25029 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
25030 const2_rtx, GEN_INT (3),
25031 const0_rtx, const1_rtx));
25032 /* And oring both together leads to the result. */
25033 emit_insn (gen_iorv32qi3 (target, t1,
25034 gen_lowpart (V32QImode, t7)));
25035 if (target != operands[0])
25036 emit_move_insn (operands[0],
25037 gen_lowpart (GET_MODE (operands[0]), target));
25038 return;
25041 t4 = gen_reg_rtx (V32QImode);
25042 /* Similarly to the above one_operand_shuffle code,
25043 just for repeated twice for each operand. merge_two:
25044 code will merge the two results together. */
25045 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25046 gen_lowpart (V32QImode, t6)));
25047 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25048 gen_lowpart (V32QImode, t6)));
25049 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25050 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25051 t7 = gen_reg_rtx (V4DImode);
25052 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25053 const2_rtx, GEN_INT (3),
25054 const0_rtx, const1_rtx));
25055 t8 = gen_reg_rtx (V4DImode);
25056 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25057 const2_rtx, GEN_INT (3),
25058 const0_rtx, const1_rtx));
25059 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25060 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25061 t1 = t4;
25062 t2 = t3;
25063 goto merge_two;
25065 default:
25066 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25067 break;
25071 if (TARGET_XOP)
25073 /* The XOP VPPERM insn supports three inputs. By ignoring the
25074 one_operand_shuffle special case, we avoid creating another
25075 set of constant vectors in memory. */
25076 one_operand_shuffle = false;
25078 /* mask = mask & {2*w-1, ...} */
25079 vt = GEN_INT (2*w - 1);
25081 else
25083 /* mask = mask & {w-1, ...} */
25084 vt = GEN_INT (w - 1);
25087 vt = gen_const_vec_duplicate (maskmode, vt);
25088 mask = expand_simple_binop (maskmode, AND, mask, vt,
25089 NULL_RTX, 0, OPTAB_DIRECT);
25091 /* For non-QImode operations, convert the word permutation control
25092 into a byte permutation control. */
25093 if (mode != V16QImode)
25095 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25096 GEN_INT (exact_log2 (e)),
25097 NULL_RTX, 0, OPTAB_DIRECT);
25099 /* Convert mask to vector of chars. */
25100 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25102 /* Replicate each of the input bytes into byte positions:
25103 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25104 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25105 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25106 for (i = 0; i < 16; ++i)
25107 vec[i] = GEN_INT (i/e * e);
25108 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25109 vt = validize_mem (force_const_mem (V16QImode, vt));
25110 if (TARGET_XOP)
25111 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25112 else
25113 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25115 /* Convert it into the byte positions by doing
25116 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25117 for (i = 0; i < 16; ++i)
25118 vec[i] = GEN_INT (i % e);
25119 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25120 vt = validize_mem (force_const_mem (V16QImode, vt));
25121 emit_insn (gen_addv16qi3 (mask, mask, vt));
25124 /* The actual shuffle operations all operate on V16QImode. */
25125 op0 = gen_lowpart (V16QImode, op0);
25126 op1 = gen_lowpart (V16QImode, op1);
25128 if (TARGET_XOP)
25130 if (GET_MODE (target) != V16QImode)
25131 target = gen_reg_rtx (V16QImode);
25132 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25133 if (target != operands[0])
25134 emit_move_insn (operands[0],
25135 gen_lowpart (GET_MODE (operands[0]), target));
25137 else if (one_operand_shuffle)
25139 if (GET_MODE (target) != V16QImode)
25140 target = gen_reg_rtx (V16QImode);
25141 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25142 if (target != operands[0])
25143 emit_move_insn (operands[0],
25144 gen_lowpart (GET_MODE (operands[0]), target));
25146 else
25148 rtx xops[6];
25149 bool ok;
25151 /* Shuffle the two input vectors independently. */
25152 t1 = gen_reg_rtx (V16QImode);
25153 t2 = gen_reg_rtx (V16QImode);
25154 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25155 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25157 merge_two:
25158 /* Then merge them together. The key is whether any given control
25159 element contained a bit set that indicates the second word. */
25160 mask = operands[3];
25161 vt = GEN_INT (w);
25162 if (maskmode == V2DImode && !TARGET_SSE4_1)
25164 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25165 more shuffle to convert the V2DI input mask into a V4SI
25166 input mask. At which point the masking that expand_int_vcond
25167 will work as desired. */
25168 rtx t3 = gen_reg_rtx (V4SImode);
25169 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25170 const0_rtx, const0_rtx,
25171 const2_rtx, const2_rtx));
25172 mask = t3;
25173 maskmode = V4SImode;
25174 e = w = 4;
25177 vt = gen_const_vec_duplicate (maskmode, vt);
25178 vt = force_reg (maskmode, vt);
25179 mask = expand_simple_binop (maskmode, AND, mask, vt,
25180 NULL_RTX, 0, OPTAB_DIRECT);
25182 if (GET_MODE (target) != mode)
25183 target = gen_reg_rtx (mode);
25184 xops[0] = target;
25185 xops[1] = gen_lowpart (mode, t2);
25186 xops[2] = gen_lowpart (mode, t1);
25187 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25188 xops[4] = mask;
25189 xops[5] = vt;
25190 ok = ix86_expand_int_vcond (xops);
25191 gcc_assert (ok);
25192 if (target != operands[0])
25193 emit_move_insn (operands[0],
25194 gen_lowpart (GET_MODE (operands[0]), target));
25198 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25199 true if we should do zero extension, else sign extension. HIGH_P is
25200 true if we want the N/2 high elements, else the low elements. */
25202 void
25203 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25205 machine_mode imode = GET_MODE (src);
25206 rtx tmp;
25208 if (TARGET_SSE4_1)
25210 rtx (*unpack)(rtx, rtx);
25211 rtx (*extract)(rtx, rtx) = NULL;
25212 machine_mode halfmode = BLKmode;
25214 switch (imode)
25216 case E_V64QImode:
25217 if (unsigned_p)
25218 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25219 else
25220 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25221 halfmode = V32QImode;
25222 extract
25223 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25224 break;
25225 case E_V32QImode:
25226 if (unsigned_p)
25227 unpack = gen_avx2_zero_extendv16qiv16hi2;
25228 else
25229 unpack = gen_avx2_sign_extendv16qiv16hi2;
25230 halfmode = V16QImode;
25231 extract
25232 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25233 break;
25234 case E_V32HImode:
25235 if (unsigned_p)
25236 unpack = gen_avx512f_zero_extendv16hiv16si2;
25237 else
25238 unpack = gen_avx512f_sign_extendv16hiv16si2;
25239 halfmode = V16HImode;
25240 extract
25241 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25242 break;
25243 case E_V16HImode:
25244 if (unsigned_p)
25245 unpack = gen_avx2_zero_extendv8hiv8si2;
25246 else
25247 unpack = gen_avx2_sign_extendv8hiv8si2;
25248 halfmode = V8HImode;
25249 extract
25250 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25251 break;
25252 case E_V16SImode:
25253 if (unsigned_p)
25254 unpack = gen_avx512f_zero_extendv8siv8di2;
25255 else
25256 unpack = gen_avx512f_sign_extendv8siv8di2;
25257 halfmode = V8SImode;
25258 extract
25259 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25260 break;
25261 case E_V8SImode:
25262 if (unsigned_p)
25263 unpack = gen_avx2_zero_extendv4siv4di2;
25264 else
25265 unpack = gen_avx2_sign_extendv4siv4di2;
25266 halfmode = V4SImode;
25267 extract
25268 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25269 break;
25270 case E_V16QImode:
25271 if (unsigned_p)
25272 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25273 else
25274 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25275 break;
25276 case E_V8HImode:
25277 if (unsigned_p)
25278 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25279 else
25280 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25281 break;
25282 case E_V4SImode:
25283 if (unsigned_p)
25284 unpack = gen_sse4_1_zero_extendv2siv2di2;
25285 else
25286 unpack = gen_sse4_1_sign_extendv2siv2di2;
25287 break;
25288 default:
25289 gcc_unreachable ();
25292 if (GET_MODE_SIZE (imode) >= 32)
25294 tmp = gen_reg_rtx (halfmode);
25295 emit_insn (extract (tmp, src));
25297 else if (high_p)
25299 /* Shift higher 8 bytes to lower 8 bytes. */
25300 tmp = gen_reg_rtx (V1TImode);
25301 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25302 GEN_INT (64)));
25303 tmp = gen_lowpart (imode, tmp);
25305 else
25306 tmp = src;
25308 emit_insn (unpack (dest, tmp));
25310 else
25312 rtx (*unpack)(rtx, rtx, rtx);
25314 switch (imode)
25316 case E_V16QImode:
25317 if (high_p)
25318 unpack = gen_vec_interleave_highv16qi;
25319 else
25320 unpack = gen_vec_interleave_lowv16qi;
25321 break;
25322 case E_V8HImode:
25323 if (high_p)
25324 unpack = gen_vec_interleave_highv8hi;
25325 else
25326 unpack = gen_vec_interleave_lowv8hi;
25327 break;
25328 case E_V4SImode:
25329 if (high_p)
25330 unpack = gen_vec_interleave_highv4si;
25331 else
25332 unpack = gen_vec_interleave_lowv4si;
25333 break;
25334 default:
25335 gcc_unreachable ();
25338 if (unsigned_p)
25339 tmp = force_reg (imode, CONST0_RTX (imode));
25340 else
25341 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25342 src, pc_rtx, pc_rtx);
25344 rtx tmp2 = gen_reg_rtx (imode);
25345 emit_insn (unpack (tmp2, src, tmp));
25346 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25350 /* Expand conditional increment or decrement using adb/sbb instructions.
25351 The default case using setcc followed by the conditional move can be
25352 done by generic code. */
25353 bool
25354 ix86_expand_int_addcc (rtx operands[])
25356 enum rtx_code code = GET_CODE (operands[1]);
25357 rtx flags;
25358 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25359 rtx compare_op;
25360 rtx val = const0_rtx;
25361 bool fpcmp = false;
25362 machine_mode mode;
25363 rtx op0 = XEXP (operands[1], 0);
25364 rtx op1 = XEXP (operands[1], 1);
25366 if (operands[3] != const1_rtx
25367 && operands[3] != constm1_rtx)
25368 return false;
25369 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25370 return false;
25371 code = GET_CODE (compare_op);
25373 flags = XEXP (compare_op, 0);
25375 if (GET_MODE (flags) == CCFPmode)
25377 fpcmp = true;
25378 code = ix86_fp_compare_code_to_integer (code);
25381 if (code != LTU)
25383 val = constm1_rtx;
25384 if (fpcmp)
25385 PUT_CODE (compare_op,
25386 reverse_condition_maybe_unordered
25387 (GET_CODE (compare_op)));
25388 else
25389 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25392 mode = GET_MODE (operands[0]);
25394 /* Construct either adc or sbb insn. */
25395 if ((code == LTU) == (operands[3] == constm1_rtx))
25397 switch (mode)
25399 case E_QImode:
25400 insn = gen_subqi3_carry;
25401 break;
25402 case E_HImode:
25403 insn = gen_subhi3_carry;
25404 break;
25405 case E_SImode:
25406 insn = gen_subsi3_carry;
25407 break;
25408 case E_DImode:
25409 insn = gen_subdi3_carry;
25410 break;
25411 default:
25412 gcc_unreachable ();
25415 else
25417 switch (mode)
25419 case E_QImode:
25420 insn = gen_addqi3_carry;
25421 break;
25422 case E_HImode:
25423 insn = gen_addhi3_carry;
25424 break;
25425 case E_SImode:
25426 insn = gen_addsi3_carry;
25427 break;
25428 case E_DImode:
25429 insn = gen_adddi3_carry;
25430 break;
25431 default:
25432 gcc_unreachable ();
25435 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25437 return true;
25441 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25442 but works for floating pointer parameters and nonoffsetable memories.
25443 For pushes, it returns just stack offsets; the values will be saved
25444 in the right order. Maximally three parts are generated. */
25446 static int
25447 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25449 int size;
25451 if (!TARGET_64BIT)
25452 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25453 else
25454 size = (GET_MODE_SIZE (mode) + 4) / 8;
25456 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25457 gcc_assert (size >= 2 && size <= 4);
25459 /* Optimize constant pool reference to immediates. This is used by fp
25460 moves, that force all constants to memory to allow combining. */
25461 if (MEM_P (operand) && MEM_READONLY_P (operand))
25462 operand = avoid_constant_pool_reference (operand);
25464 if (MEM_P (operand) && !offsettable_memref_p (operand))
25466 /* The only non-offsetable memories we handle are pushes. */
25467 int ok = push_operand (operand, VOIDmode);
25469 gcc_assert (ok);
25471 operand = copy_rtx (operand);
25472 PUT_MODE (operand, word_mode);
25473 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25474 return size;
25477 if (GET_CODE (operand) == CONST_VECTOR)
25479 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25480 /* Caution: if we looked through a constant pool memory above,
25481 the operand may actually have a different mode now. That's
25482 ok, since we want to pun this all the way back to an integer. */
25483 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25484 gcc_assert (operand != NULL);
25485 mode = imode;
25488 if (!TARGET_64BIT)
25490 if (mode == DImode)
25491 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25492 else
25494 int i;
25496 if (REG_P (operand))
25498 gcc_assert (reload_completed);
25499 for (i = 0; i < size; i++)
25500 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25502 else if (offsettable_memref_p (operand))
25504 operand = adjust_address (operand, SImode, 0);
25505 parts[0] = operand;
25506 for (i = 1; i < size; i++)
25507 parts[i] = adjust_address (operand, SImode, 4 * i);
25509 else if (CONST_DOUBLE_P (operand))
25511 const REAL_VALUE_TYPE *r;
25512 long l[4];
25514 r = CONST_DOUBLE_REAL_VALUE (operand);
25515 switch (mode)
25517 case E_TFmode:
25518 real_to_target (l, r, mode);
25519 parts[3] = gen_int_mode (l[3], SImode);
25520 parts[2] = gen_int_mode (l[2], SImode);
25521 break;
25522 case E_XFmode:
25523 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25524 long double may not be 80-bit. */
25525 real_to_target (l, r, mode);
25526 parts[2] = gen_int_mode (l[2], SImode);
25527 break;
25528 case E_DFmode:
25529 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25530 break;
25531 default:
25532 gcc_unreachable ();
25534 parts[1] = gen_int_mode (l[1], SImode);
25535 parts[0] = gen_int_mode (l[0], SImode);
25537 else
25538 gcc_unreachable ();
25541 else
25543 if (mode == TImode)
25544 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25545 if (mode == XFmode || mode == TFmode)
25547 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25548 if (REG_P (operand))
25550 gcc_assert (reload_completed);
25551 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25552 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25554 else if (offsettable_memref_p (operand))
25556 operand = adjust_address (operand, DImode, 0);
25557 parts[0] = operand;
25558 parts[1] = adjust_address (operand, upper_mode, 8);
25560 else if (CONST_DOUBLE_P (operand))
25562 long l[4];
25564 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25566 /* real_to_target puts 32-bit pieces in each long. */
25567 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25568 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25569 << 32), DImode);
25571 if (upper_mode == SImode)
25572 parts[1] = gen_int_mode (l[2], SImode);
25573 else
25574 parts[1]
25575 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25576 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25577 << 32), DImode);
25579 else
25580 gcc_unreachable ();
25584 return size;
25587 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25588 Return false when normal moves are needed; true when all required
25589 insns have been emitted. Operands 2-4 contain the input values
25590 int the correct order; operands 5-7 contain the output values. */
25592 void
25593 ix86_split_long_move (rtx operands[])
25595 rtx part[2][4];
25596 int nparts, i, j;
25597 int push = 0;
25598 int collisions = 0;
25599 machine_mode mode = GET_MODE (operands[0]);
25600 bool collisionparts[4];
25602 /* The DFmode expanders may ask us to move double.
25603 For 64bit target this is single move. By hiding the fact
25604 here we simplify i386.md splitters. */
25605 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25607 /* Optimize constant pool reference to immediates. This is used by
25608 fp moves, that force all constants to memory to allow combining. */
25610 if (MEM_P (operands[1])
25611 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25612 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25613 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25614 if (push_operand (operands[0], VOIDmode))
25616 operands[0] = copy_rtx (operands[0]);
25617 PUT_MODE (operands[0], word_mode);
25619 else
25620 operands[0] = gen_lowpart (DImode, operands[0]);
25621 operands[1] = gen_lowpart (DImode, operands[1]);
25622 emit_move_insn (operands[0], operands[1]);
25623 return;
25626 /* The only non-offsettable memory we handle is push. */
25627 if (push_operand (operands[0], VOIDmode))
25628 push = 1;
25629 else
25630 gcc_assert (!MEM_P (operands[0])
25631 || offsettable_memref_p (operands[0]));
25633 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25634 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25636 /* When emitting push, take care for source operands on the stack. */
25637 if (push && MEM_P (operands[1])
25638 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25640 rtx src_base = XEXP (part[1][nparts - 1], 0);
25642 /* Compensate for the stack decrement by 4. */
25643 if (!TARGET_64BIT && nparts == 3
25644 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25645 src_base = plus_constant (Pmode, src_base, 4);
25647 /* src_base refers to the stack pointer and is
25648 automatically decreased by emitted push. */
25649 for (i = 0; i < nparts; i++)
25650 part[1][i] = change_address (part[1][i],
25651 GET_MODE (part[1][i]), src_base);
25654 /* We need to do copy in the right order in case an address register
25655 of the source overlaps the destination. */
25656 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25658 rtx tmp;
25660 for (i = 0; i < nparts; i++)
25662 collisionparts[i]
25663 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25664 if (collisionparts[i])
25665 collisions++;
25668 /* Collision in the middle part can be handled by reordering. */
25669 if (collisions == 1 && nparts == 3 && collisionparts [1])
25671 std::swap (part[0][1], part[0][2]);
25672 std::swap (part[1][1], part[1][2]);
25674 else if (collisions == 1
25675 && nparts == 4
25676 && (collisionparts [1] || collisionparts [2]))
25678 if (collisionparts [1])
25680 std::swap (part[0][1], part[0][2]);
25681 std::swap (part[1][1], part[1][2]);
25683 else
25685 std::swap (part[0][2], part[0][3]);
25686 std::swap (part[1][2], part[1][3]);
25690 /* If there are more collisions, we can't handle it by reordering.
25691 Do an lea to the last part and use only one colliding move. */
25692 else if (collisions > 1)
25694 rtx base, addr;
25696 collisions = 1;
25698 base = part[0][nparts - 1];
25700 /* Handle the case when the last part isn't valid for lea.
25701 Happens in 64-bit mode storing the 12-byte XFmode. */
25702 if (GET_MODE (base) != Pmode)
25703 base = gen_rtx_REG (Pmode, REGNO (base));
25705 addr = XEXP (part[1][0], 0);
25706 if (TARGET_TLS_DIRECT_SEG_REFS)
25708 struct ix86_address parts;
25709 int ok = ix86_decompose_address (addr, &parts);
25710 gcc_assert (ok);
25711 /* It is not valid to use %gs: or %fs: in lea. */
25712 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25714 emit_insn (gen_rtx_SET (base, addr));
25715 part[1][0] = replace_equiv_address (part[1][0], base);
25716 for (i = 1; i < nparts; i++)
25718 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25719 part[1][i] = replace_equiv_address (part[1][i], tmp);
25724 if (push)
25726 if (!TARGET_64BIT)
25728 if (nparts == 3)
25730 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25731 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25732 stack_pointer_rtx, GEN_INT (-4)));
25733 emit_move_insn (part[0][2], part[1][2]);
25735 else if (nparts == 4)
25737 emit_move_insn (part[0][3], part[1][3]);
25738 emit_move_insn (part[0][2], part[1][2]);
25741 else
25743 /* In 64bit mode we don't have 32bit push available. In case this is
25744 register, it is OK - we will just use larger counterpart. We also
25745 retype memory - these comes from attempt to avoid REX prefix on
25746 moving of second half of TFmode value. */
25747 if (GET_MODE (part[1][1]) == SImode)
25749 switch (GET_CODE (part[1][1]))
25751 case MEM:
25752 part[1][1] = adjust_address (part[1][1], DImode, 0);
25753 break;
25755 case REG:
25756 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25757 break;
25759 default:
25760 gcc_unreachable ();
25763 if (GET_MODE (part[1][0]) == SImode)
25764 part[1][0] = part[1][1];
25767 emit_move_insn (part[0][1], part[1][1]);
25768 emit_move_insn (part[0][0], part[1][0]);
25769 return;
25772 /* Choose correct order to not overwrite the source before it is copied. */
25773 if ((REG_P (part[0][0])
25774 && REG_P (part[1][1])
25775 && (REGNO (part[0][0]) == REGNO (part[1][1])
25776 || (nparts == 3
25777 && REGNO (part[0][0]) == REGNO (part[1][2]))
25778 || (nparts == 4
25779 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25780 || (collisions > 0
25781 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25783 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25785 operands[2 + i] = part[0][j];
25786 operands[6 + i] = part[1][j];
25789 else
25791 for (i = 0; i < nparts; i++)
25793 operands[2 + i] = part[0][i];
25794 operands[6 + i] = part[1][i];
25798 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25799 if (optimize_insn_for_size_p ())
25801 for (j = 0; j < nparts - 1; j++)
25802 if (CONST_INT_P (operands[6 + j])
25803 && operands[6 + j] != const0_rtx
25804 && REG_P (operands[2 + j]))
25805 for (i = j; i < nparts - 1; i++)
25806 if (CONST_INT_P (operands[7 + i])
25807 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25808 operands[7 + i] = operands[2 + j];
25811 for (i = 0; i < nparts; i++)
25812 emit_move_insn (operands[2 + i], operands[6 + i]);
25814 return;
25817 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25818 left shift by a constant, either using a single shift or
25819 a sequence of add instructions. */
25821 static void
25822 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25824 rtx (*insn)(rtx, rtx, rtx);
25826 if (count == 1
25827 || (count * ix86_cost->add <= ix86_cost->shift_const
25828 && !optimize_insn_for_size_p ()))
25830 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25831 while (count-- > 0)
25832 emit_insn (insn (operand, operand, operand));
25834 else
25836 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25837 emit_insn (insn (operand, operand, GEN_INT (count)));
25841 void
25842 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25844 rtx (*gen_ashl3)(rtx, rtx, rtx);
25845 rtx (*gen_shld)(rtx, rtx, rtx);
25846 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25848 rtx low[2], high[2];
25849 int count;
25851 if (CONST_INT_P (operands[2]))
25853 split_double_mode (mode, operands, 2, low, high);
25854 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25856 if (count >= half_width)
25858 emit_move_insn (high[0], low[1]);
25859 emit_move_insn (low[0], const0_rtx);
25861 if (count > half_width)
25862 ix86_expand_ashl_const (high[0], count - half_width, mode);
25864 else
25866 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25868 if (!rtx_equal_p (operands[0], operands[1]))
25869 emit_move_insn (operands[0], operands[1]);
25871 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25872 ix86_expand_ashl_const (low[0], count, mode);
25874 return;
25877 split_double_mode (mode, operands, 1, low, high);
25879 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25881 if (operands[1] == const1_rtx)
25883 /* Assuming we've chosen a QImode capable registers, then 1 << N
25884 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25885 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25887 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25889 ix86_expand_clear (low[0]);
25890 ix86_expand_clear (high[0]);
25891 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25893 d = gen_lowpart (QImode, low[0]);
25894 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25895 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25896 emit_insn (gen_rtx_SET (d, s));
25898 d = gen_lowpart (QImode, high[0]);
25899 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25900 s = gen_rtx_NE (QImode, flags, const0_rtx);
25901 emit_insn (gen_rtx_SET (d, s));
25904 /* Otherwise, we can get the same results by manually performing
25905 a bit extract operation on bit 5/6, and then performing the two
25906 shifts. The two methods of getting 0/1 into low/high are exactly
25907 the same size. Avoiding the shift in the bit extract case helps
25908 pentium4 a bit; no one else seems to care much either way. */
25909 else
25911 machine_mode half_mode;
25912 rtx (*gen_lshr3)(rtx, rtx, rtx);
25913 rtx (*gen_and3)(rtx, rtx, rtx);
25914 rtx (*gen_xor3)(rtx, rtx, rtx);
25915 HOST_WIDE_INT bits;
25916 rtx x;
25918 if (mode == DImode)
25920 half_mode = SImode;
25921 gen_lshr3 = gen_lshrsi3;
25922 gen_and3 = gen_andsi3;
25923 gen_xor3 = gen_xorsi3;
25924 bits = 5;
25926 else
25928 half_mode = DImode;
25929 gen_lshr3 = gen_lshrdi3;
25930 gen_and3 = gen_anddi3;
25931 gen_xor3 = gen_xordi3;
25932 bits = 6;
25935 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25936 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25937 else
25938 x = gen_lowpart (half_mode, operands[2]);
25939 emit_insn (gen_rtx_SET (high[0], x));
25941 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25942 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25943 emit_move_insn (low[0], high[0]);
25944 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25947 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25948 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25949 return;
25952 if (operands[1] == constm1_rtx)
25954 /* For -1 << N, we can avoid the shld instruction, because we
25955 know that we're shifting 0...31/63 ones into a -1. */
25956 emit_move_insn (low[0], constm1_rtx);
25957 if (optimize_insn_for_size_p ())
25958 emit_move_insn (high[0], low[0]);
25959 else
25960 emit_move_insn (high[0], constm1_rtx);
25962 else
25964 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25966 if (!rtx_equal_p (operands[0], operands[1]))
25967 emit_move_insn (operands[0], operands[1]);
25969 split_double_mode (mode, operands, 1, low, high);
25970 emit_insn (gen_shld (high[0], low[0], operands[2]));
25973 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25975 if (TARGET_CMOVE && scratch)
25977 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25978 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25980 ix86_expand_clear (scratch);
25981 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25983 else
25985 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25986 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25988 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25992 void
25993 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25995 rtx (*gen_ashr3)(rtx, rtx, rtx)
25996 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25997 rtx (*gen_shrd)(rtx, rtx, rtx);
25998 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26000 rtx low[2], high[2];
26001 int count;
26003 if (CONST_INT_P (operands[2]))
26005 split_double_mode (mode, operands, 2, low, high);
26006 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26008 if (count == GET_MODE_BITSIZE (mode) - 1)
26010 emit_move_insn (high[0], high[1]);
26011 emit_insn (gen_ashr3 (high[0], high[0],
26012 GEN_INT (half_width - 1)));
26013 emit_move_insn (low[0], high[0]);
26016 else if (count >= half_width)
26018 emit_move_insn (low[0], high[1]);
26019 emit_move_insn (high[0], low[0]);
26020 emit_insn (gen_ashr3 (high[0], high[0],
26021 GEN_INT (half_width - 1)));
26023 if (count > half_width)
26024 emit_insn (gen_ashr3 (low[0], low[0],
26025 GEN_INT (count - half_width)));
26027 else
26029 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26031 if (!rtx_equal_p (operands[0], operands[1]))
26032 emit_move_insn (operands[0], operands[1]);
26034 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26035 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
26038 else
26040 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26042 if (!rtx_equal_p (operands[0], operands[1]))
26043 emit_move_insn (operands[0], operands[1]);
26045 split_double_mode (mode, operands, 1, low, high);
26047 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26048 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26050 if (TARGET_CMOVE && scratch)
26052 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26053 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26055 emit_move_insn (scratch, high[0]);
26056 emit_insn (gen_ashr3 (scratch, scratch,
26057 GEN_INT (half_width - 1)));
26058 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26059 scratch));
26061 else
26063 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26064 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26066 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26071 void
26072 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26074 rtx (*gen_lshr3)(rtx, rtx, rtx)
26075 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26076 rtx (*gen_shrd)(rtx, rtx, rtx);
26077 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26079 rtx low[2], high[2];
26080 int count;
26082 if (CONST_INT_P (operands[2]))
26084 split_double_mode (mode, operands, 2, low, high);
26085 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26087 if (count >= half_width)
26089 emit_move_insn (low[0], high[1]);
26090 ix86_expand_clear (high[0]);
26092 if (count > half_width)
26093 emit_insn (gen_lshr3 (low[0], low[0],
26094 GEN_INT (count - half_width)));
26096 else
26098 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26100 if (!rtx_equal_p (operands[0], operands[1]))
26101 emit_move_insn (operands[0], operands[1]);
26103 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26104 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26107 else
26109 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26111 if (!rtx_equal_p (operands[0], operands[1]))
26112 emit_move_insn (operands[0], operands[1]);
26114 split_double_mode (mode, operands, 1, low, high);
26116 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26117 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26119 if (TARGET_CMOVE && scratch)
26121 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26122 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26124 ix86_expand_clear (scratch);
26125 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26126 scratch));
26128 else
26130 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26131 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26133 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26138 /* Predict just emitted jump instruction to be taken with probability PROB. */
26139 static void
26140 predict_jump (int prob)
26142 rtx_insn *insn = get_last_insn ();
26143 gcc_assert (JUMP_P (insn));
26144 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26147 /* Helper function for the string operations below. Dest VARIABLE whether
26148 it is aligned to VALUE bytes. If true, jump to the label. */
26149 static rtx_code_label *
26150 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26152 rtx_code_label *label = gen_label_rtx ();
26153 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26154 if (GET_MODE (variable) == DImode)
26155 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26156 else
26157 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26158 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26159 1, label);
26160 if (epilogue)
26161 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26162 else
26163 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26164 return label;
26167 /* Adjust COUNTER by the VALUE. */
26168 static void
26169 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26171 rtx (*gen_add)(rtx, rtx, rtx)
26172 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26174 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26177 /* Zero extend possibly SImode EXP to Pmode register. */
26179 ix86_zero_extend_to_Pmode (rtx exp)
26181 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26184 /* Divide COUNTREG by SCALE. */
26185 static rtx
26186 scale_counter (rtx countreg, int scale)
26188 rtx sc;
26190 if (scale == 1)
26191 return countreg;
26192 if (CONST_INT_P (countreg))
26193 return GEN_INT (INTVAL (countreg) / scale);
26194 gcc_assert (REG_P (countreg));
26196 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26197 GEN_INT (exact_log2 (scale)),
26198 NULL, 1, OPTAB_DIRECT);
26199 return sc;
26202 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26203 DImode for constant loop counts. */
26205 static machine_mode
26206 counter_mode (rtx count_exp)
26208 if (GET_MODE (count_exp) != VOIDmode)
26209 return GET_MODE (count_exp);
26210 if (!CONST_INT_P (count_exp))
26211 return Pmode;
26212 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26213 return DImode;
26214 return SImode;
26217 /* Copy the address to a Pmode register. This is used for x32 to
26218 truncate DImode TLS address to a SImode register. */
26220 static rtx
26221 ix86_copy_addr_to_reg (rtx addr)
26223 rtx reg;
26224 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26226 reg = copy_addr_to_reg (addr);
26227 REG_POINTER (reg) = 1;
26228 return reg;
26230 else
26232 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26233 reg = copy_to_mode_reg (DImode, addr);
26234 REG_POINTER (reg) = 1;
26235 return gen_rtx_SUBREG (SImode, reg, 0);
26239 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26240 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26241 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26242 memory by VALUE (supposed to be in MODE).
26244 The size is rounded down to whole number of chunk size moved at once.
26245 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26248 static void
26249 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26250 rtx destptr, rtx srcptr, rtx value,
26251 rtx count, machine_mode mode, int unroll,
26252 int expected_size, bool issetmem)
26254 rtx_code_label *out_label, *top_label;
26255 rtx iter, tmp;
26256 machine_mode iter_mode = counter_mode (count);
26257 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26258 rtx piece_size = GEN_INT (piece_size_n);
26259 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26260 rtx size;
26261 int i;
26263 top_label = gen_label_rtx ();
26264 out_label = gen_label_rtx ();
26265 iter = gen_reg_rtx (iter_mode);
26267 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26268 NULL, 1, OPTAB_DIRECT);
26269 /* Those two should combine. */
26270 if (piece_size == const1_rtx)
26272 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26273 true, out_label);
26274 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26276 emit_move_insn (iter, const0_rtx);
26278 emit_label (top_label);
26280 tmp = convert_modes (Pmode, iter_mode, iter, true);
26282 /* This assert could be relaxed - in this case we'll need to compute
26283 smallest power of two, containing in PIECE_SIZE_N and pass it to
26284 offset_address. */
26285 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26286 destmem = offset_address (destmem, tmp, piece_size_n);
26287 destmem = adjust_address (destmem, mode, 0);
26289 if (!issetmem)
26291 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26292 srcmem = adjust_address (srcmem, mode, 0);
26294 /* When unrolling for chips that reorder memory reads and writes,
26295 we can save registers by using single temporary.
26296 Also using 4 temporaries is overkill in 32bit mode. */
26297 if (!TARGET_64BIT && 0)
26299 for (i = 0; i < unroll; i++)
26301 if (i)
26303 destmem =
26304 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26305 srcmem =
26306 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26308 emit_move_insn (destmem, srcmem);
26311 else
26313 rtx tmpreg[4];
26314 gcc_assert (unroll <= 4);
26315 for (i = 0; i < unroll; i++)
26317 tmpreg[i] = gen_reg_rtx (mode);
26318 if (i)
26320 srcmem =
26321 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26323 emit_move_insn (tmpreg[i], srcmem);
26325 for (i = 0; i < unroll; i++)
26327 if (i)
26329 destmem =
26330 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26332 emit_move_insn (destmem, tmpreg[i]);
26336 else
26337 for (i = 0; i < unroll; i++)
26339 if (i)
26340 destmem =
26341 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26342 emit_move_insn (destmem, value);
26345 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26346 true, OPTAB_LIB_WIDEN);
26347 if (tmp != iter)
26348 emit_move_insn (iter, tmp);
26350 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26351 true, top_label);
26352 if (expected_size != -1)
26354 expected_size /= GET_MODE_SIZE (mode) * unroll;
26355 if (expected_size == 0)
26356 predict_jump (0);
26357 else if (expected_size > REG_BR_PROB_BASE)
26358 predict_jump (REG_BR_PROB_BASE - 1);
26359 else
26360 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26362 else
26363 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26364 iter = ix86_zero_extend_to_Pmode (iter);
26365 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26366 true, OPTAB_LIB_WIDEN);
26367 if (tmp != destptr)
26368 emit_move_insn (destptr, tmp);
26369 if (!issetmem)
26371 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26372 true, OPTAB_LIB_WIDEN);
26373 if (tmp != srcptr)
26374 emit_move_insn (srcptr, tmp);
26376 emit_label (out_label);
26379 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26380 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26381 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26382 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26383 ORIG_VALUE is the original value passed to memset to fill the memory with.
26384 Other arguments have same meaning as for previous function. */
26386 static void
26387 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26388 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26389 rtx count,
26390 machine_mode mode, bool issetmem)
26392 rtx destexp;
26393 rtx srcexp;
26394 rtx countreg;
26395 HOST_WIDE_INT rounded_count;
26397 /* If possible, it is shorter to use rep movs.
26398 TODO: Maybe it is better to move this logic to decide_alg. */
26399 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26400 && (!issetmem || orig_value == const0_rtx))
26401 mode = SImode;
26403 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26404 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26406 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26407 GET_MODE_SIZE (mode)));
26408 if (mode != QImode)
26410 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26411 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26412 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26414 else
26415 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26416 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26418 rounded_count
26419 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26420 destmem = shallow_copy_rtx (destmem);
26421 set_mem_size (destmem, rounded_count);
26423 else if (MEM_SIZE_KNOWN_P (destmem))
26424 clear_mem_size (destmem);
26426 if (issetmem)
26428 value = force_reg (mode, gen_lowpart (mode, value));
26429 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26431 else
26433 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26434 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26435 if (mode != QImode)
26437 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26438 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26439 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26441 else
26442 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26443 if (CONST_INT_P (count))
26445 rounded_count
26446 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26447 srcmem = shallow_copy_rtx (srcmem);
26448 set_mem_size (srcmem, rounded_count);
26450 else
26452 if (MEM_SIZE_KNOWN_P (srcmem))
26453 clear_mem_size (srcmem);
26455 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26456 destexp, srcexp));
26460 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26461 DESTMEM.
26462 SRC is passed by pointer to be updated on return.
26463 Return value is updated DST. */
26464 static rtx
26465 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26466 HOST_WIDE_INT size_to_move)
26468 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26469 enum insn_code code;
26470 machine_mode move_mode;
26471 int piece_size, i;
26473 /* Find the widest mode in which we could perform moves.
26474 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26475 it until move of such size is supported. */
26476 piece_size = 1 << floor_log2 (size_to_move);
26477 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26478 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26480 gcc_assert (piece_size > 1);
26481 piece_size >>= 1;
26484 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26485 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26486 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26488 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26489 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26490 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26492 move_mode = word_mode;
26493 piece_size = GET_MODE_SIZE (move_mode);
26494 code = optab_handler (mov_optab, move_mode);
26497 gcc_assert (code != CODE_FOR_nothing);
26499 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26500 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26502 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26503 gcc_assert (size_to_move % piece_size == 0);
26504 adjust = GEN_INT (piece_size);
26505 for (i = 0; i < size_to_move; i += piece_size)
26507 /* We move from memory to memory, so we'll need to do it via
26508 a temporary register. */
26509 tempreg = gen_reg_rtx (move_mode);
26510 emit_insn (GEN_FCN (code) (tempreg, src));
26511 emit_insn (GEN_FCN (code) (dst, tempreg));
26513 emit_move_insn (destptr,
26514 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26515 emit_move_insn (srcptr,
26516 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26518 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26519 piece_size);
26520 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26521 piece_size);
26524 /* Update DST and SRC rtx. */
26525 *srcmem = src;
26526 return dst;
26529 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26530 static void
26531 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26532 rtx destptr, rtx srcptr, rtx count, int max_size)
26534 rtx src, dest;
26535 if (CONST_INT_P (count))
26537 HOST_WIDE_INT countval = INTVAL (count);
26538 HOST_WIDE_INT epilogue_size = countval % max_size;
26539 int i;
26541 /* For now MAX_SIZE should be a power of 2. This assert could be
26542 relaxed, but it'll require a bit more complicated epilogue
26543 expanding. */
26544 gcc_assert ((max_size & (max_size - 1)) == 0);
26545 for (i = max_size; i >= 1; i >>= 1)
26547 if (epilogue_size & i)
26548 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26550 return;
26552 if (max_size > 8)
26554 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26555 count, 1, OPTAB_DIRECT);
26556 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26557 count, QImode, 1, 4, false);
26558 return;
26561 /* When there are stringops, we can cheaply increase dest and src pointers.
26562 Otherwise we save code size by maintaining offset (zero is readily
26563 available from preceding rep operation) and using x86 addressing modes.
26565 if (TARGET_SINGLE_STRINGOP)
26567 if (max_size > 4)
26569 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26570 src = change_address (srcmem, SImode, srcptr);
26571 dest = change_address (destmem, SImode, destptr);
26572 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26573 emit_label (label);
26574 LABEL_NUSES (label) = 1;
26576 if (max_size > 2)
26578 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26579 src = change_address (srcmem, HImode, srcptr);
26580 dest = change_address (destmem, HImode, destptr);
26581 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26582 emit_label (label);
26583 LABEL_NUSES (label) = 1;
26585 if (max_size > 1)
26587 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26588 src = change_address (srcmem, QImode, srcptr);
26589 dest = change_address (destmem, QImode, destptr);
26590 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26591 emit_label (label);
26592 LABEL_NUSES (label) = 1;
26595 else
26597 rtx offset = force_reg (Pmode, const0_rtx);
26598 rtx tmp;
26600 if (max_size > 4)
26602 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26603 src = change_address (srcmem, SImode, srcptr);
26604 dest = change_address (destmem, SImode, destptr);
26605 emit_move_insn (dest, src);
26606 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26607 true, OPTAB_LIB_WIDEN);
26608 if (tmp != offset)
26609 emit_move_insn (offset, tmp);
26610 emit_label (label);
26611 LABEL_NUSES (label) = 1;
26613 if (max_size > 2)
26615 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26616 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26617 src = change_address (srcmem, HImode, tmp);
26618 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26619 dest = change_address (destmem, HImode, tmp);
26620 emit_move_insn (dest, src);
26621 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26622 true, OPTAB_LIB_WIDEN);
26623 if (tmp != offset)
26624 emit_move_insn (offset, tmp);
26625 emit_label (label);
26626 LABEL_NUSES (label) = 1;
26628 if (max_size > 1)
26630 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26631 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26632 src = change_address (srcmem, QImode, tmp);
26633 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26634 dest = change_address (destmem, QImode, tmp);
26635 emit_move_insn (dest, src);
26636 emit_label (label);
26637 LABEL_NUSES (label) = 1;
26642 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26643 with value PROMOTED_VAL.
26644 SRC is passed by pointer to be updated on return.
26645 Return value is updated DST. */
26646 static rtx
26647 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26648 HOST_WIDE_INT size_to_move)
26650 rtx dst = destmem, adjust;
26651 enum insn_code code;
26652 machine_mode move_mode;
26653 int piece_size, i;
26655 /* Find the widest mode in which we could perform moves.
26656 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26657 it until move of such size is supported. */
26658 move_mode = GET_MODE (promoted_val);
26659 if (move_mode == VOIDmode)
26660 move_mode = QImode;
26661 if (size_to_move < GET_MODE_SIZE (move_mode))
26663 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26664 move_mode = int_mode_for_size (move_bits, 0).require ();
26665 promoted_val = gen_lowpart (move_mode, promoted_val);
26667 piece_size = GET_MODE_SIZE (move_mode);
26668 code = optab_handler (mov_optab, move_mode);
26669 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26671 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26673 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26674 gcc_assert (size_to_move % piece_size == 0);
26675 adjust = GEN_INT (piece_size);
26676 for (i = 0; i < size_to_move; i += piece_size)
26678 if (piece_size <= GET_MODE_SIZE (word_mode))
26680 emit_insn (gen_strset (destptr, dst, promoted_val));
26681 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26682 piece_size);
26683 continue;
26686 emit_insn (GEN_FCN (code) (dst, promoted_val));
26688 emit_move_insn (destptr,
26689 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26691 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26692 piece_size);
26695 /* Update DST rtx. */
26696 return dst;
26698 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26699 static void
26700 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26701 rtx count, int max_size)
26703 count =
26704 expand_simple_binop (counter_mode (count), AND, count,
26705 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26706 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26707 gen_lowpart (QImode, value), count, QImode,
26708 1, max_size / 2, true);
26711 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26712 static void
26713 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26714 rtx count, int max_size)
26716 rtx dest;
26718 if (CONST_INT_P (count))
26720 HOST_WIDE_INT countval = INTVAL (count);
26721 HOST_WIDE_INT epilogue_size = countval % max_size;
26722 int i;
26724 /* For now MAX_SIZE should be a power of 2. This assert could be
26725 relaxed, but it'll require a bit more complicated epilogue
26726 expanding. */
26727 gcc_assert ((max_size & (max_size - 1)) == 0);
26728 for (i = max_size; i >= 1; i >>= 1)
26730 if (epilogue_size & i)
26732 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26733 destmem = emit_memset (destmem, destptr, vec_value, i);
26734 else
26735 destmem = emit_memset (destmem, destptr, value, i);
26738 return;
26740 if (max_size > 32)
26742 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26743 return;
26745 if (max_size > 16)
26747 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26748 if (TARGET_64BIT)
26750 dest = change_address (destmem, DImode, destptr);
26751 emit_insn (gen_strset (destptr, dest, value));
26752 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26753 emit_insn (gen_strset (destptr, dest, value));
26755 else
26757 dest = change_address (destmem, SImode, destptr);
26758 emit_insn (gen_strset (destptr, dest, value));
26759 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26760 emit_insn (gen_strset (destptr, dest, value));
26761 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26762 emit_insn (gen_strset (destptr, dest, value));
26763 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26764 emit_insn (gen_strset (destptr, dest, value));
26766 emit_label (label);
26767 LABEL_NUSES (label) = 1;
26769 if (max_size > 8)
26771 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26772 if (TARGET_64BIT)
26774 dest = change_address (destmem, DImode, destptr);
26775 emit_insn (gen_strset (destptr, dest, value));
26777 else
26779 dest = change_address (destmem, SImode, destptr);
26780 emit_insn (gen_strset (destptr, dest, value));
26781 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26782 emit_insn (gen_strset (destptr, dest, value));
26784 emit_label (label);
26785 LABEL_NUSES (label) = 1;
26787 if (max_size > 4)
26789 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26790 dest = change_address (destmem, SImode, destptr);
26791 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26792 emit_label (label);
26793 LABEL_NUSES (label) = 1;
26795 if (max_size > 2)
26797 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26798 dest = change_address (destmem, HImode, destptr);
26799 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26800 emit_label (label);
26801 LABEL_NUSES (label) = 1;
26803 if (max_size > 1)
26805 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26806 dest = change_address (destmem, QImode, destptr);
26807 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26808 emit_label (label);
26809 LABEL_NUSES (label) = 1;
26813 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26814 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26815 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26816 ignored.
26817 Return value is updated DESTMEM. */
26818 static rtx
26819 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26820 rtx destptr, rtx srcptr, rtx value,
26821 rtx vec_value, rtx count, int align,
26822 int desired_alignment, bool issetmem)
26824 int i;
26825 for (i = 1; i < desired_alignment; i <<= 1)
26827 if (align <= i)
26829 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26830 if (issetmem)
26832 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26833 destmem = emit_memset (destmem, destptr, vec_value, i);
26834 else
26835 destmem = emit_memset (destmem, destptr, value, i);
26837 else
26838 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26839 ix86_adjust_counter (count, i);
26840 emit_label (label);
26841 LABEL_NUSES (label) = 1;
26842 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26845 return destmem;
26848 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26849 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26850 and jump to DONE_LABEL. */
26851 static void
26852 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26853 rtx destptr, rtx srcptr,
26854 rtx value, rtx vec_value,
26855 rtx count, int size,
26856 rtx done_label, bool issetmem)
26858 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26859 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26860 rtx modesize;
26861 int n;
26863 /* If we do not have vector value to copy, we must reduce size. */
26864 if (issetmem)
26866 if (!vec_value)
26868 if (GET_MODE (value) == VOIDmode && size > 8)
26869 mode = Pmode;
26870 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26871 mode = GET_MODE (value);
26873 else
26874 mode = GET_MODE (vec_value), value = vec_value;
26876 else
26878 /* Choose appropriate vector mode. */
26879 if (size >= 32)
26880 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26881 else if (size >= 16)
26882 mode = TARGET_SSE ? V16QImode : DImode;
26883 srcmem = change_address (srcmem, mode, srcptr);
26885 destmem = change_address (destmem, mode, destptr);
26886 modesize = GEN_INT (GET_MODE_SIZE (mode));
26887 gcc_assert (GET_MODE_SIZE (mode) <= size);
26888 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26890 if (issetmem)
26891 emit_move_insn (destmem, gen_lowpart (mode, value));
26892 else
26894 emit_move_insn (destmem, srcmem);
26895 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26897 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26900 destmem = offset_address (destmem, count, 1);
26901 destmem = offset_address (destmem, GEN_INT (-2 * size),
26902 GET_MODE_SIZE (mode));
26903 if (!issetmem)
26905 srcmem = offset_address (srcmem, count, 1);
26906 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26907 GET_MODE_SIZE (mode));
26909 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26911 if (issetmem)
26912 emit_move_insn (destmem, gen_lowpart (mode, value));
26913 else
26915 emit_move_insn (destmem, srcmem);
26916 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26918 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26920 emit_jump_insn (gen_jump (done_label));
26921 emit_barrier ();
26923 emit_label (label);
26924 LABEL_NUSES (label) = 1;
26927 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26928 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26929 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26930 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26931 DONE_LABEL is a label after the whole copying sequence. The label is created
26932 on demand if *DONE_LABEL is NULL.
26933 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26934 bounds after the initial copies.
26936 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26937 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26938 we will dispatch to a library call for large blocks.
26940 In pseudocode we do:
26942 if (COUNT < SIZE)
26944 Assume that SIZE is 4. Bigger sizes are handled analogously
26945 if (COUNT & 4)
26947 copy 4 bytes from SRCPTR to DESTPTR
26948 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26949 goto done_label
26951 if (!COUNT)
26952 goto done_label;
26953 copy 1 byte from SRCPTR to DESTPTR
26954 if (COUNT & 2)
26956 copy 2 bytes from SRCPTR to DESTPTR
26957 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26960 else
26962 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26963 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26965 OLD_DESPTR = DESTPTR;
26966 Align DESTPTR up to DESIRED_ALIGN
26967 SRCPTR += DESTPTR - OLD_DESTPTR
26968 COUNT -= DEST_PTR - OLD_DESTPTR
26969 if (DYNAMIC_CHECK)
26970 Round COUNT down to multiple of SIZE
26971 << optional caller supplied zero size guard is here >>
26972 << optional caller supplied dynamic check is here >>
26973 << caller supplied main copy loop is here >>
26975 done_label:
26977 static void
26978 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26979 rtx *destptr, rtx *srcptr,
26980 machine_mode mode,
26981 rtx value, rtx vec_value,
26982 rtx *count,
26983 rtx_code_label **done_label,
26984 int size,
26985 int desired_align,
26986 int align,
26987 unsigned HOST_WIDE_INT *min_size,
26988 bool dynamic_check,
26989 bool issetmem)
26991 rtx_code_label *loop_label = NULL, *label;
26992 int n;
26993 rtx modesize;
26994 int prolog_size = 0;
26995 rtx mode_value;
26997 /* Chose proper value to copy. */
26998 if (issetmem && VECTOR_MODE_P (mode))
26999 mode_value = vec_value;
27000 else
27001 mode_value = value;
27002 gcc_assert (GET_MODE_SIZE (mode) <= size);
27004 /* See if block is big or small, handle small blocks. */
27005 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
27007 int size2 = size;
27008 loop_label = gen_label_rtx ();
27010 if (!*done_label)
27011 *done_label = gen_label_rtx ();
27013 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
27014 1, loop_label);
27015 size2 >>= 1;
27017 /* Handle sizes > 3. */
27018 for (;size2 > 2; size2 >>= 1)
27019 expand_small_movmem_or_setmem (destmem, srcmem,
27020 *destptr, *srcptr,
27021 value, vec_value,
27022 *count,
27023 size2, *done_label, issetmem);
27024 /* Nothing to copy? Jump to DONE_LABEL if so */
27025 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
27026 1, *done_label);
27028 /* Do a byte copy. */
27029 destmem = change_address (destmem, QImode, *destptr);
27030 if (issetmem)
27031 emit_move_insn (destmem, gen_lowpart (QImode, value));
27032 else
27034 srcmem = change_address (srcmem, QImode, *srcptr);
27035 emit_move_insn (destmem, srcmem);
27038 /* Handle sizes 2 and 3. */
27039 label = ix86_expand_aligntest (*count, 2, false);
27040 destmem = change_address (destmem, HImode, *destptr);
27041 destmem = offset_address (destmem, *count, 1);
27042 destmem = offset_address (destmem, GEN_INT (-2), 2);
27043 if (issetmem)
27044 emit_move_insn (destmem, gen_lowpart (HImode, value));
27045 else
27047 srcmem = change_address (srcmem, HImode, *srcptr);
27048 srcmem = offset_address (srcmem, *count, 1);
27049 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27050 emit_move_insn (destmem, srcmem);
27053 emit_label (label);
27054 LABEL_NUSES (label) = 1;
27055 emit_jump_insn (gen_jump (*done_label));
27056 emit_barrier ();
27058 else
27059 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27060 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27062 /* Start memcpy for COUNT >= SIZE. */
27063 if (loop_label)
27065 emit_label (loop_label);
27066 LABEL_NUSES (loop_label) = 1;
27069 /* Copy first desired_align bytes. */
27070 if (!issetmem)
27071 srcmem = change_address (srcmem, mode, *srcptr);
27072 destmem = change_address (destmem, mode, *destptr);
27073 modesize = GEN_INT (GET_MODE_SIZE (mode));
27074 for (n = 0; prolog_size < desired_align - align; n++)
27076 if (issetmem)
27077 emit_move_insn (destmem, mode_value);
27078 else
27080 emit_move_insn (destmem, srcmem);
27081 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27083 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27084 prolog_size += GET_MODE_SIZE (mode);
27088 /* Copy last SIZE bytes. */
27089 destmem = offset_address (destmem, *count, 1);
27090 destmem = offset_address (destmem,
27091 GEN_INT (-size - prolog_size),
27093 if (issetmem)
27094 emit_move_insn (destmem, mode_value);
27095 else
27097 srcmem = offset_address (srcmem, *count, 1);
27098 srcmem = offset_address (srcmem,
27099 GEN_INT (-size - prolog_size),
27101 emit_move_insn (destmem, srcmem);
27103 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27105 destmem = offset_address (destmem, modesize, 1);
27106 if (issetmem)
27107 emit_move_insn (destmem, mode_value);
27108 else
27110 srcmem = offset_address (srcmem, modesize, 1);
27111 emit_move_insn (destmem, srcmem);
27115 /* Align destination. */
27116 if (desired_align > 1 && desired_align > align)
27118 rtx saveddest = *destptr;
27120 gcc_assert (desired_align <= size);
27121 /* Align destptr up, place it to new register. */
27122 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27123 GEN_INT (prolog_size),
27124 NULL_RTX, 1, OPTAB_DIRECT);
27125 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27126 REG_POINTER (*destptr) = 1;
27127 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27128 GEN_INT (-desired_align),
27129 *destptr, 1, OPTAB_DIRECT);
27130 /* See how many bytes we skipped. */
27131 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27132 *destptr,
27133 saveddest, 1, OPTAB_DIRECT);
27134 /* Adjust srcptr and count. */
27135 if (!issetmem)
27136 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27137 saveddest, *srcptr, 1, OPTAB_DIRECT);
27138 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27139 saveddest, *count, 1, OPTAB_DIRECT);
27140 /* We copied at most size + prolog_size. */
27141 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27142 *min_size
27143 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27144 else
27145 *min_size = 0;
27147 /* Our loops always round down the block size, but for dispatch to
27148 library we need precise value. */
27149 if (dynamic_check)
27150 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27151 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27153 else
27155 gcc_assert (prolog_size == 0);
27156 /* Decrease count, so we won't end up copying last word twice. */
27157 if (!CONST_INT_P (*count))
27158 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27159 constm1_rtx, *count, 1, OPTAB_DIRECT);
27160 else
27161 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27162 (unsigned HOST_WIDE_INT)size));
27163 if (*min_size)
27164 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27169 /* This function is like the previous one, except here we know how many bytes
27170 need to be copied. That allows us to update alignment not only of DST, which
27171 is returned, but also of SRC, which is passed as a pointer for that
27172 reason. */
27173 static rtx
27174 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27175 rtx srcreg, rtx value, rtx vec_value,
27176 int desired_align, int align_bytes,
27177 bool issetmem)
27179 rtx src = NULL;
27180 rtx orig_dst = dst;
27181 rtx orig_src = NULL;
27182 int piece_size = 1;
27183 int copied_bytes = 0;
27185 if (!issetmem)
27187 gcc_assert (srcp != NULL);
27188 src = *srcp;
27189 orig_src = src;
27192 for (piece_size = 1;
27193 piece_size <= desired_align && copied_bytes < align_bytes;
27194 piece_size <<= 1)
27196 if (align_bytes & piece_size)
27198 if (issetmem)
27200 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27201 dst = emit_memset (dst, destreg, vec_value, piece_size);
27202 else
27203 dst = emit_memset (dst, destreg, value, piece_size);
27205 else
27206 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27207 copied_bytes += piece_size;
27210 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27211 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27212 if (MEM_SIZE_KNOWN_P (orig_dst))
27213 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27215 if (!issetmem)
27217 int src_align_bytes = get_mem_align_offset (src, desired_align
27218 * BITS_PER_UNIT);
27219 if (src_align_bytes >= 0)
27220 src_align_bytes = desired_align - src_align_bytes;
27221 if (src_align_bytes >= 0)
27223 unsigned int src_align;
27224 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27226 if ((src_align_bytes & (src_align - 1))
27227 == (align_bytes & (src_align - 1)))
27228 break;
27230 if (src_align > (unsigned int) desired_align)
27231 src_align = desired_align;
27232 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27233 set_mem_align (src, src_align * BITS_PER_UNIT);
27235 if (MEM_SIZE_KNOWN_P (orig_src))
27236 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27237 *srcp = src;
27240 return dst;
27243 /* Return true if ALG can be used in current context.
27244 Assume we expand memset if MEMSET is true. */
27245 static bool
27246 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27248 if (alg == no_stringop)
27249 return false;
27250 if (alg == vector_loop)
27251 return TARGET_SSE || TARGET_AVX;
27252 /* Algorithms using the rep prefix want at least edi and ecx;
27253 additionally, memset wants eax and memcpy wants esi. Don't
27254 consider such algorithms if the user has appropriated those
27255 registers for their own purposes, or if we have a non-default
27256 address space, since some string insns cannot override the segment. */
27257 if (alg == rep_prefix_1_byte
27258 || alg == rep_prefix_4_byte
27259 || alg == rep_prefix_8_byte)
27261 if (have_as)
27262 return false;
27263 if (fixed_regs[CX_REG]
27264 || fixed_regs[DI_REG]
27265 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27266 return false;
27268 return true;
27271 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27272 static enum stringop_alg
27273 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27274 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27275 bool memset, bool zero_memset, bool have_as,
27276 int *dynamic_check, bool *noalign, bool recur)
27278 const struct stringop_algs *algs;
27279 bool optimize_for_speed;
27280 int max = 0;
27281 const struct processor_costs *cost;
27282 int i;
27283 bool any_alg_usable_p = false;
27285 *noalign = false;
27286 *dynamic_check = -1;
27288 /* Even if the string operation call is cold, we still might spend a lot
27289 of time processing large blocks. */
27290 if (optimize_function_for_size_p (cfun)
27291 || (optimize_insn_for_size_p ()
27292 && (max_size < 256
27293 || (expected_size != -1 && expected_size < 256))))
27294 optimize_for_speed = false;
27295 else
27296 optimize_for_speed = true;
27298 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27299 if (memset)
27300 algs = &cost->memset[TARGET_64BIT != 0];
27301 else
27302 algs = &cost->memcpy[TARGET_64BIT != 0];
27304 /* See maximal size for user defined algorithm. */
27305 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27307 enum stringop_alg candidate = algs->size[i].alg;
27308 bool usable = alg_usable_p (candidate, memset, have_as);
27309 any_alg_usable_p |= usable;
27311 if (candidate != libcall && candidate && usable)
27312 max = algs->size[i].max;
27315 /* If expected size is not known but max size is small enough
27316 so inline version is a win, set expected size into
27317 the range. */
27318 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27319 && expected_size == -1)
27320 expected_size = min_size / 2 + max_size / 2;
27322 /* If user specified the algorithm, honor it if possible. */
27323 if (ix86_stringop_alg != no_stringop
27324 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27325 return ix86_stringop_alg;
27326 /* rep; movq or rep; movl is the smallest variant. */
27327 else if (!optimize_for_speed)
27329 *noalign = true;
27330 if (!count || (count & 3) || (memset && !zero_memset))
27331 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27332 ? rep_prefix_1_byte : loop_1_byte;
27333 else
27334 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27335 ? rep_prefix_4_byte : loop;
27337 /* Very tiny blocks are best handled via the loop, REP is expensive to
27338 setup. */
27339 else if (expected_size != -1 && expected_size < 4)
27340 return loop_1_byte;
27341 else if (expected_size != -1)
27343 enum stringop_alg alg = libcall;
27344 bool alg_noalign = false;
27345 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27347 /* We get here if the algorithms that were not libcall-based
27348 were rep-prefix based and we are unable to use rep prefixes
27349 based on global register usage. Break out of the loop and
27350 use the heuristic below. */
27351 if (algs->size[i].max == 0)
27352 break;
27353 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27355 enum stringop_alg candidate = algs->size[i].alg;
27357 if (candidate != libcall
27358 && alg_usable_p (candidate, memset, have_as))
27360 alg = candidate;
27361 alg_noalign = algs->size[i].noalign;
27363 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27364 last non-libcall inline algorithm. */
27365 if (TARGET_INLINE_ALL_STRINGOPS)
27367 /* When the current size is best to be copied by a libcall,
27368 but we are still forced to inline, run the heuristic below
27369 that will pick code for medium sized blocks. */
27370 if (alg != libcall)
27372 *noalign = alg_noalign;
27373 return alg;
27375 else if (!any_alg_usable_p)
27376 break;
27378 else if (alg_usable_p (candidate, memset, have_as))
27380 *noalign = algs->size[i].noalign;
27381 return candidate;
27386 /* When asked to inline the call anyway, try to pick meaningful choice.
27387 We look for maximal size of block that is faster to copy by hand and
27388 take blocks of at most of that size guessing that average size will
27389 be roughly half of the block.
27391 If this turns out to be bad, we might simply specify the preferred
27392 choice in ix86_costs. */
27393 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27394 && (algs->unknown_size == libcall
27395 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27397 enum stringop_alg alg;
27398 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27400 /* If there aren't any usable algorithms or if recursing already,
27401 then recursing on smaller sizes or same size isn't going to
27402 find anything. Just return the simple byte-at-a-time copy loop. */
27403 if (!any_alg_usable_p || recur)
27405 /* Pick something reasonable. */
27406 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27407 *dynamic_check = 128;
27408 return loop_1_byte;
27410 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27411 zero_memset, have_as, dynamic_check, noalign, true);
27412 gcc_assert (*dynamic_check == -1);
27413 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27414 *dynamic_check = max;
27415 else
27416 gcc_assert (alg != libcall);
27417 return alg;
27419 return (alg_usable_p (algs->unknown_size, memset, have_as)
27420 ? algs->unknown_size : libcall);
27423 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27424 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27425 static int
27426 decide_alignment (int align,
27427 enum stringop_alg alg,
27428 int expected_size,
27429 machine_mode move_mode)
27431 int desired_align = 0;
27433 gcc_assert (alg != no_stringop);
27435 if (alg == libcall)
27436 return 0;
27437 if (move_mode == VOIDmode)
27438 return 0;
27440 desired_align = GET_MODE_SIZE (move_mode);
27441 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27442 copying whole cacheline at once. */
27443 if (TARGET_PENTIUMPRO
27444 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27445 desired_align = 8;
27447 if (optimize_size)
27448 desired_align = 1;
27449 if (desired_align < align)
27450 desired_align = align;
27451 if (expected_size != -1 && expected_size < 4)
27452 desired_align = align;
27454 return desired_align;
27458 /* Helper function for memcpy. For QImode value 0xXY produce
27459 0xXYXYXYXY of wide specified by MODE. This is essentially
27460 a * 0x10101010, but we can do slightly better than
27461 synth_mult by unwinding the sequence by hand on CPUs with
27462 slow multiply. */
27463 static rtx
27464 promote_duplicated_reg (machine_mode mode, rtx val)
27466 machine_mode valmode = GET_MODE (val);
27467 rtx tmp;
27468 int nops = mode == DImode ? 3 : 2;
27470 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27471 if (val == const0_rtx)
27472 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27473 if (CONST_INT_P (val))
27475 HOST_WIDE_INT v = INTVAL (val) & 255;
27477 v |= v << 8;
27478 v |= v << 16;
27479 if (mode == DImode)
27480 v |= (v << 16) << 16;
27481 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27484 if (valmode == VOIDmode)
27485 valmode = QImode;
27486 if (valmode != QImode)
27487 val = gen_lowpart (QImode, val);
27488 if (mode == QImode)
27489 return val;
27490 if (!TARGET_PARTIAL_REG_STALL)
27491 nops--;
27492 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27493 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27494 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27495 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27497 rtx reg = convert_modes (mode, QImode, val, true);
27498 tmp = promote_duplicated_reg (mode, const1_rtx);
27499 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27500 OPTAB_DIRECT);
27502 else
27504 rtx reg = convert_modes (mode, QImode, val, true);
27506 if (!TARGET_PARTIAL_REG_STALL)
27507 if (mode == SImode)
27508 emit_insn (gen_insvsi_1 (reg, reg));
27509 else
27510 emit_insn (gen_insvdi_1 (reg, reg));
27511 else
27513 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27514 NULL, 1, OPTAB_DIRECT);
27515 reg =
27516 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27518 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27519 NULL, 1, OPTAB_DIRECT);
27520 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27521 if (mode == SImode)
27522 return reg;
27523 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27524 NULL, 1, OPTAB_DIRECT);
27525 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27526 return reg;
27530 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27531 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27532 alignment from ALIGN to DESIRED_ALIGN. */
27533 static rtx
27534 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27535 int align)
27537 rtx promoted_val;
27539 if (TARGET_64BIT
27540 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27541 promoted_val = promote_duplicated_reg (DImode, val);
27542 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27543 promoted_val = promote_duplicated_reg (SImode, val);
27544 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27545 promoted_val = promote_duplicated_reg (HImode, val);
27546 else
27547 promoted_val = val;
27549 return promoted_val;
27552 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27553 operations when profitable. The code depends upon architecture, block size
27554 and alignment, but always has one of the following overall structures:
27556 Aligned move sequence:
27558 1) Prologue guard: Conditional that jumps up to epilogues for small
27559 blocks that can be handled by epilogue alone. This is faster
27560 but also needed for correctness, since prologue assume the block
27561 is larger than the desired alignment.
27563 Optional dynamic check for size and libcall for large
27564 blocks is emitted here too, with -minline-stringops-dynamically.
27566 2) Prologue: copy first few bytes in order to get destination
27567 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27568 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27569 copied. We emit either a jump tree on power of two sized
27570 blocks, or a byte loop.
27572 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27573 with specified algorithm.
27575 4) Epilogue: code copying tail of the block that is too small to be
27576 handled by main body (or up to size guarded by prologue guard).
27578 Misaligned move sequence
27580 1) missaligned move prologue/epilogue containing:
27581 a) Prologue handling small memory blocks and jumping to done_label
27582 (skipped if blocks are known to be large enough)
27583 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27584 needed by single possibly misaligned move
27585 (skipped if alignment is not needed)
27586 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27588 2) Zero size guard dispatching to done_label, if needed
27590 3) dispatch to library call, if needed,
27592 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27593 with specified algorithm. */
27594 bool
27595 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27596 rtx align_exp, rtx expected_align_exp,
27597 rtx expected_size_exp, rtx min_size_exp,
27598 rtx max_size_exp, rtx probable_max_size_exp,
27599 bool issetmem)
27601 rtx destreg;
27602 rtx srcreg = NULL;
27603 rtx_code_label *label = NULL;
27604 rtx tmp;
27605 rtx_code_label *jump_around_label = NULL;
27606 HOST_WIDE_INT align = 1;
27607 unsigned HOST_WIDE_INT count = 0;
27608 HOST_WIDE_INT expected_size = -1;
27609 int size_needed = 0, epilogue_size_needed;
27610 int desired_align = 0, align_bytes = 0;
27611 enum stringop_alg alg;
27612 rtx promoted_val = NULL;
27613 rtx vec_promoted_val = NULL;
27614 bool force_loopy_epilogue = false;
27615 int dynamic_check;
27616 bool need_zero_guard = false;
27617 bool noalign;
27618 machine_mode move_mode = VOIDmode;
27619 machine_mode wider_mode;
27620 int unroll_factor = 1;
27621 /* TODO: Once value ranges are available, fill in proper data. */
27622 unsigned HOST_WIDE_INT min_size = 0;
27623 unsigned HOST_WIDE_INT max_size = -1;
27624 unsigned HOST_WIDE_INT probable_max_size = -1;
27625 bool misaligned_prologue_used = false;
27626 bool have_as;
27628 if (CONST_INT_P (align_exp))
27629 align = INTVAL (align_exp);
27630 /* i386 can do misaligned access on reasonably increased cost. */
27631 if (CONST_INT_P (expected_align_exp)
27632 && INTVAL (expected_align_exp) > align)
27633 align = INTVAL (expected_align_exp);
27634 /* ALIGN is the minimum of destination and source alignment, but we care here
27635 just about destination alignment. */
27636 else if (!issetmem
27637 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27638 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27640 if (CONST_INT_P (count_exp))
27642 min_size = max_size = probable_max_size = count = expected_size
27643 = INTVAL (count_exp);
27644 /* When COUNT is 0, there is nothing to do. */
27645 if (!count)
27646 return true;
27648 else
27650 if (min_size_exp)
27651 min_size = INTVAL (min_size_exp);
27652 if (max_size_exp)
27653 max_size = INTVAL (max_size_exp);
27654 if (probable_max_size_exp)
27655 probable_max_size = INTVAL (probable_max_size_exp);
27656 if (CONST_INT_P (expected_size_exp))
27657 expected_size = INTVAL (expected_size_exp);
27660 /* Make sure we don't need to care about overflow later on. */
27661 if (count > (HOST_WIDE_INT_1U << 30))
27662 return false;
27664 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27665 if (!issetmem)
27666 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27668 /* Step 0: Decide on preferred algorithm, desired alignment and
27669 size of chunks to be copied by main loop. */
27670 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27671 issetmem,
27672 issetmem && val_exp == const0_rtx, have_as,
27673 &dynamic_check, &noalign, false);
27674 if (alg == libcall)
27675 return false;
27676 gcc_assert (alg != no_stringop);
27678 /* For now vector-version of memset is generated only for memory zeroing, as
27679 creating of promoted vector value is very cheap in this case. */
27680 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27681 alg = unrolled_loop;
27683 if (!count)
27684 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27685 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27686 if (!issetmem)
27687 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27689 unroll_factor = 1;
27690 move_mode = word_mode;
27691 switch (alg)
27693 case libcall:
27694 case no_stringop:
27695 case last_alg:
27696 gcc_unreachable ();
27697 case loop_1_byte:
27698 need_zero_guard = true;
27699 move_mode = QImode;
27700 break;
27701 case loop:
27702 need_zero_guard = true;
27703 break;
27704 case unrolled_loop:
27705 need_zero_guard = true;
27706 unroll_factor = (TARGET_64BIT ? 4 : 2);
27707 break;
27708 case vector_loop:
27709 need_zero_guard = true;
27710 unroll_factor = 4;
27711 /* Find the widest supported mode. */
27712 move_mode = word_mode;
27713 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27714 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27715 move_mode = wider_mode;
27717 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27718 move_mode = TImode;
27720 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27721 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27722 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27724 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27725 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27726 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27727 move_mode = word_mode;
27729 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27730 break;
27731 case rep_prefix_8_byte:
27732 move_mode = DImode;
27733 break;
27734 case rep_prefix_4_byte:
27735 move_mode = SImode;
27736 break;
27737 case rep_prefix_1_byte:
27738 move_mode = QImode;
27739 break;
27741 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27742 epilogue_size_needed = size_needed;
27744 /* If we are going to call any library calls conditionally, make sure any
27745 pending stack adjustment happen before the first conditional branch,
27746 otherwise they will be emitted before the library call only and won't
27747 happen from the other branches. */
27748 if (dynamic_check != -1)
27749 do_pending_stack_adjust ();
27751 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27752 if (!TARGET_ALIGN_STRINGOPS || noalign)
27753 align = desired_align;
27755 /* Step 1: Prologue guard. */
27757 /* Alignment code needs count to be in register. */
27758 if (CONST_INT_P (count_exp) && desired_align > align)
27760 if (INTVAL (count_exp) > desired_align
27761 && INTVAL (count_exp) > size_needed)
27763 align_bytes
27764 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27765 if (align_bytes <= 0)
27766 align_bytes = 0;
27767 else
27768 align_bytes = desired_align - align_bytes;
27770 if (align_bytes == 0)
27771 count_exp = force_reg (counter_mode (count_exp), count_exp);
27773 gcc_assert (desired_align >= 1 && align >= 1);
27775 /* Misaligned move sequences handle both prologue and epilogue at once.
27776 Default code generation results in a smaller code for large alignments
27777 and also avoids redundant job when sizes are known precisely. */
27778 misaligned_prologue_used
27779 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27780 && MAX (desired_align, epilogue_size_needed) <= 32
27781 && desired_align <= epilogue_size_needed
27782 && ((desired_align > align && !align_bytes)
27783 || (!count && epilogue_size_needed > 1)));
27785 /* Do the cheap promotion to allow better CSE across the
27786 main loop and epilogue (ie one load of the big constant in the
27787 front of all code.
27788 For now the misaligned move sequences do not have fast path
27789 without broadcasting. */
27790 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27792 if (alg == vector_loop)
27794 gcc_assert (val_exp == const0_rtx);
27795 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27796 promoted_val = promote_duplicated_reg_to_size (val_exp,
27797 GET_MODE_SIZE (word_mode),
27798 desired_align, align);
27800 else
27802 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27803 desired_align, align);
27806 /* Misaligned move sequences handles both prologues and epilogues at once.
27807 Default code generation results in smaller code for large alignments and
27808 also avoids redundant job when sizes are known precisely. */
27809 if (misaligned_prologue_used)
27811 /* Misaligned move prologue handled small blocks by itself. */
27812 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27813 (dst, src, &destreg, &srcreg,
27814 move_mode, promoted_val, vec_promoted_val,
27815 &count_exp,
27816 &jump_around_label,
27817 desired_align < align
27818 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27819 desired_align, align, &min_size, dynamic_check, issetmem);
27820 if (!issetmem)
27821 src = change_address (src, BLKmode, srcreg);
27822 dst = change_address (dst, BLKmode, destreg);
27823 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27824 epilogue_size_needed = 0;
27825 if (need_zero_guard
27826 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27828 /* It is possible that we copied enough so the main loop will not
27829 execute. */
27830 gcc_assert (size_needed > 1);
27831 if (jump_around_label == NULL_RTX)
27832 jump_around_label = gen_label_rtx ();
27833 emit_cmp_and_jump_insns (count_exp,
27834 GEN_INT (size_needed),
27835 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27836 if (expected_size == -1
27837 || expected_size < (desired_align - align) / 2 + size_needed)
27838 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27839 else
27840 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27843 /* Ensure that alignment prologue won't copy past end of block. */
27844 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27846 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27847 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27848 Make sure it is power of 2. */
27849 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27851 /* To improve performance of small blocks, we jump around the VAL
27852 promoting mode. This mean that if the promoted VAL is not constant,
27853 we might not use it in the epilogue and have to use byte
27854 loop variant. */
27855 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27856 force_loopy_epilogue = true;
27857 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27858 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27860 /* If main algorithm works on QImode, no epilogue is needed.
27861 For small sizes just don't align anything. */
27862 if (size_needed == 1)
27863 desired_align = align;
27864 else
27865 goto epilogue;
27867 else if (!count
27868 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27870 label = gen_label_rtx ();
27871 emit_cmp_and_jump_insns (count_exp,
27872 GEN_INT (epilogue_size_needed),
27873 LTU, 0, counter_mode (count_exp), 1, label);
27874 if (expected_size == -1 || expected_size < epilogue_size_needed)
27875 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27876 else
27877 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27881 /* Emit code to decide on runtime whether library call or inline should be
27882 used. */
27883 if (dynamic_check != -1)
27885 if (!issetmem && CONST_INT_P (count_exp))
27887 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27889 emit_block_copy_via_libcall (dst, src, count_exp);
27890 count_exp = const0_rtx;
27891 goto epilogue;
27894 else
27896 rtx_code_label *hot_label = gen_label_rtx ();
27897 if (jump_around_label == NULL_RTX)
27898 jump_around_label = gen_label_rtx ();
27899 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27900 LEU, 0, counter_mode (count_exp),
27901 1, hot_label);
27902 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27903 if (issetmem)
27904 set_storage_via_libcall (dst, count_exp, val_exp);
27905 else
27906 emit_block_copy_via_libcall (dst, src, count_exp);
27907 emit_jump (jump_around_label);
27908 emit_label (hot_label);
27912 /* Step 2: Alignment prologue. */
27913 /* Do the expensive promotion once we branched off the small blocks. */
27914 if (issetmem && !promoted_val)
27915 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27916 desired_align, align);
27918 if (desired_align > align && !misaligned_prologue_used)
27920 if (align_bytes == 0)
27922 /* Except for the first move in prologue, we no longer know
27923 constant offset in aliasing info. It don't seems to worth
27924 the pain to maintain it for the first move, so throw away
27925 the info early. */
27926 dst = change_address (dst, BLKmode, destreg);
27927 if (!issetmem)
27928 src = change_address (src, BLKmode, srcreg);
27929 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27930 promoted_val, vec_promoted_val,
27931 count_exp, align, desired_align,
27932 issetmem);
27933 /* At most desired_align - align bytes are copied. */
27934 if (min_size < (unsigned)(desired_align - align))
27935 min_size = 0;
27936 else
27937 min_size -= desired_align - align;
27939 else
27941 /* If we know how many bytes need to be stored before dst is
27942 sufficiently aligned, maintain aliasing info accurately. */
27943 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27944 srcreg,
27945 promoted_val,
27946 vec_promoted_val,
27947 desired_align,
27948 align_bytes,
27949 issetmem);
27951 count_exp = plus_constant (counter_mode (count_exp),
27952 count_exp, -align_bytes);
27953 count -= align_bytes;
27954 min_size -= align_bytes;
27955 max_size -= align_bytes;
27957 if (need_zero_guard
27958 && min_size < (unsigned HOST_WIDE_INT) size_needed
27959 && (count < (unsigned HOST_WIDE_INT) size_needed
27960 || (align_bytes == 0
27961 && count < ((unsigned HOST_WIDE_INT) size_needed
27962 + desired_align - align))))
27964 /* It is possible that we copied enough so the main loop will not
27965 execute. */
27966 gcc_assert (size_needed > 1);
27967 if (label == NULL_RTX)
27968 label = gen_label_rtx ();
27969 emit_cmp_and_jump_insns (count_exp,
27970 GEN_INT (size_needed),
27971 LTU, 0, counter_mode (count_exp), 1, label);
27972 if (expected_size == -1
27973 || expected_size < (desired_align - align) / 2 + size_needed)
27974 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27975 else
27976 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27979 if (label && size_needed == 1)
27981 emit_label (label);
27982 LABEL_NUSES (label) = 1;
27983 label = NULL;
27984 epilogue_size_needed = 1;
27985 if (issetmem)
27986 promoted_val = val_exp;
27988 else if (label == NULL_RTX && !misaligned_prologue_used)
27989 epilogue_size_needed = size_needed;
27991 /* Step 3: Main loop. */
27993 switch (alg)
27995 case libcall:
27996 case no_stringop:
27997 case last_alg:
27998 gcc_unreachable ();
27999 case loop_1_byte:
28000 case loop:
28001 case unrolled_loop:
28002 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
28003 count_exp, move_mode, unroll_factor,
28004 expected_size, issetmem);
28005 break;
28006 case vector_loop:
28007 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
28008 vec_promoted_val, count_exp, move_mode,
28009 unroll_factor, expected_size, issetmem);
28010 break;
28011 case rep_prefix_8_byte:
28012 case rep_prefix_4_byte:
28013 case rep_prefix_1_byte:
28014 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
28015 val_exp, count_exp, move_mode, issetmem);
28016 break;
28018 /* Adjust properly the offset of src and dest memory for aliasing. */
28019 if (CONST_INT_P (count_exp))
28021 if (!issetmem)
28022 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
28023 (count / size_needed) * size_needed);
28024 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
28025 (count / size_needed) * size_needed);
28027 else
28029 if (!issetmem)
28030 src = change_address (src, BLKmode, srcreg);
28031 dst = change_address (dst, BLKmode, destreg);
28034 /* Step 4: Epilogue to copy the remaining bytes. */
28035 epilogue:
28036 if (label)
28038 /* When the main loop is done, COUNT_EXP might hold original count,
28039 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28040 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28041 bytes. Compensate if needed. */
28043 if (size_needed < epilogue_size_needed)
28045 tmp =
28046 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28047 GEN_INT (size_needed - 1), count_exp, 1,
28048 OPTAB_DIRECT);
28049 if (tmp != count_exp)
28050 emit_move_insn (count_exp, tmp);
28052 emit_label (label);
28053 LABEL_NUSES (label) = 1;
28056 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28058 if (force_loopy_epilogue)
28059 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28060 epilogue_size_needed);
28061 else
28063 if (issetmem)
28064 expand_setmem_epilogue (dst, destreg, promoted_val,
28065 vec_promoted_val, count_exp,
28066 epilogue_size_needed);
28067 else
28068 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28069 epilogue_size_needed);
28072 if (jump_around_label)
28073 emit_label (jump_around_label);
28074 return true;
28078 /* Expand the appropriate insns for doing strlen if not just doing
28079 repnz; scasb
28081 out = result, initialized with the start address
28082 align_rtx = alignment of the address.
28083 scratch = scratch register, initialized with the startaddress when
28084 not aligned, otherwise undefined
28086 This is just the body. It needs the initializations mentioned above and
28087 some address computing at the end. These things are done in i386.md. */
28089 static void
28090 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28092 int align;
28093 rtx tmp;
28094 rtx_code_label *align_2_label = NULL;
28095 rtx_code_label *align_3_label = NULL;
28096 rtx_code_label *align_4_label = gen_label_rtx ();
28097 rtx_code_label *end_0_label = gen_label_rtx ();
28098 rtx mem;
28099 rtx tmpreg = gen_reg_rtx (SImode);
28100 rtx scratch = gen_reg_rtx (SImode);
28101 rtx cmp;
28103 align = 0;
28104 if (CONST_INT_P (align_rtx))
28105 align = INTVAL (align_rtx);
28107 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28109 /* Is there a known alignment and is it less than 4? */
28110 if (align < 4)
28112 rtx scratch1 = gen_reg_rtx (Pmode);
28113 emit_move_insn (scratch1, out);
28114 /* Is there a known alignment and is it not 2? */
28115 if (align != 2)
28117 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28118 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28120 /* Leave just the 3 lower bits. */
28121 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28122 NULL_RTX, 0, OPTAB_WIDEN);
28124 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28125 Pmode, 1, align_4_label);
28126 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28127 Pmode, 1, align_2_label);
28128 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28129 Pmode, 1, align_3_label);
28131 else
28133 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28134 check if is aligned to 4 - byte. */
28136 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28137 NULL_RTX, 0, OPTAB_WIDEN);
28139 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28140 Pmode, 1, align_4_label);
28143 mem = change_address (src, QImode, out);
28145 /* Now compare the bytes. */
28147 /* Compare the first n unaligned byte on a byte per byte basis. */
28148 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28149 QImode, 1, end_0_label);
28151 /* Increment the address. */
28152 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28154 /* Not needed with an alignment of 2 */
28155 if (align != 2)
28157 emit_label (align_2_label);
28159 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28160 end_0_label);
28162 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28164 emit_label (align_3_label);
28167 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28168 end_0_label);
28170 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28173 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28174 align this loop. It gives only huge programs, but does not help to
28175 speed up. */
28176 emit_label (align_4_label);
28178 mem = change_address (src, SImode, out);
28179 emit_move_insn (scratch, mem);
28180 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28182 /* This formula yields a nonzero result iff one of the bytes is zero.
28183 This saves three branches inside loop and many cycles. */
28185 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28186 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28187 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28188 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28189 gen_int_mode (0x80808080, SImode)));
28190 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28191 align_4_label);
28193 if (TARGET_CMOVE)
28195 rtx reg = gen_reg_rtx (SImode);
28196 rtx reg2 = gen_reg_rtx (Pmode);
28197 emit_move_insn (reg, tmpreg);
28198 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28200 /* If zero is not in the first two bytes, move two bytes forward. */
28201 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28202 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28203 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28204 emit_insn (gen_rtx_SET (tmpreg,
28205 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28206 reg,
28207 tmpreg)));
28208 /* Emit lea manually to avoid clobbering of flags. */
28209 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28211 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28212 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28213 emit_insn (gen_rtx_SET (out,
28214 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28215 reg2,
28216 out)));
28218 else
28220 rtx_code_label *end_2_label = gen_label_rtx ();
28221 /* Is zero in the first two bytes? */
28223 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28224 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28225 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28226 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28227 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28228 pc_rtx);
28229 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28230 JUMP_LABEL (tmp) = end_2_label;
28232 /* Not in the first two. Move two bytes forward. */
28233 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28234 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28236 emit_label (end_2_label);
28240 /* Avoid branch in fixing the byte. */
28241 tmpreg = gen_lowpart (QImode, tmpreg);
28242 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28243 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28244 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28245 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28247 emit_label (end_0_label);
28250 /* Expand strlen. */
28252 bool
28253 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28255 rtx addr, scratch1, scratch2, scratch3, scratch4;
28257 /* The generic case of strlen expander is long. Avoid it's
28258 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28260 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28261 && !TARGET_INLINE_ALL_STRINGOPS
28262 && !optimize_insn_for_size_p ()
28263 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28264 return false;
28266 addr = force_reg (Pmode, XEXP (src, 0));
28267 scratch1 = gen_reg_rtx (Pmode);
28269 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28270 && !optimize_insn_for_size_p ())
28272 /* Well it seems that some optimizer does not combine a call like
28273 foo(strlen(bar), strlen(bar));
28274 when the move and the subtraction is done here. It does calculate
28275 the length just once when these instructions are done inside of
28276 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28277 often used and I use one fewer register for the lifetime of
28278 output_strlen_unroll() this is better. */
28280 emit_move_insn (out, addr);
28282 ix86_expand_strlensi_unroll_1 (out, src, align);
28284 /* strlensi_unroll_1 returns the address of the zero at the end of
28285 the string, like memchr(), so compute the length by subtracting
28286 the start address. */
28287 emit_insn (ix86_gen_sub3 (out, out, addr));
28289 else
28291 rtx unspec;
28293 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28294 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28295 return false;
28296 /* Can't use this for non-default address spaces. */
28297 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28298 return false;
28300 scratch2 = gen_reg_rtx (Pmode);
28301 scratch3 = gen_reg_rtx (Pmode);
28302 scratch4 = force_reg (Pmode, constm1_rtx);
28304 emit_move_insn (scratch3, addr);
28305 eoschar = force_reg (QImode, eoschar);
28307 src = replace_equiv_address_nv (src, scratch3);
28309 /* If .md starts supporting :P, this can be done in .md. */
28310 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28311 scratch4), UNSPEC_SCAS);
28312 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28313 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28314 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28316 return true;
28319 /* For given symbol (function) construct code to compute address of it's PLT
28320 entry in large x86-64 PIC model. */
28321 static rtx
28322 construct_plt_address (rtx symbol)
28324 rtx tmp, unspec;
28326 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28327 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28328 gcc_assert (Pmode == DImode);
28330 tmp = gen_reg_rtx (Pmode);
28331 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28333 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28334 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28335 return tmp;
28339 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28340 rtx callarg2,
28341 rtx pop, bool sibcall)
28343 rtx vec[3];
28344 rtx use = NULL, call;
28345 unsigned int vec_len = 0;
28346 tree fndecl;
28348 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28350 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28351 if (fndecl
28352 && (lookup_attribute ("interrupt",
28353 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28354 error ("interrupt service routine can't be called directly");
28356 else
28357 fndecl = NULL_TREE;
28359 if (pop == const0_rtx)
28360 pop = NULL;
28361 gcc_assert (!TARGET_64BIT || !pop);
28363 if (TARGET_MACHO && !TARGET_64BIT)
28365 #if TARGET_MACHO
28366 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28367 fnaddr = machopic_indirect_call_target (fnaddr);
28368 #endif
28370 else
28372 /* Static functions and indirect calls don't need the pic register. Also,
28373 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28374 it an indirect call. */
28375 rtx addr = XEXP (fnaddr, 0);
28376 if (flag_pic
28377 && GET_CODE (addr) == SYMBOL_REF
28378 && !SYMBOL_REF_LOCAL_P (addr))
28380 if (flag_plt
28381 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28382 || !lookup_attribute ("noplt",
28383 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28385 if (!TARGET_64BIT
28386 || (ix86_cmodel == CM_LARGE_PIC
28387 && DEFAULT_ABI != MS_ABI))
28389 use_reg (&use, gen_rtx_REG (Pmode,
28390 REAL_PIC_OFFSET_TABLE_REGNUM));
28391 if (ix86_use_pseudo_pic_reg ())
28392 emit_move_insn (gen_rtx_REG (Pmode,
28393 REAL_PIC_OFFSET_TABLE_REGNUM),
28394 pic_offset_table_rtx);
28397 else if (!TARGET_PECOFF && !TARGET_MACHO)
28399 if (TARGET_64BIT)
28401 fnaddr = gen_rtx_UNSPEC (Pmode,
28402 gen_rtvec (1, addr),
28403 UNSPEC_GOTPCREL);
28404 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28406 else
28408 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28409 UNSPEC_GOT);
28410 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28411 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28412 fnaddr);
28414 fnaddr = gen_const_mem (Pmode, fnaddr);
28415 /* Pmode may not be the same as word_mode for x32, which
28416 doesn't support indirect branch via 32-bit memory slot.
28417 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28418 indirect branch via x32 GOT slot is OK. */
28419 if (GET_MODE (fnaddr) != word_mode)
28420 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28421 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28426 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28427 parameters passed in vector registers. */
28428 if (TARGET_64BIT
28429 && (INTVAL (callarg2) > 0
28430 || (INTVAL (callarg2) == 0
28431 && (TARGET_SSE || !flag_skip_rax_setup))))
28433 rtx al = gen_rtx_REG (QImode, AX_REG);
28434 emit_move_insn (al, callarg2);
28435 use_reg (&use, al);
28438 if (ix86_cmodel == CM_LARGE_PIC
28439 && !TARGET_PECOFF
28440 && MEM_P (fnaddr)
28441 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28442 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28443 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28444 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28445 branch via x32 GOT slot is OK. */
28446 else if (!(TARGET_X32
28447 && MEM_P (fnaddr)
28448 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28449 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28450 && (sibcall
28451 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28452 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28454 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28455 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28458 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28460 if (retval)
28462 /* We should add bounds as destination register in case
28463 pointer with bounds may be returned. */
28464 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
28466 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
28467 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
28468 if (GET_CODE (retval) == PARALLEL)
28470 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
28471 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
28472 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
28473 retval = chkp_join_splitted_slot (retval, par);
28475 else
28477 retval = gen_rtx_PARALLEL (VOIDmode,
28478 gen_rtvec (3, retval, b0, b1));
28479 chkp_put_regs_to_expr_list (retval);
28483 call = gen_rtx_SET (retval, call);
28485 vec[vec_len++] = call;
28487 if (pop)
28489 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28490 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28491 vec[vec_len++] = pop;
28494 if (cfun->machine->no_caller_saved_registers
28495 && (!fndecl
28496 || (!TREE_THIS_VOLATILE (fndecl)
28497 && !lookup_attribute ("no_caller_saved_registers",
28498 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28500 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28501 bool is_64bit_ms_abi = (TARGET_64BIT
28502 && ix86_function_abi (fndecl) == MS_ABI);
28503 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28505 /* If there are no caller-saved registers, add all registers
28506 that are clobbered by the call which returns. */
28507 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28508 if (!fixed_regs[i]
28509 && (ix86_call_used_regs[i] == 1
28510 || (ix86_call_used_regs[i] & c_mask))
28511 && !STACK_REGNO_P (i)
28512 && !MMX_REGNO_P (i))
28513 clobber_reg (&use,
28514 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28516 else if (TARGET_64BIT_MS_ABI
28517 && (!callarg2 || INTVAL (callarg2) != -2))
28519 unsigned i;
28521 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28523 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28524 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28526 clobber_reg (&use, gen_rtx_REG (mode, regno));
28529 /* Set here, but it may get cleared later. */
28530 if (TARGET_CALL_MS2SYSV_XLOGUES)
28532 if (!TARGET_SSE)
28535 /* Don't break hot-patched functions. */
28536 else if (ix86_function_ms_hook_prologue (current_function_decl))
28539 /* TODO: Cases not yet examined. */
28540 else if (flag_split_stack)
28541 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28543 else
28545 gcc_assert (!reload_completed);
28546 cfun->machine->call_ms2sysv = true;
28551 if (vec_len > 1)
28552 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28553 call = emit_call_insn (call);
28554 if (use)
28555 CALL_INSN_FUNCTION_USAGE (call) = use;
28557 return call;
28560 /* Return true if the function being called was marked with attribute
28561 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28562 to handle the non-PIC case in the backend because there is no easy
28563 interface for the front-end to force non-PLT calls to use the GOT.
28564 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28565 to call the function marked "noplt" indirectly. */
28567 static bool
28568 ix86_nopic_noplt_attribute_p (rtx call_op)
28570 if (flag_pic || ix86_cmodel == CM_LARGE
28571 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28572 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28573 || SYMBOL_REF_LOCAL_P (call_op))
28574 return false;
28576 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28578 if (!flag_plt
28579 || (symbol_decl != NULL_TREE
28580 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28581 return true;
28583 return false;
28586 /* Output indirect branch via a call and return thunk. CALL_OP is a
28587 register which contains the branch target. XASM is the assembly
28588 template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
28589 A normal call is converted to:
28591 call __x86_indirect_thunk_reg
28593 and a tail call is converted to:
28595 jmp __x86_indirect_thunk_reg
28598 static void
28599 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28601 char thunk_name_buf[32];
28602 char *thunk_name;
28603 bool need_bnd_p = ix86_bnd_prefixed_insn_p (current_output_insn);
28604 int regno = REGNO (call_op);
28606 if (cfun->machine->indirect_branch_type
28607 != indirect_branch_thunk_inline)
28609 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28611 int i = regno;
28612 if (i >= FIRST_REX_INT_REG)
28613 i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28614 if (need_bnd_p)
28615 indirect_thunks_bnd_used |= 1 << i;
28616 else
28617 indirect_thunks_used |= 1 << i;
28619 indirect_thunk_name (thunk_name_buf, regno, need_bnd_p, false);
28620 thunk_name = thunk_name_buf;
28622 else
28623 thunk_name = NULL;
28625 if (sibcall_p)
28627 if (thunk_name != NULL)
28629 if (need_bnd_p)
28630 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28631 else
28632 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28634 else
28635 output_indirect_thunk (need_bnd_p, regno);
28637 else
28639 if (thunk_name != NULL)
28641 if (need_bnd_p)
28642 fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28643 else
28644 fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28645 return;
28648 char indirectlabel1[32];
28649 char indirectlabel2[32];
28651 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28652 INDIRECT_LABEL,
28653 indirectlabelno++);
28654 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28655 INDIRECT_LABEL,
28656 indirectlabelno++);
28658 /* Jump. */
28659 if (need_bnd_p)
28660 fputs ("\tbnd jmp\t", asm_out_file);
28661 else
28662 fputs ("\tjmp\t", asm_out_file);
28663 assemble_name_raw (asm_out_file, indirectlabel2);
28664 fputc ('\n', asm_out_file);
28666 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28668 if (thunk_name != NULL)
28670 if (need_bnd_p)
28671 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28672 else
28673 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28675 else
28676 output_indirect_thunk (need_bnd_p, regno);
28678 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28680 /* Call. */
28681 if (need_bnd_p)
28682 fputs ("\tbnd call\t", asm_out_file);
28683 else
28684 fputs ("\tcall\t", asm_out_file);
28685 assemble_name_raw (asm_out_file, indirectlabel1);
28686 fputc ('\n', asm_out_file);
28690 /* Output indirect branch via a call and return thunk. CALL_OP is
28691 the branch target. XASM is the assembly template for CALL_OP.
28692 Branch is a tail call if SIBCALL_P is true. A normal call is
28693 converted to:
28695 jmp L2
28697 push CALL_OP
28698 jmp __x86_indirect_thunk
28700 call L1
28702 and a tail call is converted to:
28704 push CALL_OP
28705 jmp __x86_indirect_thunk
28708 static void
28709 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28710 bool sibcall_p)
28712 char thunk_name_buf[32];
28713 char *thunk_name;
28714 char push_buf[64];
28715 bool need_bnd_p = ix86_bnd_prefixed_insn_p (current_output_insn);
28716 int regno = -1;
28718 if (cfun->machine->indirect_branch_type
28719 != indirect_branch_thunk_inline)
28721 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28723 if (need_bnd_p)
28724 indirect_thunk_bnd_needed = true;
28725 else
28726 indirect_thunk_needed = true;
28728 indirect_thunk_name (thunk_name_buf, regno, need_bnd_p, false);
28729 thunk_name = thunk_name_buf;
28731 else
28732 thunk_name = NULL;
28734 snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28735 TARGET_64BIT ? 'q' : 'l', xasm);
28737 if (sibcall_p)
28739 output_asm_insn (push_buf, &call_op);
28740 if (thunk_name != NULL)
28742 if (need_bnd_p)
28743 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28744 else
28745 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28747 else
28748 output_indirect_thunk (need_bnd_p, regno);
28750 else
28752 char indirectlabel1[32];
28753 char indirectlabel2[32];
28755 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28756 INDIRECT_LABEL,
28757 indirectlabelno++);
28758 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28759 INDIRECT_LABEL,
28760 indirectlabelno++);
28762 /* Jump. */
28763 if (need_bnd_p)
28764 fputs ("\tbnd jmp\t", asm_out_file);
28765 else
28766 fputs ("\tjmp\t", asm_out_file);
28767 assemble_name_raw (asm_out_file, indirectlabel2);
28768 fputc ('\n', asm_out_file);
28770 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28772 /* An external function may be called via GOT, instead of PLT. */
28773 if (MEM_P (call_op))
28775 struct ix86_address parts;
28776 rtx addr = XEXP (call_op, 0);
28777 if (ix86_decompose_address (addr, &parts)
28778 && parts.base == stack_pointer_rtx)
28780 /* Since call will adjust stack by -UNITS_PER_WORD,
28781 we must convert "disp(stack, index, scale)" to
28782 "disp+UNITS_PER_WORD(stack, index, scale)". */
28783 if (parts.index)
28785 addr = gen_rtx_MULT (Pmode, parts.index,
28786 GEN_INT (parts.scale));
28787 addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28788 addr);
28790 else
28791 addr = stack_pointer_rtx;
28793 rtx disp;
28794 if (parts.disp != NULL_RTX)
28795 disp = plus_constant (Pmode, parts.disp,
28796 UNITS_PER_WORD);
28797 else
28798 disp = GEN_INT (UNITS_PER_WORD);
28800 addr = gen_rtx_PLUS (Pmode, addr, disp);
28801 call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28805 output_asm_insn (push_buf, &call_op);
28807 if (thunk_name != NULL)
28809 if (need_bnd_p)
28810 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28811 else
28812 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28814 else
28815 output_indirect_thunk (need_bnd_p, regno);
28817 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28819 /* Call. */
28820 if (need_bnd_p)
28821 fputs ("\tbnd call\t", asm_out_file);
28822 else
28823 fputs ("\tcall\t", asm_out_file);
28824 assemble_name_raw (asm_out_file, indirectlabel1);
28825 fputc ('\n', asm_out_file);
28829 /* Output indirect branch via a call and return thunk. CALL_OP is
28830 the branch target. XASM is the assembly template for CALL_OP.
28831 Branch is a tail call if SIBCALL_P is true. */
28833 static void
28834 ix86_output_indirect_branch (rtx call_op, const char *xasm,
28835 bool sibcall_p)
28837 if (REG_P (call_op))
28838 ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
28839 else
28840 ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
28842 /* Output indirect jump. CALL_OP is the jump target. Jump is a
28843 function return if RET_P is true. */
28845 const char *
28846 ix86_output_indirect_jmp (rtx call_op, bool ret_p)
28848 if (cfun->machine->indirect_branch_type != indirect_branch_keep)
28850 /* We can't have red-zone if this isn't a function return since
28851 "call" in the indirect thunk pushes the return address onto
28852 stack, destroying red-zone. */
28853 if (!ret_p && ix86_red_zone_size != 0)
28854 gcc_unreachable ();
28856 ix86_output_indirect_branch (call_op, "%0", true);
28857 return "";
28859 else
28860 return "%!jmp\t%A0";
28863 /* Output function return. CALL_OP is the jump target. Add a REP
28864 prefix to RET if LONG_P is true and function return is kept. */
28866 const char *
28867 ix86_output_function_return (bool long_p)
28869 if (cfun->machine->function_return_type != indirect_branch_keep)
28871 char thunk_name[32];
28872 bool need_bnd_p = ix86_bnd_prefixed_insn_p (current_output_insn);
28874 if (cfun->machine->function_return_type
28875 != indirect_branch_thunk_inline)
28877 bool need_thunk = (cfun->machine->function_return_type
28878 == indirect_branch_thunk);
28879 indirect_thunk_name (thunk_name, -1, need_bnd_p, true);
28880 if (need_bnd_p)
28882 indirect_thunk_bnd_needed |= need_thunk;
28883 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28885 else
28887 indirect_thunk_needed |= need_thunk;
28888 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28891 else
28892 output_indirect_thunk (need_bnd_p, -1);
28894 return "";
28897 if (!long_p || ix86_bnd_prefixed_insn_p (current_output_insn))
28898 return "%!ret";
28900 return "rep%; ret";
28903 /* Output the assembly for a call instruction. */
28905 const char *
28906 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28908 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28909 bool output_indirect_p
28910 = (!TARGET_SEH
28911 && cfun->machine->indirect_branch_type != indirect_branch_keep);
28912 bool seh_nop_p = false;
28913 const char *xasm;
28915 if (SIBLING_CALL_P (insn))
28917 if (direct_p)
28919 if (ix86_nopic_noplt_attribute_p (call_op))
28921 direct_p = false;
28922 if (TARGET_64BIT)
28924 if (output_indirect_p)
28925 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28926 else
28927 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28929 else
28931 if (output_indirect_p)
28932 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
28933 else
28934 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28937 else
28938 xasm = "%!jmp\t%P0";
28940 /* SEH epilogue detection requires the indirect branch case
28941 to include REX.W. */
28942 else if (TARGET_SEH)
28943 xasm = "%!rex.W jmp\t%A0";
28944 else
28946 if (output_indirect_p)
28947 xasm = "%0";
28948 else
28949 xasm = "%!jmp\t%A0";
28952 if (output_indirect_p && !direct_p)
28953 ix86_output_indirect_branch (call_op, xasm, true);
28954 else
28955 output_asm_insn (xasm, &call_op);
28956 return "";
28959 /* SEH unwinding can require an extra nop to be emitted in several
28960 circumstances. Determine if we have one of those. */
28961 if (TARGET_SEH)
28963 rtx_insn *i;
28965 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28967 /* If we get to another real insn, we don't need the nop. */
28968 if (INSN_P (i))
28969 break;
28971 /* If we get to the epilogue note, prevent a catch region from
28972 being adjacent to the standard epilogue sequence. If non-
28973 call-exceptions, we'll have done this during epilogue emission. */
28974 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28975 && !flag_non_call_exceptions
28976 && !can_throw_internal (insn))
28978 seh_nop_p = true;
28979 break;
28983 /* If we didn't find a real insn following the call, prevent the
28984 unwinder from looking into the next function. */
28985 if (i == NULL)
28986 seh_nop_p = true;
28989 if (direct_p)
28991 if (ix86_nopic_noplt_attribute_p (call_op))
28993 direct_p = false;
28994 if (TARGET_64BIT)
28996 if (output_indirect_p)
28997 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28998 else
28999 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29001 else
29003 if (output_indirect_p)
29004 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29005 else
29006 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29009 else
29010 xasm = "%!call\t%P0";
29012 else
29014 if (output_indirect_p)
29015 xasm = "%0";
29016 else
29017 xasm = "%!call\t%A0";
29020 if (output_indirect_p && !direct_p)
29021 ix86_output_indirect_branch (call_op, xasm, false);
29022 else
29023 output_asm_insn (xasm, &call_op);
29025 if (seh_nop_p)
29026 return "nop";
29028 return "";
29031 /* Clear stack slot assignments remembered from previous functions.
29032 This is called from INIT_EXPANDERS once before RTL is emitted for each
29033 function. */
29035 static struct machine_function *
29036 ix86_init_machine_status (void)
29038 struct machine_function *f;
29040 f = ggc_cleared_alloc<machine_function> ();
29041 f->call_abi = ix86_abi;
29043 return f;
29046 /* Return a MEM corresponding to a stack slot with mode MODE.
29047 Allocate a new slot if necessary.
29049 The RTL for a function can have several slots available: N is
29050 which slot to use. */
29053 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29055 struct stack_local_entry *s;
29057 gcc_assert (n < MAX_386_STACK_LOCALS);
29059 for (s = ix86_stack_locals; s; s = s->next)
29060 if (s->mode == mode && s->n == n)
29061 return validize_mem (copy_rtx (s->rtl));
29063 s = ggc_alloc<stack_local_entry> ();
29064 s->n = n;
29065 s->mode = mode;
29066 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29068 s->next = ix86_stack_locals;
29069 ix86_stack_locals = s;
29070 return validize_mem (copy_rtx (s->rtl));
29073 static void
29074 ix86_instantiate_decls (void)
29076 struct stack_local_entry *s;
29078 for (s = ix86_stack_locals; s; s = s->next)
29079 if (s->rtl != NULL_RTX)
29080 instantiate_decl_rtl (s->rtl);
29083 /* Return the number used for encoding REG, in the range 0..7. */
29085 static int
29086 reg_encoded_number (rtx reg)
29088 unsigned regno = REGNO (reg);
29089 switch (regno)
29091 case AX_REG:
29092 return 0;
29093 case CX_REG:
29094 return 1;
29095 case DX_REG:
29096 return 2;
29097 case BX_REG:
29098 return 3;
29099 case SP_REG:
29100 return 4;
29101 case BP_REG:
29102 return 5;
29103 case SI_REG:
29104 return 6;
29105 case DI_REG:
29106 return 7;
29107 default:
29108 break;
29110 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29111 return regno - FIRST_STACK_REG;
29112 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29113 return regno - FIRST_SSE_REG;
29114 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29115 return regno - FIRST_MMX_REG;
29116 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29117 return regno - FIRST_REX_SSE_REG;
29118 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29119 return regno - FIRST_REX_INT_REG;
29120 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29121 return regno - FIRST_MASK_REG;
29122 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
29123 return regno - FIRST_BND_REG;
29124 return -1;
29127 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29128 in its encoding if it could be relevant for ROP mitigation, otherwise
29129 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29130 used for calculating it into them. */
29132 static int
29133 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29134 int *popno0 = 0, int *popno1 = 0)
29136 if (asm_noperands (PATTERN (insn)) >= 0)
29137 return -1;
29138 int has_modrm = get_attr_modrm (insn);
29139 if (!has_modrm)
29140 return -1;
29141 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29142 rtx op0, op1;
29143 switch (cls)
29145 case MODRM_CLASS_OP02:
29146 gcc_assert (noperands >= 3);
29147 if (popno0)
29149 *popno0 = 0;
29150 *popno1 = 2;
29152 op0 = operands[0];
29153 op1 = operands[2];
29154 break;
29155 case MODRM_CLASS_OP01:
29156 gcc_assert (noperands >= 2);
29157 if (popno0)
29159 *popno0 = 0;
29160 *popno1 = 1;
29162 op0 = operands[0];
29163 op1 = operands[1];
29164 break;
29165 default:
29166 return -1;
29168 if (REG_P (op0) && REG_P (op1))
29170 int enc0 = reg_encoded_number (op0);
29171 int enc1 = reg_encoded_number (op1);
29172 return 0xc0 + (enc1 << 3) + enc0;
29174 return -1;
29177 /* Check whether x86 address PARTS is a pc-relative address. */
29179 bool
29180 ix86_rip_relative_addr_p (struct ix86_address *parts)
29182 rtx base, index, disp;
29184 base = parts->base;
29185 index = parts->index;
29186 disp = parts->disp;
29188 if (disp && !base && !index)
29190 if (TARGET_64BIT)
29192 rtx symbol = disp;
29194 if (GET_CODE (disp) == CONST)
29195 symbol = XEXP (disp, 0);
29196 if (GET_CODE (symbol) == PLUS
29197 && CONST_INT_P (XEXP (symbol, 1)))
29198 symbol = XEXP (symbol, 0);
29200 if (GET_CODE (symbol) == LABEL_REF
29201 || (GET_CODE (symbol) == SYMBOL_REF
29202 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29203 || (GET_CODE (symbol) == UNSPEC
29204 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29205 || XINT (symbol, 1) == UNSPEC_PCREL
29206 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29207 return true;
29210 return false;
29213 /* Calculate the length of the memory address in the instruction encoding.
29214 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29215 or other prefixes. We never generate addr32 prefix for LEA insn. */
29218 memory_address_length (rtx addr, bool lea)
29220 struct ix86_address parts;
29221 rtx base, index, disp;
29222 int len;
29223 int ok;
29225 if (GET_CODE (addr) == PRE_DEC
29226 || GET_CODE (addr) == POST_INC
29227 || GET_CODE (addr) == PRE_MODIFY
29228 || GET_CODE (addr) == POST_MODIFY)
29229 return 0;
29231 ok = ix86_decompose_address (addr, &parts);
29232 gcc_assert (ok);
29234 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29236 /* If this is not LEA instruction, add the length of addr32 prefix. */
29237 if (TARGET_64BIT && !lea
29238 && (SImode_address_operand (addr, VOIDmode)
29239 || (parts.base && GET_MODE (parts.base) == SImode)
29240 || (parts.index && GET_MODE (parts.index) == SImode)))
29241 len++;
29243 base = parts.base;
29244 index = parts.index;
29245 disp = parts.disp;
29247 if (base && SUBREG_P (base))
29248 base = SUBREG_REG (base);
29249 if (index && SUBREG_P (index))
29250 index = SUBREG_REG (index);
29252 gcc_assert (base == NULL_RTX || REG_P (base));
29253 gcc_assert (index == NULL_RTX || REG_P (index));
29255 /* Rule of thumb:
29256 - esp as the base always wants an index,
29257 - ebp as the base always wants a displacement,
29258 - r12 as the base always wants an index,
29259 - r13 as the base always wants a displacement. */
29261 /* Register Indirect. */
29262 if (base && !index && !disp)
29264 /* esp (for its index) and ebp (for its displacement) need
29265 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29266 code. */
29267 if (base == arg_pointer_rtx
29268 || base == frame_pointer_rtx
29269 || REGNO (base) == SP_REG
29270 || REGNO (base) == BP_REG
29271 || REGNO (base) == R12_REG
29272 || REGNO (base) == R13_REG)
29273 len++;
29276 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29277 is not disp32, but disp32(%rip), so for disp32
29278 SIB byte is needed, unless print_operand_address
29279 optimizes it into disp32(%rip) or (%rip) is implied
29280 by UNSPEC. */
29281 else if (disp && !base && !index)
29283 len += 4;
29284 if (!ix86_rip_relative_addr_p (&parts))
29285 len++;
29287 else
29289 /* Find the length of the displacement constant. */
29290 if (disp)
29292 if (base && satisfies_constraint_K (disp))
29293 len += 1;
29294 else
29295 len += 4;
29297 /* ebp always wants a displacement. Similarly r13. */
29298 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29299 len++;
29301 /* An index requires the two-byte modrm form.... */
29302 if (index
29303 /* ...like esp (or r12), which always wants an index. */
29304 || base == arg_pointer_rtx
29305 || base == frame_pointer_rtx
29306 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29307 len++;
29310 return len;
29313 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29314 is set, expect that insn have 8bit immediate alternative. */
29316 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29318 int len = 0;
29319 int i;
29320 extract_insn_cached (insn);
29321 for (i = recog_data.n_operands - 1; i >= 0; --i)
29322 if (CONSTANT_P (recog_data.operand[i]))
29324 enum attr_mode mode = get_attr_mode (insn);
29326 gcc_assert (!len);
29327 if (shortform && CONST_INT_P (recog_data.operand[i]))
29329 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29330 switch (mode)
29332 case MODE_QI:
29333 len = 1;
29334 continue;
29335 case MODE_HI:
29336 ival = trunc_int_for_mode (ival, HImode);
29337 break;
29338 case MODE_SI:
29339 ival = trunc_int_for_mode (ival, SImode);
29340 break;
29341 default:
29342 break;
29344 if (IN_RANGE (ival, -128, 127))
29346 len = 1;
29347 continue;
29350 switch (mode)
29352 case MODE_QI:
29353 len = 1;
29354 break;
29355 case MODE_HI:
29356 len = 2;
29357 break;
29358 case MODE_SI:
29359 len = 4;
29360 break;
29361 /* Immediates for DImode instructions are encoded
29362 as 32bit sign extended values. */
29363 case MODE_DI:
29364 len = 4;
29365 break;
29366 default:
29367 fatal_insn ("unknown insn mode", insn);
29370 return len;
29373 /* Compute default value for "length_address" attribute. */
29375 ix86_attr_length_address_default (rtx_insn *insn)
29377 int i;
29379 if (get_attr_type (insn) == TYPE_LEA)
29381 rtx set = PATTERN (insn), addr;
29383 if (GET_CODE (set) == PARALLEL)
29384 set = XVECEXP (set, 0, 0);
29386 gcc_assert (GET_CODE (set) == SET);
29388 addr = SET_SRC (set);
29390 return memory_address_length (addr, true);
29393 extract_insn_cached (insn);
29394 for (i = recog_data.n_operands - 1; i >= 0; --i)
29396 rtx op = recog_data.operand[i];
29397 if (MEM_P (op))
29399 constrain_operands_cached (insn, reload_completed);
29400 if (which_alternative != -1)
29402 const char *constraints = recog_data.constraints[i];
29403 int alt = which_alternative;
29405 while (*constraints == '=' || *constraints == '+')
29406 constraints++;
29407 while (alt-- > 0)
29408 while (*constraints++ != ',')
29410 /* Skip ignored operands. */
29411 if (*constraints == 'X')
29412 continue;
29415 int len = memory_address_length (XEXP (op, 0), false);
29417 /* Account for segment prefix for non-default addr spaces. */
29418 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29419 len++;
29421 return len;
29424 return 0;
29427 /* Compute default value for "length_vex" attribute. It includes
29428 2 or 3 byte VEX prefix and 1 opcode byte. */
29431 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29432 bool has_vex_w)
29434 int i;
29436 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29437 byte VEX prefix. */
29438 if (!has_0f_opcode || has_vex_w)
29439 return 3 + 1;
29441 /* We can always use 2 byte VEX prefix in 32bit. */
29442 if (!TARGET_64BIT)
29443 return 2 + 1;
29445 extract_insn_cached (insn);
29447 for (i = recog_data.n_operands - 1; i >= 0; --i)
29448 if (REG_P (recog_data.operand[i]))
29450 /* REX.W bit uses 3 byte VEX prefix. */
29451 if (GET_MODE (recog_data.operand[i]) == DImode
29452 && GENERAL_REG_P (recog_data.operand[i]))
29453 return 3 + 1;
29455 else
29457 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29458 if (MEM_P (recog_data.operand[i])
29459 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29460 return 3 + 1;
29463 return 2 + 1;
29467 static bool
29468 ix86_class_likely_spilled_p (reg_class_t);
29470 /* Returns true if lhs of insn is HW function argument register and set up
29471 is_spilled to true if it is likely spilled HW register. */
29472 static bool
29473 insn_is_function_arg (rtx insn, bool* is_spilled)
29475 rtx dst;
29477 if (!NONDEBUG_INSN_P (insn))
29478 return false;
29479 /* Call instructions are not movable, ignore it. */
29480 if (CALL_P (insn))
29481 return false;
29482 insn = PATTERN (insn);
29483 if (GET_CODE (insn) == PARALLEL)
29484 insn = XVECEXP (insn, 0, 0);
29485 if (GET_CODE (insn) != SET)
29486 return false;
29487 dst = SET_DEST (insn);
29488 if (REG_P (dst) && HARD_REGISTER_P (dst)
29489 && ix86_function_arg_regno_p (REGNO (dst)))
29491 /* Is it likely spilled HW register? */
29492 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29493 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29494 *is_spilled = true;
29495 return true;
29497 return false;
29500 /* Add output dependencies for chain of function adjacent arguments if only
29501 there is a move to likely spilled HW register. Return first argument
29502 if at least one dependence was added or NULL otherwise. */
29503 static rtx_insn *
29504 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29506 rtx_insn *insn;
29507 rtx_insn *last = call;
29508 rtx_insn *first_arg = NULL;
29509 bool is_spilled = false;
29511 head = PREV_INSN (head);
29513 /* Find nearest to call argument passing instruction. */
29514 while (true)
29516 last = PREV_INSN (last);
29517 if (last == head)
29518 return NULL;
29519 if (!NONDEBUG_INSN_P (last))
29520 continue;
29521 if (insn_is_function_arg (last, &is_spilled))
29522 break;
29523 return NULL;
29526 first_arg = last;
29527 while (true)
29529 insn = PREV_INSN (last);
29530 if (!INSN_P (insn))
29531 break;
29532 if (insn == head)
29533 break;
29534 if (!NONDEBUG_INSN_P (insn))
29536 last = insn;
29537 continue;
29539 if (insn_is_function_arg (insn, &is_spilled))
29541 /* Add output depdendence between two function arguments if chain
29542 of output arguments contains likely spilled HW registers. */
29543 if (is_spilled)
29544 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29545 first_arg = last = insn;
29547 else
29548 break;
29550 if (!is_spilled)
29551 return NULL;
29552 return first_arg;
29555 /* Add output or anti dependency from insn to first_arg to restrict its code
29556 motion. */
29557 static void
29558 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29560 rtx set;
29561 rtx tmp;
29563 /* Add anti dependencies for bounds stores. */
29564 if (INSN_P (insn)
29565 && GET_CODE (PATTERN (insn)) == PARALLEL
29566 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29567 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29569 add_dependence (first_arg, insn, REG_DEP_ANTI);
29570 return;
29573 set = single_set (insn);
29574 if (!set)
29575 return;
29576 tmp = SET_DEST (set);
29577 if (REG_P (tmp))
29579 /* Add output dependency to the first function argument. */
29580 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29581 return;
29583 /* Add anti dependency. */
29584 add_dependence (first_arg, insn, REG_DEP_ANTI);
29587 /* Avoid cross block motion of function argument through adding dependency
29588 from the first non-jump instruction in bb. */
29589 static void
29590 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29592 rtx_insn *insn = BB_END (bb);
29594 while (insn)
29596 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29598 rtx set = single_set (insn);
29599 if (set)
29601 avoid_func_arg_motion (arg, insn);
29602 return;
29605 if (insn == BB_HEAD (bb))
29606 return;
29607 insn = PREV_INSN (insn);
29611 /* Hook for pre-reload schedule - avoid motion of function arguments
29612 passed in likely spilled HW registers. */
29613 static void
29614 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29616 rtx_insn *insn;
29617 rtx_insn *first_arg = NULL;
29618 if (reload_completed)
29619 return;
29620 while (head != tail && DEBUG_INSN_P (head))
29621 head = NEXT_INSN (head);
29622 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29623 if (INSN_P (insn) && CALL_P (insn))
29625 first_arg = add_parameter_dependencies (insn, head);
29626 if (first_arg)
29628 /* Add dependee for first argument to predecessors if only
29629 region contains more than one block. */
29630 basic_block bb = BLOCK_FOR_INSN (insn);
29631 int rgn = CONTAINING_RGN (bb->index);
29632 int nr_blks = RGN_NR_BLOCKS (rgn);
29633 /* Skip trivial regions and region head blocks that can have
29634 predecessors outside of region. */
29635 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29637 edge e;
29638 edge_iterator ei;
29640 /* Regions are SCCs with the exception of selective
29641 scheduling with pipelining of outer blocks enabled.
29642 So also check that immediate predecessors of a non-head
29643 block are in the same region. */
29644 FOR_EACH_EDGE (e, ei, bb->preds)
29646 /* Avoid creating of loop-carried dependencies through
29647 using topological ordering in the region. */
29648 if (rgn == CONTAINING_RGN (e->src->index)
29649 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29650 add_dependee_for_func_arg (first_arg, e->src);
29653 insn = first_arg;
29654 if (insn == head)
29655 break;
29658 else if (first_arg)
29659 avoid_func_arg_motion (first_arg, insn);
29662 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29663 HW registers to maximum, to schedule them at soon as possible. These are
29664 moves from function argument registers at the top of the function entry
29665 and moves from function return value registers after call. */
29666 static int
29667 ix86_adjust_priority (rtx_insn *insn, int priority)
29669 rtx set;
29671 if (reload_completed)
29672 return priority;
29674 if (!NONDEBUG_INSN_P (insn))
29675 return priority;
29677 set = single_set (insn);
29678 if (set)
29680 rtx tmp = SET_SRC (set);
29681 if (REG_P (tmp)
29682 && HARD_REGISTER_P (tmp)
29683 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29684 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29685 return current_sched_info->sched_max_insns_priority;
29688 return priority;
29691 /* Prepare for scheduling pass. */
29692 static void
29693 ix86_sched_init_global (FILE *, int, int)
29695 /* Install scheduling hooks for current CPU. Some of these hooks are used
29696 in time-critical parts of the scheduler, so we only set them up when
29697 they are actually used. */
29698 switch (ix86_tune)
29700 case PROCESSOR_CORE2:
29701 case PROCESSOR_NEHALEM:
29702 case PROCESSOR_SANDYBRIDGE:
29703 case PROCESSOR_HASWELL:
29704 case PROCESSOR_GENERIC:
29705 /* Do not perform multipass scheduling for pre-reload schedule
29706 to save compile time. */
29707 if (reload_completed)
29709 ix86_core2i7_init_hooks ();
29710 break;
29712 /* Fall through. */
29713 default:
29714 targetm.sched.dfa_post_advance_cycle = NULL;
29715 targetm.sched.first_cycle_multipass_init = NULL;
29716 targetm.sched.first_cycle_multipass_begin = NULL;
29717 targetm.sched.first_cycle_multipass_issue = NULL;
29718 targetm.sched.first_cycle_multipass_backtrack = NULL;
29719 targetm.sched.first_cycle_multipass_end = NULL;
29720 targetm.sched.first_cycle_multipass_fini = NULL;
29721 break;
29726 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
29728 static HOST_WIDE_INT
29729 ix86_static_rtx_alignment (machine_mode mode)
29731 if (mode == DFmode)
29732 return 64;
29733 if (ALIGN_MODE_128 (mode))
29734 return MAX (128, GET_MODE_ALIGNMENT (mode));
29735 return GET_MODE_ALIGNMENT (mode);
29738 /* Implement TARGET_CONSTANT_ALIGNMENT. */
29740 static HOST_WIDE_INT
29741 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29743 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29744 || TREE_CODE (exp) == INTEGER_CST)
29746 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29747 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29748 return MAX (mode_align, align);
29750 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29751 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29752 return BITS_PER_WORD;
29754 return align;
29757 /* Implement TARGET_EMPTY_RECORD_P. */
29759 static bool
29760 ix86_is_empty_record (const_tree type)
29762 if (!TARGET_64BIT)
29763 return false;
29764 return default_is_empty_record (type);
29767 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
29769 static void
29770 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
29772 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
29774 if (!cum->warn_empty)
29775 return;
29777 if (!TYPE_EMPTY_P (type))
29778 return;
29780 const_tree ctx = get_ultimate_context (cum->decl);
29781 if (ctx != NULL_TREE
29782 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
29783 return;
29785 /* If the actual size of the type is zero, then there is no change
29786 in how objects of this size are passed. */
29787 if (int_size_in_bytes (type) == 0)
29788 return;
29790 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
29791 "changes in -fabi-version=12 (GCC 8)", type);
29793 /* Only warn once. */
29794 cum->warn_empty = false;
29797 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
29798 the data type, and ALIGN is the alignment that the object would
29799 ordinarily have. */
29801 static int
29802 iamcu_alignment (tree type, int align)
29804 machine_mode mode;
29806 if (align < 32 || TYPE_USER_ALIGN (type))
29807 return align;
29809 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
29810 bytes. */
29811 mode = TYPE_MODE (strip_array_types (type));
29812 switch (GET_MODE_CLASS (mode))
29814 case MODE_INT:
29815 case MODE_COMPLEX_INT:
29816 case MODE_COMPLEX_FLOAT:
29817 case MODE_FLOAT:
29818 case MODE_DECIMAL_FLOAT:
29819 return 32;
29820 default:
29821 return align;
29825 /* Compute the alignment for a static variable.
29826 TYPE is the data type, and ALIGN is the alignment that
29827 the object would ordinarily have. The value of this function is used
29828 instead of that alignment to align the object. */
29831 ix86_data_alignment (tree type, int align, bool opt)
29833 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
29834 for symbols from other compilation units or symbols that don't need
29835 to bind locally. In order to preserve some ABI compatibility with
29836 those compilers, ensure we don't decrease alignment from what we
29837 used to assume. */
29839 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
29841 /* A data structure, equal or greater than the size of a cache line
29842 (64 bytes in the Pentium 4 and other recent Intel processors, including
29843 processors based on Intel Core microarchitecture) should be aligned
29844 so that its base address is a multiple of a cache line size. */
29846 int max_align
29847 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29849 if (max_align < BITS_PER_WORD)
29850 max_align = BITS_PER_WORD;
29852 switch (ix86_align_data_type)
29854 case ix86_align_data_type_abi: opt = false; break;
29855 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29856 case ix86_align_data_type_cacheline: break;
29859 if (TARGET_IAMCU)
29860 align = iamcu_alignment (type, align);
29862 if (opt
29863 && AGGREGATE_TYPE_P (type)
29864 && TYPE_SIZE (type)
29865 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29867 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29868 && align < max_align_compat)
29869 align = max_align_compat;
29870 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29871 && align < max_align)
29872 align = max_align;
29875 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29876 to 16byte boundary. */
29877 if (TARGET_64BIT)
29879 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29880 && TYPE_SIZE (type)
29881 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29882 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29883 && align < 128)
29884 return 128;
29887 if (!opt)
29888 return align;
29890 if (TREE_CODE (type) == ARRAY_TYPE)
29892 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29893 return 64;
29894 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29895 return 128;
29897 else if (TREE_CODE (type) == COMPLEX_TYPE)
29900 if (TYPE_MODE (type) == DCmode && align < 64)
29901 return 64;
29902 if ((TYPE_MODE (type) == XCmode
29903 || TYPE_MODE (type) == TCmode) && align < 128)
29904 return 128;
29906 else if ((TREE_CODE (type) == RECORD_TYPE
29907 || TREE_CODE (type) == UNION_TYPE
29908 || TREE_CODE (type) == QUAL_UNION_TYPE)
29909 && TYPE_FIELDS (type))
29911 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29912 return 64;
29913 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29914 return 128;
29916 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29917 || TREE_CODE (type) == INTEGER_TYPE)
29919 if (TYPE_MODE (type) == DFmode && align < 64)
29920 return 64;
29921 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29922 return 128;
29925 return align;
29928 /* Compute the alignment for a local variable or a stack slot. EXP is
29929 the data type or decl itself, MODE is the widest mode available and
29930 ALIGN is the alignment that the object would ordinarily have. The
29931 value of this macro is used instead of that alignment to align the
29932 object. */
29934 unsigned int
29935 ix86_local_alignment (tree exp, machine_mode mode,
29936 unsigned int align)
29938 tree type, decl;
29940 if (exp && DECL_P (exp))
29942 type = TREE_TYPE (exp);
29943 decl = exp;
29945 else
29947 type = exp;
29948 decl = NULL;
29951 /* Don't do dynamic stack realignment for long long objects with
29952 -mpreferred-stack-boundary=2. */
29953 if (!TARGET_64BIT
29954 && align == 64
29955 && ix86_preferred_stack_boundary < 64
29956 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29957 && (!type || !TYPE_USER_ALIGN (type))
29958 && (!decl || !DECL_USER_ALIGN (decl)))
29959 align = 32;
29961 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29962 register in MODE. We will return the largest alignment of XF
29963 and DF. */
29964 if (!type)
29966 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29967 align = GET_MODE_ALIGNMENT (DFmode);
29968 return align;
29971 /* Don't increase alignment for Intel MCU psABI. */
29972 if (TARGET_IAMCU)
29973 return align;
29975 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29976 to 16byte boundary. Exact wording is:
29978 An array uses the same alignment as its elements, except that a local or
29979 global array variable of length at least 16 bytes or
29980 a C99 variable-length array variable always has alignment of at least 16 bytes.
29982 This was added to allow use of aligned SSE instructions at arrays. This
29983 rule is meant for static storage (where compiler can not do the analysis
29984 by itself). We follow it for automatic variables only when convenient.
29985 We fully control everything in the function compiled and functions from
29986 other unit can not rely on the alignment.
29988 Exclude va_list type. It is the common case of local array where
29989 we can not benefit from the alignment.
29991 TODO: Probably one should optimize for size only when var is not escaping. */
29992 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29993 && TARGET_SSE)
29995 if (AGGREGATE_TYPE_P (type)
29996 && (va_list_type_node == NULL_TREE
29997 || (TYPE_MAIN_VARIANT (type)
29998 != TYPE_MAIN_VARIANT (va_list_type_node)))
29999 && TYPE_SIZE (type)
30000 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30001 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30002 && align < 128)
30003 return 128;
30005 if (TREE_CODE (type) == ARRAY_TYPE)
30007 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30008 return 64;
30009 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30010 return 128;
30012 else if (TREE_CODE (type) == COMPLEX_TYPE)
30014 if (TYPE_MODE (type) == DCmode && align < 64)
30015 return 64;
30016 if ((TYPE_MODE (type) == XCmode
30017 || TYPE_MODE (type) == TCmode) && align < 128)
30018 return 128;
30020 else if ((TREE_CODE (type) == RECORD_TYPE
30021 || TREE_CODE (type) == UNION_TYPE
30022 || TREE_CODE (type) == QUAL_UNION_TYPE)
30023 && TYPE_FIELDS (type))
30025 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30026 return 64;
30027 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30028 return 128;
30030 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30031 || TREE_CODE (type) == INTEGER_TYPE)
30034 if (TYPE_MODE (type) == DFmode && align < 64)
30035 return 64;
30036 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30037 return 128;
30039 return align;
30042 /* Compute the minimum required alignment for dynamic stack realignment
30043 purposes for a local variable, parameter or a stack slot. EXP is
30044 the data type or decl itself, MODE is its mode and ALIGN is the
30045 alignment that the object would ordinarily have. */
30047 unsigned int
30048 ix86_minimum_alignment (tree exp, machine_mode mode,
30049 unsigned int align)
30051 tree type, decl;
30053 if (exp && DECL_P (exp))
30055 type = TREE_TYPE (exp);
30056 decl = exp;
30058 else
30060 type = exp;
30061 decl = NULL;
30064 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30065 return align;
30067 /* Don't do dynamic stack realignment for long long objects with
30068 -mpreferred-stack-boundary=2. */
30069 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30070 && (!type || !TYPE_USER_ALIGN (type))
30071 && (!decl || !DECL_USER_ALIGN (decl)))
30073 gcc_checking_assert (!TARGET_STV);
30074 return 32;
30077 return align;
30080 /* Find a location for the static chain incoming to a nested function.
30081 This is a register, unless all free registers are used by arguments. */
30083 static rtx
30084 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30086 unsigned regno;
30088 if (TARGET_64BIT)
30090 /* We always use R10 in 64-bit mode. */
30091 regno = R10_REG;
30093 else
30095 const_tree fntype, fndecl;
30096 unsigned int ccvt;
30098 /* By default in 32-bit mode we use ECX to pass the static chain. */
30099 regno = CX_REG;
30101 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30103 fntype = TREE_TYPE (fndecl_or_type);
30104 fndecl = fndecl_or_type;
30106 else
30108 fntype = fndecl_or_type;
30109 fndecl = NULL;
30112 ccvt = ix86_get_callcvt (fntype);
30113 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30115 /* Fastcall functions use ecx/edx for arguments, which leaves
30116 us with EAX for the static chain.
30117 Thiscall functions use ecx for arguments, which also
30118 leaves us with EAX for the static chain. */
30119 regno = AX_REG;
30121 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30123 /* Thiscall functions use ecx for arguments, which leaves
30124 us with EAX and EDX for the static chain.
30125 We are using for abi-compatibility EAX. */
30126 regno = AX_REG;
30128 else if (ix86_function_regparm (fntype, fndecl) == 3)
30130 /* For regparm 3, we have no free call-clobbered registers in
30131 which to store the static chain. In order to implement this,
30132 we have the trampoline push the static chain to the stack.
30133 However, we can't push a value below the return address when
30134 we call the nested function directly, so we have to use an
30135 alternate entry point. For this we use ESI, and have the
30136 alternate entry point push ESI, so that things appear the
30137 same once we're executing the nested function. */
30138 if (incoming_p)
30140 if (fndecl == current_function_decl
30141 && !ix86_static_chain_on_stack)
30143 gcc_assert (!reload_completed);
30144 ix86_static_chain_on_stack = true;
30146 return gen_frame_mem (SImode,
30147 plus_constant (Pmode,
30148 arg_pointer_rtx, -8));
30150 regno = SI_REG;
30154 return gen_rtx_REG (Pmode, regno);
30157 /* Emit RTL insns to initialize the variable parts of a trampoline.
30158 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30159 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30160 to be passed to the target function. */
30162 static void
30163 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30165 rtx mem, fnaddr;
30166 int opcode;
30167 int offset = 0;
30169 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30171 if (TARGET_64BIT)
30173 int size;
30175 /* Load the function address to r11. Try to load address using
30176 the shorter movl instead of movabs. We may want to support
30177 movq for kernel mode, but kernel does not use trampolines at
30178 the moment. FNADDR is a 32bit address and may not be in
30179 DImode when ptr_mode == SImode. Always use movl in this
30180 case. */
30181 if (ptr_mode == SImode
30182 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30184 fnaddr = copy_addr_to_reg (fnaddr);
30186 mem = adjust_address (m_tramp, HImode, offset);
30187 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30189 mem = adjust_address (m_tramp, SImode, offset + 2);
30190 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30191 offset += 6;
30193 else
30195 mem = adjust_address (m_tramp, HImode, offset);
30196 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30198 mem = adjust_address (m_tramp, DImode, offset + 2);
30199 emit_move_insn (mem, fnaddr);
30200 offset += 10;
30203 /* Load static chain using movabs to r10. Use the shorter movl
30204 instead of movabs when ptr_mode == SImode. */
30205 if (ptr_mode == SImode)
30207 opcode = 0xba41;
30208 size = 6;
30210 else
30212 opcode = 0xba49;
30213 size = 10;
30216 mem = adjust_address (m_tramp, HImode, offset);
30217 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30219 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30220 emit_move_insn (mem, chain_value);
30221 offset += size;
30223 /* Jump to r11; the last (unused) byte is a nop, only there to
30224 pad the write out to a single 32-bit store. */
30225 mem = adjust_address (m_tramp, SImode, offset);
30226 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30227 offset += 4;
30229 else
30231 rtx disp, chain;
30233 /* Depending on the static chain location, either load a register
30234 with a constant, or push the constant to the stack. All of the
30235 instructions are the same size. */
30236 chain = ix86_static_chain (fndecl, true);
30237 if (REG_P (chain))
30239 switch (REGNO (chain))
30241 case AX_REG:
30242 opcode = 0xb8; break;
30243 case CX_REG:
30244 opcode = 0xb9; break;
30245 default:
30246 gcc_unreachable ();
30249 else
30250 opcode = 0x68;
30252 mem = adjust_address (m_tramp, QImode, offset);
30253 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30255 mem = adjust_address (m_tramp, SImode, offset + 1);
30256 emit_move_insn (mem, chain_value);
30257 offset += 5;
30259 mem = adjust_address (m_tramp, QImode, offset);
30260 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30262 mem = adjust_address (m_tramp, SImode, offset + 1);
30264 /* Compute offset from the end of the jmp to the target function.
30265 In the case in which the trampoline stores the static chain on
30266 the stack, we need to skip the first insn which pushes the
30267 (call-saved) register static chain; this push is 1 byte. */
30268 offset += 5;
30269 disp = expand_binop (SImode, sub_optab, fnaddr,
30270 plus_constant (Pmode, XEXP (m_tramp, 0),
30271 offset - (MEM_P (chain) ? 1 : 0)),
30272 NULL_RTX, 1, OPTAB_DIRECT);
30273 emit_move_insn (mem, disp);
30276 gcc_assert (offset <= TRAMPOLINE_SIZE);
30278 #ifdef HAVE_ENABLE_EXECUTE_STACK
30279 #ifdef CHECK_EXECUTE_STACK_ENABLED
30280 if (CHECK_EXECUTE_STACK_ENABLED)
30281 #endif
30282 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30283 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30284 #endif
30287 static bool
30288 ix86_allocate_stack_slots_for_args (void)
30290 /* Naked functions should not allocate stack slots for arguments. */
30291 return !ix86_function_naked (current_function_decl);
30294 static bool
30295 ix86_warn_func_return (tree decl)
30297 /* Naked functions are implemented entirely in assembly, including the
30298 return sequence, so suppress warnings about this. */
30299 return !ix86_function_naked (decl);
30302 /* The following file contains several enumerations and data structures
30303 built from the definitions in i386-builtin-types.def. */
30305 #include "i386-builtin-types.inc"
30307 /* Table for the ix86 builtin non-function types. */
30308 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30310 /* Retrieve an element from the above table, building some of
30311 the types lazily. */
30313 static tree
30314 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30316 unsigned int index;
30317 tree type, itype;
30319 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30321 type = ix86_builtin_type_tab[(int) tcode];
30322 if (type != NULL)
30323 return type;
30325 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30326 if (tcode <= IX86_BT_LAST_VECT)
30328 machine_mode mode;
30330 index = tcode - IX86_BT_LAST_PRIM - 1;
30331 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30332 mode = ix86_builtin_type_vect_mode[index];
30334 type = build_vector_type_for_mode (itype, mode);
30336 else
30338 int quals;
30340 index = tcode - IX86_BT_LAST_VECT - 1;
30341 if (tcode <= IX86_BT_LAST_PTR)
30342 quals = TYPE_UNQUALIFIED;
30343 else
30344 quals = TYPE_QUAL_CONST;
30346 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30347 if (quals != TYPE_UNQUALIFIED)
30348 itype = build_qualified_type (itype, quals);
30350 type = build_pointer_type (itype);
30353 ix86_builtin_type_tab[(int) tcode] = type;
30354 return type;
30357 /* Table for the ix86 builtin function types. */
30358 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30360 /* Retrieve an element from the above table, building some of
30361 the types lazily. */
30363 static tree
30364 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30366 tree type;
30368 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30370 type = ix86_builtin_func_type_tab[(int) tcode];
30371 if (type != NULL)
30372 return type;
30374 if (tcode <= IX86_BT_LAST_FUNC)
30376 unsigned start = ix86_builtin_func_start[(int) tcode];
30377 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30378 tree rtype, atype, args = void_list_node;
30379 unsigned i;
30381 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30382 for (i = after - 1; i > start; --i)
30384 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30385 args = tree_cons (NULL, atype, args);
30388 type = build_function_type (rtype, args);
30390 else
30392 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30393 enum ix86_builtin_func_type icode;
30395 icode = ix86_builtin_func_alias_base[index];
30396 type = ix86_get_builtin_func_type (icode);
30399 ix86_builtin_func_type_tab[(int) tcode] = type;
30400 return type;
30404 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30405 bdesc_* arrays below should come first, then builtins for each bdesc_*
30406 array in ascending order, so that we can use direct array accesses. */
30407 enum ix86_builtins
30409 IX86_BUILTIN_MASKMOVQ,
30410 IX86_BUILTIN_LDMXCSR,
30411 IX86_BUILTIN_STMXCSR,
30412 IX86_BUILTIN_MASKMOVDQU,
30413 IX86_BUILTIN_PSLLDQ128,
30414 IX86_BUILTIN_CLFLUSH,
30415 IX86_BUILTIN_MONITOR,
30416 IX86_BUILTIN_MWAIT,
30417 IX86_BUILTIN_CLZERO,
30418 IX86_BUILTIN_VEC_INIT_V2SI,
30419 IX86_BUILTIN_VEC_INIT_V4HI,
30420 IX86_BUILTIN_VEC_INIT_V8QI,
30421 IX86_BUILTIN_VEC_EXT_V2DF,
30422 IX86_BUILTIN_VEC_EXT_V2DI,
30423 IX86_BUILTIN_VEC_EXT_V4SF,
30424 IX86_BUILTIN_VEC_EXT_V4SI,
30425 IX86_BUILTIN_VEC_EXT_V8HI,
30426 IX86_BUILTIN_VEC_EXT_V2SI,
30427 IX86_BUILTIN_VEC_EXT_V4HI,
30428 IX86_BUILTIN_VEC_EXT_V16QI,
30429 IX86_BUILTIN_VEC_SET_V2DI,
30430 IX86_BUILTIN_VEC_SET_V4SF,
30431 IX86_BUILTIN_VEC_SET_V4SI,
30432 IX86_BUILTIN_VEC_SET_V8HI,
30433 IX86_BUILTIN_VEC_SET_V4HI,
30434 IX86_BUILTIN_VEC_SET_V16QI,
30435 IX86_BUILTIN_GATHERSIV2DF,
30436 IX86_BUILTIN_GATHERSIV4DF,
30437 IX86_BUILTIN_GATHERDIV2DF,
30438 IX86_BUILTIN_GATHERDIV4DF,
30439 IX86_BUILTIN_GATHERSIV4SF,
30440 IX86_BUILTIN_GATHERSIV8SF,
30441 IX86_BUILTIN_GATHERDIV4SF,
30442 IX86_BUILTIN_GATHERDIV8SF,
30443 IX86_BUILTIN_GATHERSIV2DI,
30444 IX86_BUILTIN_GATHERSIV4DI,
30445 IX86_BUILTIN_GATHERDIV2DI,
30446 IX86_BUILTIN_GATHERDIV4DI,
30447 IX86_BUILTIN_GATHERSIV4SI,
30448 IX86_BUILTIN_GATHERSIV8SI,
30449 IX86_BUILTIN_GATHERDIV4SI,
30450 IX86_BUILTIN_GATHERDIV8SI,
30451 IX86_BUILTIN_VFMSUBSD3_MASK3,
30452 IX86_BUILTIN_VFMSUBSS3_MASK3,
30453 IX86_BUILTIN_GATHER3SIV8SF,
30454 IX86_BUILTIN_GATHER3SIV4SF,
30455 IX86_BUILTIN_GATHER3SIV4DF,
30456 IX86_BUILTIN_GATHER3SIV2DF,
30457 IX86_BUILTIN_GATHER3DIV8SF,
30458 IX86_BUILTIN_GATHER3DIV4SF,
30459 IX86_BUILTIN_GATHER3DIV4DF,
30460 IX86_BUILTIN_GATHER3DIV2DF,
30461 IX86_BUILTIN_GATHER3SIV8SI,
30462 IX86_BUILTIN_GATHER3SIV4SI,
30463 IX86_BUILTIN_GATHER3SIV4DI,
30464 IX86_BUILTIN_GATHER3SIV2DI,
30465 IX86_BUILTIN_GATHER3DIV8SI,
30466 IX86_BUILTIN_GATHER3DIV4SI,
30467 IX86_BUILTIN_GATHER3DIV4DI,
30468 IX86_BUILTIN_GATHER3DIV2DI,
30469 IX86_BUILTIN_SCATTERSIV8SF,
30470 IX86_BUILTIN_SCATTERSIV4SF,
30471 IX86_BUILTIN_SCATTERSIV4DF,
30472 IX86_BUILTIN_SCATTERSIV2DF,
30473 IX86_BUILTIN_SCATTERDIV8SF,
30474 IX86_BUILTIN_SCATTERDIV4SF,
30475 IX86_BUILTIN_SCATTERDIV4DF,
30476 IX86_BUILTIN_SCATTERDIV2DF,
30477 IX86_BUILTIN_SCATTERSIV8SI,
30478 IX86_BUILTIN_SCATTERSIV4SI,
30479 IX86_BUILTIN_SCATTERSIV4DI,
30480 IX86_BUILTIN_SCATTERSIV2DI,
30481 IX86_BUILTIN_SCATTERDIV8SI,
30482 IX86_BUILTIN_SCATTERDIV4SI,
30483 IX86_BUILTIN_SCATTERDIV4DI,
30484 IX86_BUILTIN_SCATTERDIV2DI,
30485 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30486 where all operands are 32-byte or 64-byte wide respectively. */
30487 IX86_BUILTIN_GATHERALTSIV4DF,
30488 IX86_BUILTIN_GATHERALTDIV8SF,
30489 IX86_BUILTIN_GATHERALTSIV4DI,
30490 IX86_BUILTIN_GATHERALTDIV8SI,
30491 IX86_BUILTIN_GATHER3ALTDIV16SF,
30492 IX86_BUILTIN_GATHER3ALTDIV16SI,
30493 IX86_BUILTIN_GATHER3ALTSIV4DF,
30494 IX86_BUILTIN_GATHER3ALTDIV8SF,
30495 IX86_BUILTIN_GATHER3ALTSIV4DI,
30496 IX86_BUILTIN_GATHER3ALTDIV8SI,
30497 IX86_BUILTIN_GATHER3ALTSIV8DF,
30498 IX86_BUILTIN_GATHER3ALTSIV8DI,
30499 IX86_BUILTIN_GATHER3DIV16SF,
30500 IX86_BUILTIN_GATHER3DIV16SI,
30501 IX86_BUILTIN_GATHER3DIV8DF,
30502 IX86_BUILTIN_GATHER3DIV8DI,
30503 IX86_BUILTIN_GATHER3SIV16SF,
30504 IX86_BUILTIN_GATHER3SIV16SI,
30505 IX86_BUILTIN_GATHER3SIV8DF,
30506 IX86_BUILTIN_GATHER3SIV8DI,
30507 IX86_BUILTIN_SCATTERALTSIV8DF,
30508 IX86_BUILTIN_SCATTERALTDIV16SF,
30509 IX86_BUILTIN_SCATTERALTSIV8DI,
30510 IX86_BUILTIN_SCATTERALTDIV16SI,
30511 IX86_BUILTIN_SCATTERDIV16SF,
30512 IX86_BUILTIN_SCATTERDIV16SI,
30513 IX86_BUILTIN_SCATTERDIV8DF,
30514 IX86_BUILTIN_SCATTERDIV8DI,
30515 IX86_BUILTIN_SCATTERSIV16SF,
30516 IX86_BUILTIN_SCATTERSIV16SI,
30517 IX86_BUILTIN_SCATTERSIV8DF,
30518 IX86_BUILTIN_SCATTERSIV8DI,
30519 IX86_BUILTIN_GATHERPFQPD,
30520 IX86_BUILTIN_GATHERPFDPS,
30521 IX86_BUILTIN_GATHERPFDPD,
30522 IX86_BUILTIN_GATHERPFQPS,
30523 IX86_BUILTIN_SCATTERPFDPD,
30524 IX86_BUILTIN_SCATTERPFDPS,
30525 IX86_BUILTIN_SCATTERPFQPD,
30526 IX86_BUILTIN_SCATTERPFQPS,
30527 IX86_BUILTIN_CLWB,
30528 IX86_BUILTIN_CLFLUSHOPT,
30529 IX86_BUILTIN_INFQ,
30530 IX86_BUILTIN_HUGE_VALQ,
30531 IX86_BUILTIN_NANQ,
30532 IX86_BUILTIN_NANSQ,
30533 IX86_BUILTIN_XABORT,
30534 IX86_BUILTIN_ADDCARRYX32,
30535 IX86_BUILTIN_ADDCARRYX64,
30536 IX86_BUILTIN_SBB32,
30537 IX86_BUILTIN_SBB64,
30538 IX86_BUILTIN_RDRAND16_STEP,
30539 IX86_BUILTIN_RDRAND32_STEP,
30540 IX86_BUILTIN_RDRAND64_STEP,
30541 IX86_BUILTIN_RDSEED16_STEP,
30542 IX86_BUILTIN_RDSEED32_STEP,
30543 IX86_BUILTIN_RDSEED64_STEP,
30544 IX86_BUILTIN_MONITORX,
30545 IX86_BUILTIN_MWAITX,
30546 IX86_BUILTIN_CFSTRING,
30547 IX86_BUILTIN_CPU_INIT,
30548 IX86_BUILTIN_CPU_IS,
30549 IX86_BUILTIN_CPU_SUPPORTS,
30550 IX86_BUILTIN_READ_FLAGS,
30551 IX86_BUILTIN_WRITE_FLAGS,
30553 /* All the remaining builtins are tracked in bdesc_* arrays in
30554 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30555 this point. */
30556 #define BDESC(mask, icode, name, code, comparison, flag) \
30557 code,
30558 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30559 code, \
30560 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30561 #define BDESC_END(kind, next_kind)
30563 #include "i386-builtin.def"
30565 #undef BDESC
30566 #undef BDESC_FIRST
30567 #undef BDESC_END
30569 IX86_BUILTIN_MAX,
30571 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30573 /* Now just the aliases for bdesc_* start/end. */
30574 #define BDESC(mask, icode, name, code, comparison, flag)
30575 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30576 #define BDESC_END(kind, next_kind) \
30577 IX86_BUILTIN__BDESC_##kind##_LAST \
30578 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30580 #include "i386-builtin.def"
30582 #undef BDESC
30583 #undef BDESC_FIRST
30584 #undef BDESC_END
30586 /* Just to make sure there is no comma after the last enumerator. */
30587 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30590 /* Table for the ix86 builtin decls. */
30591 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30593 /* Table of all of the builtin functions that are possible with different ISA's
30594 but are waiting to be built until a function is declared to use that
30595 ISA. */
30596 struct builtin_isa {
30597 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30598 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30599 const char *name; /* function name */
30600 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30601 unsigned char const_p:1; /* true if the declaration is constant */
30602 unsigned char pure_p:1; /* true if the declaration has pure attribute */
30603 bool leaf_p; /* true if the declaration has leaf attribute */
30604 bool nothrow_p; /* true if the declaration has nothrow attribute */
30605 bool set_and_not_built_p;
30608 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30610 /* Bits that can still enable any inclusion of a builtin. */
30611 static HOST_WIDE_INT deferred_isa_values = 0;
30612 static HOST_WIDE_INT deferred_isa_values2 = 0;
30614 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30615 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30616 function decl in the ix86_builtins array. Returns the function decl or
30617 NULL_TREE, if the builtin was not added.
30619 If the front end has a special hook for builtin functions, delay adding
30620 builtin functions that aren't in the current ISA until the ISA is changed
30621 with function specific optimization. Doing so, can save about 300K for the
30622 default compiler. When the builtin is expanded, check at that time whether
30623 it is valid.
30625 If the front end doesn't have a special hook, record all builtins, even if
30626 it isn't an instruction set in the current ISA in case the user uses
30627 function specific options for a different ISA, so that we don't get scope
30628 errors if a builtin is added in the middle of a function scope. */
30630 static inline tree
30631 def_builtin (HOST_WIDE_INT mask, const char *name,
30632 enum ix86_builtin_func_type tcode,
30633 enum ix86_builtins code)
30635 tree decl = NULL_TREE;
30637 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30639 ix86_builtins_isa[(int) code].isa = mask;
30641 mask &= ~OPTION_MASK_ISA_64BIT;
30643 /* Filter out the masks most often ored together with others. */
30644 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30645 && mask != OPTION_MASK_ISA_AVX512VL)
30646 mask &= ~OPTION_MASK_ISA_AVX512VL;
30647 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30648 && mask != OPTION_MASK_ISA_AVX512BW)
30649 mask &= ~OPTION_MASK_ISA_AVX512BW;
30651 if (mask == 0
30652 || (mask & ix86_isa_flags) != 0
30653 || (lang_hooks.builtin_function
30654 == lang_hooks.builtin_function_ext_scope))
30656 tree type = ix86_get_builtin_func_type (tcode);
30657 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30658 NULL, NULL_TREE);
30659 ix86_builtins[(int) code] = decl;
30660 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30662 else
30664 /* Just a MASK where set_and_not_built_p == true can potentially
30665 include a builtin. */
30666 deferred_isa_values |= mask;
30667 ix86_builtins[(int) code] = NULL_TREE;
30668 ix86_builtins_isa[(int) code].tcode = tcode;
30669 ix86_builtins_isa[(int) code].name = name;
30670 ix86_builtins_isa[(int) code].leaf_p = false;
30671 ix86_builtins_isa[(int) code].nothrow_p = false;
30672 ix86_builtins_isa[(int) code].const_p = false;
30673 ix86_builtins_isa[(int) code].pure_p = false;
30674 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30678 return decl;
30681 /* Like def_builtin, but also marks the function decl "const". */
30683 static inline tree
30684 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30685 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30687 tree decl = def_builtin (mask, name, tcode, code);
30688 if (decl)
30689 TREE_READONLY (decl) = 1;
30690 else
30691 ix86_builtins_isa[(int) code].const_p = true;
30693 return decl;
30696 /* Like def_builtin, but also marks the function decl "pure". */
30698 static inline tree
30699 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30700 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30702 tree decl = def_builtin (mask, name, tcode, code);
30703 if (decl)
30704 DECL_PURE_P (decl) = 1;
30705 else
30706 ix86_builtins_isa[(int) code].pure_p = true;
30708 return decl;
30711 /* Like def_builtin, but for additional isa2 flags. */
30713 static inline tree
30714 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30715 enum ix86_builtin_func_type tcode,
30716 enum ix86_builtins code)
30718 tree decl = NULL_TREE;
30720 ix86_builtins_isa[(int) code].isa2 = mask;
30722 if (mask == 0
30723 || (mask & ix86_isa_flags2) != 0
30724 || (lang_hooks.builtin_function
30725 == lang_hooks.builtin_function_ext_scope))
30728 tree type = ix86_get_builtin_func_type (tcode);
30729 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30730 NULL, NULL_TREE);
30731 ix86_builtins[(int) code] = decl;
30732 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30734 else
30736 /* Just a MASK where set_and_not_built_p == true can potentially
30737 include a builtin. */
30738 deferred_isa_values2 |= mask;
30739 ix86_builtins[(int) code] = NULL_TREE;
30740 ix86_builtins_isa[(int) code].tcode = tcode;
30741 ix86_builtins_isa[(int) code].name = name;
30742 ix86_builtins_isa[(int) code].leaf_p = false;
30743 ix86_builtins_isa[(int) code].nothrow_p = false;
30744 ix86_builtins_isa[(int) code].const_p = false;
30745 ix86_builtins_isa[(int) code].pure_p = false;
30746 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30749 return decl;
30752 /* Like def_builtin, but also marks the function decl "const". */
30754 static inline tree
30755 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
30756 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30758 tree decl = def_builtin2 (mask, name, tcode, code);
30759 if (decl)
30760 TREE_READONLY (decl) = 1;
30761 else
30762 ix86_builtins_isa[(int) code].const_p = true;
30764 return decl;
30767 /* Like def_builtin, but also marks the function decl "pure". */
30769 static inline tree
30770 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
30771 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30773 tree decl = def_builtin2 (mask, name, tcode, code);
30774 if (decl)
30775 DECL_PURE_P (decl) = 1;
30776 else
30777 ix86_builtins_isa[(int) code].pure_p = true;
30779 return decl;
30782 /* Add any new builtin functions for a given ISA that may not have been
30783 declared. This saves a bit of space compared to adding all of the
30784 declarations to the tree, even if we didn't use them. */
30786 static void
30787 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
30789 isa &= ~OPTION_MASK_ISA_64BIT;
30791 if ((isa & deferred_isa_values) == 0
30792 && (isa2 & deferred_isa_values2) == 0)
30793 return;
30795 /* Bits in ISA value can be removed from potential isa values. */
30796 deferred_isa_values &= ~isa;
30797 deferred_isa_values2 &= ~isa2;
30799 int i;
30800 tree saved_current_target_pragma = current_target_pragma;
30801 current_target_pragma = NULL_TREE;
30803 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30805 if (((ix86_builtins_isa[i].isa & isa) != 0
30806 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
30807 && ix86_builtins_isa[i].set_and_not_built_p)
30809 tree decl, type;
30811 /* Don't define the builtin again. */
30812 ix86_builtins_isa[i].set_and_not_built_p = false;
30814 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30815 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30816 type, i, BUILT_IN_MD, NULL,
30817 NULL_TREE);
30819 ix86_builtins[i] = decl;
30820 if (ix86_builtins_isa[i].const_p)
30821 TREE_READONLY (decl) = 1;
30822 if (ix86_builtins_isa[i].pure_p)
30823 DECL_PURE_P (decl) = 1;
30824 if (ix86_builtins_isa[i].leaf_p)
30825 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30826 NULL_TREE);
30827 if (ix86_builtins_isa[i].nothrow_p)
30828 TREE_NOTHROW (decl) = 1;
30832 current_target_pragma = saved_current_target_pragma;
30835 /* Bits for builtin_description.flag. */
30837 /* Set when we don't support the comparison natively, and should
30838 swap_comparison in order to support it. */
30839 #define BUILTIN_DESC_SWAP_OPERANDS 1
30841 struct builtin_description
30843 const HOST_WIDE_INT mask;
30844 const enum insn_code icode;
30845 const char *const name;
30846 const enum ix86_builtins code;
30847 const enum rtx_code comparison;
30848 const int flag;
30851 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30852 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30853 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30854 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30855 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30856 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30857 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30858 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30859 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30860 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30861 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30862 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30863 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30864 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30865 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30866 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30867 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30868 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30869 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30870 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30871 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30872 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30873 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30874 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30875 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30876 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30877 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30878 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30879 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30880 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30881 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30882 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30883 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30884 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30885 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30886 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30887 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30888 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30889 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30890 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30891 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30892 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30893 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30894 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30895 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30896 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30897 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30898 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30899 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30900 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30901 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30902 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30904 #define BDESC(mask, icode, name, code, comparison, flag) \
30905 { mask, icode, name, code, comparison, flag },
30906 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30907 static const struct builtin_description bdesc_##kind[] = \
30909 BDESC (mask, icode, name, code, comparison, flag)
30910 #define BDESC_END(kind, next_kind) \
30913 #include "i386-builtin.def"
30915 #undef BDESC
30916 #undef BDESC_FIRST
30917 #undef BDESC_END
30919 /* TM vector builtins. */
30921 /* Reuse the existing x86-specific `struct builtin_description' cause
30922 we're lazy. Add casts to make them fit. */
30923 static const struct builtin_description bdesc_tm[] =
30925 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30926 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30927 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30928 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30929 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30930 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30931 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30933 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30934 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30935 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30936 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30937 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30938 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30939 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30941 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30942 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30943 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30944 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30945 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30946 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30947 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30949 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30950 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30951 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30954 /* Initialize the transactional memory vector load/store builtins. */
30956 static void
30957 ix86_init_tm_builtins (void)
30959 enum ix86_builtin_func_type ftype;
30960 const struct builtin_description *d;
30961 size_t i;
30962 tree decl;
30963 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30964 tree attrs_log, attrs_type_log;
30966 if (!flag_tm)
30967 return;
30969 /* If there are no builtins defined, we must be compiling in a
30970 language without trans-mem support. */
30971 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30972 return;
30974 /* Use whatever attributes a normal TM load has. */
30975 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30976 attrs_load = DECL_ATTRIBUTES (decl);
30977 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30978 /* Use whatever attributes a normal TM store has. */
30979 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30980 attrs_store = DECL_ATTRIBUTES (decl);
30981 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30982 /* Use whatever attributes a normal TM log has. */
30983 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30984 attrs_log = DECL_ATTRIBUTES (decl);
30985 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30987 for (i = 0, d = bdesc_tm;
30988 i < ARRAY_SIZE (bdesc_tm);
30989 i++, d++)
30991 if ((d->mask & ix86_isa_flags) != 0
30992 || (lang_hooks.builtin_function
30993 == lang_hooks.builtin_function_ext_scope))
30995 tree type, attrs, attrs_type;
30996 enum built_in_function code = (enum built_in_function) d->code;
30998 ftype = (enum ix86_builtin_func_type) d->flag;
30999 type = ix86_get_builtin_func_type (ftype);
31001 if (BUILTIN_TM_LOAD_P (code))
31003 attrs = attrs_load;
31004 attrs_type = attrs_type_load;
31006 else if (BUILTIN_TM_STORE_P (code))
31008 attrs = attrs_store;
31009 attrs_type = attrs_type_store;
31011 else
31013 attrs = attrs_log;
31014 attrs_type = attrs_type_log;
31016 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31017 /* The builtin without the prefix for
31018 calling it directly. */
31019 d->name + strlen ("__builtin_"),
31020 attrs);
31021 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31022 set the TYPE_ATTRIBUTES. */
31023 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31025 set_builtin_decl (code, decl, false);
31030 /* Macros for verification of enum ix86_builtins order. */
31031 #define BDESC_VERIFY(x, y, z) \
31032 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31033 #define BDESC_VERIFYS(x, y, z) \
31034 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31036 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31037 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31038 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31039 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31040 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31041 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31042 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31043 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31044 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31045 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31046 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31047 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31048 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
31049 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31050 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31051 IX86_BUILTIN__BDESC_MPX_LAST, 1);
31052 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31053 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
31054 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31055 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31056 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31057 IX86_BUILTIN__BDESC_CET_LAST, 1);
31058 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31059 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31061 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31062 in the current target ISA to allow the user to compile particular modules
31063 with different target specific options that differ from the command line
31064 options. */
31065 static void
31066 ix86_init_mmx_sse_builtins (void)
31068 const struct builtin_description * d;
31069 enum ix86_builtin_func_type ftype;
31070 size_t i;
31072 /* Add all special builtins with variable number of operands. */
31073 for (i = 0, d = bdesc_special_args;
31074 i < ARRAY_SIZE (bdesc_special_args);
31075 i++, d++)
31077 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31078 if (d->name == 0)
31079 continue;
31081 ftype = (enum ix86_builtin_func_type) d->flag;
31082 def_builtin (d->mask, d->name, ftype, d->code);
31084 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31085 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31086 ARRAY_SIZE (bdesc_special_args) - 1);
31088 /* Add all builtins with variable number of operands. */
31089 for (i = 0, d = bdesc_args;
31090 i < ARRAY_SIZE (bdesc_args);
31091 i++, d++)
31093 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31094 if (d->name == 0)
31095 continue;
31097 ftype = (enum ix86_builtin_func_type) d->flag;
31098 def_builtin_const (d->mask, d->name, ftype, d->code);
31100 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31101 IX86_BUILTIN__BDESC_ARGS_FIRST,
31102 ARRAY_SIZE (bdesc_args) - 1);
31104 /* Add all builtins with variable number of operands. */
31105 for (i = 0, d = bdesc_args2;
31106 i < ARRAY_SIZE (bdesc_args2);
31107 i++, d++)
31109 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31110 if (d->name == 0)
31111 continue;
31113 ftype = (enum ix86_builtin_func_type) d->flag;
31114 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31116 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31117 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31118 ARRAY_SIZE (bdesc_args2) - 1);
31120 /* Add all builtins with rounding. */
31121 for (i = 0, d = bdesc_round_args;
31122 i < ARRAY_SIZE (bdesc_round_args);
31123 i++, d++)
31125 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31126 if (d->name == 0)
31127 continue;
31129 ftype = (enum ix86_builtin_func_type) d->flag;
31130 def_builtin_const (d->mask, d->name, ftype, d->code);
31132 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31133 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31134 ARRAY_SIZE (bdesc_round_args) - 1);
31136 /* pcmpestr[im] insns. */
31137 for (i = 0, d = bdesc_pcmpestr;
31138 i < ARRAY_SIZE (bdesc_pcmpestr);
31139 i++, d++)
31141 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31142 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31143 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31144 else
31145 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31146 def_builtin_const (d->mask, d->name, ftype, d->code);
31148 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31149 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31150 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31152 /* pcmpistr[im] insns. */
31153 for (i = 0, d = bdesc_pcmpistr;
31154 i < ARRAY_SIZE (bdesc_pcmpistr);
31155 i++, d++)
31157 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31158 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31159 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31160 else
31161 ftype = INT_FTYPE_V16QI_V16QI_INT;
31162 def_builtin_const (d->mask, d->name, ftype, d->code);
31164 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31165 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31166 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31168 /* comi/ucomi insns. */
31169 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31171 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31172 if (d->mask == OPTION_MASK_ISA_SSE2)
31173 ftype = INT_FTYPE_V2DF_V2DF;
31174 else
31175 ftype = INT_FTYPE_V4SF_V4SF;
31176 def_builtin_const (d->mask, d->name, ftype, d->code);
31178 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31179 IX86_BUILTIN__BDESC_COMI_FIRST,
31180 ARRAY_SIZE (bdesc_comi) - 1);
31182 /* SSE */
31183 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31184 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31185 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31186 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31188 /* SSE or 3DNow!A */
31189 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31190 /* As it uses V4HImode, we have to require -mmmx too. */
31191 | OPTION_MASK_ISA_MMX,
31192 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31193 IX86_BUILTIN_MASKMOVQ);
31195 /* SSE2 */
31196 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31197 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31199 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31200 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31201 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31202 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31204 /* SSE3. */
31205 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31206 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31207 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31208 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31210 /* AES */
31211 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
31212 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31213 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
31214 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31215 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
31216 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31217 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
31218 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31219 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
31220 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31221 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
31222 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31224 /* PCLMUL */
31225 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
31226 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31228 /* RDRND */
31229 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31230 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31231 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31232 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31233 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31234 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31235 IX86_BUILTIN_RDRAND64_STEP);
31237 /* AVX2 */
31238 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31239 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31240 IX86_BUILTIN_GATHERSIV2DF);
31242 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31243 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31244 IX86_BUILTIN_GATHERSIV4DF);
31246 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31247 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31248 IX86_BUILTIN_GATHERDIV2DF);
31250 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31251 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31252 IX86_BUILTIN_GATHERDIV4DF);
31254 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31255 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31256 IX86_BUILTIN_GATHERSIV4SF);
31258 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31259 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31260 IX86_BUILTIN_GATHERSIV8SF);
31262 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31263 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31264 IX86_BUILTIN_GATHERDIV4SF);
31266 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31267 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31268 IX86_BUILTIN_GATHERDIV8SF);
31270 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31271 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31272 IX86_BUILTIN_GATHERSIV2DI);
31274 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31275 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31276 IX86_BUILTIN_GATHERSIV4DI);
31278 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31279 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31280 IX86_BUILTIN_GATHERDIV2DI);
31282 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31283 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31284 IX86_BUILTIN_GATHERDIV4DI);
31286 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31287 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31288 IX86_BUILTIN_GATHERSIV4SI);
31290 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31291 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31292 IX86_BUILTIN_GATHERSIV8SI);
31294 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31295 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31296 IX86_BUILTIN_GATHERDIV4SI);
31298 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31299 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31300 IX86_BUILTIN_GATHERDIV8SI);
31302 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31303 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31304 IX86_BUILTIN_GATHERALTSIV4DF);
31306 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31307 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31308 IX86_BUILTIN_GATHERALTDIV8SF);
31310 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31311 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31312 IX86_BUILTIN_GATHERALTSIV4DI);
31314 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31315 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31316 IX86_BUILTIN_GATHERALTDIV8SI);
31318 /* AVX512F */
31319 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31320 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31321 IX86_BUILTIN_GATHER3SIV16SF);
31323 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31324 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31325 IX86_BUILTIN_GATHER3SIV8DF);
31327 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31328 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31329 IX86_BUILTIN_GATHER3DIV16SF);
31331 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31332 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31333 IX86_BUILTIN_GATHER3DIV8DF);
31335 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31336 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31337 IX86_BUILTIN_GATHER3SIV16SI);
31339 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31340 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31341 IX86_BUILTIN_GATHER3SIV8DI);
31343 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31344 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31345 IX86_BUILTIN_GATHER3DIV16SI);
31347 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31348 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31349 IX86_BUILTIN_GATHER3DIV8DI);
31351 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31352 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31353 IX86_BUILTIN_GATHER3ALTSIV8DF);
31355 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31356 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31357 IX86_BUILTIN_GATHER3ALTDIV16SF);
31359 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31360 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31361 IX86_BUILTIN_GATHER3ALTSIV8DI);
31363 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31364 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31365 IX86_BUILTIN_GATHER3ALTDIV16SI);
31367 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31368 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31369 IX86_BUILTIN_SCATTERSIV16SF);
31371 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31372 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31373 IX86_BUILTIN_SCATTERSIV8DF);
31375 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31376 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31377 IX86_BUILTIN_SCATTERDIV16SF);
31379 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31380 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31381 IX86_BUILTIN_SCATTERDIV8DF);
31383 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31384 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31385 IX86_BUILTIN_SCATTERSIV16SI);
31387 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31388 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31389 IX86_BUILTIN_SCATTERSIV8DI);
31391 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31392 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31393 IX86_BUILTIN_SCATTERDIV16SI);
31395 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31396 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31397 IX86_BUILTIN_SCATTERDIV8DI);
31399 /* AVX512VL */
31400 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31401 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31402 IX86_BUILTIN_GATHER3SIV2DF);
31404 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31405 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31406 IX86_BUILTIN_GATHER3SIV4DF);
31408 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31409 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31410 IX86_BUILTIN_GATHER3DIV2DF);
31412 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31413 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31414 IX86_BUILTIN_GATHER3DIV4DF);
31416 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31417 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31418 IX86_BUILTIN_GATHER3SIV4SF);
31420 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31421 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31422 IX86_BUILTIN_GATHER3SIV8SF);
31424 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31425 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31426 IX86_BUILTIN_GATHER3DIV4SF);
31428 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31429 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31430 IX86_BUILTIN_GATHER3DIV8SF);
31432 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31433 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31434 IX86_BUILTIN_GATHER3SIV2DI);
31436 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31437 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31438 IX86_BUILTIN_GATHER3SIV4DI);
31440 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31441 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31442 IX86_BUILTIN_GATHER3DIV2DI);
31444 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31445 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31446 IX86_BUILTIN_GATHER3DIV4DI);
31448 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31449 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31450 IX86_BUILTIN_GATHER3SIV4SI);
31452 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31453 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31454 IX86_BUILTIN_GATHER3SIV8SI);
31456 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31457 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31458 IX86_BUILTIN_GATHER3DIV4SI);
31460 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31461 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31462 IX86_BUILTIN_GATHER3DIV8SI);
31464 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31465 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31466 IX86_BUILTIN_GATHER3ALTSIV4DF);
31468 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31469 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31470 IX86_BUILTIN_GATHER3ALTDIV8SF);
31472 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31473 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31474 IX86_BUILTIN_GATHER3ALTSIV4DI);
31476 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31477 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31478 IX86_BUILTIN_GATHER3ALTDIV8SI);
31480 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31481 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31482 IX86_BUILTIN_SCATTERSIV8SF);
31484 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31485 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31486 IX86_BUILTIN_SCATTERSIV4SF);
31488 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31489 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31490 IX86_BUILTIN_SCATTERSIV4DF);
31492 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31493 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31494 IX86_BUILTIN_SCATTERSIV2DF);
31496 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31497 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31498 IX86_BUILTIN_SCATTERDIV8SF);
31500 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31501 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31502 IX86_BUILTIN_SCATTERDIV4SF);
31504 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31505 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31506 IX86_BUILTIN_SCATTERDIV4DF);
31508 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31509 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31510 IX86_BUILTIN_SCATTERDIV2DF);
31512 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31513 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31514 IX86_BUILTIN_SCATTERSIV8SI);
31516 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31517 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31518 IX86_BUILTIN_SCATTERSIV4SI);
31520 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31521 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31522 IX86_BUILTIN_SCATTERSIV4DI);
31524 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31525 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31526 IX86_BUILTIN_SCATTERSIV2DI);
31528 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31529 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31530 IX86_BUILTIN_SCATTERDIV8SI);
31532 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31533 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31534 IX86_BUILTIN_SCATTERDIV4SI);
31536 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31537 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31538 IX86_BUILTIN_SCATTERDIV4DI);
31540 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31541 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31542 IX86_BUILTIN_SCATTERDIV2DI);
31543 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31544 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31545 IX86_BUILTIN_SCATTERALTSIV8DF);
31547 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31548 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31549 IX86_BUILTIN_SCATTERALTDIV16SF);
31551 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31552 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31553 IX86_BUILTIN_SCATTERALTSIV8DI);
31555 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31556 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31557 IX86_BUILTIN_SCATTERALTDIV16SI);
31559 /* AVX512PF */
31560 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31561 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31562 IX86_BUILTIN_GATHERPFDPD);
31563 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31564 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31565 IX86_BUILTIN_GATHERPFDPS);
31566 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31567 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31568 IX86_BUILTIN_GATHERPFQPD);
31569 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31570 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31571 IX86_BUILTIN_GATHERPFQPS);
31572 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31573 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31574 IX86_BUILTIN_SCATTERPFDPD);
31575 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31576 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31577 IX86_BUILTIN_SCATTERPFDPS);
31578 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31579 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31580 IX86_BUILTIN_SCATTERPFQPD);
31581 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31582 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31583 IX86_BUILTIN_SCATTERPFQPS);
31585 /* SHA */
31586 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31587 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31588 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31589 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31590 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31591 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31592 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31593 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31594 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31595 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31596 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31597 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31598 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31599 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31601 /* RTM. */
31602 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31603 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31605 /* MMX access to the vec_init patterns. */
31606 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31607 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31609 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31610 V4HI_FTYPE_HI_HI_HI_HI,
31611 IX86_BUILTIN_VEC_INIT_V4HI);
31613 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31614 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31615 IX86_BUILTIN_VEC_INIT_V8QI);
31617 /* Access to the vec_extract patterns. */
31618 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31619 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31620 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31621 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31622 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31623 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31624 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31625 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31626 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31627 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31629 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31630 /* As it uses V4HImode, we have to require -mmmx too. */
31631 | OPTION_MASK_ISA_MMX,
31632 "__builtin_ia32_vec_ext_v4hi",
31633 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31635 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31636 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31638 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31639 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31641 /* Access to the vec_set patterns. */
31642 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31643 "__builtin_ia32_vec_set_v2di",
31644 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31646 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31647 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31649 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31650 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31652 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31653 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31655 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31656 /* As it uses V4HImode, we have to require -mmmx too. */
31657 | OPTION_MASK_ISA_MMX,
31658 "__builtin_ia32_vec_set_v4hi",
31659 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31661 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31662 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31664 /* RDSEED */
31665 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31666 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31667 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31668 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31669 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31670 "__builtin_ia32_rdseed_di_step",
31671 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31673 /* ADCX */
31674 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31675 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31676 def_builtin (OPTION_MASK_ISA_64BIT,
31677 "__builtin_ia32_addcarryx_u64",
31678 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31679 IX86_BUILTIN_ADDCARRYX64);
31681 /* SBB */
31682 def_builtin (0, "__builtin_ia32_sbb_u32",
31683 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31684 def_builtin (OPTION_MASK_ISA_64BIT,
31685 "__builtin_ia32_sbb_u64",
31686 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31687 IX86_BUILTIN_SBB64);
31689 /* Read/write FLAGS. */
31690 def_builtin (0, "__builtin_ia32_readeflags_u32",
31691 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31692 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31693 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31694 def_builtin (0, "__builtin_ia32_writeeflags_u32",
31695 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31696 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31697 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31699 /* CLFLUSHOPT. */
31700 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31701 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31703 /* CLWB. */
31704 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31705 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31707 /* MONITORX and MWAITX. */
31708 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31709 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31710 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31711 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31713 /* CLZERO. */
31714 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31715 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31717 /* Add FMA4 multi-arg argument instructions */
31718 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31720 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31721 if (d->name == 0)
31722 continue;
31724 ftype = (enum ix86_builtin_func_type) d->flag;
31725 def_builtin_const (d->mask, d->name, ftype, d->code);
31727 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31728 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31729 ARRAY_SIZE (bdesc_multi_arg) - 1);
31731 /* Add CET inrinsics. */
31732 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
31734 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
31735 if (d->name == 0)
31736 continue;
31738 ftype = (enum ix86_builtin_func_type) d->flag;
31739 def_builtin (d->mask, d->name, ftype, d->code);
31741 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
31742 IX86_BUILTIN__BDESC_CET_FIRST,
31743 ARRAY_SIZE (bdesc_cet) - 1);
31745 for (i = 0, d = bdesc_cet_rdssp;
31746 i < ARRAY_SIZE (bdesc_cet_rdssp);
31747 i++, d++)
31749 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
31750 if (d->name == 0)
31751 continue;
31753 ftype = (enum ix86_builtin_func_type) d->flag;
31754 def_builtin (d->mask, d->name, ftype, d->code);
31756 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
31757 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31758 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
31761 static void
31762 ix86_init_mpx_builtins ()
31764 const struct builtin_description * d;
31765 enum ix86_builtin_func_type ftype;
31766 tree decl;
31767 size_t i;
31769 for (i = 0, d = bdesc_mpx;
31770 i < ARRAY_SIZE (bdesc_mpx);
31771 i++, d++)
31773 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
31774 if (d->name == 0)
31775 continue;
31777 ftype = (enum ix86_builtin_func_type) d->flag;
31778 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
31780 /* With no leaf and nothrow flags for MPX builtins
31781 abnormal edges may follow its call when setjmp
31782 presents in the function. Since we may have a lot
31783 of MPX builtins calls it causes lots of useless
31784 edges and enormous PHI nodes. To avoid this we mark
31785 MPX builtins as leaf and nothrow. */
31786 if (decl)
31788 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31789 NULL_TREE);
31790 TREE_NOTHROW (decl) = 1;
31792 else
31794 ix86_builtins_isa[(int)d->code].leaf_p = true;
31795 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31798 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
31799 IX86_BUILTIN__BDESC_MPX_FIRST,
31800 ARRAY_SIZE (bdesc_mpx) - 1);
31802 for (i = 0, d = bdesc_mpx_const;
31803 i < ARRAY_SIZE (bdesc_mpx_const);
31804 i++, d++)
31806 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
31807 if (d->name == 0)
31808 continue;
31810 ftype = (enum ix86_builtin_func_type) d->flag;
31811 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
31813 if (decl)
31815 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
31816 NULL_TREE);
31817 TREE_NOTHROW (decl) = 1;
31819 else
31821 ix86_builtins_isa[(int)d->code].leaf_p = true;
31822 ix86_builtins_isa[(int)d->code].nothrow_p = true;
31825 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
31826 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
31827 ARRAY_SIZE (bdesc_mpx_const) - 1);
31829 #undef BDESC_VERIFY
31830 #undef BDESC_VERIFYS
31832 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31833 to return a pointer to VERSION_DECL if the outcome of the expression
31834 formed by PREDICATE_CHAIN is true. This function will be called during
31835 version dispatch to decide which function version to execute. It returns
31836 the basic block at the end, to which more conditions can be added. */
31838 static basic_block
31839 add_condition_to_bb (tree function_decl, tree version_decl,
31840 tree predicate_chain, basic_block new_bb)
31842 gimple *return_stmt;
31843 tree convert_expr, result_var;
31844 gimple *convert_stmt;
31845 gimple *call_cond_stmt;
31846 gimple *if_else_stmt;
31848 basic_block bb1, bb2, bb3;
31849 edge e12, e23;
31851 tree cond_var, and_expr_var = NULL_TREE;
31852 gimple_seq gseq;
31854 tree predicate_decl, predicate_arg;
31856 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31858 gcc_assert (new_bb != NULL);
31859 gseq = bb_seq (new_bb);
31862 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31863 build_fold_addr_expr (version_decl));
31864 result_var = create_tmp_var (ptr_type_node);
31865 convert_stmt = gimple_build_assign (result_var, convert_expr);
31866 return_stmt = gimple_build_return (result_var);
31868 if (predicate_chain == NULL_TREE)
31870 gimple_seq_add_stmt (&gseq, convert_stmt);
31871 gimple_seq_add_stmt (&gseq, return_stmt);
31872 set_bb_seq (new_bb, gseq);
31873 gimple_set_bb (convert_stmt, new_bb);
31874 gimple_set_bb (return_stmt, new_bb);
31875 pop_cfun ();
31876 return new_bb;
31879 while (predicate_chain != NULL)
31881 cond_var = create_tmp_var (integer_type_node);
31882 predicate_decl = TREE_PURPOSE (predicate_chain);
31883 predicate_arg = TREE_VALUE (predicate_chain);
31884 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31885 gimple_call_set_lhs (call_cond_stmt, cond_var);
31887 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31888 gimple_set_bb (call_cond_stmt, new_bb);
31889 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31891 predicate_chain = TREE_CHAIN (predicate_chain);
31893 if (and_expr_var == NULL)
31894 and_expr_var = cond_var;
31895 else
31897 gimple *assign_stmt;
31898 /* Use MIN_EXPR to check if any integer is zero?.
31899 and_expr_var = min_expr <cond_var, and_expr_var> */
31900 assign_stmt = gimple_build_assign (and_expr_var,
31901 build2 (MIN_EXPR, integer_type_node,
31902 cond_var, and_expr_var));
31904 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31905 gimple_set_bb (assign_stmt, new_bb);
31906 gimple_seq_add_stmt (&gseq, assign_stmt);
31910 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31911 integer_zero_node,
31912 NULL_TREE, NULL_TREE);
31913 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31914 gimple_set_bb (if_else_stmt, new_bb);
31915 gimple_seq_add_stmt (&gseq, if_else_stmt);
31917 gimple_seq_add_stmt (&gseq, convert_stmt);
31918 gimple_seq_add_stmt (&gseq, return_stmt);
31919 set_bb_seq (new_bb, gseq);
31921 bb1 = new_bb;
31922 e12 = split_block (bb1, if_else_stmt);
31923 bb2 = e12->dest;
31924 e12->flags &= ~EDGE_FALLTHRU;
31925 e12->flags |= EDGE_TRUE_VALUE;
31927 e23 = split_block (bb2, return_stmt);
31929 gimple_set_bb (convert_stmt, bb2);
31930 gimple_set_bb (return_stmt, bb2);
31932 bb3 = e23->dest;
31933 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31935 remove_edge (e23);
31936 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31938 pop_cfun ();
31940 return bb3;
31943 /* This parses the attribute arguments to target in DECL and determines
31944 the right builtin to use to match the platform specification.
31945 It returns the priority value for this version decl. If PREDICATE_LIST
31946 is not NULL, it stores the list of cpu features that need to be checked
31947 before dispatching this function. */
31949 static unsigned int
31950 get_builtin_code_for_version (tree decl, tree *predicate_list)
31952 tree attrs;
31953 struct cl_target_option cur_target;
31954 tree target_node;
31955 struct cl_target_option *new_target;
31956 const char *arg_str = NULL;
31957 const char *attrs_str = NULL;
31958 char *tok_str = NULL;
31959 char *token;
31961 /* Priority of i386 features, greater value is higher priority. This is
31962 used to decide the order in which function dispatch must happen. For
31963 instance, a version specialized for SSE4.2 should be checked for dispatch
31964 before a version for SSE3, as SSE4.2 implies SSE3. */
31965 enum feature_priority
31967 P_ZERO = 0,
31968 P_MMX,
31969 P_SSE,
31970 P_SSE2,
31971 P_SSE3,
31972 P_SSSE3,
31973 P_PROC_SSSE3,
31974 P_SSE4_A,
31975 P_PROC_SSE4_A,
31976 P_SSE4_1,
31977 P_SSE4_2,
31978 P_PROC_SSE4_2,
31979 P_POPCNT,
31980 P_AES,
31981 P_PCLMUL,
31982 P_AVX,
31983 P_PROC_AVX,
31984 P_BMI,
31985 P_PROC_BMI,
31986 P_FMA4,
31987 P_XOP,
31988 P_PROC_XOP,
31989 P_FMA,
31990 P_PROC_FMA,
31991 P_BMI2,
31992 P_AVX2,
31993 P_PROC_AVX2,
31994 P_AVX512F,
31995 P_PROC_AVX512F
31998 enum feature_priority priority = P_ZERO;
32000 /* These are the target attribute strings for which a dispatcher is
32001 available, from fold_builtin_cpu. */
32003 static struct _feature_list
32005 const char *const name;
32006 const enum feature_priority priority;
32008 const feature_list[] =
32010 {"mmx", P_MMX},
32011 {"sse", P_SSE},
32012 {"sse2", P_SSE2},
32013 {"sse3", P_SSE3},
32014 {"sse4a", P_SSE4_A},
32015 {"ssse3", P_SSSE3},
32016 {"sse4.1", P_SSE4_1},
32017 {"sse4.2", P_SSE4_2},
32018 {"popcnt", P_POPCNT},
32019 {"aes", P_AES},
32020 {"pclmul", P_PCLMUL},
32021 {"avx", P_AVX},
32022 {"bmi", P_BMI},
32023 {"fma4", P_FMA4},
32024 {"xop", P_XOP},
32025 {"fma", P_FMA},
32026 {"bmi2", P_BMI2},
32027 {"avx2", P_AVX2},
32028 {"avx512f", P_AVX512F}
32032 static unsigned int NUM_FEATURES
32033 = sizeof (feature_list) / sizeof (struct _feature_list);
32035 unsigned int i;
32037 tree predicate_chain = NULL_TREE;
32038 tree predicate_decl, predicate_arg;
32040 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32041 gcc_assert (attrs != NULL);
32043 attrs = TREE_VALUE (TREE_VALUE (attrs));
32045 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32046 attrs_str = TREE_STRING_POINTER (attrs);
32048 /* Return priority zero for default function. */
32049 if (strcmp (attrs_str, "default") == 0)
32050 return 0;
32052 /* Handle arch= if specified. For priority, set it to be 1 more than
32053 the best instruction set the processor can handle. For instance, if
32054 there is a version for atom and a version for ssse3 (the highest ISA
32055 priority for atom), the atom version must be checked for dispatch
32056 before the ssse3 version. */
32057 if (strstr (attrs_str, "arch=") != NULL)
32059 cl_target_option_save (&cur_target, &global_options);
32060 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32061 &global_options_set);
32063 gcc_assert (target_node);
32064 new_target = TREE_TARGET_OPTION (target_node);
32065 gcc_assert (new_target);
32067 if (new_target->arch_specified && new_target->arch > 0)
32069 switch (new_target->arch)
32071 case PROCESSOR_CORE2:
32072 arg_str = "core2";
32073 priority = P_PROC_SSSE3;
32074 break;
32075 case PROCESSOR_NEHALEM:
32076 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32078 arg_str = "westmere";
32079 priority = P_AES;
32081 else
32083 /* We translate "arch=corei7" and "arch=nehalem" to
32084 "corei7" so that it will be mapped to M_INTEL_COREI7
32085 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32086 arg_str = "corei7";
32087 priority = P_PROC_SSE4_2;
32089 break;
32090 case PROCESSOR_SANDYBRIDGE:
32091 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32092 arg_str = "ivybridge";
32093 else
32094 arg_str = "sandybridge";
32095 priority = P_PROC_AVX;
32096 break;
32097 case PROCESSOR_HASWELL:
32098 case PROCESSOR_SKYLAKE_AVX512:
32099 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
32100 arg_str = "cannonlake";
32101 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
32102 arg_str = "skylake-avx512";
32103 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
32104 arg_str = "skylake";
32105 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32106 arg_str = "broadwell";
32107 else
32108 arg_str = "haswell";
32109 priority = P_PROC_AVX2;
32110 break;
32111 case PROCESSOR_BONNELL:
32112 arg_str = "bonnell";
32113 priority = P_PROC_SSSE3;
32114 break;
32115 case PROCESSOR_KNL:
32116 arg_str = "knl";
32117 priority = P_PROC_AVX512F;
32118 break;
32119 case PROCESSOR_KNM:
32120 arg_str = "knm";
32121 priority = P_PROC_AVX512F;
32122 break;
32123 case PROCESSOR_SILVERMONT:
32124 arg_str = "silvermont";
32125 priority = P_PROC_SSE4_2;
32126 break;
32127 case PROCESSOR_AMDFAM10:
32128 arg_str = "amdfam10h";
32129 priority = P_PROC_SSE4_A;
32130 break;
32131 case PROCESSOR_BTVER1:
32132 arg_str = "btver1";
32133 priority = P_PROC_SSE4_A;
32134 break;
32135 case PROCESSOR_BTVER2:
32136 arg_str = "btver2";
32137 priority = P_PROC_BMI;
32138 break;
32139 case PROCESSOR_BDVER1:
32140 arg_str = "bdver1";
32141 priority = P_PROC_XOP;
32142 break;
32143 case PROCESSOR_BDVER2:
32144 arg_str = "bdver2";
32145 priority = P_PROC_FMA;
32146 break;
32147 case PROCESSOR_BDVER3:
32148 arg_str = "bdver3";
32149 priority = P_PROC_FMA;
32150 break;
32151 case PROCESSOR_BDVER4:
32152 arg_str = "bdver4";
32153 priority = P_PROC_AVX2;
32154 break;
32155 case PROCESSOR_ZNVER1:
32156 arg_str = "znver1";
32157 priority = P_PROC_AVX2;
32158 break;
32162 cl_target_option_restore (&global_options, &cur_target);
32164 if (predicate_list && arg_str == NULL)
32166 error_at (DECL_SOURCE_LOCATION (decl),
32167 "No dispatcher found for the versioning attributes");
32168 return 0;
32171 if (predicate_list)
32173 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32174 /* For a C string literal the length includes the trailing NULL. */
32175 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32176 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32177 predicate_chain);
32181 /* Process feature name. */
32182 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32183 strcpy (tok_str, attrs_str);
32184 token = strtok (tok_str, ",");
32185 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32187 while (token != NULL)
32189 /* Do not process "arch=" */
32190 if (strncmp (token, "arch=", 5) == 0)
32192 token = strtok (NULL, ",");
32193 continue;
32195 for (i = 0; i < NUM_FEATURES; ++i)
32197 if (strcmp (token, feature_list[i].name) == 0)
32199 if (predicate_list)
32201 predicate_arg = build_string_literal (
32202 strlen (feature_list[i].name) + 1,
32203 feature_list[i].name);
32204 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32205 predicate_chain);
32207 /* Find the maximum priority feature. */
32208 if (feature_list[i].priority > priority)
32209 priority = feature_list[i].priority;
32211 break;
32214 if (predicate_list && i == NUM_FEATURES)
32216 error_at (DECL_SOURCE_LOCATION (decl),
32217 "No dispatcher found for %s", token);
32218 return 0;
32220 token = strtok (NULL, ",");
32222 free (tok_str);
32224 if (predicate_list && predicate_chain == NULL_TREE)
32226 error_at (DECL_SOURCE_LOCATION (decl),
32227 "No dispatcher found for the versioning attributes : %s",
32228 attrs_str);
32229 return 0;
32231 else if (predicate_list)
32233 predicate_chain = nreverse (predicate_chain);
32234 *predicate_list = predicate_chain;
32237 return priority;
32240 /* This compares the priority of target features in function DECL1
32241 and DECL2. It returns positive value if DECL1 is higher priority,
32242 negative value if DECL2 is higher priority and 0 if they are the
32243 same. */
32245 static int
32246 ix86_compare_version_priority (tree decl1, tree decl2)
32248 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32249 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32251 return (int)priority1 - (int)priority2;
32254 /* V1 and V2 point to function versions with different priorities
32255 based on the target ISA. This function compares their priorities. */
32257 static int
32258 feature_compare (const void *v1, const void *v2)
32260 typedef struct _function_version_info
32262 tree version_decl;
32263 tree predicate_chain;
32264 unsigned int dispatch_priority;
32265 } function_version_info;
32267 const function_version_info c1 = *(const function_version_info *)v1;
32268 const function_version_info c2 = *(const function_version_info *)v2;
32269 return (c2.dispatch_priority - c1.dispatch_priority);
32272 /* This function generates the dispatch function for
32273 multi-versioned functions. DISPATCH_DECL is the function which will
32274 contain the dispatch logic. FNDECLS are the function choices for
32275 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32276 in DISPATCH_DECL in which the dispatch code is generated. */
32278 static int
32279 dispatch_function_versions (tree dispatch_decl,
32280 void *fndecls_p,
32281 basic_block *empty_bb)
32283 tree default_decl;
32284 gimple *ifunc_cpu_init_stmt;
32285 gimple_seq gseq;
32286 int ix;
32287 tree ele;
32288 vec<tree> *fndecls;
32289 unsigned int num_versions = 0;
32290 unsigned int actual_versions = 0;
32291 unsigned int i;
32293 struct _function_version_info
32295 tree version_decl;
32296 tree predicate_chain;
32297 unsigned int dispatch_priority;
32298 }*function_version_info;
32300 gcc_assert (dispatch_decl != NULL
32301 && fndecls_p != NULL
32302 && empty_bb != NULL);
32304 /*fndecls_p is actually a vector. */
32305 fndecls = static_cast<vec<tree> *> (fndecls_p);
32307 /* At least one more version other than the default. */
32308 num_versions = fndecls->length ();
32309 gcc_assert (num_versions >= 2);
32311 function_version_info = (struct _function_version_info *)
32312 XNEWVEC (struct _function_version_info, (num_versions - 1));
32314 /* The first version in the vector is the default decl. */
32315 default_decl = (*fndecls)[0];
32317 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32319 gseq = bb_seq (*empty_bb);
32320 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32321 constructors, so explicity call __builtin_cpu_init here. */
32322 ifunc_cpu_init_stmt = gimple_build_call_vec (
32323 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32324 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32325 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32326 set_bb_seq (*empty_bb, gseq);
32328 pop_cfun ();
32331 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32333 tree version_decl = ele;
32334 tree predicate_chain = NULL_TREE;
32335 unsigned int priority;
32336 /* Get attribute string, parse it and find the right predicate decl.
32337 The predicate function could be a lengthy combination of many
32338 features, like arch-type and various isa-variants. */
32339 priority = get_builtin_code_for_version (version_decl,
32340 &predicate_chain);
32342 if (predicate_chain == NULL_TREE)
32343 continue;
32345 function_version_info [actual_versions].version_decl = version_decl;
32346 function_version_info [actual_versions].predicate_chain
32347 = predicate_chain;
32348 function_version_info [actual_versions].dispatch_priority = priority;
32349 actual_versions++;
32352 /* Sort the versions according to descending order of dispatch priority. The
32353 priority is based on the ISA. This is not a perfect solution. There
32354 could still be ambiguity. If more than one function version is suitable
32355 to execute, which one should be dispatched? In future, allow the user
32356 to specify a dispatch priority next to the version. */
32357 qsort (function_version_info, actual_versions,
32358 sizeof (struct _function_version_info), feature_compare);
32360 for (i = 0; i < actual_versions; ++i)
32361 *empty_bb = add_condition_to_bb (dispatch_decl,
32362 function_version_info[i].version_decl,
32363 function_version_info[i].predicate_chain,
32364 *empty_bb);
32366 /* dispatch default version at the end. */
32367 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32368 NULL, *empty_bb);
32370 free (function_version_info);
32371 return 0;
32374 /* This function changes the assembler name for functions that are
32375 versions. If DECL is a function version and has a "target"
32376 attribute, it appends the attribute string to its assembler name. */
32378 static tree
32379 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32381 tree version_attr;
32382 const char *orig_name, *version_string;
32383 char *attr_str, *assembler_name;
32385 if (DECL_DECLARED_INLINE_P (decl)
32386 && lookup_attribute ("gnu_inline",
32387 DECL_ATTRIBUTES (decl)))
32388 error_at (DECL_SOURCE_LOCATION (decl),
32389 "Function versions cannot be marked as gnu_inline,"
32390 " bodies have to be generated");
32392 if (DECL_VIRTUAL_P (decl)
32393 || DECL_VINDEX (decl))
32394 sorry ("Virtual function multiversioning not supported");
32396 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32398 /* target attribute string cannot be NULL. */
32399 gcc_assert (version_attr != NULL_TREE);
32401 orig_name = IDENTIFIER_POINTER (id);
32402 version_string
32403 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32405 if (strcmp (version_string, "default") == 0)
32406 return id;
32408 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32409 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32411 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32413 /* Allow assembler name to be modified if already set. */
32414 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32415 SET_DECL_RTL (decl, NULL);
32417 tree ret = get_identifier (assembler_name);
32418 XDELETEVEC (attr_str);
32419 XDELETEVEC (assembler_name);
32420 return ret;
32424 static tree
32425 ix86_mangle_decl_assembler_name (tree decl, tree id)
32427 /* For function version, add the target suffix to the assembler name. */
32428 if (TREE_CODE (decl) == FUNCTION_DECL
32429 && DECL_FUNCTION_VERSIONED (decl))
32430 id = ix86_mangle_function_version_assembler_name (decl, id);
32431 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32432 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32433 #endif
32435 return id;
32438 /* Make a dispatcher declaration for the multi-versioned function DECL.
32439 Calls to DECL function will be replaced with calls to the dispatcher
32440 by the front-end. Returns the decl of the dispatcher function. */
32442 static tree
32443 ix86_get_function_versions_dispatcher (void *decl)
32445 tree fn = (tree) decl;
32446 struct cgraph_node *node = NULL;
32447 struct cgraph_node *default_node = NULL;
32448 struct cgraph_function_version_info *node_v = NULL;
32449 struct cgraph_function_version_info *first_v = NULL;
32451 tree dispatch_decl = NULL;
32453 struct cgraph_function_version_info *default_version_info = NULL;
32455 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32457 node = cgraph_node::get (fn);
32458 gcc_assert (node != NULL);
32460 node_v = node->function_version ();
32461 gcc_assert (node_v != NULL);
32463 if (node_v->dispatcher_resolver != NULL)
32464 return node_v->dispatcher_resolver;
32466 /* Find the default version and make it the first node. */
32467 first_v = node_v;
32468 /* Go to the beginning of the chain. */
32469 while (first_v->prev != NULL)
32470 first_v = first_v->prev;
32471 default_version_info = first_v;
32472 while (default_version_info != NULL)
32474 if (is_function_default_version
32475 (default_version_info->this_node->decl))
32476 break;
32477 default_version_info = default_version_info->next;
32480 /* If there is no default node, just return NULL. */
32481 if (default_version_info == NULL)
32482 return NULL;
32484 /* Make default info the first node. */
32485 if (first_v != default_version_info)
32487 default_version_info->prev->next = default_version_info->next;
32488 if (default_version_info->next)
32489 default_version_info->next->prev = default_version_info->prev;
32490 first_v->prev = default_version_info;
32491 default_version_info->next = first_v;
32492 default_version_info->prev = NULL;
32495 default_node = default_version_info->this_node;
32497 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32498 if (targetm.has_ifunc_p ())
32500 struct cgraph_function_version_info *it_v = NULL;
32501 struct cgraph_node *dispatcher_node = NULL;
32502 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32504 /* Right now, the dispatching is done via ifunc. */
32505 dispatch_decl = make_dispatcher_decl (default_node->decl);
32507 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32508 gcc_assert (dispatcher_node != NULL);
32509 dispatcher_node->dispatcher_function = 1;
32510 dispatcher_version_info
32511 = dispatcher_node->insert_new_function_version ();
32512 dispatcher_version_info->next = default_version_info;
32513 dispatcher_node->definition = 1;
32515 /* Set the dispatcher for all the versions. */
32516 it_v = default_version_info;
32517 while (it_v != NULL)
32519 it_v->dispatcher_resolver = dispatch_decl;
32520 it_v = it_v->next;
32523 else
32524 #endif
32526 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32527 "multiversioning needs ifunc which is not supported "
32528 "on this target");
32531 return dispatch_decl;
32534 /* Make the resolver function decl to dispatch the versions of
32535 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
32536 ifunc alias that will point to the created resolver. Create an
32537 empty basic block in the resolver and store the pointer in
32538 EMPTY_BB. Return the decl of the resolver function. */
32540 static tree
32541 make_resolver_func (const tree default_decl,
32542 const tree ifunc_alias_decl,
32543 basic_block *empty_bb)
32545 char *resolver_name;
32546 tree decl, type, decl_name, t;
32548 /* IFUNC's have to be globally visible. So, if the default_decl is
32549 not, then the name of the IFUNC should be made unique. */
32550 if (TREE_PUBLIC (default_decl) == 0)
32552 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32553 symtab->change_decl_assembler_name (ifunc_alias_decl,
32554 get_identifier (ifunc_name));
32555 XDELETEVEC (ifunc_name);
32558 resolver_name = make_unique_name (default_decl, "resolver", false);
32560 /* The resolver function should return a (void *). */
32561 type = build_function_type_list (ptr_type_node, NULL_TREE);
32563 decl = build_fn_decl (resolver_name, type);
32564 decl_name = get_identifier (resolver_name);
32565 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32567 DECL_NAME (decl) = decl_name;
32568 TREE_USED (decl) = 1;
32569 DECL_ARTIFICIAL (decl) = 1;
32570 DECL_IGNORED_P (decl) = 1;
32571 TREE_PUBLIC (decl) = 0;
32572 DECL_UNINLINABLE (decl) = 1;
32574 /* Resolver is not external, body is generated. */
32575 DECL_EXTERNAL (decl) = 0;
32576 DECL_EXTERNAL (ifunc_alias_decl) = 0;
32578 DECL_CONTEXT (decl) = NULL_TREE;
32579 DECL_INITIAL (decl) = make_node (BLOCK);
32580 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32582 if (DECL_COMDAT_GROUP (default_decl)
32583 || TREE_PUBLIC (default_decl))
32585 /* In this case, each translation unit with a call to this
32586 versioned function will put out a resolver. Ensure it
32587 is comdat to keep just one copy. */
32588 DECL_COMDAT (decl) = 1;
32589 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32591 /* Build result decl and add to function_decl. */
32592 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32593 DECL_ARTIFICIAL (t) = 1;
32594 DECL_IGNORED_P (t) = 1;
32595 DECL_RESULT (decl) = t;
32597 gimplify_function_tree (decl);
32598 push_cfun (DECL_STRUCT_FUNCTION (decl));
32599 *empty_bb = init_lowered_empty_function (decl, false,
32600 profile_count::uninitialized ());
32602 cgraph_node::add_new_function (decl, true);
32603 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32605 pop_cfun ();
32607 gcc_assert (ifunc_alias_decl != NULL);
32608 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
32609 DECL_ATTRIBUTES (ifunc_alias_decl)
32610 = make_attribute ("ifunc", resolver_name,
32611 DECL_ATTRIBUTES (ifunc_alias_decl));
32613 /* Create the alias for dispatch to resolver here. */
32614 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32615 XDELETEVEC (resolver_name);
32616 return decl;
32619 /* Generate the dispatching code body to dispatch multi-versioned function
32620 DECL. The target hook is called to process the "target" attributes and
32621 provide the code to dispatch the right function at run-time. NODE points
32622 to the dispatcher decl whose body will be created. */
32624 static tree
32625 ix86_generate_version_dispatcher_body (void *node_p)
32627 tree resolver_decl;
32628 basic_block empty_bb;
32629 tree default_ver_decl;
32630 struct cgraph_node *versn;
32631 struct cgraph_node *node;
32633 struct cgraph_function_version_info *node_version_info = NULL;
32634 struct cgraph_function_version_info *versn_info = NULL;
32636 node = (cgraph_node *)node_p;
32638 node_version_info = node->function_version ();
32639 gcc_assert (node->dispatcher_function
32640 && node_version_info != NULL);
32642 if (node_version_info->dispatcher_resolver)
32643 return node_version_info->dispatcher_resolver;
32645 /* The first version in the chain corresponds to the default version. */
32646 default_ver_decl = node_version_info->next->this_node->decl;
32648 /* node is going to be an alias, so remove the finalized bit. */
32649 node->definition = false;
32651 resolver_decl = make_resolver_func (default_ver_decl,
32652 node->decl, &empty_bb);
32654 node_version_info->dispatcher_resolver = resolver_decl;
32656 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32658 auto_vec<tree, 2> fn_ver_vec;
32660 for (versn_info = node_version_info->next; versn_info;
32661 versn_info = versn_info->next)
32663 versn = versn_info->this_node;
32664 /* Check for virtual functions here again, as by this time it should
32665 have been determined if this function needs a vtable index or
32666 not. This happens for methods in derived classes that override
32667 virtual methods in base classes but are not explicitly marked as
32668 virtual. */
32669 if (DECL_VINDEX (versn->decl))
32670 sorry ("Virtual function multiversioning not supported");
32672 fn_ver_vec.safe_push (versn->decl);
32675 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32676 cgraph_edge::rebuild_edges ();
32677 pop_cfun ();
32678 return resolver_decl;
32680 /* This builds the processor_model struct type defined in
32681 libgcc/config/i386/cpuinfo.c */
32683 static tree
32684 build_processor_model_struct (void)
32686 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32687 "__cpu_features"};
32688 tree field = NULL_TREE, field_chain = NULL_TREE;
32689 int i;
32690 tree type = make_node (RECORD_TYPE);
32692 /* The first 3 fields are unsigned int. */
32693 for (i = 0; i < 3; ++i)
32695 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32696 get_identifier (field_name[i]), unsigned_type_node);
32697 if (field_chain != NULL_TREE)
32698 DECL_CHAIN (field) = field_chain;
32699 field_chain = field;
32702 /* The last field is an array of unsigned integers of size one. */
32703 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32704 get_identifier (field_name[3]),
32705 build_array_type (unsigned_type_node,
32706 build_index_type (size_one_node)));
32707 if (field_chain != NULL_TREE)
32708 DECL_CHAIN (field) = field_chain;
32709 field_chain = field;
32711 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32712 return type;
32715 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32717 static tree
32718 make_var_decl (tree type, const char *name)
32720 tree new_decl;
32722 new_decl = build_decl (UNKNOWN_LOCATION,
32723 VAR_DECL,
32724 get_identifier(name),
32725 type);
32727 DECL_EXTERNAL (new_decl) = 1;
32728 TREE_STATIC (new_decl) = 1;
32729 TREE_PUBLIC (new_decl) = 1;
32730 DECL_INITIAL (new_decl) = 0;
32731 DECL_ARTIFICIAL (new_decl) = 0;
32732 DECL_PRESERVE_P (new_decl) = 1;
32734 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
32735 assemble_variable (new_decl, 0, 0, 0);
32737 return new_decl;
32740 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
32741 into an integer defined in libgcc/config/i386/cpuinfo.c */
32743 static tree
32744 fold_builtin_cpu (tree fndecl, tree *args)
32746 unsigned int i;
32747 enum ix86_builtins fn_code = (enum ix86_builtins)
32748 DECL_FUNCTION_CODE (fndecl);
32749 tree param_string_cst = NULL;
32751 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
32752 enum processor_features
32754 F_CMOV = 0,
32755 F_MMX,
32756 F_POPCNT,
32757 F_SSE,
32758 F_SSE2,
32759 F_SSE3,
32760 F_SSSE3,
32761 F_SSE4_1,
32762 F_SSE4_2,
32763 F_AVX,
32764 F_AVX2,
32765 F_SSE4_A,
32766 F_FMA4,
32767 F_XOP,
32768 F_FMA,
32769 F_AVX512F,
32770 F_BMI,
32771 F_BMI2,
32772 F_AES,
32773 F_PCLMUL,
32774 F_AVX512VL,
32775 F_AVX512BW,
32776 F_AVX512DQ,
32777 F_AVX512CD,
32778 F_AVX512ER,
32779 F_AVX512PF,
32780 F_AVX512VBMI,
32781 F_AVX512IFMA,
32782 F_AVX5124VNNIW,
32783 F_AVX5124FMAPS,
32784 F_AVX512VPOPCNTDQ,
32785 F_MAX
32788 /* These are the values for vendor types and cpu types and subtypes
32789 in cpuinfo.c. Cpu types and subtypes should be subtracted by
32790 the corresponding start value. */
32791 enum processor_model
32793 M_INTEL = 1,
32794 M_AMD,
32795 M_CPU_TYPE_START,
32796 M_INTEL_BONNELL,
32797 M_INTEL_CORE2,
32798 M_INTEL_COREI7,
32799 M_AMDFAM10H,
32800 M_AMDFAM15H,
32801 M_INTEL_SILVERMONT,
32802 M_INTEL_KNL,
32803 M_AMD_BTVER1,
32804 M_AMD_BTVER2,
32805 M_AMDFAM17H,
32806 M_INTEL_KNM,
32807 M_CPU_SUBTYPE_START,
32808 M_INTEL_COREI7_NEHALEM,
32809 M_INTEL_COREI7_WESTMERE,
32810 M_INTEL_COREI7_SANDYBRIDGE,
32811 M_AMDFAM10H_BARCELONA,
32812 M_AMDFAM10H_SHANGHAI,
32813 M_AMDFAM10H_ISTANBUL,
32814 M_AMDFAM15H_BDVER1,
32815 M_AMDFAM15H_BDVER2,
32816 M_AMDFAM15H_BDVER3,
32817 M_AMDFAM15H_BDVER4,
32818 M_AMDFAM17H_ZNVER1,
32819 M_INTEL_COREI7_IVYBRIDGE,
32820 M_INTEL_COREI7_HASWELL,
32821 M_INTEL_COREI7_BROADWELL,
32822 M_INTEL_COREI7_SKYLAKE,
32823 M_INTEL_COREI7_SKYLAKE_AVX512,
32824 M_INTEL_COREI7_CANNONLAKE
32827 static struct _arch_names_table
32829 const char *const name;
32830 const enum processor_model model;
32832 const arch_names_table[] =
32834 {"amd", M_AMD},
32835 {"intel", M_INTEL},
32836 {"atom", M_INTEL_BONNELL},
32837 {"slm", M_INTEL_SILVERMONT},
32838 {"core2", M_INTEL_CORE2},
32839 {"corei7", M_INTEL_COREI7},
32840 {"nehalem", M_INTEL_COREI7_NEHALEM},
32841 {"westmere", M_INTEL_COREI7_WESTMERE},
32842 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32843 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32844 {"haswell", M_INTEL_COREI7_HASWELL},
32845 {"broadwell", M_INTEL_COREI7_BROADWELL},
32846 {"skylake", M_INTEL_COREI7_SKYLAKE},
32847 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32848 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32849 {"bonnell", M_INTEL_BONNELL},
32850 {"silvermont", M_INTEL_SILVERMONT},
32851 {"knl", M_INTEL_KNL},
32852 {"knm", M_INTEL_KNM},
32853 {"amdfam10h", M_AMDFAM10H},
32854 {"barcelona", M_AMDFAM10H_BARCELONA},
32855 {"shanghai", M_AMDFAM10H_SHANGHAI},
32856 {"istanbul", M_AMDFAM10H_ISTANBUL},
32857 {"btver1", M_AMD_BTVER1},
32858 {"amdfam15h", M_AMDFAM15H},
32859 {"bdver1", M_AMDFAM15H_BDVER1},
32860 {"bdver2", M_AMDFAM15H_BDVER2},
32861 {"bdver3", M_AMDFAM15H_BDVER3},
32862 {"bdver4", M_AMDFAM15H_BDVER4},
32863 {"btver2", M_AMD_BTVER2},
32864 {"amdfam17h", M_AMDFAM17H},
32865 {"znver1", M_AMDFAM17H_ZNVER1},
32868 static struct _isa_names_table
32870 const char *const name;
32871 const enum processor_features feature;
32873 const isa_names_table[] =
32875 {"cmov", F_CMOV},
32876 {"mmx", F_MMX},
32877 {"popcnt", F_POPCNT},
32878 {"sse", F_SSE},
32879 {"sse2", F_SSE2},
32880 {"sse3", F_SSE3},
32881 {"ssse3", F_SSSE3},
32882 {"sse4a", F_SSE4_A},
32883 {"sse4.1", F_SSE4_1},
32884 {"sse4.2", F_SSE4_2},
32885 {"avx", F_AVX},
32886 {"fma4", F_FMA4},
32887 {"xop", F_XOP},
32888 {"fma", F_FMA},
32889 {"avx2", F_AVX2},
32890 {"avx512f", F_AVX512F},
32891 {"bmi", F_BMI},
32892 {"bmi2", F_BMI2},
32893 {"aes", F_AES},
32894 {"pclmul", F_PCLMUL},
32895 {"avx512vl",F_AVX512VL},
32896 {"avx512bw",F_AVX512BW},
32897 {"avx512dq",F_AVX512DQ},
32898 {"avx512cd",F_AVX512CD},
32899 {"avx512er",F_AVX512ER},
32900 {"avx512pf",F_AVX512PF},
32901 {"avx512vbmi",F_AVX512VBMI},
32902 {"avx512ifma",F_AVX512IFMA},
32903 {"avx5124vnniw",F_AVX5124VNNIW},
32904 {"avx5124fmaps",F_AVX5124FMAPS},
32905 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32908 tree __processor_model_type = build_processor_model_struct ();
32909 tree __cpu_model_var = make_var_decl (__processor_model_type,
32910 "__cpu_model");
32913 varpool_node::add (__cpu_model_var);
32915 gcc_assert ((args != NULL) && (*args != NULL));
32917 param_string_cst = *args;
32918 while (param_string_cst
32919 && TREE_CODE (param_string_cst) != STRING_CST)
32921 /* *args must be a expr that can contain other EXPRS leading to a
32922 STRING_CST. */
32923 if (!EXPR_P (param_string_cst))
32925 error ("Parameter to builtin must be a string constant or literal");
32926 return integer_zero_node;
32928 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32931 gcc_assert (param_string_cst);
32933 if (fn_code == IX86_BUILTIN_CPU_IS)
32935 tree ref;
32936 tree field;
32937 tree final;
32939 unsigned int field_val = 0;
32940 unsigned int NUM_ARCH_NAMES
32941 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32943 for (i = 0; i < NUM_ARCH_NAMES; i++)
32944 if (strcmp (arch_names_table[i].name,
32945 TREE_STRING_POINTER (param_string_cst)) == 0)
32946 break;
32948 if (i == NUM_ARCH_NAMES)
32950 error ("Parameter to builtin not valid: %s",
32951 TREE_STRING_POINTER (param_string_cst));
32952 return integer_zero_node;
32955 field = TYPE_FIELDS (__processor_model_type);
32956 field_val = arch_names_table[i].model;
32958 /* CPU types are stored in the next field. */
32959 if (field_val > M_CPU_TYPE_START
32960 && field_val < M_CPU_SUBTYPE_START)
32962 field = DECL_CHAIN (field);
32963 field_val -= M_CPU_TYPE_START;
32966 /* CPU subtypes are stored in the next field. */
32967 if (field_val > M_CPU_SUBTYPE_START)
32969 field = DECL_CHAIN ( DECL_CHAIN (field));
32970 field_val -= M_CPU_SUBTYPE_START;
32973 /* Get the appropriate field in __cpu_model. */
32974 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32975 field, NULL_TREE);
32977 /* Check the value. */
32978 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32979 build_int_cstu (unsigned_type_node, field_val));
32980 return build1 (CONVERT_EXPR, integer_type_node, final);
32982 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32984 tree ref;
32985 tree array_elt;
32986 tree field;
32987 tree final;
32989 unsigned int field_val = 0;
32990 unsigned int NUM_ISA_NAMES
32991 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32993 for (i = 0; i < NUM_ISA_NAMES; i++)
32994 if (strcmp (isa_names_table[i].name,
32995 TREE_STRING_POINTER (param_string_cst)) == 0)
32996 break;
32998 if (i == NUM_ISA_NAMES)
33000 error ("Parameter to builtin not valid: %s",
33001 TREE_STRING_POINTER (param_string_cst));
33002 return integer_zero_node;
33005 field = TYPE_FIELDS (__processor_model_type);
33006 /* Get the last field, which is __cpu_features. */
33007 while (DECL_CHAIN (field))
33008 field = DECL_CHAIN (field);
33010 /* Get the appropriate field: __cpu_model.__cpu_features */
33011 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33012 field, NULL_TREE);
33014 /* Access the 0th element of __cpu_features array. */
33015 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33016 integer_zero_node, NULL_TREE, NULL_TREE);
33018 field_val = (1 << isa_names_table[i].feature);
33019 /* Return __cpu_model.__cpu_features[0] & field_val */
33020 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33021 build_int_cstu (unsigned_type_node, field_val));
33022 return build1 (CONVERT_EXPR, integer_type_node, final);
33024 gcc_unreachable ();
33027 static tree
33028 ix86_fold_builtin (tree fndecl, int n_args,
33029 tree *args, bool ignore ATTRIBUTE_UNUSED)
33031 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33033 enum ix86_builtins fn_code = (enum ix86_builtins)
33034 DECL_FUNCTION_CODE (fndecl);
33035 switch (fn_code)
33037 case IX86_BUILTIN_CPU_IS:
33038 case IX86_BUILTIN_CPU_SUPPORTS:
33039 gcc_assert (n_args == 1);
33040 return fold_builtin_cpu (fndecl, args);
33042 case IX86_BUILTIN_NANQ:
33043 case IX86_BUILTIN_NANSQ:
33045 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33046 const char *str = c_getstr (*args);
33047 int quiet = fn_code == IX86_BUILTIN_NANQ;
33048 REAL_VALUE_TYPE real;
33050 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33051 return build_real (type, real);
33052 return NULL_TREE;
33055 case IX86_BUILTIN_INFQ:
33056 case IX86_BUILTIN_HUGE_VALQ:
33058 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33059 REAL_VALUE_TYPE inf;
33060 real_inf (&inf);
33061 return build_real (type, inf);
33064 case IX86_BUILTIN_TZCNT16:
33065 case IX86_BUILTIN_CTZS:
33066 case IX86_BUILTIN_TZCNT32:
33067 case IX86_BUILTIN_TZCNT64:
33068 gcc_assert (n_args == 1);
33069 if (TREE_CODE (args[0]) == INTEGER_CST)
33071 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33072 tree arg = args[0];
33073 if (fn_code == IX86_BUILTIN_TZCNT16
33074 || fn_code == IX86_BUILTIN_CTZS)
33075 arg = fold_convert (short_unsigned_type_node, arg);
33076 if (integer_zerop (arg))
33077 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33078 else
33079 return fold_const_call (CFN_CTZ, type, arg);
33081 break;
33083 case IX86_BUILTIN_LZCNT16:
33084 case IX86_BUILTIN_CLZS:
33085 case IX86_BUILTIN_LZCNT32:
33086 case IX86_BUILTIN_LZCNT64:
33087 gcc_assert (n_args == 1);
33088 if (TREE_CODE (args[0]) == INTEGER_CST)
33090 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33091 tree arg = args[0];
33092 if (fn_code == IX86_BUILTIN_LZCNT16
33093 || fn_code == IX86_BUILTIN_CLZS)
33094 arg = fold_convert (short_unsigned_type_node, arg);
33095 if (integer_zerop (arg))
33096 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33097 else
33098 return fold_const_call (CFN_CLZ, type, arg);
33100 break;
33102 case IX86_BUILTIN_BEXTR32:
33103 case IX86_BUILTIN_BEXTR64:
33104 case IX86_BUILTIN_BEXTRI32:
33105 case IX86_BUILTIN_BEXTRI64:
33106 gcc_assert (n_args == 2);
33107 if (tree_fits_uhwi_p (args[1]))
33109 unsigned HOST_WIDE_INT res = 0;
33110 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33111 unsigned int start = tree_to_uhwi (args[1]);
33112 unsigned int len = (start & 0xff00) >> 8;
33113 start &= 0xff;
33114 if (start >= prec || len == 0)
33115 res = 0;
33116 else if (!tree_fits_uhwi_p (args[0]))
33117 break;
33118 else
33119 res = tree_to_uhwi (args[0]) >> start;
33120 if (len > prec)
33121 len = prec;
33122 if (len < HOST_BITS_PER_WIDE_INT)
33123 res &= (HOST_WIDE_INT_1U << len) - 1;
33124 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33126 break;
33128 case IX86_BUILTIN_BZHI32:
33129 case IX86_BUILTIN_BZHI64:
33130 gcc_assert (n_args == 2);
33131 if (tree_fits_uhwi_p (args[1]))
33133 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33134 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33135 return args[0];
33136 if (!tree_fits_uhwi_p (args[0]))
33137 break;
33138 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33139 res &= ~(HOST_WIDE_INT_M1U << idx);
33140 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33142 break;
33144 case IX86_BUILTIN_PDEP32:
33145 case IX86_BUILTIN_PDEP64:
33146 gcc_assert (n_args == 2);
33147 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33149 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33150 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33151 unsigned HOST_WIDE_INT res = 0;
33152 unsigned HOST_WIDE_INT m, k = 1;
33153 for (m = 1; m; m <<= 1)
33154 if ((mask & m) != 0)
33156 if ((src & k) != 0)
33157 res |= m;
33158 k <<= 1;
33160 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33162 break;
33164 case IX86_BUILTIN_PEXT32:
33165 case IX86_BUILTIN_PEXT64:
33166 gcc_assert (n_args == 2);
33167 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33169 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33170 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33171 unsigned HOST_WIDE_INT res = 0;
33172 unsigned HOST_WIDE_INT m, k = 1;
33173 for (m = 1; m; m <<= 1)
33174 if ((mask & m) != 0)
33176 if ((src & m) != 0)
33177 res |= k;
33178 k <<= 1;
33180 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33182 break;
33184 default:
33185 break;
33189 #ifdef SUBTARGET_FOLD_BUILTIN
33190 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33191 #endif
33193 return NULL_TREE;
33196 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33197 constant) in GIMPLE. */
33199 bool
33200 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33202 gimple *stmt = gsi_stmt (*gsi);
33203 tree fndecl = gimple_call_fndecl (stmt);
33204 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33205 int n_args = gimple_call_num_args (stmt);
33206 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33207 tree decl = NULL_TREE;
33208 tree arg0, arg1;
33210 switch (fn_code)
33212 case IX86_BUILTIN_TZCNT32:
33213 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33214 goto fold_tzcnt_lzcnt;
33216 case IX86_BUILTIN_TZCNT64:
33217 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33218 goto fold_tzcnt_lzcnt;
33220 case IX86_BUILTIN_LZCNT32:
33221 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33222 goto fold_tzcnt_lzcnt;
33224 case IX86_BUILTIN_LZCNT64:
33225 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33226 goto fold_tzcnt_lzcnt;
33228 fold_tzcnt_lzcnt:
33229 gcc_assert (n_args == 1);
33230 arg0 = gimple_call_arg (stmt, 0);
33231 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33233 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33234 /* If arg0 is provably non-zero, optimize into generic
33235 __builtin_c[tl]z{,ll} function the middle-end handles
33236 better. */
33237 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33238 return false;
33240 location_t loc = gimple_location (stmt);
33241 gimple *g = gimple_build_call (decl, 1, arg0);
33242 gimple_set_location (g, loc);
33243 tree lhs = make_ssa_name (integer_type_node);
33244 gimple_call_set_lhs (g, lhs);
33245 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33246 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33247 gimple_set_location (g, loc);
33248 gsi_replace (gsi, g, false);
33249 return true;
33251 break;
33253 case IX86_BUILTIN_BZHI32:
33254 case IX86_BUILTIN_BZHI64:
33255 gcc_assert (n_args == 2);
33256 arg1 = gimple_call_arg (stmt, 1);
33257 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33259 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33260 arg0 = gimple_call_arg (stmt, 0);
33261 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33262 break;
33263 location_t loc = gimple_location (stmt);
33264 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33265 gimple_set_location (g, loc);
33266 gsi_replace (gsi, g, false);
33267 return true;
33269 break;
33271 case IX86_BUILTIN_PDEP32:
33272 case IX86_BUILTIN_PDEP64:
33273 case IX86_BUILTIN_PEXT32:
33274 case IX86_BUILTIN_PEXT64:
33275 gcc_assert (n_args == 2);
33276 arg1 = gimple_call_arg (stmt, 1);
33277 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33279 location_t loc = gimple_location (stmt);
33280 arg0 = gimple_call_arg (stmt, 0);
33281 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33282 gimple_set_location (g, loc);
33283 gsi_replace (gsi, g, false);
33284 return true;
33286 break;
33288 default:
33289 break;
33292 return false;
33295 /* Make builtins to detect cpu type and features supported. NAME is
33296 the builtin name, CODE is the builtin code, and FTYPE is the function
33297 type of the builtin. */
33299 static void
33300 make_cpu_type_builtin (const char* name, int code,
33301 enum ix86_builtin_func_type ftype, bool is_const)
33303 tree decl;
33304 tree type;
33306 type = ix86_get_builtin_func_type (ftype);
33307 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33308 NULL, NULL_TREE);
33309 gcc_assert (decl != NULL_TREE);
33310 ix86_builtins[(int) code] = decl;
33311 TREE_READONLY (decl) = is_const;
33314 /* Make builtins to get CPU type and features supported. The created
33315 builtins are :
33317 __builtin_cpu_init (), to detect cpu type and features,
33318 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33319 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33322 static void
33323 ix86_init_platform_type_builtins (void)
33325 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33326 INT_FTYPE_VOID, false);
33327 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33328 INT_FTYPE_PCCHAR, true);
33329 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33330 INT_FTYPE_PCCHAR, true);
33333 /* Internal method for ix86_init_builtins. */
33335 static void
33336 ix86_init_builtins_va_builtins_abi (void)
33338 tree ms_va_ref, sysv_va_ref;
33339 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33340 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33341 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33342 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33344 if (!TARGET_64BIT)
33345 return;
33346 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33347 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33348 ms_va_ref = build_reference_type (ms_va_list_type_node);
33349 sysv_va_ref =
33350 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33352 fnvoid_va_end_ms =
33353 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33354 fnvoid_va_start_ms =
33355 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33356 fnvoid_va_end_sysv =
33357 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33358 fnvoid_va_start_sysv =
33359 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33360 NULL_TREE);
33361 fnvoid_va_copy_ms =
33362 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33363 NULL_TREE);
33364 fnvoid_va_copy_sysv =
33365 build_function_type_list (void_type_node, sysv_va_ref,
33366 sysv_va_ref, NULL_TREE);
33368 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33369 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33370 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33371 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33372 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33373 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33374 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33375 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33376 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33377 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33378 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33379 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33382 static void
33383 ix86_init_builtin_types (void)
33385 tree float80_type_node, const_string_type_node;
33387 /* The __float80 type. */
33388 float80_type_node = long_double_type_node;
33389 if (TYPE_MODE (float80_type_node) != XFmode)
33391 if (float64x_type_node != NULL_TREE
33392 && TYPE_MODE (float64x_type_node) == XFmode)
33393 float80_type_node = float64x_type_node;
33394 else
33396 /* The __float80 type. */
33397 float80_type_node = make_node (REAL_TYPE);
33399 TYPE_PRECISION (float80_type_node) = 80;
33400 layout_type (float80_type_node);
33403 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
33405 /* The __float128 type. The node has already been created as
33406 _Float128, so we only need to register the __float128 name for
33407 it. */
33408 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
33410 const_string_type_node
33411 = build_pointer_type (build_qualified_type
33412 (char_type_node, TYPE_QUAL_CONST));
33414 /* This macro is built by i386-builtin-types.awk. */
33415 DEFINE_BUILTIN_PRIMITIVE_TYPES;
33418 static void
33419 ix86_init_builtins (void)
33421 tree ftype, decl;
33423 ix86_init_builtin_types ();
33425 /* Builtins to get CPU type and features. */
33426 ix86_init_platform_type_builtins ();
33428 /* TFmode support builtins. */
33429 def_builtin_const (0, "__builtin_infq",
33430 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
33431 def_builtin_const (0, "__builtin_huge_valq",
33432 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
33434 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
33435 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
33436 BUILT_IN_MD, "nanq", NULL_TREE);
33437 TREE_READONLY (decl) = 1;
33438 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
33440 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
33441 BUILT_IN_MD, "nansq", NULL_TREE);
33442 TREE_READONLY (decl) = 1;
33443 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
33445 /* We will expand them to normal call if SSE isn't available since
33446 they are used by libgcc. */
33447 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
33448 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
33449 BUILT_IN_MD, "__fabstf2", NULL_TREE);
33450 TREE_READONLY (decl) = 1;
33451 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
33453 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
33454 decl = add_builtin_function ("__builtin_copysignq", ftype,
33455 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
33456 "__copysigntf3", NULL_TREE);
33457 TREE_READONLY (decl) = 1;
33458 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
33460 ix86_init_tm_builtins ();
33461 ix86_init_mmx_sse_builtins ();
33462 ix86_init_mpx_builtins ();
33464 if (TARGET_LP64)
33465 ix86_init_builtins_va_builtins_abi ();
33467 #ifdef SUBTARGET_INIT_BUILTINS
33468 SUBTARGET_INIT_BUILTINS;
33469 #endif
33472 /* Return the ix86 builtin for CODE. */
33474 static tree
33475 ix86_builtin_decl (unsigned code, bool)
33477 if (code >= IX86_BUILTIN_MAX)
33478 return error_mark_node;
33480 return ix86_builtins[code];
33483 /* Errors in the source file can cause expand_expr to return const0_rtx
33484 where we expect a vector. To avoid crashing, use one of the vector
33485 clear instructions. */
33486 static rtx
33487 safe_vector_operand (rtx x, machine_mode mode)
33489 if (x == const0_rtx)
33490 x = CONST0_RTX (mode);
33491 return x;
33494 /* Fixup modeless constants to fit required mode. */
33495 static rtx
33496 fixup_modeless_constant (rtx x, machine_mode mode)
33498 if (GET_MODE (x) == VOIDmode)
33499 x = convert_to_mode (mode, x, 1);
33500 return x;
33503 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
33505 static rtx
33506 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
33508 rtx pat;
33509 tree arg0 = CALL_EXPR_ARG (exp, 0);
33510 tree arg1 = CALL_EXPR_ARG (exp, 1);
33511 rtx op0 = expand_normal (arg0);
33512 rtx op1 = expand_normal (arg1);
33513 machine_mode tmode = insn_data[icode].operand[0].mode;
33514 machine_mode mode0 = insn_data[icode].operand[1].mode;
33515 machine_mode mode1 = insn_data[icode].operand[2].mode;
33517 if (VECTOR_MODE_P (mode0))
33518 op0 = safe_vector_operand (op0, mode0);
33519 if (VECTOR_MODE_P (mode1))
33520 op1 = safe_vector_operand (op1, mode1);
33522 if (optimize || !target
33523 || GET_MODE (target) != tmode
33524 || !insn_data[icode].operand[0].predicate (target, tmode))
33525 target = gen_reg_rtx (tmode);
33527 if (GET_MODE (op1) == SImode && mode1 == TImode)
33529 rtx x = gen_reg_rtx (V4SImode);
33530 emit_insn (gen_sse2_loadd (x, op1));
33531 op1 = gen_lowpart (TImode, x);
33534 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33535 op0 = copy_to_mode_reg (mode0, op0);
33536 if (!insn_data[icode].operand[2].predicate (op1, mode1))
33537 op1 = copy_to_mode_reg (mode1, op1);
33539 pat = GEN_FCN (icode) (target, op0, op1);
33540 if (! pat)
33541 return 0;
33543 emit_insn (pat);
33545 return target;
33548 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
33550 static rtx
33551 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
33552 enum ix86_builtin_func_type m_type,
33553 enum rtx_code sub_code)
33555 rtx pat;
33556 int i;
33557 int nargs;
33558 bool comparison_p = false;
33559 bool tf_p = false;
33560 bool last_arg_constant = false;
33561 int num_memory = 0;
33562 struct {
33563 rtx op;
33564 machine_mode mode;
33565 } args[4];
33567 machine_mode tmode = insn_data[icode].operand[0].mode;
33569 switch (m_type)
33571 case MULTI_ARG_4_DF2_DI_I:
33572 case MULTI_ARG_4_DF2_DI_I1:
33573 case MULTI_ARG_4_SF2_SI_I:
33574 case MULTI_ARG_4_SF2_SI_I1:
33575 nargs = 4;
33576 last_arg_constant = true;
33577 break;
33579 case MULTI_ARG_3_SF:
33580 case MULTI_ARG_3_DF:
33581 case MULTI_ARG_3_SF2:
33582 case MULTI_ARG_3_DF2:
33583 case MULTI_ARG_3_DI:
33584 case MULTI_ARG_3_SI:
33585 case MULTI_ARG_3_SI_DI:
33586 case MULTI_ARG_3_HI:
33587 case MULTI_ARG_3_HI_SI:
33588 case MULTI_ARG_3_QI:
33589 case MULTI_ARG_3_DI2:
33590 case MULTI_ARG_3_SI2:
33591 case MULTI_ARG_3_HI2:
33592 case MULTI_ARG_3_QI2:
33593 nargs = 3;
33594 break;
33596 case MULTI_ARG_2_SF:
33597 case MULTI_ARG_2_DF:
33598 case MULTI_ARG_2_DI:
33599 case MULTI_ARG_2_SI:
33600 case MULTI_ARG_2_HI:
33601 case MULTI_ARG_2_QI:
33602 nargs = 2;
33603 break;
33605 case MULTI_ARG_2_DI_IMM:
33606 case MULTI_ARG_2_SI_IMM:
33607 case MULTI_ARG_2_HI_IMM:
33608 case MULTI_ARG_2_QI_IMM:
33609 nargs = 2;
33610 last_arg_constant = true;
33611 break;
33613 case MULTI_ARG_1_SF:
33614 case MULTI_ARG_1_DF:
33615 case MULTI_ARG_1_SF2:
33616 case MULTI_ARG_1_DF2:
33617 case MULTI_ARG_1_DI:
33618 case MULTI_ARG_1_SI:
33619 case MULTI_ARG_1_HI:
33620 case MULTI_ARG_1_QI:
33621 case MULTI_ARG_1_SI_DI:
33622 case MULTI_ARG_1_HI_DI:
33623 case MULTI_ARG_1_HI_SI:
33624 case MULTI_ARG_1_QI_DI:
33625 case MULTI_ARG_1_QI_SI:
33626 case MULTI_ARG_1_QI_HI:
33627 nargs = 1;
33628 break;
33630 case MULTI_ARG_2_DI_CMP:
33631 case MULTI_ARG_2_SI_CMP:
33632 case MULTI_ARG_2_HI_CMP:
33633 case MULTI_ARG_2_QI_CMP:
33634 nargs = 2;
33635 comparison_p = true;
33636 break;
33638 case MULTI_ARG_2_SF_TF:
33639 case MULTI_ARG_2_DF_TF:
33640 case MULTI_ARG_2_DI_TF:
33641 case MULTI_ARG_2_SI_TF:
33642 case MULTI_ARG_2_HI_TF:
33643 case MULTI_ARG_2_QI_TF:
33644 nargs = 2;
33645 tf_p = true;
33646 break;
33648 default:
33649 gcc_unreachable ();
33652 if (optimize || !target
33653 || GET_MODE (target) != tmode
33654 || !insn_data[icode].operand[0].predicate (target, tmode))
33655 target = gen_reg_rtx (tmode);
33656 else if (memory_operand (target, tmode))
33657 num_memory++;
33659 gcc_assert (nargs <= 4);
33661 for (i = 0; i < nargs; i++)
33663 tree arg = CALL_EXPR_ARG (exp, i);
33664 rtx op = expand_normal (arg);
33665 int adjust = (comparison_p) ? 1 : 0;
33666 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
33668 if (last_arg_constant && i == nargs - 1)
33670 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
33672 enum insn_code new_icode = icode;
33673 switch (icode)
33675 case CODE_FOR_xop_vpermil2v2df3:
33676 case CODE_FOR_xop_vpermil2v4sf3:
33677 case CODE_FOR_xop_vpermil2v4df3:
33678 case CODE_FOR_xop_vpermil2v8sf3:
33679 error ("the last argument must be a 2-bit immediate");
33680 return gen_reg_rtx (tmode);
33681 case CODE_FOR_xop_rotlv2di3:
33682 new_icode = CODE_FOR_rotlv2di3;
33683 goto xop_rotl;
33684 case CODE_FOR_xop_rotlv4si3:
33685 new_icode = CODE_FOR_rotlv4si3;
33686 goto xop_rotl;
33687 case CODE_FOR_xop_rotlv8hi3:
33688 new_icode = CODE_FOR_rotlv8hi3;
33689 goto xop_rotl;
33690 case CODE_FOR_xop_rotlv16qi3:
33691 new_icode = CODE_FOR_rotlv16qi3;
33692 xop_rotl:
33693 if (CONST_INT_P (op))
33695 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
33696 op = GEN_INT (INTVAL (op) & mask);
33697 gcc_checking_assert
33698 (insn_data[icode].operand[i + 1].predicate (op, mode));
33700 else
33702 gcc_checking_assert
33703 (nargs == 2
33704 && insn_data[new_icode].operand[0].mode == tmode
33705 && insn_data[new_icode].operand[1].mode == tmode
33706 && insn_data[new_icode].operand[2].mode == mode
33707 && insn_data[new_icode].operand[0].predicate
33708 == insn_data[icode].operand[0].predicate
33709 && insn_data[new_icode].operand[1].predicate
33710 == insn_data[icode].operand[1].predicate);
33711 icode = new_icode;
33712 goto non_constant;
33714 break;
33715 default:
33716 gcc_unreachable ();
33720 else
33722 non_constant:
33723 if (VECTOR_MODE_P (mode))
33724 op = safe_vector_operand (op, mode);
33726 /* If we aren't optimizing, only allow one memory operand to be
33727 generated. */
33728 if (memory_operand (op, mode))
33729 num_memory++;
33731 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
33733 if (optimize
33734 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
33735 || num_memory > 1)
33736 op = force_reg (mode, op);
33739 args[i].op = op;
33740 args[i].mode = mode;
33743 switch (nargs)
33745 case 1:
33746 pat = GEN_FCN (icode) (target, args[0].op);
33747 break;
33749 case 2:
33750 if (tf_p)
33751 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
33752 GEN_INT ((int)sub_code));
33753 else if (! comparison_p)
33754 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
33755 else
33757 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
33758 args[0].op,
33759 args[1].op);
33761 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
33763 break;
33765 case 3:
33766 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
33767 break;
33769 case 4:
33770 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
33771 break;
33773 default:
33774 gcc_unreachable ();
33777 if (! pat)
33778 return 0;
33780 emit_insn (pat);
33781 return target;
33784 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
33785 insns with vec_merge. */
33787 static rtx
33788 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
33789 rtx target)
33791 rtx pat;
33792 tree arg0 = CALL_EXPR_ARG (exp, 0);
33793 rtx op1, op0 = expand_normal (arg0);
33794 machine_mode tmode = insn_data[icode].operand[0].mode;
33795 machine_mode mode0 = insn_data[icode].operand[1].mode;
33797 if (optimize || !target
33798 || GET_MODE (target) != tmode
33799 || !insn_data[icode].operand[0].predicate (target, tmode))
33800 target = gen_reg_rtx (tmode);
33802 if (VECTOR_MODE_P (mode0))
33803 op0 = safe_vector_operand (op0, mode0);
33805 if ((optimize && !register_operand (op0, mode0))
33806 || !insn_data[icode].operand[1].predicate (op0, mode0))
33807 op0 = copy_to_mode_reg (mode0, op0);
33809 op1 = op0;
33810 if (!insn_data[icode].operand[2].predicate (op1, mode0))
33811 op1 = copy_to_mode_reg (mode0, op1);
33813 pat = GEN_FCN (icode) (target, op0, op1);
33814 if (! pat)
33815 return 0;
33816 emit_insn (pat);
33817 return target;
33820 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
33822 static rtx
33823 ix86_expand_sse_compare (const struct builtin_description *d,
33824 tree exp, rtx target, bool swap)
33826 rtx pat;
33827 tree arg0 = CALL_EXPR_ARG (exp, 0);
33828 tree arg1 = CALL_EXPR_ARG (exp, 1);
33829 rtx op0 = expand_normal (arg0);
33830 rtx op1 = expand_normal (arg1);
33831 rtx op2;
33832 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33833 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33834 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33835 enum rtx_code comparison = d->comparison;
33837 if (VECTOR_MODE_P (mode0))
33838 op0 = safe_vector_operand (op0, mode0);
33839 if (VECTOR_MODE_P (mode1))
33840 op1 = safe_vector_operand (op1, mode1);
33842 /* Swap operands if we have a comparison that isn't available in
33843 hardware. */
33844 if (swap)
33845 std::swap (op0, op1);
33847 if (optimize || !target
33848 || GET_MODE (target) != tmode
33849 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33850 target = gen_reg_rtx (tmode);
33852 if ((optimize && !register_operand (op0, mode0))
33853 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
33854 op0 = copy_to_mode_reg (mode0, op0);
33855 if ((optimize && !register_operand (op1, mode1))
33856 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
33857 op1 = copy_to_mode_reg (mode1, op1);
33859 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33860 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33861 if (! pat)
33862 return 0;
33863 emit_insn (pat);
33864 return target;
33867 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33869 static rtx
33870 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33871 rtx target)
33873 rtx pat;
33874 tree arg0 = CALL_EXPR_ARG (exp, 0);
33875 tree arg1 = CALL_EXPR_ARG (exp, 1);
33876 rtx op0 = expand_normal (arg0);
33877 rtx op1 = expand_normal (arg1);
33878 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33879 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33880 enum rtx_code comparison = d->comparison;
33882 if (VECTOR_MODE_P (mode0))
33883 op0 = safe_vector_operand (op0, mode0);
33884 if (VECTOR_MODE_P (mode1))
33885 op1 = safe_vector_operand (op1, mode1);
33887 /* Swap operands if we have a comparison that isn't available in
33888 hardware. */
33889 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33890 std::swap (op0, op1);
33892 target = gen_reg_rtx (SImode);
33893 emit_move_insn (target, const0_rtx);
33894 target = gen_rtx_SUBREG (QImode, target, 0);
33896 if ((optimize && !register_operand (op0, mode0))
33897 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33898 op0 = copy_to_mode_reg (mode0, op0);
33899 if ((optimize && !register_operand (op1, mode1))
33900 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33901 op1 = copy_to_mode_reg (mode1, op1);
33903 pat = GEN_FCN (d->icode) (op0, op1);
33904 if (! pat)
33905 return 0;
33906 emit_insn (pat);
33907 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33908 gen_rtx_fmt_ee (comparison, QImode,
33909 SET_DEST (pat),
33910 const0_rtx)));
33912 return SUBREG_REG (target);
33915 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33917 static rtx
33918 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33919 rtx target)
33921 rtx pat;
33922 tree arg0 = CALL_EXPR_ARG (exp, 0);
33923 rtx op1, op0 = expand_normal (arg0);
33924 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33925 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33927 if (optimize || target == 0
33928 || GET_MODE (target) != tmode
33929 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33930 target = gen_reg_rtx (tmode);
33932 if (VECTOR_MODE_P (mode0))
33933 op0 = safe_vector_operand (op0, mode0);
33935 if ((optimize && !register_operand (op0, mode0))
33936 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33937 op0 = copy_to_mode_reg (mode0, op0);
33939 op1 = GEN_INT (d->comparison);
33941 pat = GEN_FCN (d->icode) (target, op0, op1);
33942 if (! pat)
33943 return 0;
33944 emit_insn (pat);
33945 return target;
33948 static rtx
33949 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33950 tree exp, rtx target)
33952 rtx pat;
33953 tree arg0 = CALL_EXPR_ARG (exp, 0);
33954 tree arg1 = CALL_EXPR_ARG (exp, 1);
33955 rtx op0 = expand_normal (arg0);
33956 rtx op1 = expand_normal (arg1);
33957 rtx op2;
33958 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33959 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33960 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33962 if (optimize || target == 0
33963 || GET_MODE (target) != tmode
33964 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33965 target = gen_reg_rtx (tmode);
33967 op0 = safe_vector_operand (op0, mode0);
33968 op1 = safe_vector_operand (op1, mode1);
33970 if ((optimize && !register_operand (op0, mode0))
33971 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33972 op0 = copy_to_mode_reg (mode0, op0);
33973 if ((optimize && !register_operand (op1, mode1))
33974 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33975 op1 = copy_to_mode_reg (mode1, op1);
33977 op2 = GEN_INT (d->comparison);
33979 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33980 if (! pat)
33981 return 0;
33982 emit_insn (pat);
33983 return target;
33986 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33988 static rtx
33989 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33990 rtx target)
33992 rtx pat;
33993 tree arg0 = CALL_EXPR_ARG (exp, 0);
33994 tree arg1 = CALL_EXPR_ARG (exp, 1);
33995 rtx op0 = expand_normal (arg0);
33996 rtx op1 = expand_normal (arg1);
33997 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33998 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33999 enum rtx_code comparison = d->comparison;
34001 if (VECTOR_MODE_P (mode0))
34002 op0 = safe_vector_operand (op0, mode0);
34003 if (VECTOR_MODE_P (mode1))
34004 op1 = safe_vector_operand (op1, mode1);
34006 target = gen_reg_rtx (SImode);
34007 emit_move_insn (target, const0_rtx);
34008 target = gen_rtx_SUBREG (QImode, target, 0);
34010 if ((optimize && !register_operand (op0, mode0))
34011 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34012 op0 = copy_to_mode_reg (mode0, op0);
34013 if ((optimize && !register_operand (op1, mode1))
34014 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34015 op1 = copy_to_mode_reg (mode1, op1);
34017 pat = GEN_FCN (d->icode) (op0, op1);
34018 if (! pat)
34019 return 0;
34020 emit_insn (pat);
34021 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34022 gen_rtx_fmt_ee (comparison, QImode,
34023 SET_DEST (pat),
34024 const0_rtx)));
34026 return SUBREG_REG (target);
34029 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34031 static rtx
34032 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34033 tree exp, rtx target)
34035 rtx pat;
34036 tree arg0 = CALL_EXPR_ARG (exp, 0);
34037 tree arg1 = CALL_EXPR_ARG (exp, 1);
34038 tree arg2 = CALL_EXPR_ARG (exp, 2);
34039 tree arg3 = CALL_EXPR_ARG (exp, 3);
34040 tree arg4 = CALL_EXPR_ARG (exp, 4);
34041 rtx scratch0, scratch1;
34042 rtx op0 = expand_normal (arg0);
34043 rtx op1 = expand_normal (arg1);
34044 rtx op2 = expand_normal (arg2);
34045 rtx op3 = expand_normal (arg3);
34046 rtx op4 = expand_normal (arg4);
34047 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34049 tmode0 = insn_data[d->icode].operand[0].mode;
34050 tmode1 = insn_data[d->icode].operand[1].mode;
34051 modev2 = insn_data[d->icode].operand[2].mode;
34052 modei3 = insn_data[d->icode].operand[3].mode;
34053 modev4 = insn_data[d->icode].operand[4].mode;
34054 modei5 = insn_data[d->icode].operand[5].mode;
34055 modeimm = insn_data[d->icode].operand[6].mode;
34057 if (VECTOR_MODE_P (modev2))
34058 op0 = safe_vector_operand (op0, modev2);
34059 if (VECTOR_MODE_P (modev4))
34060 op2 = safe_vector_operand (op2, modev4);
34062 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34063 op0 = copy_to_mode_reg (modev2, op0);
34064 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34065 op1 = copy_to_mode_reg (modei3, op1);
34066 if ((optimize && !register_operand (op2, modev4))
34067 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34068 op2 = copy_to_mode_reg (modev4, op2);
34069 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34070 op3 = copy_to_mode_reg (modei5, op3);
34072 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34074 error ("the fifth argument must be an 8-bit immediate");
34075 return const0_rtx;
34078 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34080 if (optimize || !target
34081 || GET_MODE (target) != tmode0
34082 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34083 target = gen_reg_rtx (tmode0);
34085 scratch1 = gen_reg_rtx (tmode1);
34087 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34089 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34091 if (optimize || !target
34092 || GET_MODE (target) != tmode1
34093 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34094 target = gen_reg_rtx (tmode1);
34096 scratch0 = gen_reg_rtx (tmode0);
34098 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34100 else
34102 gcc_assert (d->flag);
34104 scratch0 = gen_reg_rtx (tmode0);
34105 scratch1 = gen_reg_rtx (tmode1);
34107 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34110 if (! pat)
34111 return 0;
34113 emit_insn (pat);
34115 if (d->flag)
34117 target = gen_reg_rtx (SImode);
34118 emit_move_insn (target, const0_rtx);
34119 target = gen_rtx_SUBREG (QImode, target, 0);
34121 emit_insn
34122 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34123 gen_rtx_fmt_ee (EQ, QImode,
34124 gen_rtx_REG ((machine_mode) d->flag,
34125 FLAGS_REG),
34126 const0_rtx)));
34127 return SUBREG_REG (target);
34129 else
34130 return target;
34134 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34136 static rtx
34137 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34138 tree exp, rtx target)
34140 rtx pat;
34141 tree arg0 = CALL_EXPR_ARG (exp, 0);
34142 tree arg1 = CALL_EXPR_ARG (exp, 1);
34143 tree arg2 = CALL_EXPR_ARG (exp, 2);
34144 rtx scratch0, scratch1;
34145 rtx op0 = expand_normal (arg0);
34146 rtx op1 = expand_normal (arg1);
34147 rtx op2 = expand_normal (arg2);
34148 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34150 tmode0 = insn_data[d->icode].operand[0].mode;
34151 tmode1 = insn_data[d->icode].operand[1].mode;
34152 modev2 = insn_data[d->icode].operand[2].mode;
34153 modev3 = insn_data[d->icode].operand[3].mode;
34154 modeimm = insn_data[d->icode].operand[4].mode;
34156 if (VECTOR_MODE_P (modev2))
34157 op0 = safe_vector_operand (op0, modev2);
34158 if (VECTOR_MODE_P (modev3))
34159 op1 = safe_vector_operand (op1, modev3);
34161 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34162 op0 = copy_to_mode_reg (modev2, op0);
34163 if ((optimize && !register_operand (op1, modev3))
34164 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34165 op1 = copy_to_mode_reg (modev3, op1);
34167 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34169 error ("the third argument must be an 8-bit immediate");
34170 return const0_rtx;
34173 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34175 if (optimize || !target
34176 || GET_MODE (target) != tmode0
34177 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34178 target = gen_reg_rtx (tmode0);
34180 scratch1 = gen_reg_rtx (tmode1);
34182 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34184 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34186 if (optimize || !target
34187 || GET_MODE (target) != tmode1
34188 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34189 target = gen_reg_rtx (tmode1);
34191 scratch0 = gen_reg_rtx (tmode0);
34193 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34195 else
34197 gcc_assert (d->flag);
34199 scratch0 = gen_reg_rtx (tmode0);
34200 scratch1 = gen_reg_rtx (tmode1);
34202 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34205 if (! pat)
34206 return 0;
34208 emit_insn (pat);
34210 if (d->flag)
34212 target = gen_reg_rtx (SImode);
34213 emit_move_insn (target, const0_rtx);
34214 target = gen_rtx_SUBREG (QImode, target, 0);
34216 emit_insn
34217 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34218 gen_rtx_fmt_ee (EQ, QImode,
34219 gen_rtx_REG ((machine_mode) d->flag,
34220 FLAGS_REG),
34221 const0_rtx)));
34222 return SUBREG_REG (target);
34224 else
34225 return target;
34228 /* Subroutine of ix86_expand_builtin to take care of insns with
34229 variable number of operands. */
34231 static rtx
34232 ix86_expand_args_builtin (const struct builtin_description *d,
34233 tree exp, rtx target)
34235 rtx pat, real_target;
34236 unsigned int i, nargs;
34237 unsigned int nargs_constant = 0;
34238 unsigned int mask_pos = 0;
34239 int num_memory = 0;
34240 struct
34242 rtx op;
34243 machine_mode mode;
34244 } args[6];
34245 bool second_arg_count = false;
34246 enum insn_code icode = d->icode;
34247 const struct insn_data_d *insn_p = &insn_data[icode];
34248 machine_mode tmode = insn_p->operand[0].mode;
34249 machine_mode rmode = VOIDmode;
34250 bool swap = false;
34251 enum rtx_code comparison = d->comparison;
34253 switch ((enum ix86_builtin_func_type) d->flag)
34255 case V2DF_FTYPE_V2DF_ROUND:
34256 case V4DF_FTYPE_V4DF_ROUND:
34257 case V8DF_FTYPE_V8DF_ROUND:
34258 case V4SF_FTYPE_V4SF_ROUND:
34259 case V8SF_FTYPE_V8SF_ROUND:
34260 case V16SF_FTYPE_V16SF_ROUND:
34261 case V4SI_FTYPE_V4SF_ROUND:
34262 case V8SI_FTYPE_V8SF_ROUND:
34263 case V16SI_FTYPE_V16SF_ROUND:
34264 return ix86_expand_sse_round (d, exp, target);
34265 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34266 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34267 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34268 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34269 case INT_FTYPE_V8SF_V8SF_PTEST:
34270 case INT_FTYPE_V4DI_V4DI_PTEST:
34271 case INT_FTYPE_V4DF_V4DF_PTEST:
34272 case INT_FTYPE_V4SF_V4SF_PTEST:
34273 case INT_FTYPE_V2DI_V2DI_PTEST:
34274 case INT_FTYPE_V2DF_V2DF_PTEST:
34275 return ix86_expand_sse_ptest (d, exp, target);
34276 case FLOAT128_FTYPE_FLOAT128:
34277 case FLOAT_FTYPE_FLOAT:
34278 case INT_FTYPE_INT:
34279 case UINT_FTYPE_UINT:
34280 case UINT16_FTYPE_UINT16:
34281 case UINT64_FTYPE_INT:
34282 case UINT64_FTYPE_UINT64:
34283 case INT64_FTYPE_INT64:
34284 case INT64_FTYPE_V4SF:
34285 case INT64_FTYPE_V2DF:
34286 case INT_FTYPE_V16QI:
34287 case INT_FTYPE_V8QI:
34288 case INT_FTYPE_V8SF:
34289 case INT_FTYPE_V4DF:
34290 case INT_FTYPE_V4SF:
34291 case INT_FTYPE_V2DF:
34292 case INT_FTYPE_V32QI:
34293 case V16QI_FTYPE_V16QI:
34294 case V8SI_FTYPE_V8SF:
34295 case V8SI_FTYPE_V4SI:
34296 case V8HI_FTYPE_V8HI:
34297 case V8HI_FTYPE_V16QI:
34298 case V8QI_FTYPE_V8QI:
34299 case V8SF_FTYPE_V8SF:
34300 case V8SF_FTYPE_V8SI:
34301 case V8SF_FTYPE_V4SF:
34302 case V8SF_FTYPE_V8HI:
34303 case V4SI_FTYPE_V4SI:
34304 case V4SI_FTYPE_V16QI:
34305 case V4SI_FTYPE_V4SF:
34306 case V4SI_FTYPE_V8SI:
34307 case V4SI_FTYPE_V8HI:
34308 case V4SI_FTYPE_V4DF:
34309 case V4SI_FTYPE_V2DF:
34310 case V4HI_FTYPE_V4HI:
34311 case V4DF_FTYPE_V4DF:
34312 case V4DF_FTYPE_V4SI:
34313 case V4DF_FTYPE_V4SF:
34314 case V4DF_FTYPE_V2DF:
34315 case V4SF_FTYPE_V4SF:
34316 case V4SF_FTYPE_V4SI:
34317 case V4SF_FTYPE_V8SF:
34318 case V4SF_FTYPE_V4DF:
34319 case V4SF_FTYPE_V8HI:
34320 case V4SF_FTYPE_V2DF:
34321 case V2DI_FTYPE_V2DI:
34322 case V2DI_FTYPE_V16QI:
34323 case V2DI_FTYPE_V8HI:
34324 case V2DI_FTYPE_V4SI:
34325 case V2DF_FTYPE_V2DF:
34326 case V2DF_FTYPE_V4SI:
34327 case V2DF_FTYPE_V4DF:
34328 case V2DF_FTYPE_V4SF:
34329 case V2DF_FTYPE_V2SI:
34330 case V2SI_FTYPE_V2SI:
34331 case V2SI_FTYPE_V4SF:
34332 case V2SI_FTYPE_V2SF:
34333 case V2SI_FTYPE_V2DF:
34334 case V2SF_FTYPE_V2SF:
34335 case V2SF_FTYPE_V2SI:
34336 case V32QI_FTYPE_V32QI:
34337 case V32QI_FTYPE_V16QI:
34338 case V16HI_FTYPE_V16HI:
34339 case V16HI_FTYPE_V8HI:
34340 case V8SI_FTYPE_V8SI:
34341 case V16HI_FTYPE_V16QI:
34342 case V8SI_FTYPE_V16QI:
34343 case V4DI_FTYPE_V16QI:
34344 case V8SI_FTYPE_V8HI:
34345 case V4DI_FTYPE_V8HI:
34346 case V4DI_FTYPE_V4SI:
34347 case V4DI_FTYPE_V2DI:
34348 case UQI_FTYPE_UQI:
34349 case UHI_FTYPE_UHI:
34350 case USI_FTYPE_USI:
34351 case USI_FTYPE_UQI:
34352 case USI_FTYPE_UHI:
34353 case UDI_FTYPE_UDI:
34354 case UHI_FTYPE_V16QI:
34355 case USI_FTYPE_V32QI:
34356 case UDI_FTYPE_V64QI:
34357 case V16QI_FTYPE_UHI:
34358 case V32QI_FTYPE_USI:
34359 case V64QI_FTYPE_UDI:
34360 case V8HI_FTYPE_UQI:
34361 case V16HI_FTYPE_UHI:
34362 case V32HI_FTYPE_USI:
34363 case V4SI_FTYPE_UQI:
34364 case V8SI_FTYPE_UQI:
34365 case V4SI_FTYPE_UHI:
34366 case V8SI_FTYPE_UHI:
34367 case UQI_FTYPE_V8HI:
34368 case UHI_FTYPE_V16HI:
34369 case USI_FTYPE_V32HI:
34370 case UQI_FTYPE_V4SI:
34371 case UQI_FTYPE_V8SI:
34372 case UHI_FTYPE_V16SI:
34373 case UQI_FTYPE_V2DI:
34374 case UQI_FTYPE_V4DI:
34375 case UQI_FTYPE_V8DI:
34376 case V16SI_FTYPE_UHI:
34377 case V2DI_FTYPE_UQI:
34378 case V4DI_FTYPE_UQI:
34379 case V16SI_FTYPE_INT:
34380 case V16SF_FTYPE_V8SF:
34381 case V16SI_FTYPE_V8SI:
34382 case V16SF_FTYPE_V4SF:
34383 case V16SI_FTYPE_V4SI:
34384 case V16SI_FTYPE_V16SF:
34385 case V16SI_FTYPE_V16SI:
34386 case V64QI_FTYPE_V64QI:
34387 case V32HI_FTYPE_V32HI:
34388 case V16SF_FTYPE_V16SF:
34389 case V8DI_FTYPE_UQI:
34390 case V8DI_FTYPE_V8DI:
34391 case V8DF_FTYPE_V4DF:
34392 case V8DF_FTYPE_V2DF:
34393 case V8DF_FTYPE_V8DF:
34394 case V4DI_FTYPE_V4DI:
34395 nargs = 1;
34396 break;
34397 case V4SF_FTYPE_V4SF_VEC_MERGE:
34398 case V2DF_FTYPE_V2DF_VEC_MERGE:
34399 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
34400 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
34401 case V16QI_FTYPE_V16QI_V16QI:
34402 case V16QI_FTYPE_V8HI_V8HI:
34403 case V16SF_FTYPE_V16SF_V16SF:
34404 case V8QI_FTYPE_V8QI_V8QI:
34405 case V8QI_FTYPE_V4HI_V4HI:
34406 case V8HI_FTYPE_V8HI_V8HI:
34407 case V8HI_FTYPE_V16QI_V16QI:
34408 case V8HI_FTYPE_V4SI_V4SI:
34409 case V8SF_FTYPE_V8SF_V8SF:
34410 case V8SF_FTYPE_V8SF_V8SI:
34411 case V8DF_FTYPE_V8DF_V8DF:
34412 case V4SI_FTYPE_V4SI_V4SI:
34413 case V4SI_FTYPE_V8HI_V8HI:
34414 case V4SI_FTYPE_V2DF_V2DF:
34415 case V4HI_FTYPE_V4HI_V4HI:
34416 case V4HI_FTYPE_V8QI_V8QI:
34417 case V4HI_FTYPE_V2SI_V2SI:
34418 case V4DF_FTYPE_V4DF_V4DF:
34419 case V4DF_FTYPE_V4DF_V4DI:
34420 case V4SF_FTYPE_V4SF_V4SF:
34421 case V4SF_FTYPE_V4SF_V4SI:
34422 case V4SF_FTYPE_V4SF_V2SI:
34423 case V4SF_FTYPE_V4SF_V2DF:
34424 case V4SF_FTYPE_V4SF_UINT:
34425 case V4SF_FTYPE_V4SF_DI:
34426 case V4SF_FTYPE_V4SF_SI:
34427 case V2DI_FTYPE_V2DI_V2DI:
34428 case V2DI_FTYPE_V16QI_V16QI:
34429 case V2DI_FTYPE_V4SI_V4SI:
34430 case V2DI_FTYPE_V2DI_V16QI:
34431 case V2SI_FTYPE_V2SI_V2SI:
34432 case V2SI_FTYPE_V4HI_V4HI:
34433 case V2SI_FTYPE_V2SF_V2SF:
34434 case V2DF_FTYPE_V2DF_V2DF:
34435 case V2DF_FTYPE_V2DF_V4SF:
34436 case V2DF_FTYPE_V2DF_V2DI:
34437 case V2DF_FTYPE_V2DF_DI:
34438 case V2DF_FTYPE_V2DF_SI:
34439 case V2DF_FTYPE_V2DF_UINT:
34440 case V2SF_FTYPE_V2SF_V2SF:
34441 case V1DI_FTYPE_V1DI_V1DI:
34442 case V1DI_FTYPE_V8QI_V8QI:
34443 case V1DI_FTYPE_V2SI_V2SI:
34444 case V32QI_FTYPE_V16HI_V16HI:
34445 case V16HI_FTYPE_V8SI_V8SI:
34446 case V64QI_FTYPE_V64QI_V64QI:
34447 case V32QI_FTYPE_V32QI_V32QI:
34448 case V16HI_FTYPE_V32QI_V32QI:
34449 case V16HI_FTYPE_V16HI_V16HI:
34450 case V8SI_FTYPE_V4DF_V4DF:
34451 case V8SI_FTYPE_V8SI_V8SI:
34452 case V8SI_FTYPE_V16HI_V16HI:
34453 case V4DI_FTYPE_V4DI_V4DI:
34454 case V4DI_FTYPE_V8SI_V8SI:
34455 case V8DI_FTYPE_V64QI_V64QI:
34456 if (comparison == UNKNOWN)
34457 return ix86_expand_binop_builtin (icode, exp, target);
34458 nargs = 2;
34459 break;
34460 case V4SF_FTYPE_V4SF_V4SF_SWAP:
34461 case V2DF_FTYPE_V2DF_V2DF_SWAP:
34462 gcc_assert (comparison != UNKNOWN);
34463 nargs = 2;
34464 swap = true;
34465 break;
34466 case V16HI_FTYPE_V16HI_V8HI_COUNT:
34467 case V16HI_FTYPE_V16HI_SI_COUNT:
34468 case V8SI_FTYPE_V8SI_V4SI_COUNT:
34469 case V8SI_FTYPE_V8SI_SI_COUNT:
34470 case V4DI_FTYPE_V4DI_V2DI_COUNT:
34471 case V4DI_FTYPE_V4DI_INT_COUNT:
34472 case V8HI_FTYPE_V8HI_V8HI_COUNT:
34473 case V8HI_FTYPE_V8HI_SI_COUNT:
34474 case V4SI_FTYPE_V4SI_V4SI_COUNT:
34475 case V4SI_FTYPE_V4SI_SI_COUNT:
34476 case V4HI_FTYPE_V4HI_V4HI_COUNT:
34477 case V4HI_FTYPE_V4HI_SI_COUNT:
34478 case V2DI_FTYPE_V2DI_V2DI_COUNT:
34479 case V2DI_FTYPE_V2DI_SI_COUNT:
34480 case V2SI_FTYPE_V2SI_V2SI_COUNT:
34481 case V2SI_FTYPE_V2SI_SI_COUNT:
34482 case V1DI_FTYPE_V1DI_V1DI_COUNT:
34483 case V1DI_FTYPE_V1DI_SI_COUNT:
34484 nargs = 2;
34485 second_arg_count = true;
34486 break;
34487 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
34488 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
34489 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
34490 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
34491 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
34492 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
34493 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
34494 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
34495 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
34496 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
34497 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
34498 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
34499 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
34500 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
34501 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
34502 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
34503 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
34504 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
34505 nargs = 4;
34506 second_arg_count = true;
34507 break;
34508 case UINT64_FTYPE_UINT64_UINT64:
34509 case UINT_FTYPE_UINT_UINT:
34510 case UINT_FTYPE_UINT_USHORT:
34511 case UINT_FTYPE_UINT_UCHAR:
34512 case UINT16_FTYPE_UINT16_INT:
34513 case UINT8_FTYPE_UINT8_INT:
34514 case UQI_FTYPE_UQI_UQI:
34515 case UHI_FTYPE_UHI_UHI:
34516 case USI_FTYPE_USI_USI:
34517 case UDI_FTYPE_UDI_UDI:
34518 case V16SI_FTYPE_V8DF_V8DF:
34519 nargs = 2;
34520 break;
34521 case V2DI_FTYPE_V2DI_INT_CONVERT:
34522 nargs = 2;
34523 rmode = V1TImode;
34524 nargs_constant = 1;
34525 break;
34526 case V4DI_FTYPE_V4DI_INT_CONVERT:
34527 nargs = 2;
34528 rmode = V2TImode;
34529 nargs_constant = 1;
34530 break;
34531 case V8DI_FTYPE_V8DI_INT_CONVERT:
34532 nargs = 2;
34533 rmode = V4TImode;
34534 nargs_constant = 1;
34535 break;
34536 case V8HI_FTYPE_V8HI_INT:
34537 case V8HI_FTYPE_V8SF_INT:
34538 case V16HI_FTYPE_V16SF_INT:
34539 case V8HI_FTYPE_V4SF_INT:
34540 case V8SF_FTYPE_V8SF_INT:
34541 case V4SF_FTYPE_V16SF_INT:
34542 case V16SF_FTYPE_V16SF_INT:
34543 case V4SI_FTYPE_V4SI_INT:
34544 case V4SI_FTYPE_V8SI_INT:
34545 case V4HI_FTYPE_V4HI_INT:
34546 case V4DF_FTYPE_V4DF_INT:
34547 case V4DF_FTYPE_V8DF_INT:
34548 case V4SF_FTYPE_V4SF_INT:
34549 case V4SF_FTYPE_V8SF_INT:
34550 case V2DI_FTYPE_V2DI_INT:
34551 case V2DF_FTYPE_V2DF_INT:
34552 case V2DF_FTYPE_V4DF_INT:
34553 case V16HI_FTYPE_V16HI_INT:
34554 case V8SI_FTYPE_V8SI_INT:
34555 case V16SI_FTYPE_V16SI_INT:
34556 case V4SI_FTYPE_V16SI_INT:
34557 case V4DI_FTYPE_V4DI_INT:
34558 case V2DI_FTYPE_V4DI_INT:
34559 case V4DI_FTYPE_V8DI_INT:
34560 case QI_FTYPE_V4SF_INT:
34561 case QI_FTYPE_V2DF_INT:
34562 case UQI_FTYPE_UQI_UQI_CONST:
34563 case UHI_FTYPE_UHI_UQI:
34564 case USI_FTYPE_USI_UQI:
34565 case UDI_FTYPE_UDI_UQI:
34566 nargs = 2;
34567 nargs_constant = 1;
34568 break;
34569 case V16QI_FTYPE_V16QI_V16QI_V16QI:
34570 case V8SF_FTYPE_V8SF_V8SF_V8SF:
34571 case V4DF_FTYPE_V4DF_V4DF_V4DF:
34572 case V4SF_FTYPE_V4SF_V4SF_V4SF:
34573 case V2DF_FTYPE_V2DF_V2DF_V2DF:
34574 case V32QI_FTYPE_V32QI_V32QI_V32QI:
34575 case UHI_FTYPE_V16SI_V16SI_UHI:
34576 case UQI_FTYPE_V8DI_V8DI_UQI:
34577 case V16HI_FTYPE_V16SI_V16HI_UHI:
34578 case V16QI_FTYPE_V16SI_V16QI_UHI:
34579 case V16QI_FTYPE_V8DI_V16QI_UQI:
34580 case V16SF_FTYPE_V16SF_V16SF_UHI:
34581 case V16SF_FTYPE_V4SF_V16SF_UHI:
34582 case V16SI_FTYPE_SI_V16SI_UHI:
34583 case V16SI_FTYPE_V16HI_V16SI_UHI:
34584 case V16SI_FTYPE_V16QI_V16SI_UHI:
34585 case V8SF_FTYPE_V4SF_V8SF_UQI:
34586 case V4DF_FTYPE_V2DF_V4DF_UQI:
34587 case V8SI_FTYPE_V4SI_V8SI_UQI:
34588 case V8SI_FTYPE_SI_V8SI_UQI:
34589 case V4SI_FTYPE_V4SI_V4SI_UQI:
34590 case V4SI_FTYPE_SI_V4SI_UQI:
34591 case V4DI_FTYPE_V2DI_V4DI_UQI:
34592 case V4DI_FTYPE_DI_V4DI_UQI:
34593 case V2DI_FTYPE_V2DI_V2DI_UQI:
34594 case V2DI_FTYPE_DI_V2DI_UQI:
34595 case V64QI_FTYPE_V64QI_V64QI_UDI:
34596 case V64QI_FTYPE_V16QI_V64QI_UDI:
34597 case V64QI_FTYPE_QI_V64QI_UDI:
34598 case V32QI_FTYPE_V32QI_V32QI_USI:
34599 case V32QI_FTYPE_V16QI_V32QI_USI:
34600 case V32QI_FTYPE_QI_V32QI_USI:
34601 case V16QI_FTYPE_V16QI_V16QI_UHI:
34602 case V16QI_FTYPE_QI_V16QI_UHI:
34603 case V32HI_FTYPE_V8HI_V32HI_USI:
34604 case V32HI_FTYPE_HI_V32HI_USI:
34605 case V16HI_FTYPE_V8HI_V16HI_UHI:
34606 case V16HI_FTYPE_HI_V16HI_UHI:
34607 case V8HI_FTYPE_V8HI_V8HI_UQI:
34608 case V8HI_FTYPE_HI_V8HI_UQI:
34609 case V8SF_FTYPE_V8HI_V8SF_UQI:
34610 case V4SF_FTYPE_V8HI_V4SF_UQI:
34611 case V8SI_FTYPE_V8SF_V8SI_UQI:
34612 case V4SI_FTYPE_V4SF_V4SI_UQI:
34613 case V4DI_FTYPE_V4SF_V4DI_UQI:
34614 case V2DI_FTYPE_V4SF_V2DI_UQI:
34615 case V4SF_FTYPE_V4DI_V4SF_UQI:
34616 case V4SF_FTYPE_V2DI_V4SF_UQI:
34617 case V4DF_FTYPE_V4DI_V4DF_UQI:
34618 case V2DF_FTYPE_V2DI_V2DF_UQI:
34619 case V16QI_FTYPE_V8HI_V16QI_UQI:
34620 case V16QI_FTYPE_V16HI_V16QI_UHI:
34621 case V16QI_FTYPE_V4SI_V16QI_UQI:
34622 case V16QI_FTYPE_V8SI_V16QI_UQI:
34623 case V8HI_FTYPE_V4SI_V8HI_UQI:
34624 case V8HI_FTYPE_V8SI_V8HI_UQI:
34625 case V16QI_FTYPE_V2DI_V16QI_UQI:
34626 case V16QI_FTYPE_V4DI_V16QI_UQI:
34627 case V8HI_FTYPE_V2DI_V8HI_UQI:
34628 case V8HI_FTYPE_V4DI_V8HI_UQI:
34629 case V4SI_FTYPE_V2DI_V4SI_UQI:
34630 case V4SI_FTYPE_V4DI_V4SI_UQI:
34631 case V32QI_FTYPE_V32HI_V32QI_USI:
34632 case UHI_FTYPE_V16QI_V16QI_UHI:
34633 case USI_FTYPE_V32QI_V32QI_USI:
34634 case UDI_FTYPE_V64QI_V64QI_UDI:
34635 case UQI_FTYPE_V8HI_V8HI_UQI:
34636 case UHI_FTYPE_V16HI_V16HI_UHI:
34637 case USI_FTYPE_V32HI_V32HI_USI:
34638 case UQI_FTYPE_V4SI_V4SI_UQI:
34639 case UQI_FTYPE_V8SI_V8SI_UQI:
34640 case UQI_FTYPE_V2DI_V2DI_UQI:
34641 case UQI_FTYPE_V4DI_V4DI_UQI:
34642 case V4SF_FTYPE_V2DF_V4SF_UQI:
34643 case V4SF_FTYPE_V4DF_V4SF_UQI:
34644 case V16SI_FTYPE_V16SI_V16SI_UHI:
34645 case V16SI_FTYPE_V4SI_V16SI_UHI:
34646 case V2DI_FTYPE_V4SI_V2DI_UQI:
34647 case V2DI_FTYPE_V8HI_V2DI_UQI:
34648 case V2DI_FTYPE_V16QI_V2DI_UQI:
34649 case V4DI_FTYPE_V4DI_V4DI_UQI:
34650 case V4DI_FTYPE_V4SI_V4DI_UQI:
34651 case V4DI_FTYPE_V8HI_V4DI_UQI:
34652 case V4DI_FTYPE_V16QI_V4DI_UQI:
34653 case V4DI_FTYPE_V4DF_V4DI_UQI:
34654 case V2DI_FTYPE_V2DF_V2DI_UQI:
34655 case V4SI_FTYPE_V4DF_V4SI_UQI:
34656 case V4SI_FTYPE_V2DF_V4SI_UQI:
34657 case V4SI_FTYPE_V8HI_V4SI_UQI:
34658 case V4SI_FTYPE_V16QI_V4SI_UQI:
34659 case V4DI_FTYPE_V4DI_V4DI_V4DI:
34660 case V8DF_FTYPE_V2DF_V8DF_UQI:
34661 case V8DF_FTYPE_V4DF_V8DF_UQI:
34662 case V8DF_FTYPE_V8DF_V8DF_UQI:
34663 case V8SF_FTYPE_V8SF_V8SF_UQI:
34664 case V8SF_FTYPE_V8SI_V8SF_UQI:
34665 case V4DF_FTYPE_V4DF_V4DF_UQI:
34666 case V4SF_FTYPE_V4SF_V4SF_UQI:
34667 case V2DF_FTYPE_V2DF_V2DF_UQI:
34668 case V2DF_FTYPE_V4SF_V2DF_UQI:
34669 case V2DF_FTYPE_V4SI_V2DF_UQI:
34670 case V4SF_FTYPE_V4SI_V4SF_UQI:
34671 case V4DF_FTYPE_V4SF_V4DF_UQI:
34672 case V4DF_FTYPE_V4SI_V4DF_UQI:
34673 case V8SI_FTYPE_V8SI_V8SI_UQI:
34674 case V8SI_FTYPE_V8HI_V8SI_UQI:
34675 case V8SI_FTYPE_V16QI_V8SI_UQI:
34676 case V8DF_FTYPE_V8SI_V8DF_UQI:
34677 case V8DI_FTYPE_DI_V8DI_UQI:
34678 case V16SF_FTYPE_V8SF_V16SF_UHI:
34679 case V16SI_FTYPE_V8SI_V16SI_UHI:
34680 case V16HI_FTYPE_V16HI_V16HI_UHI:
34681 case V8HI_FTYPE_V16QI_V8HI_UQI:
34682 case V16HI_FTYPE_V16QI_V16HI_UHI:
34683 case V32HI_FTYPE_V32HI_V32HI_USI:
34684 case V32HI_FTYPE_V32QI_V32HI_USI:
34685 case V8DI_FTYPE_V16QI_V8DI_UQI:
34686 case V8DI_FTYPE_V2DI_V8DI_UQI:
34687 case V8DI_FTYPE_V4DI_V8DI_UQI:
34688 case V8DI_FTYPE_V8DI_V8DI_UQI:
34689 case V8DI_FTYPE_V8HI_V8DI_UQI:
34690 case V8DI_FTYPE_V8SI_V8DI_UQI:
34691 case V8HI_FTYPE_V8DI_V8HI_UQI:
34692 case V8SI_FTYPE_V8DI_V8SI_UQI:
34693 case V4SI_FTYPE_V4SI_V4SI_V4SI:
34694 case V16SI_FTYPE_V16SI_V16SI_V16SI:
34695 case V8DI_FTYPE_V8DI_V8DI_V8DI:
34696 case V32HI_FTYPE_V32HI_V32HI_V32HI:
34697 case V2DI_FTYPE_V2DI_V2DI_V2DI:
34698 case V16HI_FTYPE_V16HI_V16HI_V16HI:
34699 case V8SI_FTYPE_V8SI_V8SI_V8SI:
34700 case V8HI_FTYPE_V8HI_V8HI_V8HI:
34701 nargs = 3;
34702 break;
34703 case V32QI_FTYPE_V32QI_V32QI_INT:
34704 case V16HI_FTYPE_V16HI_V16HI_INT:
34705 case V16QI_FTYPE_V16QI_V16QI_INT:
34706 case V4DI_FTYPE_V4DI_V4DI_INT:
34707 case V8HI_FTYPE_V8HI_V8HI_INT:
34708 case V8SI_FTYPE_V8SI_V8SI_INT:
34709 case V8SI_FTYPE_V8SI_V4SI_INT:
34710 case V8SF_FTYPE_V8SF_V8SF_INT:
34711 case V8SF_FTYPE_V8SF_V4SF_INT:
34712 case V4SI_FTYPE_V4SI_V4SI_INT:
34713 case V4DF_FTYPE_V4DF_V4DF_INT:
34714 case V16SF_FTYPE_V16SF_V16SF_INT:
34715 case V16SF_FTYPE_V16SF_V4SF_INT:
34716 case V16SI_FTYPE_V16SI_V4SI_INT:
34717 case V4DF_FTYPE_V4DF_V2DF_INT:
34718 case V4SF_FTYPE_V4SF_V4SF_INT:
34719 case V2DI_FTYPE_V2DI_V2DI_INT:
34720 case V4DI_FTYPE_V4DI_V2DI_INT:
34721 case V2DF_FTYPE_V2DF_V2DF_INT:
34722 case UQI_FTYPE_V8DI_V8UDI_INT:
34723 case UQI_FTYPE_V8DF_V8DF_INT:
34724 case UQI_FTYPE_V2DF_V2DF_INT:
34725 case UQI_FTYPE_V4SF_V4SF_INT:
34726 case UHI_FTYPE_V16SI_V16SI_INT:
34727 case UHI_FTYPE_V16SF_V16SF_INT:
34728 case V64QI_FTYPE_V64QI_V64QI_INT:
34729 case V32HI_FTYPE_V32HI_V32HI_INT:
34730 case V16SI_FTYPE_V16SI_V16SI_INT:
34731 case V8DI_FTYPE_V8DI_V8DI_INT:
34732 nargs = 3;
34733 nargs_constant = 1;
34734 break;
34735 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
34736 nargs = 3;
34737 rmode = V4DImode;
34738 nargs_constant = 1;
34739 break;
34740 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
34741 nargs = 3;
34742 rmode = V2DImode;
34743 nargs_constant = 1;
34744 break;
34745 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
34746 nargs = 3;
34747 rmode = DImode;
34748 nargs_constant = 1;
34749 break;
34750 case V2DI_FTYPE_V2DI_UINT_UINT:
34751 nargs = 3;
34752 nargs_constant = 2;
34753 break;
34754 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
34755 nargs = 3;
34756 rmode = V8DImode;
34757 nargs_constant = 1;
34758 break;
34759 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
34760 nargs = 5;
34761 rmode = V8DImode;
34762 mask_pos = 2;
34763 nargs_constant = 1;
34764 break;
34765 case QI_FTYPE_V8DF_INT_UQI:
34766 case QI_FTYPE_V4DF_INT_UQI:
34767 case QI_FTYPE_V2DF_INT_UQI:
34768 case HI_FTYPE_V16SF_INT_UHI:
34769 case QI_FTYPE_V8SF_INT_UQI:
34770 case QI_FTYPE_V4SF_INT_UQI:
34771 case V4SI_FTYPE_V4SI_V4SI_UHI:
34772 case V8SI_FTYPE_V8SI_V8SI_UHI:
34773 nargs = 3;
34774 mask_pos = 1;
34775 nargs_constant = 1;
34776 break;
34777 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
34778 nargs = 5;
34779 rmode = V4DImode;
34780 mask_pos = 2;
34781 nargs_constant = 1;
34782 break;
34783 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
34784 nargs = 5;
34785 rmode = V2DImode;
34786 mask_pos = 2;
34787 nargs_constant = 1;
34788 break;
34789 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
34790 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
34791 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
34792 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
34793 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
34794 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
34795 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
34796 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
34797 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
34798 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
34799 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
34800 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
34801 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
34802 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
34803 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
34804 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
34805 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
34806 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
34807 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
34808 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
34809 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
34810 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
34811 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
34812 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
34813 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
34814 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
34815 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
34816 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
34817 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
34818 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
34819 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
34820 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
34821 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
34822 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
34823 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
34824 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
34825 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
34826 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
34827 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
34828 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
34829 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
34830 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
34831 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
34832 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
34833 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
34834 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
34835 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
34836 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
34837 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
34838 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
34839 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
34840 nargs = 4;
34841 break;
34842 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
34843 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
34844 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
34845 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
34846 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
34847 nargs = 4;
34848 nargs_constant = 1;
34849 break;
34850 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
34851 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
34852 case QI_FTYPE_V4DF_V4DF_INT_UQI:
34853 case QI_FTYPE_V8SF_V8SF_INT_UQI:
34854 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
34855 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
34856 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
34857 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
34858 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
34859 case USI_FTYPE_V32QI_V32QI_INT_USI:
34860 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
34861 case USI_FTYPE_V32HI_V32HI_INT_USI:
34862 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
34863 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
34864 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34865 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34866 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34867 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34868 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34869 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34870 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34871 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34872 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34873 nargs = 4;
34874 mask_pos = 1;
34875 nargs_constant = 1;
34876 break;
34877 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34878 nargs = 4;
34879 nargs_constant = 2;
34880 break;
34881 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34882 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34883 nargs = 4;
34884 break;
34885 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34886 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34887 mask_pos = 1;
34888 nargs = 4;
34889 nargs_constant = 1;
34890 break;
34891 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34892 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34893 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34894 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34895 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34896 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34897 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34898 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34899 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34900 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34901 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34902 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34903 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34904 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34905 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34906 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34907 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34908 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34909 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34910 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34911 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34912 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34913 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34914 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34915 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34916 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34917 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34918 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34919 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34920 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34921 nargs = 4;
34922 mask_pos = 2;
34923 nargs_constant = 1;
34924 break;
34925 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34926 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34927 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34928 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34929 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34930 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34931 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34932 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34933 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34934 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34935 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34936 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34937 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34938 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34939 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34940 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34941 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34942 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34943 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34944 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34945 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34946 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34947 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34948 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34949 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34950 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34951 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34952 nargs = 5;
34953 mask_pos = 2;
34954 nargs_constant = 1;
34955 break;
34956 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34957 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34958 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34959 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34960 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34961 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34962 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34963 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34964 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34965 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34966 nargs = 5;
34967 mask_pos = 1;
34968 nargs_constant = 1;
34969 break;
34970 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34971 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34972 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34973 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34974 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34975 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34976 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34977 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34978 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34979 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34980 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34981 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34982 nargs = 5;
34983 mask_pos = 1;
34984 nargs_constant = 2;
34985 break;
34987 default:
34988 gcc_unreachable ();
34991 gcc_assert (nargs <= ARRAY_SIZE (args));
34993 if (comparison != UNKNOWN)
34995 gcc_assert (nargs == 2);
34996 return ix86_expand_sse_compare (d, exp, target, swap);
34999 if (rmode == VOIDmode || rmode == tmode)
35001 if (optimize
35002 || target == 0
35003 || GET_MODE (target) != tmode
35004 || !insn_p->operand[0].predicate (target, tmode))
35005 target = gen_reg_rtx (tmode);
35006 else if (memory_operand (target, tmode))
35007 num_memory++;
35008 real_target = target;
35010 else
35012 real_target = gen_reg_rtx (tmode);
35013 target = lowpart_subreg (rmode, real_target, tmode);
35016 for (i = 0; i < nargs; i++)
35018 tree arg = CALL_EXPR_ARG (exp, i);
35019 rtx op = expand_normal (arg);
35020 machine_mode mode = insn_p->operand[i + 1].mode;
35021 bool match = insn_p->operand[i + 1].predicate (op, mode);
35023 if (second_arg_count && i == 1)
35025 /* SIMD shift insns take either an 8-bit immediate or
35026 register as count. But builtin functions take int as
35027 count. If count doesn't match, we put it in register.
35028 The instructions are using 64-bit count, if op is just
35029 32-bit, zero-extend it, as negative shift counts
35030 are undefined behavior and zero-extension is more
35031 efficient. */
35032 if (!match)
35034 if (SCALAR_INT_MODE_P (GET_MODE (op)))
35035 op = convert_modes (mode, GET_MODE (op), op, 1);
35036 else
35037 op = lowpart_subreg (mode, op, GET_MODE (op));
35038 if (!insn_p->operand[i + 1].predicate (op, mode))
35039 op = copy_to_reg (op);
35042 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35043 (!mask_pos && (nargs - i) <= nargs_constant))
35045 if (!match)
35046 switch (icode)
35048 case CODE_FOR_avx_vinsertf128v4di:
35049 case CODE_FOR_avx_vextractf128v4di:
35050 error ("the last argument must be an 1-bit immediate");
35051 return const0_rtx;
35053 case CODE_FOR_avx512f_cmpv8di3_mask:
35054 case CODE_FOR_avx512f_cmpv16si3_mask:
35055 case CODE_FOR_avx512f_ucmpv8di3_mask:
35056 case CODE_FOR_avx512f_ucmpv16si3_mask:
35057 case CODE_FOR_avx512vl_cmpv4di3_mask:
35058 case CODE_FOR_avx512vl_cmpv8si3_mask:
35059 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35060 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35061 case CODE_FOR_avx512vl_cmpv2di3_mask:
35062 case CODE_FOR_avx512vl_cmpv4si3_mask:
35063 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35064 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35065 error ("the last argument must be a 3-bit immediate");
35066 return const0_rtx;
35068 case CODE_FOR_sse4_1_roundsd:
35069 case CODE_FOR_sse4_1_roundss:
35071 case CODE_FOR_sse4_1_roundpd:
35072 case CODE_FOR_sse4_1_roundps:
35073 case CODE_FOR_avx_roundpd256:
35074 case CODE_FOR_avx_roundps256:
35076 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35077 case CODE_FOR_sse4_1_roundps_sfix:
35078 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35079 case CODE_FOR_avx_roundps_sfix256:
35081 case CODE_FOR_sse4_1_blendps:
35082 case CODE_FOR_avx_blendpd256:
35083 case CODE_FOR_avx_vpermilv4df:
35084 case CODE_FOR_avx_vpermilv4df_mask:
35085 case CODE_FOR_avx512f_getmantv8df_mask:
35086 case CODE_FOR_avx512f_getmantv16sf_mask:
35087 case CODE_FOR_avx512vl_getmantv8sf_mask:
35088 case CODE_FOR_avx512vl_getmantv4df_mask:
35089 case CODE_FOR_avx512vl_getmantv4sf_mask:
35090 case CODE_FOR_avx512vl_getmantv2df_mask:
35091 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35092 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35093 case CODE_FOR_avx512dq_rangepv4df_mask:
35094 case CODE_FOR_avx512dq_rangepv8sf_mask:
35095 case CODE_FOR_avx512dq_rangepv2df_mask:
35096 case CODE_FOR_avx512dq_rangepv4sf_mask:
35097 case CODE_FOR_avx_shufpd256_mask:
35098 error ("the last argument must be a 4-bit immediate");
35099 return const0_rtx;
35101 case CODE_FOR_sha1rnds4:
35102 case CODE_FOR_sse4_1_blendpd:
35103 case CODE_FOR_avx_vpermilv2df:
35104 case CODE_FOR_avx_vpermilv2df_mask:
35105 case CODE_FOR_xop_vpermil2v2df3:
35106 case CODE_FOR_xop_vpermil2v4sf3:
35107 case CODE_FOR_xop_vpermil2v4df3:
35108 case CODE_FOR_xop_vpermil2v8sf3:
35109 case CODE_FOR_avx512f_vinsertf32x4_mask:
35110 case CODE_FOR_avx512f_vinserti32x4_mask:
35111 case CODE_FOR_avx512f_vextractf32x4_mask:
35112 case CODE_FOR_avx512f_vextracti32x4_mask:
35113 case CODE_FOR_sse2_shufpd:
35114 case CODE_FOR_sse2_shufpd_mask:
35115 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35116 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35117 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35118 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35119 error ("the last argument must be a 2-bit immediate");
35120 return const0_rtx;
35122 case CODE_FOR_avx_vextractf128v4df:
35123 case CODE_FOR_avx_vextractf128v8sf:
35124 case CODE_FOR_avx_vextractf128v8si:
35125 case CODE_FOR_avx_vinsertf128v4df:
35126 case CODE_FOR_avx_vinsertf128v8sf:
35127 case CODE_FOR_avx_vinsertf128v8si:
35128 case CODE_FOR_avx512f_vinsertf64x4_mask:
35129 case CODE_FOR_avx512f_vinserti64x4_mask:
35130 case CODE_FOR_avx512f_vextractf64x4_mask:
35131 case CODE_FOR_avx512f_vextracti64x4_mask:
35132 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35133 case CODE_FOR_avx512dq_vinserti32x8_mask:
35134 case CODE_FOR_avx512vl_vinsertv4df:
35135 case CODE_FOR_avx512vl_vinsertv4di:
35136 case CODE_FOR_avx512vl_vinsertv8sf:
35137 case CODE_FOR_avx512vl_vinsertv8si:
35138 error ("the last argument must be a 1-bit immediate");
35139 return const0_rtx;
35141 case CODE_FOR_avx_vmcmpv2df3:
35142 case CODE_FOR_avx_vmcmpv4sf3:
35143 case CODE_FOR_avx_cmpv2df3:
35144 case CODE_FOR_avx_cmpv4sf3:
35145 case CODE_FOR_avx_cmpv4df3:
35146 case CODE_FOR_avx_cmpv8sf3:
35147 case CODE_FOR_avx512f_cmpv8df3_mask:
35148 case CODE_FOR_avx512f_cmpv16sf3_mask:
35149 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35150 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35151 error ("the last argument must be a 5-bit immediate");
35152 return const0_rtx;
35154 default:
35155 switch (nargs_constant)
35157 case 2:
35158 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35159 (!mask_pos && (nargs - i) == nargs_constant))
35161 error ("the next to last argument must be an 8-bit immediate");
35162 break;
35164 /* FALLTHRU */
35165 case 1:
35166 error ("the last argument must be an 8-bit immediate");
35167 break;
35168 default:
35169 gcc_unreachable ();
35171 return const0_rtx;
35174 else
35176 if (VECTOR_MODE_P (mode))
35177 op = safe_vector_operand (op, mode);
35179 /* If we aren't optimizing, only allow one memory operand to
35180 be generated. */
35181 if (memory_operand (op, mode))
35182 num_memory++;
35184 op = fixup_modeless_constant (op, mode);
35186 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35188 if (optimize || !match || num_memory > 1)
35189 op = copy_to_mode_reg (mode, op);
35191 else
35193 op = copy_to_reg (op);
35194 op = lowpart_subreg (mode, op, GET_MODE (op));
35198 args[i].op = op;
35199 args[i].mode = mode;
35202 switch (nargs)
35204 case 1:
35205 pat = GEN_FCN (icode) (real_target, args[0].op);
35206 break;
35207 case 2:
35208 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35209 break;
35210 case 3:
35211 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35212 args[2].op);
35213 break;
35214 case 4:
35215 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35216 args[2].op, args[3].op);
35217 break;
35218 case 5:
35219 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35220 args[2].op, args[3].op, args[4].op);
35221 break;
35222 case 6:
35223 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35224 args[2].op, args[3].op, args[4].op,
35225 args[5].op);
35226 break;
35227 default:
35228 gcc_unreachable ();
35231 if (! pat)
35232 return 0;
35234 emit_insn (pat);
35235 return target;
35238 /* Transform pattern of following layout:
35239 (set A
35240 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35242 into:
35243 (set (A B)) */
35245 static rtx
35246 ix86_erase_embedded_rounding (rtx pat)
35248 if (GET_CODE (pat) == INSN)
35249 pat = PATTERN (pat);
35251 gcc_assert (GET_CODE (pat) == SET);
35252 rtx src = SET_SRC (pat);
35253 gcc_assert (XVECLEN (src, 0) == 2);
35254 rtx p0 = XVECEXP (src, 0, 0);
35255 gcc_assert (GET_CODE (src) == UNSPEC
35256 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35257 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35258 return res;
35261 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35262 with rounding. */
35263 static rtx
35264 ix86_expand_sse_comi_round (const struct builtin_description *d,
35265 tree exp, rtx target)
35267 rtx pat, set_dst;
35268 tree arg0 = CALL_EXPR_ARG (exp, 0);
35269 tree arg1 = CALL_EXPR_ARG (exp, 1);
35270 tree arg2 = CALL_EXPR_ARG (exp, 2);
35271 tree arg3 = CALL_EXPR_ARG (exp, 3);
35272 rtx op0 = expand_normal (arg0);
35273 rtx op1 = expand_normal (arg1);
35274 rtx op2 = expand_normal (arg2);
35275 rtx op3 = expand_normal (arg3);
35276 enum insn_code icode = d->icode;
35277 const struct insn_data_d *insn_p = &insn_data[icode];
35278 machine_mode mode0 = insn_p->operand[0].mode;
35279 machine_mode mode1 = insn_p->operand[1].mode;
35280 enum rtx_code comparison = UNEQ;
35281 bool need_ucomi = false;
35283 /* See avxintrin.h for values. */
35284 enum rtx_code comi_comparisons[32] =
35286 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35287 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35288 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35290 bool need_ucomi_values[32] =
35292 true, false, false, true, true, false, false, true,
35293 true, false, false, true, true, false, false, true,
35294 false, true, true, false, false, true, true, false,
35295 false, true, true, false, false, true, true, false
35298 if (!CONST_INT_P (op2))
35300 error ("the third argument must be comparison constant");
35301 return const0_rtx;
35303 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35305 error ("incorrect comparison mode");
35306 return const0_rtx;
35309 if (!insn_p->operand[2].predicate (op3, SImode))
35311 error ("incorrect rounding operand");
35312 return const0_rtx;
35315 comparison = comi_comparisons[INTVAL (op2)];
35316 need_ucomi = need_ucomi_values[INTVAL (op2)];
35318 if (VECTOR_MODE_P (mode0))
35319 op0 = safe_vector_operand (op0, mode0);
35320 if (VECTOR_MODE_P (mode1))
35321 op1 = safe_vector_operand (op1, mode1);
35323 target = gen_reg_rtx (SImode);
35324 emit_move_insn (target, const0_rtx);
35325 target = gen_rtx_SUBREG (QImode, target, 0);
35327 if ((optimize && !register_operand (op0, mode0))
35328 || !insn_p->operand[0].predicate (op0, mode0))
35329 op0 = copy_to_mode_reg (mode0, op0);
35330 if ((optimize && !register_operand (op1, mode1))
35331 || !insn_p->operand[1].predicate (op1, mode1))
35332 op1 = copy_to_mode_reg (mode1, op1);
35334 if (need_ucomi)
35335 icode = icode == CODE_FOR_sse_comi_round
35336 ? CODE_FOR_sse_ucomi_round
35337 : CODE_FOR_sse2_ucomi_round;
35339 pat = GEN_FCN (icode) (op0, op1, op3);
35340 if (! pat)
35341 return 0;
35343 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35344 if (INTVAL (op3) == NO_ROUND)
35346 pat = ix86_erase_embedded_rounding (pat);
35347 if (! pat)
35348 return 0;
35350 set_dst = SET_DEST (pat);
35352 else
35354 gcc_assert (GET_CODE (pat) == SET);
35355 set_dst = SET_DEST (pat);
35358 emit_insn (pat);
35359 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35360 gen_rtx_fmt_ee (comparison, QImode,
35361 set_dst,
35362 const0_rtx)));
35364 return SUBREG_REG (target);
35367 static rtx
35368 ix86_expand_round_builtin (const struct builtin_description *d,
35369 tree exp, rtx target)
35371 rtx pat;
35372 unsigned int i, nargs;
35373 struct
35375 rtx op;
35376 machine_mode mode;
35377 } args[6];
35378 enum insn_code icode = d->icode;
35379 const struct insn_data_d *insn_p = &insn_data[icode];
35380 machine_mode tmode = insn_p->operand[0].mode;
35381 unsigned int nargs_constant = 0;
35382 unsigned int redundant_embed_rnd = 0;
35384 switch ((enum ix86_builtin_func_type) d->flag)
35386 case UINT64_FTYPE_V2DF_INT:
35387 case UINT64_FTYPE_V4SF_INT:
35388 case UINT_FTYPE_V2DF_INT:
35389 case UINT_FTYPE_V4SF_INT:
35390 case INT64_FTYPE_V2DF_INT:
35391 case INT64_FTYPE_V4SF_INT:
35392 case INT_FTYPE_V2DF_INT:
35393 case INT_FTYPE_V4SF_INT:
35394 nargs = 2;
35395 break;
35396 case V4SF_FTYPE_V4SF_UINT_INT:
35397 case V4SF_FTYPE_V4SF_UINT64_INT:
35398 case V2DF_FTYPE_V2DF_UINT64_INT:
35399 case V4SF_FTYPE_V4SF_INT_INT:
35400 case V4SF_FTYPE_V4SF_INT64_INT:
35401 case V2DF_FTYPE_V2DF_INT64_INT:
35402 case V4SF_FTYPE_V4SF_V4SF_INT:
35403 case V2DF_FTYPE_V2DF_V2DF_INT:
35404 case V4SF_FTYPE_V4SF_V2DF_INT:
35405 case V2DF_FTYPE_V2DF_V4SF_INT:
35406 nargs = 3;
35407 break;
35408 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
35409 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
35410 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
35411 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
35412 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
35413 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
35414 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
35415 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
35416 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
35417 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
35418 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
35419 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
35420 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
35421 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
35422 nargs = 4;
35423 break;
35424 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
35425 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
35426 nargs_constant = 2;
35427 nargs = 4;
35428 break;
35429 case INT_FTYPE_V4SF_V4SF_INT_INT:
35430 case INT_FTYPE_V2DF_V2DF_INT_INT:
35431 return ix86_expand_sse_comi_round (d, exp, target);
35432 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
35433 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
35434 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
35435 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
35436 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
35437 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
35438 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
35439 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
35440 nargs = 5;
35441 break;
35442 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
35443 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
35444 nargs_constant = 4;
35445 nargs = 5;
35446 break;
35447 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
35448 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
35449 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
35450 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
35451 nargs_constant = 3;
35452 nargs = 5;
35453 break;
35454 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
35455 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
35456 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
35457 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
35458 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
35459 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
35460 nargs = 6;
35461 nargs_constant = 4;
35462 break;
35463 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
35464 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
35465 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
35466 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
35467 nargs = 6;
35468 nargs_constant = 3;
35469 break;
35470 default:
35471 gcc_unreachable ();
35473 gcc_assert (nargs <= ARRAY_SIZE (args));
35475 if (optimize
35476 || target == 0
35477 || GET_MODE (target) != tmode
35478 || !insn_p->operand[0].predicate (target, tmode))
35479 target = gen_reg_rtx (tmode);
35481 for (i = 0; i < nargs; i++)
35483 tree arg = CALL_EXPR_ARG (exp, i);
35484 rtx op = expand_normal (arg);
35485 machine_mode mode = insn_p->operand[i + 1].mode;
35486 bool match = insn_p->operand[i + 1].predicate (op, mode);
35488 if (i == nargs - nargs_constant)
35490 if (!match)
35492 switch (icode)
35494 case CODE_FOR_avx512f_getmantv8df_mask_round:
35495 case CODE_FOR_avx512f_getmantv16sf_mask_round:
35496 case CODE_FOR_avx512f_vgetmantv2df_round:
35497 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
35498 case CODE_FOR_avx512f_vgetmantv4sf_round:
35499 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
35500 error ("the immediate argument must be a 4-bit immediate");
35501 return const0_rtx;
35502 case CODE_FOR_avx512f_cmpv8df3_mask_round:
35503 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
35504 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
35505 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
35506 error ("the immediate argument must be a 5-bit immediate");
35507 return const0_rtx;
35508 default:
35509 error ("the immediate argument must be an 8-bit immediate");
35510 return const0_rtx;
35514 else if (i == nargs-1)
35516 if (!insn_p->operand[nargs].predicate (op, SImode))
35518 error ("incorrect rounding operand");
35519 return const0_rtx;
35522 /* If there is no rounding use normal version of the pattern. */
35523 if (INTVAL (op) == NO_ROUND)
35524 redundant_embed_rnd = 1;
35526 else
35528 if (VECTOR_MODE_P (mode))
35529 op = safe_vector_operand (op, mode);
35531 op = fixup_modeless_constant (op, mode);
35533 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35535 if (optimize || !match)
35536 op = copy_to_mode_reg (mode, op);
35538 else
35540 op = copy_to_reg (op);
35541 op = lowpart_subreg (mode, op, GET_MODE (op));
35545 args[i].op = op;
35546 args[i].mode = mode;
35549 switch (nargs)
35551 case 1:
35552 pat = GEN_FCN (icode) (target, args[0].op);
35553 break;
35554 case 2:
35555 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35556 break;
35557 case 3:
35558 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35559 args[2].op);
35560 break;
35561 case 4:
35562 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35563 args[2].op, args[3].op);
35564 break;
35565 case 5:
35566 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35567 args[2].op, args[3].op, args[4].op);
35568 break;
35569 case 6:
35570 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
35571 args[2].op, args[3].op, args[4].op,
35572 args[5].op);
35573 break;
35574 default:
35575 gcc_unreachable ();
35578 if (!pat)
35579 return 0;
35581 if (redundant_embed_rnd)
35582 pat = ix86_erase_embedded_rounding (pat);
35584 emit_insn (pat);
35585 return target;
35588 /* Subroutine of ix86_expand_builtin to take care of special insns
35589 with variable number of operands. */
35591 static rtx
35592 ix86_expand_special_args_builtin (const struct builtin_description *d,
35593 tree exp, rtx target)
35595 tree arg;
35596 rtx pat, op;
35597 unsigned int i, nargs, arg_adjust, memory;
35598 bool aligned_mem = false;
35599 struct
35601 rtx op;
35602 machine_mode mode;
35603 } args[3];
35604 enum insn_code icode = d->icode;
35605 bool last_arg_constant = false;
35606 const struct insn_data_d *insn_p = &insn_data[icode];
35607 machine_mode tmode = insn_p->operand[0].mode;
35608 enum { load, store } klass;
35610 switch ((enum ix86_builtin_func_type) d->flag)
35612 case VOID_FTYPE_VOID:
35613 emit_insn (GEN_FCN (icode) (target));
35614 return 0;
35615 case VOID_FTYPE_UINT64:
35616 case VOID_FTYPE_UNSIGNED:
35617 nargs = 0;
35618 klass = store;
35619 memory = 0;
35620 break;
35622 case INT_FTYPE_VOID:
35623 case USHORT_FTYPE_VOID:
35624 case UINT64_FTYPE_VOID:
35625 case UNSIGNED_FTYPE_VOID:
35626 nargs = 0;
35627 klass = load;
35628 memory = 0;
35629 break;
35630 case UINT64_FTYPE_PUNSIGNED:
35631 case V2DI_FTYPE_PV2DI:
35632 case V4DI_FTYPE_PV4DI:
35633 case V32QI_FTYPE_PCCHAR:
35634 case V16QI_FTYPE_PCCHAR:
35635 case V8SF_FTYPE_PCV4SF:
35636 case V8SF_FTYPE_PCFLOAT:
35637 case V4SF_FTYPE_PCFLOAT:
35638 case V4DF_FTYPE_PCV2DF:
35639 case V4DF_FTYPE_PCDOUBLE:
35640 case V2DF_FTYPE_PCDOUBLE:
35641 case VOID_FTYPE_PVOID:
35642 case V8DI_FTYPE_PV8DI:
35643 nargs = 1;
35644 klass = load;
35645 memory = 0;
35646 switch (icode)
35648 case CODE_FOR_sse4_1_movntdqa:
35649 case CODE_FOR_avx2_movntdqa:
35650 case CODE_FOR_avx512f_movntdqa:
35651 aligned_mem = true;
35652 break;
35653 default:
35654 break;
35656 break;
35657 case VOID_FTYPE_PV2SF_V4SF:
35658 case VOID_FTYPE_PV8DI_V8DI:
35659 case VOID_FTYPE_PV4DI_V4DI:
35660 case VOID_FTYPE_PV2DI_V2DI:
35661 case VOID_FTYPE_PCHAR_V32QI:
35662 case VOID_FTYPE_PCHAR_V16QI:
35663 case VOID_FTYPE_PFLOAT_V16SF:
35664 case VOID_FTYPE_PFLOAT_V8SF:
35665 case VOID_FTYPE_PFLOAT_V4SF:
35666 case VOID_FTYPE_PDOUBLE_V8DF:
35667 case VOID_FTYPE_PDOUBLE_V4DF:
35668 case VOID_FTYPE_PDOUBLE_V2DF:
35669 case VOID_FTYPE_PLONGLONG_LONGLONG:
35670 case VOID_FTYPE_PULONGLONG_ULONGLONG:
35671 case VOID_FTYPE_PINT_INT:
35672 nargs = 1;
35673 klass = store;
35674 /* Reserve memory operand for target. */
35675 memory = ARRAY_SIZE (args);
35676 switch (icode)
35678 /* These builtins and instructions require the memory
35679 to be properly aligned. */
35680 case CODE_FOR_avx_movntv4di:
35681 case CODE_FOR_sse2_movntv2di:
35682 case CODE_FOR_avx_movntv8sf:
35683 case CODE_FOR_sse_movntv4sf:
35684 case CODE_FOR_sse4a_vmmovntv4sf:
35685 case CODE_FOR_avx_movntv4df:
35686 case CODE_FOR_sse2_movntv2df:
35687 case CODE_FOR_sse4a_vmmovntv2df:
35688 case CODE_FOR_sse2_movntidi:
35689 case CODE_FOR_sse_movntq:
35690 case CODE_FOR_sse2_movntisi:
35691 case CODE_FOR_avx512f_movntv16sf:
35692 case CODE_FOR_avx512f_movntv8df:
35693 case CODE_FOR_avx512f_movntv8di:
35694 aligned_mem = true;
35695 break;
35696 default:
35697 break;
35699 break;
35700 case V4SF_FTYPE_V4SF_PCV2SF:
35701 case V2DF_FTYPE_V2DF_PCDOUBLE:
35702 nargs = 2;
35703 klass = load;
35704 memory = 1;
35705 break;
35706 case V8SF_FTYPE_PCV8SF_V8SI:
35707 case V4DF_FTYPE_PCV4DF_V4DI:
35708 case V4SF_FTYPE_PCV4SF_V4SI:
35709 case V2DF_FTYPE_PCV2DF_V2DI:
35710 case V8SI_FTYPE_PCV8SI_V8SI:
35711 case V4DI_FTYPE_PCV4DI_V4DI:
35712 case V4SI_FTYPE_PCV4SI_V4SI:
35713 case V2DI_FTYPE_PCV2DI_V2DI:
35714 case VOID_FTYPE_INT_INT64:
35715 nargs = 2;
35716 klass = load;
35717 memory = 0;
35718 break;
35719 case VOID_FTYPE_PV8DF_V8DF_UQI:
35720 case VOID_FTYPE_PV4DF_V4DF_UQI:
35721 case VOID_FTYPE_PV2DF_V2DF_UQI:
35722 case VOID_FTYPE_PV16SF_V16SF_UHI:
35723 case VOID_FTYPE_PV8SF_V8SF_UQI:
35724 case VOID_FTYPE_PV4SF_V4SF_UQI:
35725 case VOID_FTYPE_PV8DI_V8DI_UQI:
35726 case VOID_FTYPE_PV4DI_V4DI_UQI:
35727 case VOID_FTYPE_PV2DI_V2DI_UQI:
35728 case VOID_FTYPE_PV16SI_V16SI_UHI:
35729 case VOID_FTYPE_PV8SI_V8SI_UQI:
35730 case VOID_FTYPE_PV4SI_V4SI_UQI:
35731 case VOID_FTYPE_PV64QI_V64QI_UDI:
35732 case VOID_FTYPE_PV32HI_V32HI_USI:
35733 case VOID_FTYPE_PV32QI_V32QI_USI:
35734 case VOID_FTYPE_PV16QI_V16QI_UHI:
35735 case VOID_FTYPE_PV16HI_V16HI_UHI:
35736 case VOID_FTYPE_PV8HI_V8HI_UQI:
35737 switch (icode)
35739 /* These builtins and instructions require the memory
35740 to be properly aligned. */
35741 case CODE_FOR_avx512f_storev16sf_mask:
35742 case CODE_FOR_avx512f_storev16si_mask:
35743 case CODE_FOR_avx512f_storev8df_mask:
35744 case CODE_FOR_avx512f_storev8di_mask:
35745 case CODE_FOR_avx512vl_storev8sf_mask:
35746 case CODE_FOR_avx512vl_storev8si_mask:
35747 case CODE_FOR_avx512vl_storev4df_mask:
35748 case CODE_FOR_avx512vl_storev4di_mask:
35749 case CODE_FOR_avx512vl_storev4sf_mask:
35750 case CODE_FOR_avx512vl_storev4si_mask:
35751 case CODE_FOR_avx512vl_storev2df_mask:
35752 case CODE_FOR_avx512vl_storev2di_mask:
35753 aligned_mem = true;
35754 break;
35755 default:
35756 break;
35758 /* FALLTHRU */
35759 case VOID_FTYPE_PV8SF_V8SI_V8SF:
35760 case VOID_FTYPE_PV4DF_V4DI_V4DF:
35761 case VOID_FTYPE_PV4SF_V4SI_V4SF:
35762 case VOID_FTYPE_PV2DF_V2DI_V2DF:
35763 case VOID_FTYPE_PV8SI_V8SI_V8SI:
35764 case VOID_FTYPE_PV4DI_V4DI_V4DI:
35765 case VOID_FTYPE_PV4SI_V4SI_V4SI:
35766 case VOID_FTYPE_PV2DI_V2DI_V2DI:
35767 case VOID_FTYPE_PV8SI_V8DI_UQI:
35768 case VOID_FTYPE_PV8HI_V8DI_UQI:
35769 case VOID_FTYPE_PV16HI_V16SI_UHI:
35770 case VOID_FTYPE_PV16QI_V8DI_UQI:
35771 case VOID_FTYPE_PV16QI_V16SI_UHI:
35772 case VOID_FTYPE_PV4SI_V4DI_UQI:
35773 case VOID_FTYPE_PV4SI_V2DI_UQI:
35774 case VOID_FTYPE_PV8HI_V4DI_UQI:
35775 case VOID_FTYPE_PV8HI_V2DI_UQI:
35776 case VOID_FTYPE_PV8HI_V8SI_UQI:
35777 case VOID_FTYPE_PV8HI_V4SI_UQI:
35778 case VOID_FTYPE_PV16QI_V4DI_UQI:
35779 case VOID_FTYPE_PV16QI_V2DI_UQI:
35780 case VOID_FTYPE_PV16QI_V8SI_UQI:
35781 case VOID_FTYPE_PV16QI_V4SI_UQI:
35782 case VOID_FTYPE_PCHAR_V64QI_UDI:
35783 case VOID_FTYPE_PCHAR_V32QI_USI:
35784 case VOID_FTYPE_PCHAR_V16QI_UHI:
35785 case VOID_FTYPE_PSHORT_V32HI_USI:
35786 case VOID_FTYPE_PSHORT_V16HI_UHI:
35787 case VOID_FTYPE_PSHORT_V8HI_UQI:
35788 case VOID_FTYPE_PINT_V16SI_UHI:
35789 case VOID_FTYPE_PINT_V8SI_UQI:
35790 case VOID_FTYPE_PINT_V4SI_UQI:
35791 case VOID_FTYPE_PINT64_V8DI_UQI:
35792 case VOID_FTYPE_PINT64_V4DI_UQI:
35793 case VOID_FTYPE_PINT64_V2DI_UQI:
35794 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
35795 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
35796 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
35797 case VOID_FTYPE_PFLOAT_V16SF_UHI:
35798 case VOID_FTYPE_PFLOAT_V8SF_UQI:
35799 case VOID_FTYPE_PFLOAT_V4SF_UQI:
35800 case VOID_FTYPE_PV32QI_V32HI_USI:
35801 case VOID_FTYPE_PV16QI_V16HI_UHI:
35802 case VOID_FTYPE_PV8QI_V8HI_UQI:
35803 nargs = 2;
35804 klass = store;
35805 /* Reserve memory operand for target. */
35806 memory = ARRAY_SIZE (args);
35807 break;
35808 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
35809 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
35810 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
35811 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
35812 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
35813 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
35814 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
35815 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
35816 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
35817 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
35818 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
35819 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
35820 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
35821 case V32HI_FTYPE_PCV32HI_V32HI_USI:
35822 case V32QI_FTYPE_PCV32QI_V32QI_USI:
35823 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
35824 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
35825 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
35826 switch (icode)
35828 /* These builtins and instructions require the memory
35829 to be properly aligned. */
35830 case CODE_FOR_avx512f_loadv16sf_mask:
35831 case CODE_FOR_avx512f_loadv16si_mask:
35832 case CODE_FOR_avx512f_loadv8df_mask:
35833 case CODE_FOR_avx512f_loadv8di_mask:
35834 case CODE_FOR_avx512vl_loadv8sf_mask:
35835 case CODE_FOR_avx512vl_loadv8si_mask:
35836 case CODE_FOR_avx512vl_loadv4df_mask:
35837 case CODE_FOR_avx512vl_loadv4di_mask:
35838 case CODE_FOR_avx512vl_loadv4sf_mask:
35839 case CODE_FOR_avx512vl_loadv4si_mask:
35840 case CODE_FOR_avx512vl_loadv2df_mask:
35841 case CODE_FOR_avx512vl_loadv2di_mask:
35842 case CODE_FOR_avx512bw_loadv64qi_mask:
35843 case CODE_FOR_avx512vl_loadv32qi_mask:
35844 case CODE_FOR_avx512vl_loadv16qi_mask:
35845 case CODE_FOR_avx512bw_loadv32hi_mask:
35846 case CODE_FOR_avx512vl_loadv16hi_mask:
35847 case CODE_FOR_avx512vl_loadv8hi_mask:
35848 aligned_mem = true;
35849 break;
35850 default:
35851 break;
35853 /* FALLTHRU */
35854 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
35855 case V32QI_FTYPE_PCCHAR_V32QI_USI:
35856 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
35857 case V32HI_FTYPE_PCSHORT_V32HI_USI:
35858 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
35859 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
35860 case V16SI_FTYPE_PCINT_V16SI_UHI:
35861 case V8SI_FTYPE_PCINT_V8SI_UQI:
35862 case V4SI_FTYPE_PCINT_V4SI_UQI:
35863 case V8DI_FTYPE_PCINT64_V8DI_UQI:
35864 case V4DI_FTYPE_PCINT64_V4DI_UQI:
35865 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35866 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35867 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35868 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35869 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35870 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35871 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35872 nargs = 3;
35873 klass = load;
35874 memory = 0;
35875 break;
35876 case VOID_FTYPE_UINT_UINT_UINT:
35877 case VOID_FTYPE_UINT64_UINT_UINT:
35878 case UCHAR_FTYPE_UINT_UINT_UINT:
35879 case UCHAR_FTYPE_UINT64_UINT_UINT:
35880 nargs = 3;
35881 klass = load;
35882 memory = ARRAY_SIZE (args);
35883 last_arg_constant = true;
35884 break;
35885 default:
35886 gcc_unreachable ();
35889 gcc_assert (nargs <= ARRAY_SIZE (args));
35891 if (klass == store)
35893 arg = CALL_EXPR_ARG (exp, 0);
35894 op = expand_normal (arg);
35895 gcc_assert (target == 0);
35896 if (memory)
35898 op = ix86_zero_extend_to_Pmode (op);
35899 target = gen_rtx_MEM (tmode, op);
35900 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35901 on it. Try to improve it using get_pointer_alignment,
35902 and if the special builtin is one that requires strict
35903 mode alignment, also from it's GET_MODE_ALIGNMENT.
35904 Failure to do so could lead to ix86_legitimate_combined_insn
35905 rejecting all changes to such insns. */
35906 unsigned int align = get_pointer_alignment (arg);
35907 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35908 align = GET_MODE_ALIGNMENT (tmode);
35909 if (MEM_ALIGN (target) < align)
35910 set_mem_align (target, align);
35912 else
35913 target = force_reg (tmode, op);
35914 arg_adjust = 1;
35916 else
35918 arg_adjust = 0;
35919 if (optimize
35920 || target == 0
35921 || !register_operand (target, tmode)
35922 || GET_MODE (target) != tmode)
35923 target = gen_reg_rtx (tmode);
35926 for (i = 0; i < nargs; i++)
35928 machine_mode mode = insn_p->operand[i + 1].mode;
35929 bool match;
35931 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35932 op = expand_normal (arg);
35933 match = insn_p->operand[i + 1].predicate (op, mode);
35935 if (last_arg_constant && (i + 1) == nargs)
35937 if (!match)
35939 if (icode == CODE_FOR_lwp_lwpvalsi3
35940 || icode == CODE_FOR_lwp_lwpinssi3
35941 || icode == CODE_FOR_lwp_lwpvaldi3
35942 || icode == CODE_FOR_lwp_lwpinsdi3)
35943 error ("the last argument must be a 32-bit immediate");
35944 else
35945 error ("the last argument must be an 8-bit immediate");
35946 return const0_rtx;
35949 else
35951 if (i == memory)
35953 /* This must be the memory operand. */
35954 op = ix86_zero_extend_to_Pmode (op);
35955 op = gen_rtx_MEM (mode, op);
35956 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35957 on it. Try to improve it using get_pointer_alignment,
35958 and if the special builtin is one that requires strict
35959 mode alignment, also from it's GET_MODE_ALIGNMENT.
35960 Failure to do so could lead to ix86_legitimate_combined_insn
35961 rejecting all changes to such insns. */
35962 unsigned int align = get_pointer_alignment (arg);
35963 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35964 align = GET_MODE_ALIGNMENT (mode);
35965 if (MEM_ALIGN (op) < align)
35966 set_mem_align (op, align);
35968 else
35970 /* This must be register. */
35971 if (VECTOR_MODE_P (mode))
35972 op = safe_vector_operand (op, mode);
35974 op = fixup_modeless_constant (op, mode);
35976 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35977 op = copy_to_mode_reg (mode, op);
35978 else
35980 op = copy_to_reg (op);
35981 op = lowpart_subreg (mode, op, GET_MODE (op));
35986 args[i].op = op;
35987 args[i].mode = mode;
35990 switch (nargs)
35992 case 0:
35993 pat = GEN_FCN (icode) (target);
35994 break;
35995 case 1:
35996 pat = GEN_FCN (icode) (target, args[0].op);
35997 break;
35998 case 2:
35999 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36000 break;
36001 case 3:
36002 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36003 break;
36004 default:
36005 gcc_unreachable ();
36008 if (! pat)
36009 return 0;
36010 emit_insn (pat);
36011 return klass == store ? 0 : target;
36014 /* Return the integer constant in ARG. Constrain it to be in the range
36015 of the subparts of VEC_TYPE; issue an error if not. */
36017 static int
36018 get_element_number (tree vec_type, tree arg)
36020 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36022 if (!tree_fits_uhwi_p (arg)
36023 || (elt = tree_to_uhwi (arg), elt > max))
36025 error ("selector must be an integer constant in the range 0..%wi", max);
36026 return 0;
36029 return elt;
36032 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36033 ix86_expand_vector_init. We DO have language-level syntax for this, in
36034 the form of (type){ init-list }. Except that since we can't place emms
36035 instructions from inside the compiler, we can't allow the use of MMX
36036 registers unless the user explicitly asks for it. So we do *not* define
36037 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36038 we have builtins invoked by mmintrin.h that gives us license to emit
36039 these sorts of instructions. */
36041 static rtx
36042 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36044 machine_mode tmode = TYPE_MODE (type);
36045 machine_mode inner_mode = GET_MODE_INNER (tmode);
36046 int i, n_elt = GET_MODE_NUNITS (tmode);
36047 rtvec v = rtvec_alloc (n_elt);
36049 gcc_assert (VECTOR_MODE_P (tmode));
36050 gcc_assert (call_expr_nargs (exp) == n_elt);
36052 for (i = 0; i < n_elt; ++i)
36054 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36055 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36058 if (!target || !register_operand (target, tmode))
36059 target = gen_reg_rtx (tmode);
36061 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36062 return target;
36065 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36066 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36067 had a language-level syntax for referencing vector elements. */
36069 static rtx
36070 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36072 machine_mode tmode, mode0;
36073 tree arg0, arg1;
36074 int elt;
36075 rtx op0;
36077 arg0 = CALL_EXPR_ARG (exp, 0);
36078 arg1 = CALL_EXPR_ARG (exp, 1);
36080 op0 = expand_normal (arg0);
36081 elt = get_element_number (TREE_TYPE (arg0), arg1);
36083 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36084 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36085 gcc_assert (VECTOR_MODE_P (mode0));
36087 op0 = force_reg (mode0, op0);
36089 if (optimize || !target || !register_operand (target, tmode))
36090 target = gen_reg_rtx (tmode);
36092 ix86_expand_vector_extract (true, target, op0, elt);
36094 return target;
36097 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36098 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36099 a language-level syntax for referencing vector elements. */
36101 static rtx
36102 ix86_expand_vec_set_builtin (tree exp)
36104 machine_mode tmode, mode1;
36105 tree arg0, arg1, arg2;
36106 int elt;
36107 rtx op0, op1, target;
36109 arg0 = CALL_EXPR_ARG (exp, 0);
36110 arg1 = CALL_EXPR_ARG (exp, 1);
36111 arg2 = CALL_EXPR_ARG (exp, 2);
36113 tmode = TYPE_MODE (TREE_TYPE (arg0));
36114 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36115 gcc_assert (VECTOR_MODE_P (tmode));
36117 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36118 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36119 elt = get_element_number (TREE_TYPE (arg0), arg2);
36121 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36122 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36124 op0 = force_reg (tmode, op0);
36125 op1 = force_reg (mode1, op1);
36127 /* OP0 is the source of these builtin functions and shouldn't be
36128 modified. Create a copy, use it and return it as target. */
36129 target = gen_reg_rtx (tmode);
36130 emit_move_insn (target, op0);
36131 ix86_expand_vector_set (true, target, op1, elt);
36133 return target;
36136 /* Emit conditional move of SRC to DST with condition
36137 OP1 CODE OP2. */
36138 static void
36139 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
36141 rtx t;
36143 if (TARGET_CMOVE)
36145 t = ix86_expand_compare (code, op1, op2);
36146 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
36147 src, dst)));
36149 else
36151 rtx_code_label *nomove = gen_label_rtx ();
36152 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
36153 const0_rtx, GET_MODE (op1), 1, nomove);
36154 emit_move_insn (dst, src);
36155 emit_label (nomove);
36159 /* Choose max of DST and SRC and put it to DST. */
36160 static void
36161 ix86_emit_move_max (rtx dst, rtx src)
36163 ix86_emit_cmove (dst, src, LTU, dst, src);
36166 /* Expand an expression EXP that calls a built-in function,
36167 with result going to TARGET if that's convenient
36168 (and in mode MODE if that's convenient).
36169 SUBTARGET may be used as the target for computing one of EXP's operands.
36170 IGNORE is nonzero if the value is to be ignored. */
36172 static rtx
36173 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36174 machine_mode mode, int ignore)
36176 size_t i;
36177 enum insn_code icode, icode2;
36178 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36179 tree arg0, arg1, arg2, arg3, arg4;
36180 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36181 machine_mode mode0, mode1, mode2, mode3, mode4;
36182 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36184 /* For CPU builtins that can be folded, fold first and expand the fold. */
36185 switch (fcode)
36187 case IX86_BUILTIN_CPU_INIT:
36189 /* Make it call __cpu_indicator_init in libgcc. */
36190 tree call_expr, fndecl, type;
36191 type = build_function_type_list (integer_type_node, NULL_TREE);
36192 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36193 call_expr = build_call_expr (fndecl, 0);
36194 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36196 case IX86_BUILTIN_CPU_IS:
36197 case IX86_BUILTIN_CPU_SUPPORTS:
36199 tree arg0 = CALL_EXPR_ARG (exp, 0);
36200 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36201 gcc_assert (fold_expr != NULL_TREE);
36202 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36206 HOST_WIDE_INT isa = ix86_isa_flags;
36207 HOST_WIDE_INT isa2 = ix86_isa_flags2;
36208 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36209 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36210 /* The general case is we require all the ISAs specified in bisa{,2}
36211 to be enabled.
36212 The exceptions are:
36213 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36214 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36215 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36216 where for each this pair it is sufficient if either of the ISAs is
36217 enabled, plus if it is ored with other options also those others. */
36218 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36219 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36220 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36221 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36222 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36223 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36224 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36225 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36226 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36227 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36228 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36229 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36230 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36232 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36233 (enum fpmath_unit) 0, false);
36234 if (!opts)
36235 error ("%qE needs unknown isa option", fndecl);
36236 else
36238 gcc_assert (opts != NULL);
36239 error ("%qE needs isa option %s", fndecl, opts);
36240 free (opts);
36242 return expand_call (exp, target, ignore);
36245 switch (fcode)
36247 case IX86_BUILTIN_BNDMK:
36248 if (!target
36249 || GET_MODE (target) != BNDmode
36250 || !register_operand (target, BNDmode))
36251 target = gen_reg_rtx (BNDmode);
36253 arg0 = CALL_EXPR_ARG (exp, 0);
36254 arg1 = CALL_EXPR_ARG (exp, 1);
36256 op0 = expand_normal (arg0);
36257 op1 = expand_normal (arg1);
36259 if (!register_operand (op0, Pmode))
36260 op0 = ix86_zero_extend_to_Pmode (op0);
36261 if (!register_operand (op1, Pmode))
36262 op1 = ix86_zero_extend_to_Pmode (op1);
36264 /* Builtin arg1 is size of block but instruction op1 should
36265 be (size - 1). */
36266 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
36267 NULL_RTX, 1, OPTAB_DIRECT);
36269 emit_insn (BNDmode == BND64mode
36270 ? gen_bnd64_mk (target, op0, op1)
36271 : gen_bnd32_mk (target, op0, op1));
36272 return target;
36274 case IX86_BUILTIN_BNDSTX:
36275 arg0 = CALL_EXPR_ARG (exp, 0);
36276 arg1 = CALL_EXPR_ARG (exp, 1);
36277 arg2 = CALL_EXPR_ARG (exp, 2);
36279 op0 = expand_normal (arg0);
36280 op1 = expand_normal (arg1);
36281 op2 = expand_normal (arg2);
36283 if (!register_operand (op0, Pmode))
36284 op0 = ix86_zero_extend_to_Pmode (op0);
36285 if (!register_operand (op1, BNDmode))
36286 op1 = copy_to_mode_reg (BNDmode, op1);
36287 if (!register_operand (op2, Pmode))
36288 op2 = ix86_zero_extend_to_Pmode (op2);
36290 emit_insn (BNDmode == BND64mode
36291 ? gen_bnd64_stx (op2, op0, op1)
36292 : gen_bnd32_stx (op2, op0, op1));
36293 return 0;
36295 case IX86_BUILTIN_BNDLDX:
36296 if (!target
36297 || GET_MODE (target) != BNDmode
36298 || !register_operand (target, BNDmode))
36299 target = gen_reg_rtx (BNDmode);
36301 arg0 = CALL_EXPR_ARG (exp, 0);
36302 arg1 = CALL_EXPR_ARG (exp, 1);
36304 op0 = expand_normal (arg0);
36305 op1 = expand_normal (arg1);
36307 if (!register_operand (op0, Pmode))
36308 op0 = ix86_zero_extend_to_Pmode (op0);
36309 if (!register_operand (op1, Pmode))
36310 op1 = ix86_zero_extend_to_Pmode (op1);
36312 emit_insn (BNDmode == BND64mode
36313 ? gen_bnd64_ldx (target, op0, op1)
36314 : gen_bnd32_ldx (target, op0, op1));
36315 return target;
36317 case IX86_BUILTIN_BNDCL:
36318 arg0 = CALL_EXPR_ARG (exp, 0);
36319 arg1 = CALL_EXPR_ARG (exp, 1);
36321 op0 = expand_normal (arg0);
36322 op1 = expand_normal (arg1);
36324 if (!register_operand (op0, Pmode))
36325 op0 = ix86_zero_extend_to_Pmode (op0);
36326 if (!register_operand (op1, BNDmode))
36327 op1 = copy_to_mode_reg (BNDmode, op1);
36329 emit_insn (BNDmode == BND64mode
36330 ? gen_bnd64_cl (op1, op0)
36331 : gen_bnd32_cl (op1, op0));
36332 return 0;
36334 case IX86_BUILTIN_BNDCU:
36335 arg0 = CALL_EXPR_ARG (exp, 0);
36336 arg1 = CALL_EXPR_ARG (exp, 1);
36338 op0 = expand_normal (arg0);
36339 op1 = expand_normal (arg1);
36341 if (!register_operand (op0, Pmode))
36342 op0 = ix86_zero_extend_to_Pmode (op0);
36343 if (!register_operand (op1, BNDmode))
36344 op1 = copy_to_mode_reg (BNDmode, op1);
36346 emit_insn (BNDmode == BND64mode
36347 ? gen_bnd64_cu (op1, op0)
36348 : gen_bnd32_cu (op1, op0));
36349 return 0;
36351 case IX86_BUILTIN_BNDRET:
36352 arg0 = CALL_EXPR_ARG (exp, 0);
36353 target = chkp_get_rtl_bounds (arg0);
36355 /* If no bounds were specified for returned value,
36356 then use INIT bounds. It usually happens when
36357 some built-in function is expanded. */
36358 if (!target)
36360 rtx t1 = gen_reg_rtx (Pmode);
36361 rtx t2 = gen_reg_rtx (Pmode);
36362 target = gen_reg_rtx (BNDmode);
36363 emit_move_insn (t1, const0_rtx);
36364 emit_move_insn (t2, constm1_rtx);
36365 emit_insn (BNDmode == BND64mode
36366 ? gen_bnd64_mk (target, t1, t2)
36367 : gen_bnd32_mk (target, t1, t2));
36370 gcc_assert (target && REG_P (target));
36371 return target;
36373 case IX86_BUILTIN_BNDNARROW:
36375 rtx m1, m1h1, m1h2, lb, ub, t1;
36377 /* Return value and lb. */
36378 arg0 = CALL_EXPR_ARG (exp, 0);
36379 /* Bounds. */
36380 arg1 = CALL_EXPR_ARG (exp, 1);
36381 /* Size. */
36382 arg2 = CALL_EXPR_ARG (exp, 2);
36384 lb = expand_normal (arg0);
36385 op1 = expand_normal (arg1);
36386 op2 = expand_normal (arg2);
36388 /* Size was passed but we need to use (size - 1) as for bndmk. */
36389 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
36390 NULL_RTX, 1, OPTAB_DIRECT);
36392 /* Add LB to size and inverse to get UB. */
36393 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
36394 op2, 1, OPTAB_DIRECT);
36395 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
36397 if (!register_operand (lb, Pmode))
36398 lb = ix86_zero_extend_to_Pmode (lb);
36399 if (!register_operand (ub, Pmode))
36400 ub = ix86_zero_extend_to_Pmode (ub);
36402 /* We need to move bounds to memory before any computations. */
36403 if (MEM_P (op1))
36404 m1 = op1;
36405 else
36407 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
36408 emit_move_insn (m1, op1);
36411 /* Generate mem expression to be used for access to LB and UB. */
36412 m1h1 = adjust_address (m1, Pmode, 0);
36413 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
36415 t1 = gen_reg_rtx (Pmode);
36417 /* Compute LB. */
36418 emit_move_insn (t1, m1h1);
36419 ix86_emit_move_max (t1, lb);
36420 emit_move_insn (m1h1, t1);
36422 /* Compute UB. UB is stored in 1's complement form. Therefore
36423 we also use max here. */
36424 emit_move_insn (t1, m1h2);
36425 ix86_emit_move_max (t1, ub);
36426 emit_move_insn (m1h2, t1);
36428 op2 = gen_reg_rtx (BNDmode);
36429 emit_move_insn (op2, m1);
36431 return chkp_join_splitted_slot (lb, op2);
36434 case IX86_BUILTIN_BNDINT:
36436 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
36438 if (!target
36439 || GET_MODE (target) != BNDmode
36440 || !register_operand (target, BNDmode))
36441 target = gen_reg_rtx (BNDmode);
36443 arg0 = CALL_EXPR_ARG (exp, 0);
36444 arg1 = CALL_EXPR_ARG (exp, 1);
36446 op0 = expand_normal (arg0);
36447 op1 = expand_normal (arg1);
36449 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
36450 rh1 = adjust_address (res, Pmode, 0);
36451 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
36453 /* Put first bounds to temporaries. */
36454 lb1 = gen_reg_rtx (Pmode);
36455 ub1 = gen_reg_rtx (Pmode);
36456 if (MEM_P (op0))
36458 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
36459 emit_move_insn (ub1, adjust_address (op0, Pmode,
36460 GET_MODE_SIZE (Pmode)));
36462 else
36464 emit_move_insn (res, op0);
36465 emit_move_insn (lb1, rh1);
36466 emit_move_insn (ub1, rh2);
36469 /* Put second bounds to temporaries. */
36470 lb2 = gen_reg_rtx (Pmode);
36471 ub2 = gen_reg_rtx (Pmode);
36472 if (MEM_P (op1))
36474 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
36475 emit_move_insn (ub2, adjust_address (op1, Pmode,
36476 GET_MODE_SIZE (Pmode)));
36478 else
36480 emit_move_insn (res, op1);
36481 emit_move_insn (lb2, rh1);
36482 emit_move_insn (ub2, rh2);
36485 /* Compute LB. */
36486 ix86_emit_move_max (lb1, lb2);
36487 emit_move_insn (rh1, lb1);
36489 /* Compute UB. UB is stored in 1's complement form. Therefore
36490 we also use max here. */
36491 ix86_emit_move_max (ub1, ub2);
36492 emit_move_insn (rh2, ub1);
36494 emit_move_insn (target, res);
36496 return target;
36499 case IX86_BUILTIN_SIZEOF:
36501 tree name;
36502 rtx symbol;
36504 if (!target
36505 || GET_MODE (target) != Pmode
36506 || !register_operand (target, Pmode))
36507 target = gen_reg_rtx (Pmode);
36509 arg0 = CALL_EXPR_ARG (exp, 0);
36510 gcc_assert (VAR_P (arg0));
36512 name = DECL_ASSEMBLER_NAME (arg0);
36513 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
36515 emit_insn (Pmode == SImode
36516 ? gen_move_size_reloc_si (target, symbol)
36517 : gen_move_size_reloc_di (target, symbol));
36519 return target;
36522 case IX86_BUILTIN_BNDLOWER:
36524 rtx mem, hmem;
36526 if (!target
36527 || GET_MODE (target) != Pmode
36528 || !register_operand (target, Pmode))
36529 target = gen_reg_rtx (Pmode);
36531 arg0 = CALL_EXPR_ARG (exp, 0);
36532 op0 = expand_normal (arg0);
36534 /* We need to move bounds to memory first. */
36535 if (MEM_P (op0))
36536 mem = op0;
36537 else
36539 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36540 emit_move_insn (mem, op0);
36543 /* Generate mem expression to access LB and load it. */
36544 hmem = adjust_address (mem, Pmode, 0);
36545 emit_move_insn (target, hmem);
36547 return target;
36550 case IX86_BUILTIN_BNDUPPER:
36552 rtx mem, hmem, res;
36554 if (!target
36555 || GET_MODE (target) != Pmode
36556 || !register_operand (target, Pmode))
36557 target = gen_reg_rtx (Pmode);
36559 arg0 = CALL_EXPR_ARG (exp, 0);
36560 op0 = expand_normal (arg0);
36562 /* We need to move bounds to memory first. */
36563 if (MEM_P (op0))
36564 mem = op0;
36565 else
36567 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
36568 emit_move_insn (mem, op0);
36571 /* Generate mem expression to access UB. */
36572 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
36574 /* We need to inverse all bits of UB. */
36575 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
36577 if (res != target)
36578 emit_move_insn (target, res);
36580 return target;
36583 case IX86_BUILTIN_MASKMOVQ:
36584 case IX86_BUILTIN_MASKMOVDQU:
36585 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36586 ? CODE_FOR_mmx_maskmovq
36587 : CODE_FOR_sse2_maskmovdqu);
36588 /* Note the arg order is different from the operand order. */
36589 arg1 = CALL_EXPR_ARG (exp, 0);
36590 arg2 = CALL_EXPR_ARG (exp, 1);
36591 arg0 = CALL_EXPR_ARG (exp, 2);
36592 op0 = expand_normal (arg0);
36593 op1 = expand_normal (arg1);
36594 op2 = expand_normal (arg2);
36595 mode0 = insn_data[icode].operand[0].mode;
36596 mode1 = insn_data[icode].operand[1].mode;
36597 mode2 = insn_data[icode].operand[2].mode;
36599 op0 = ix86_zero_extend_to_Pmode (op0);
36600 op0 = gen_rtx_MEM (mode1, op0);
36602 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36603 op0 = copy_to_mode_reg (mode0, op0);
36604 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36605 op1 = copy_to_mode_reg (mode1, op1);
36606 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36607 op2 = copy_to_mode_reg (mode2, op2);
36608 pat = GEN_FCN (icode) (op0, op1, op2);
36609 if (! pat)
36610 return 0;
36611 emit_insn (pat);
36612 return 0;
36614 case IX86_BUILTIN_LDMXCSR:
36615 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36616 target = assign_386_stack_local (SImode, SLOT_TEMP);
36617 emit_move_insn (target, op0);
36618 emit_insn (gen_sse_ldmxcsr (target));
36619 return 0;
36621 case IX86_BUILTIN_STMXCSR:
36622 target = assign_386_stack_local (SImode, SLOT_TEMP);
36623 emit_insn (gen_sse_stmxcsr (target));
36624 return copy_to_mode_reg (SImode, target);
36626 case IX86_BUILTIN_CLFLUSH:
36627 arg0 = CALL_EXPR_ARG (exp, 0);
36628 op0 = expand_normal (arg0);
36629 icode = CODE_FOR_sse2_clflush;
36630 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36631 op0 = ix86_zero_extend_to_Pmode (op0);
36633 emit_insn (gen_sse2_clflush (op0));
36634 return 0;
36636 case IX86_BUILTIN_CLWB:
36637 arg0 = CALL_EXPR_ARG (exp, 0);
36638 op0 = expand_normal (arg0);
36639 icode = CODE_FOR_clwb;
36640 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36641 op0 = ix86_zero_extend_to_Pmode (op0);
36643 emit_insn (gen_clwb (op0));
36644 return 0;
36646 case IX86_BUILTIN_CLFLUSHOPT:
36647 arg0 = CALL_EXPR_ARG (exp, 0);
36648 op0 = expand_normal (arg0);
36649 icode = CODE_FOR_clflushopt;
36650 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36651 op0 = ix86_zero_extend_to_Pmode (op0);
36653 emit_insn (gen_clflushopt (op0));
36654 return 0;
36656 case IX86_BUILTIN_MONITOR:
36657 case IX86_BUILTIN_MONITORX:
36658 arg0 = CALL_EXPR_ARG (exp, 0);
36659 arg1 = CALL_EXPR_ARG (exp, 1);
36660 arg2 = CALL_EXPR_ARG (exp, 2);
36661 op0 = expand_normal (arg0);
36662 op1 = expand_normal (arg1);
36663 op2 = expand_normal (arg2);
36664 if (!REG_P (op0))
36665 op0 = ix86_zero_extend_to_Pmode (op0);
36666 if (!REG_P (op1))
36667 op1 = copy_to_mode_reg (SImode, op1);
36668 if (!REG_P (op2))
36669 op2 = copy_to_mode_reg (SImode, op2);
36671 emit_insn (fcode == IX86_BUILTIN_MONITOR
36672 ? ix86_gen_monitor (op0, op1, op2)
36673 : ix86_gen_monitorx (op0, op1, op2));
36674 return 0;
36676 case IX86_BUILTIN_MWAIT:
36677 arg0 = CALL_EXPR_ARG (exp, 0);
36678 arg1 = CALL_EXPR_ARG (exp, 1);
36679 op0 = expand_normal (arg0);
36680 op1 = expand_normal (arg1);
36681 if (!REG_P (op0))
36682 op0 = copy_to_mode_reg (SImode, op0);
36683 if (!REG_P (op1))
36684 op1 = copy_to_mode_reg (SImode, op1);
36685 emit_insn (gen_sse3_mwait (op0, op1));
36686 return 0;
36688 case IX86_BUILTIN_MWAITX:
36689 arg0 = CALL_EXPR_ARG (exp, 0);
36690 arg1 = CALL_EXPR_ARG (exp, 1);
36691 arg2 = CALL_EXPR_ARG (exp, 2);
36692 op0 = expand_normal (arg0);
36693 op1 = expand_normal (arg1);
36694 op2 = expand_normal (arg2);
36695 if (!REG_P (op0))
36696 op0 = copy_to_mode_reg (SImode, op0);
36697 if (!REG_P (op1))
36698 op1 = copy_to_mode_reg (SImode, op1);
36699 if (!REG_P (op2))
36700 op2 = copy_to_mode_reg (SImode, op2);
36701 emit_insn (gen_mwaitx (op0, op1, op2));
36702 return 0;
36704 case IX86_BUILTIN_CLZERO:
36705 arg0 = CALL_EXPR_ARG (exp, 0);
36706 op0 = expand_normal (arg0);
36707 if (!REG_P (op0))
36708 op0 = ix86_zero_extend_to_Pmode (op0);
36709 emit_insn (ix86_gen_clzero (op0));
36710 return 0;
36712 case IX86_BUILTIN_VEC_INIT_V2SI:
36713 case IX86_BUILTIN_VEC_INIT_V4HI:
36714 case IX86_BUILTIN_VEC_INIT_V8QI:
36715 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
36717 case IX86_BUILTIN_VEC_EXT_V2DF:
36718 case IX86_BUILTIN_VEC_EXT_V2DI:
36719 case IX86_BUILTIN_VEC_EXT_V4SF:
36720 case IX86_BUILTIN_VEC_EXT_V4SI:
36721 case IX86_BUILTIN_VEC_EXT_V8HI:
36722 case IX86_BUILTIN_VEC_EXT_V2SI:
36723 case IX86_BUILTIN_VEC_EXT_V4HI:
36724 case IX86_BUILTIN_VEC_EXT_V16QI:
36725 return ix86_expand_vec_ext_builtin (exp, target);
36727 case IX86_BUILTIN_VEC_SET_V2DI:
36728 case IX86_BUILTIN_VEC_SET_V4SF:
36729 case IX86_BUILTIN_VEC_SET_V4SI:
36730 case IX86_BUILTIN_VEC_SET_V8HI:
36731 case IX86_BUILTIN_VEC_SET_V4HI:
36732 case IX86_BUILTIN_VEC_SET_V16QI:
36733 return ix86_expand_vec_set_builtin (exp);
36735 case IX86_BUILTIN_NANQ:
36736 case IX86_BUILTIN_NANSQ:
36737 return expand_call (exp, target, ignore);
36739 case IX86_BUILTIN_RDPMC:
36740 case IX86_BUILTIN_RDTSC:
36741 case IX86_BUILTIN_RDTSCP:
36742 case IX86_BUILTIN_XGETBV:
36744 op0 = gen_reg_rtx (DImode);
36745 op1 = gen_reg_rtx (DImode);
36747 if (fcode == IX86_BUILTIN_RDPMC)
36749 arg0 = CALL_EXPR_ARG (exp, 0);
36750 op2 = expand_normal (arg0);
36751 if (!register_operand (op2, SImode))
36752 op2 = copy_to_mode_reg (SImode, op2);
36754 insn = (TARGET_64BIT
36755 ? gen_rdpmc_rex64 (op0, op1, op2)
36756 : gen_rdpmc (op0, op2));
36757 emit_insn (insn);
36759 else if (fcode == IX86_BUILTIN_XGETBV)
36761 arg0 = CALL_EXPR_ARG (exp, 0);
36762 op2 = expand_normal (arg0);
36763 if (!register_operand (op2, SImode))
36764 op2 = copy_to_mode_reg (SImode, op2);
36766 insn = (TARGET_64BIT
36767 ? gen_xgetbv_rex64 (op0, op1, op2)
36768 : gen_xgetbv (op0, op2));
36769 emit_insn (insn);
36771 else if (fcode == IX86_BUILTIN_RDTSC)
36773 insn = (TARGET_64BIT
36774 ? gen_rdtsc_rex64 (op0, op1)
36775 : gen_rdtsc (op0));
36776 emit_insn (insn);
36778 else
36780 op2 = gen_reg_rtx (SImode);
36782 insn = (TARGET_64BIT
36783 ? gen_rdtscp_rex64 (op0, op1, op2)
36784 : gen_rdtscp (op0, op2));
36785 emit_insn (insn);
36787 arg0 = CALL_EXPR_ARG (exp, 0);
36788 op4 = expand_normal (arg0);
36789 if (!address_operand (op4, VOIDmode))
36791 op4 = convert_memory_address (Pmode, op4);
36792 op4 = copy_addr_to_reg (op4);
36794 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
36797 if (target == 0)
36799 /* mode is VOIDmode if __builtin_rd* has been called
36800 without lhs. */
36801 if (mode == VOIDmode)
36802 return target;
36803 target = gen_reg_rtx (mode);
36806 if (TARGET_64BIT)
36808 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
36809 op1, 1, OPTAB_DIRECT);
36810 op0 = expand_simple_binop (DImode, IOR, op0, op1,
36811 op0, 1, OPTAB_DIRECT);
36814 emit_move_insn (target, op0);
36815 return target;
36817 case IX86_BUILTIN_FXSAVE:
36818 case IX86_BUILTIN_FXRSTOR:
36819 case IX86_BUILTIN_FXSAVE64:
36820 case IX86_BUILTIN_FXRSTOR64:
36821 case IX86_BUILTIN_FNSTENV:
36822 case IX86_BUILTIN_FLDENV:
36823 mode0 = BLKmode;
36824 switch (fcode)
36826 case IX86_BUILTIN_FXSAVE:
36827 icode = CODE_FOR_fxsave;
36828 break;
36829 case IX86_BUILTIN_FXRSTOR:
36830 icode = CODE_FOR_fxrstor;
36831 break;
36832 case IX86_BUILTIN_FXSAVE64:
36833 icode = CODE_FOR_fxsave64;
36834 break;
36835 case IX86_BUILTIN_FXRSTOR64:
36836 icode = CODE_FOR_fxrstor64;
36837 break;
36838 case IX86_BUILTIN_FNSTENV:
36839 icode = CODE_FOR_fnstenv;
36840 break;
36841 case IX86_BUILTIN_FLDENV:
36842 icode = CODE_FOR_fldenv;
36843 break;
36844 default:
36845 gcc_unreachable ();
36848 arg0 = CALL_EXPR_ARG (exp, 0);
36849 op0 = expand_normal (arg0);
36851 if (!address_operand (op0, VOIDmode))
36853 op0 = convert_memory_address (Pmode, op0);
36854 op0 = copy_addr_to_reg (op0);
36856 op0 = gen_rtx_MEM (mode0, op0);
36858 pat = GEN_FCN (icode) (op0);
36859 if (pat)
36860 emit_insn (pat);
36861 return 0;
36863 case IX86_BUILTIN_XSETBV:
36864 arg0 = CALL_EXPR_ARG (exp, 0);
36865 arg1 = CALL_EXPR_ARG (exp, 1);
36866 op0 = expand_normal (arg0);
36867 op1 = expand_normal (arg1);
36869 if (!REG_P (op0))
36870 op0 = copy_to_mode_reg (SImode, op0);
36872 if (TARGET_64BIT)
36874 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36875 NULL, 1, OPTAB_DIRECT);
36877 op2 = gen_lowpart (SImode, op2);
36878 op1 = gen_lowpart (SImode, op1);
36879 if (!REG_P (op1))
36880 op1 = copy_to_mode_reg (SImode, op1);
36881 if (!REG_P (op2))
36882 op2 = copy_to_mode_reg (SImode, op2);
36883 icode = CODE_FOR_xsetbv_rex64;
36884 pat = GEN_FCN (icode) (op0, op1, op2);
36886 else
36888 if (!REG_P (op1))
36889 op1 = copy_to_mode_reg (DImode, op1);
36890 icode = CODE_FOR_xsetbv;
36891 pat = GEN_FCN (icode) (op0, op1);
36893 if (pat)
36894 emit_insn (pat);
36895 return 0;
36897 case IX86_BUILTIN_XSAVE:
36898 case IX86_BUILTIN_XRSTOR:
36899 case IX86_BUILTIN_XSAVE64:
36900 case IX86_BUILTIN_XRSTOR64:
36901 case IX86_BUILTIN_XSAVEOPT:
36902 case IX86_BUILTIN_XSAVEOPT64:
36903 case IX86_BUILTIN_XSAVES:
36904 case IX86_BUILTIN_XRSTORS:
36905 case IX86_BUILTIN_XSAVES64:
36906 case IX86_BUILTIN_XRSTORS64:
36907 case IX86_BUILTIN_XSAVEC:
36908 case IX86_BUILTIN_XSAVEC64:
36909 arg0 = CALL_EXPR_ARG (exp, 0);
36910 arg1 = CALL_EXPR_ARG (exp, 1);
36911 op0 = expand_normal (arg0);
36912 op1 = expand_normal (arg1);
36914 if (!address_operand (op0, VOIDmode))
36916 op0 = convert_memory_address (Pmode, op0);
36917 op0 = copy_addr_to_reg (op0);
36919 op0 = gen_rtx_MEM (BLKmode, op0);
36921 op1 = force_reg (DImode, op1);
36923 if (TARGET_64BIT)
36925 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36926 NULL, 1, OPTAB_DIRECT);
36927 switch (fcode)
36929 case IX86_BUILTIN_XSAVE:
36930 icode = CODE_FOR_xsave_rex64;
36931 break;
36932 case IX86_BUILTIN_XRSTOR:
36933 icode = CODE_FOR_xrstor_rex64;
36934 break;
36935 case IX86_BUILTIN_XSAVE64:
36936 icode = CODE_FOR_xsave64;
36937 break;
36938 case IX86_BUILTIN_XRSTOR64:
36939 icode = CODE_FOR_xrstor64;
36940 break;
36941 case IX86_BUILTIN_XSAVEOPT:
36942 icode = CODE_FOR_xsaveopt_rex64;
36943 break;
36944 case IX86_BUILTIN_XSAVEOPT64:
36945 icode = CODE_FOR_xsaveopt64;
36946 break;
36947 case IX86_BUILTIN_XSAVES:
36948 icode = CODE_FOR_xsaves_rex64;
36949 break;
36950 case IX86_BUILTIN_XRSTORS:
36951 icode = CODE_FOR_xrstors_rex64;
36952 break;
36953 case IX86_BUILTIN_XSAVES64:
36954 icode = CODE_FOR_xsaves64;
36955 break;
36956 case IX86_BUILTIN_XRSTORS64:
36957 icode = CODE_FOR_xrstors64;
36958 break;
36959 case IX86_BUILTIN_XSAVEC:
36960 icode = CODE_FOR_xsavec_rex64;
36961 break;
36962 case IX86_BUILTIN_XSAVEC64:
36963 icode = CODE_FOR_xsavec64;
36964 break;
36965 default:
36966 gcc_unreachable ();
36969 op2 = gen_lowpart (SImode, op2);
36970 op1 = gen_lowpart (SImode, op1);
36971 pat = GEN_FCN (icode) (op0, op1, op2);
36973 else
36975 switch (fcode)
36977 case IX86_BUILTIN_XSAVE:
36978 icode = CODE_FOR_xsave;
36979 break;
36980 case IX86_BUILTIN_XRSTOR:
36981 icode = CODE_FOR_xrstor;
36982 break;
36983 case IX86_BUILTIN_XSAVEOPT:
36984 icode = CODE_FOR_xsaveopt;
36985 break;
36986 case IX86_BUILTIN_XSAVES:
36987 icode = CODE_FOR_xsaves;
36988 break;
36989 case IX86_BUILTIN_XRSTORS:
36990 icode = CODE_FOR_xrstors;
36991 break;
36992 case IX86_BUILTIN_XSAVEC:
36993 icode = CODE_FOR_xsavec;
36994 break;
36995 default:
36996 gcc_unreachable ();
36998 pat = GEN_FCN (icode) (op0, op1);
37001 if (pat)
37002 emit_insn (pat);
37003 return 0;
37005 case IX86_BUILTIN_LLWPCB:
37006 arg0 = CALL_EXPR_ARG (exp, 0);
37007 op0 = expand_normal (arg0);
37008 icode = CODE_FOR_lwp_llwpcb;
37009 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37010 op0 = ix86_zero_extend_to_Pmode (op0);
37011 emit_insn (gen_lwp_llwpcb (op0));
37012 return 0;
37014 case IX86_BUILTIN_SLWPCB:
37015 icode = CODE_FOR_lwp_slwpcb;
37016 if (!target
37017 || !insn_data[icode].operand[0].predicate (target, Pmode))
37018 target = gen_reg_rtx (Pmode);
37019 emit_insn (gen_lwp_slwpcb (target));
37020 return target;
37022 case IX86_BUILTIN_BEXTRI32:
37023 case IX86_BUILTIN_BEXTRI64:
37024 arg0 = CALL_EXPR_ARG (exp, 0);
37025 arg1 = CALL_EXPR_ARG (exp, 1);
37026 op0 = expand_normal (arg0);
37027 op1 = expand_normal (arg1);
37028 icode = (fcode == IX86_BUILTIN_BEXTRI32
37029 ? CODE_FOR_tbm_bextri_si
37030 : CODE_FOR_tbm_bextri_di);
37031 if (!CONST_INT_P (op1))
37033 error ("last argument must be an immediate");
37034 return const0_rtx;
37036 else
37038 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37039 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37040 op1 = GEN_INT (length);
37041 op2 = GEN_INT (lsb_index);
37042 pat = GEN_FCN (icode) (target, op0, op1, op2);
37043 if (pat)
37044 emit_insn (pat);
37045 return target;
37048 case IX86_BUILTIN_RDRAND16_STEP:
37049 icode = CODE_FOR_rdrandhi_1;
37050 mode0 = HImode;
37051 goto rdrand_step;
37053 case IX86_BUILTIN_RDRAND32_STEP:
37054 icode = CODE_FOR_rdrandsi_1;
37055 mode0 = SImode;
37056 goto rdrand_step;
37058 case IX86_BUILTIN_RDRAND64_STEP:
37059 icode = CODE_FOR_rdranddi_1;
37060 mode0 = DImode;
37062 rdrand_step:
37063 arg0 = CALL_EXPR_ARG (exp, 0);
37064 op1 = expand_normal (arg0);
37065 if (!address_operand (op1, VOIDmode))
37067 op1 = convert_memory_address (Pmode, op1);
37068 op1 = copy_addr_to_reg (op1);
37071 op0 = gen_reg_rtx (mode0);
37072 emit_insn (GEN_FCN (icode) (op0));
37074 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37076 op1 = gen_reg_rtx (SImode);
37077 emit_move_insn (op1, CONST1_RTX (SImode));
37079 /* Emit SImode conditional move. */
37080 if (mode0 == HImode)
37082 if (TARGET_ZERO_EXTEND_WITH_AND
37083 && optimize_function_for_speed_p (cfun))
37085 op2 = force_reg (SImode, const0_rtx);
37087 emit_insn (gen_movstricthi
37088 (gen_lowpart (HImode, op2), op0));
37090 else
37092 op2 = gen_reg_rtx (SImode);
37094 emit_insn (gen_zero_extendhisi2 (op2, op0));
37097 else if (mode0 == SImode)
37098 op2 = op0;
37099 else
37100 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37102 if (target == 0
37103 || !register_operand (target, SImode))
37104 target = gen_reg_rtx (SImode);
37106 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37107 const0_rtx);
37108 emit_insn (gen_rtx_SET (target,
37109 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37110 return target;
37112 case IX86_BUILTIN_RDSEED16_STEP:
37113 icode = CODE_FOR_rdseedhi_1;
37114 mode0 = HImode;
37115 goto rdseed_step;
37117 case IX86_BUILTIN_RDSEED32_STEP:
37118 icode = CODE_FOR_rdseedsi_1;
37119 mode0 = SImode;
37120 goto rdseed_step;
37122 case IX86_BUILTIN_RDSEED64_STEP:
37123 icode = CODE_FOR_rdseeddi_1;
37124 mode0 = DImode;
37126 rdseed_step:
37127 arg0 = CALL_EXPR_ARG (exp, 0);
37128 op1 = expand_normal (arg0);
37129 if (!address_operand (op1, VOIDmode))
37131 op1 = convert_memory_address (Pmode, op1);
37132 op1 = copy_addr_to_reg (op1);
37135 op0 = gen_reg_rtx (mode0);
37136 emit_insn (GEN_FCN (icode) (op0));
37138 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37140 op2 = gen_reg_rtx (QImode);
37142 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37143 const0_rtx);
37144 emit_insn (gen_rtx_SET (op2, pat));
37146 if (target == 0
37147 || !register_operand (target, SImode))
37148 target = gen_reg_rtx (SImode);
37150 emit_insn (gen_zero_extendqisi2 (target, op2));
37151 return target;
37153 case IX86_BUILTIN_SBB32:
37154 icode = CODE_FOR_subborrowsi;
37155 icode2 = CODE_FOR_subborrowsi_0;
37156 mode0 = SImode;
37157 mode1 = DImode;
37158 mode2 = CCmode;
37159 goto handlecarry;
37161 case IX86_BUILTIN_SBB64:
37162 icode = CODE_FOR_subborrowdi;
37163 icode2 = CODE_FOR_subborrowdi_0;
37164 mode0 = DImode;
37165 mode1 = TImode;
37166 mode2 = CCmode;
37167 goto handlecarry;
37169 case IX86_BUILTIN_ADDCARRYX32:
37170 icode = CODE_FOR_addcarrysi;
37171 icode2 = CODE_FOR_addcarrysi_0;
37172 mode0 = SImode;
37173 mode1 = DImode;
37174 mode2 = CCCmode;
37175 goto handlecarry;
37177 case IX86_BUILTIN_ADDCARRYX64:
37178 icode = CODE_FOR_addcarrydi;
37179 icode2 = CODE_FOR_addcarrydi_0;
37180 mode0 = DImode;
37181 mode1 = TImode;
37182 mode2 = CCCmode;
37184 handlecarry:
37185 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37186 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37187 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37188 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37190 op1 = expand_normal (arg0);
37191 if (!integer_zerop (arg0))
37192 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37194 op2 = expand_normal (arg1);
37195 if (!register_operand (op2, mode0))
37196 op2 = copy_to_mode_reg (mode0, op2);
37198 op3 = expand_normal (arg2);
37199 if (!register_operand (op3, mode0))
37200 op3 = copy_to_mode_reg (mode0, op3);
37202 op4 = expand_normal (arg3);
37203 if (!address_operand (op4, VOIDmode))
37205 op4 = convert_memory_address (Pmode, op4);
37206 op4 = copy_addr_to_reg (op4);
37209 op0 = gen_reg_rtx (mode0);
37210 if (integer_zerop (arg0))
37212 /* If arg0 is 0, optimize right away into add or sub
37213 instruction that sets CCCmode flags. */
37214 op1 = gen_rtx_REG (mode2, FLAGS_REG);
37215 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37217 else
37219 /* Generate CF from input operand. */
37220 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37222 /* Generate instruction that consumes CF. */
37223 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37224 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37225 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37226 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37229 /* Return current CF value. */
37230 if (target == 0)
37231 target = gen_reg_rtx (QImode);
37233 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37234 emit_insn (gen_rtx_SET (target, pat));
37236 /* Store the result. */
37237 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37239 return target;
37241 case IX86_BUILTIN_READ_FLAGS:
37242 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37244 if (optimize
37245 || target == NULL_RTX
37246 || !nonimmediate_operand (target, word_mode)
37247 || GET_MODE (target) != word_mode)
37248 target = gen_reg_rtx (word_mode);
37250 emit_insn (gen_pop (target));
37251 return target;
37253 case IX86_BUILTIN_WRITE_FLAGS:
37255 arg0 = CALL_EXPR_ARG (exp, 0);
37256 op0 = expand_normal (arg0);
37257 if (!general_no_elim_operand (op0, word_mode))
37258 op0 = copy_to_mode_reg (word_mode, op0);
37260 emit_insn (gen_push (op0));
37261 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37262 return 0;
37264 case IX86_BUILTIN_KTESTC8:
37265 icode = CODE_FOR_ktestqi;
37266 mode3 = CCCmode;
37267 goto kortest;
37269 case IX86_BUILTIN_KTESTZ8:
37270 icode = CODE_FOR_ktestqi;
37271 mode3 = CCZmode;
37272 goto kortest;
37274 case IX86_BUILTIN_KTESTC16:
37275 icode = CODE_FOR_ktesthi;
37276 mode3 = CCCmode;
37277 goto kortest;
37279 case IX86_BUILTIN_KTESTZ16:
37280 icode = CODE_FOR_ktesthi;
37281 mode3 = CCZmode;
37282 goto kortest;
37284 case IX86_BUILTIN_KTESTC32:
37285 icode = CODE_FOR_ktestsi;
37286 mode3 = CCCmode;
37287 goto kortest;
37289 case IX86_BUILTIN_KTESTZ32:
37290 icode = CODE_FOR_ktestsi;
37291 mode3 = CCZmode;
37292 goto kortest;
37294 case IX86_BUILTIN_KTESTC64:
37295 icode = CODE_FOR_ktestdi;
37296 mode3 = CCCmode;
37297 goto kortest;
37299 case IX86_BUILTIN_KTESTZ64:
37300 icode = CODE_FOR_ktestdi;
37301 mode3 = CCZmode;
37302 goto kortest;
37304 case IX86_BUILTIN_KORTESTC8:
37305 icode = CODE_FOR_kortestqi;
37306 mode3 = CCCmode;
37307 goto kortest;
37309 case IX86_BUILTIN_KORTESTZ8:
37310 icode = CODE_FOR_kortestqi;
37311 mode3 = CCZmode;
37312 goto kortest;
37314 case IX86_BUILTIN_KORTESTC16:
37315 icode = CODE_FOR_kortesthi;
37316 mode3 = CCCmode;
37317 goto kortest;
37319 case IX86_BUILTIN_KORTESTZ16:
37320 icode = CODE_FOR_kortesthi;
37321 mode3 = CCZmode;
37322 goto kortest;
37324 case IX86_BUILTIN_KORTESTC32:
37325 icode = CODE_FOR_kortestsi;
37326 mode3 = CCCmode;
37327 goto kortest;
37329 case IX86_BUILTIN_KORTESTZ32:
37330 icode = CODE_FOR_kortestsi;
37331 mode3 = CCZmode;
37332 goto kortest;
37334 case IX86_BUILTIN_KORTESTC64:
37335 icode = CODE_FOR_kortestdi;
37336 mode3 = CCCmode;
37337 goto kortest;
37339 case IX86_BUILTIN_KORTESTZ64:
37340 icode = CODE_FOR_kortestdi;
37341 mode3 = CCZmode;
37343 kortest:
37344 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37345 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37346 op0 = expand_normal (arg0);
37347 op1 = expand_normal (arg1);
37349 mode0 = insn_data[icode].operand[0].mode;
37350 mode1 = insn_data[icode].operand[1].mode;
37352 if (GET_MODE (op0) != VOIDmode)
37353 op0 = force_reg (GET_MODE (op0), op0);
37355 op0 = gen_lowpart (mode0, op0);
37357 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37358 op0 = copy_to_mode_reg (mode0, op0);
37360 if (GET_MODE (op1) != VOIDmode)
37361 op1 = force_reg (GET_MODE (op1), op1);
37363 op1 = gen_lowpart (mode1, op1);
37365 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37366 op1 = copy_to_mode_reg (mode1, op1);
37368 target = gen_reg_rtx (QImode);
37370 /* Emit kortest. */
37371 emit_insn (GEN_FCN (icode) (op0, op1));
37372 /* And use setcc to return result from flags. */
37373 ix86_expand_setcc (target, EQ,
37374 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37375 return target;
37377 case IX86_BUILTIN_GATHERSIV2DF:
37378 icode = CODE_FOR_avx2_gathersiv2df;
37379 goto gather_gen;
37380 case IX86_BUILTIN_GATHERSIV4DF:
37381 icode = CODE_FOR_avx2_gathersiv4df;
37382 goto gather_gen;
37383 case IX86_BUILTIN_GATHERDIV2DF:
37384 icode = CODE_FOR_avx2_gatherdiv2df;
37385 goto gather_gen;
37386 case IX86_BUILTIN_GATHERDIV4DF:
37387 icode = CODE_FOR_avx2_gatherdiv4df;
37388 goto gather_gen;
37389 case IX86_BUILTIN_GATHERSIV4SF:
37390 icode = CODE_FOR_avx2_gathersiv4sf;
37391 goto gather_gen;
37392 case IX86_BUILTIN_GATHERSIV8SF:
37393 icode = CODE_FOR_avx2_gathersiv8sf;
37394 goto gather_gen;
37395 case IX86_BUILTIN_GATHERDIV4SF:
37396 icode = CODE_FOR_avx2_gatherdiv4sf;
37397 goto gather_gen;
37398 case IX86_BUILTIN_GATHERDIV8SF:
37399 icode = CODE_FOR_avx2_gatherdiv8sf;
37400 goto gather_gen;
37401 case IX86_BUILTIN_GATHERSIV2DI:
37402 icode = CODE_FOR_avx2_gathersiv2di;
37403 goto gather_gen;
37404 case IX86_BUILTIN_GATHERSIV4DI:
37405 icode = CODE_FOR_avx2_gathersiv4di;
37406 goto gather_gen;
37407 case IX86_BUILTIN_GATHERDIV2DI:
37408 icode = CODE_FOR_avx2_gatherdiv2di;
37409 goto gather_gen;
37410 case IX86_BUILTIN_GATHERDIV4DI:
37411 icode = CODE_FOR_avx2_gatherdiv4di;
37412 goto gather_gen;
37413 case IX86_BUILTIN_GATHERSIV4SI:
37414 icode = CODE_FOR_avx2_gathersiv4si;
37415 goto gather_gen;
37416 case IX86_BUILTIN_GATHERSIV8SI:
37417 icode = CODE_FOR_avx2_gathersiv8si;
37418 goto gather_gen;
37419 case IX86_BUILTIN_GATHERDIV4SI:
37420 icode = CODE_FOR_avx2_gatherdiv4si;
37421 goto gather_gen;
37422 case IX86_BUILTIN_GATHERDIV8SI:
37423 icode = CODE_FOR_avx2_gatherdiv8si;
37424 goto gather_gen;
37425 case IX86_BUILTIN_GATHERALTSIV4DF:
37426 icode = CODE_FOR_avx2_gathersiv4df;
37427 goto gather_gen;
37428 case IX86_BUILTIN_GATHERALTDIV8SF:
37429 icode = CODE_FOR_avx2_gatherdiv8sf;
37430 goto gather_gen;
37431 case IX86_BUILTIN_GATHERALTSIV4DI:
37432 icode = CODE_FOR_avx2_gathersiv4di;
37433 goto gather_gen;
37434 case IX86_BUILTIN_GATHERALTDIV8SI:
37435 icode = CODE_FOR_avx2_gatherdiv8si;
37436 goto gather_gen;
37437 case IX86_BUILTIN_GATHER3SIV16SF:
37438 icode = CODE_FOR_avx512f_gathersiv16sf;
37439 goto gather_gen;
37440 case IX86_BUILTIN_GATHER3SIV8DF:
37441 icode = CODE_FOR_avx512f_gathersiv8df;
37442 goto gather_gen;
37443 case IX86_BUILTIN_GATHER3DIV16SF:
37444 icode = CODE_FOR_avx512f_gatherdiv16sf;
37445 goto gather_gen;
37446 case IX86_BUILTIN_GATHER3DIV8DF:
37447 icode = CODE_FOR_avx512f_gatherdiv8df;
37448 goto gather_gen;
37449 case IX86_BUILTIN_GATHER3SIV16SI:
37450 icode = CODE_FOR_avx512f_gathersiv16si;
37451 goto gather_gen;
37452 case IX86_BUILTIN_GATHER3SIV8DI:
37453 icode = CODE_FOR_avx512f_gathersiv8di;
37454 goto gather_gen;
37455 case IX86_BUILTIN_GATHER3DIV16SI:
37456 icode = CODE_FOR_avx512f_gatherdiv16si;
37457 goto gather_gen;
37458 case IX86_BUILTIN_GATHER3DIV8DI:
37459 icode = CODE_FOR_avx512f_gatherdiv8di;
37460 goto gather_gen;
37461 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37462 icode = CODE_FOR_avx512f_gathersiv8df;
37463 goto gather_gen;
37464 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37465 icode = CODE_FOR_avx512f_gatherdiv16sf;
37466 goto gather_gen;
37467 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37468 icode = CODE_FOR_avx512f_gathersiv8di;
37469 goto gather_gen;
37470 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37471 icode = CODE_FOR_avx512f_gatherdiv16si;
37472 goto gather_gen;
37473 case IX86_BUILTIN_GATHER3SIV2DF:
37474 icode = CODE_FOR_avx512vl_gathersiv2df;
37475 goto gather_gen;
37476 case IX86_BUILTIN_GATHER3SIV4DF:
37477 icode = CODE_FOR_avx512vl_gathersiv4df;
37478 goto gather_gen;
37479 case IX86_BUILTIN_GATHER3DIV2DF:
37480 icode = CODE_FOR_avx512vl_gatherdiv2df;
37481 goto gather_gen;
37482 case IX86_BUILTIN_GATHER3DIV4DF:
37483 icode = CODE_FOR_avx512vl_gatherdiv4df;
37484 goto gather_gen;
37485 case IX86_BUILTIN_GATHER3SIV4SF:
37486 icode = CODE_FOR_avx512vl_gathersiv4sf;
37487 goto gather_gen;
37488 case IX86_BUILTIN_GATHER3SIV8SF:
37489 icode = CODE_FOR_avx512vl_gathersiv8sf;
37490 goto gather_gen;
37491 case IX86_BUILTIN_GATHER3DIV4SF:
37492 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37493 goto gather_gen;
37494 case IX86_BUILTIN_GATHER3DIV8SF:
37495 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37496 goto gather_gen;
37497 case IX86_BUILTIN_GATHER3SIV2DI:
37498 icode = CODE_FOR_avx512vl_gathersiv2di;
37499 goto gather_gen;
37500 case IX86_BUILTIN_GATHER3SIV4DI:
37501 icode = CODE_FOR_avx512vl_gathersiv4di;
37502 goto gather_gen;
37503 case IX86_BUILTIN_GATHER3DIV2DI:
37504 icode = CODE_FOR_avx512vl_gatherdiv2di;
37505 goto gather_gen;
37506 case IX86_BUILTIN_GATHER3DIV4DI:
37507 icode = CODE_FOR_avx512vl_gatherdiv4di;
37508 goto gather_gen;
37509 case IX86_BUILTIN_GATHER3SIV4SI:
37510 icode = CODE_FOR_avx512vl_gathersiv4si;
37511 goto gather_gen;
37512 case IX86_BUILTIN_GATHER3SIV8SI:
37513 icode = CODE_FOR_avx512vl_gathersiv8si;
37514 goto gather_gen;
37515 case IX86_BUILTIN_GATHER3DIV4SI:
37516 icode = CODE_FOR_avx512vl_gatherdiv4si;
37517 goto gather_gen;
37518 case IX86_BUILTIN_GATHER3DIV8SI:
37519 icode = CODE_FOR_avx512vl_gatherdiv8si;
37520 goto gather_gen;
37521 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37522 icode = CODE_FOR_avx512vl_gathersiv4df;
37523 goto gather_gen;
37524 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37525 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37526 goto gather_gen;
37527 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37528 icode = CODE_FOR_avx512vl_gathersiv4di;
37529 goto gather_gen;
37530 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37531 icode = CODE_FOR_avx512vl_gatherdiv8si;
37532 goto gather_gen;
37533 case IX86_BUILTIN_SCATTERSIV16SF:
37534 icode = CODE_FOR_avx512f_scattersiv16sf;
37535 goto scatter_gen;
37536 case IX86_BUILTIN_SCATTERSIV8DF:
37537 icode = CODE_FOR_avx512f_scattersiv8df;
37538 goto scatter_gen;
37539 case IX86_BUILTIN_SCATTERDIV16SF:
37540 icode = CODE_FOR_avx512f_scatterdiv16sf;
37541 goto scatter_gen;
37542 case IX86_BUILTIN_SCATTERDIV8DF:
37543 icode = CODE_FOR_avx512f_scatterdiv8df;
37544 goto scatter_gen;
37545 case IX86_BUILTIN_SCATTERSIV16SI:
37546 icode = CODE_FOR_avx512f_scattersiv16si;
37547 goto scatter_gen;
37548 case IX86_BUILTIN_SCATTERSIV8DI:
37549 icode = CODE_FOR_avx512f_scattersiv8di;
37550 goto scatter_gen;
37551 case IX86_BUILTIN_SCATTERDIV16SI:
37552 icode = CODE_FOR_avx512f_scatterdiv16si;
37553 goto scatter_gen;
37554 case IX86_BUILTIN_SCATTERDIV8DI:
37555 icode = CODE_FOR_avx512f_scatterdiv8di;
37556 goto scatter_gen;
37557 case IX86_BUILTIN_SCATTERSIV8SF:
37558 icode = CODE_FOR_avx512vl_scattersiv8sf;
37559 goto scatter_gen;
37560 case IX86_BUILTIN_SCATTERSIV4SF:
37561 icode = CODE_FOR_avx512vl_scattersiv4sf;
37562 goto scatter_gen;
37563 case IX86_BUILTIN_SCATTERSIV4DF:
37564 icode = CODE_FOR_avx512vl_scattersiv4df;
37565 goto scatter_gen;
37566 case IX86_BUILTIN_SCATTERSIV2DF:
37567 icode = CODE_FOR_avx512vl_scattersiv2df;
37568 goto scatter_gen;
37569 case IX86_BUILTIN_SCATTERDIV8SF:
37570 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37571 goto scatter_gen;
37572 case IX86_BUILTIN_SCATTERDIV4SF:
37573 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37574 goto scatter_gen;
37575 case IX86_BUILTIN_SCATTERDIV4DF:
37576 icode = CODE_FOR_avx512vl_scatterdiv4df;
37577 goto scatter_gen;
37578 case IX86_BUILTIN_SCATTERDIV2DF:
37579 icode = CODE_FOR_avx512vl_scatterdiv2df;
37580 goto scatter_gen;
37581 case IX86_BUILTIN_SCATTERSIV8SI:
37582 icode = CODE_FOR_avx512vl_scattersiv8si;
37583 goto scatter_gen;
37584 case IX86_BUILTIN_SCATTERSIV4SI:
37585 icode = CODE_FOR_avx512vl_scattersiv4si;
37586 goto scatter_gen;
37587 case IX86_BUILTIN_SCATTERSIV4DI:
37588 icode = CODE_FOR_avx512vl_scattersiv4di;
37589 goto scatter_gen;
37590 case IX86_BUILTIN_SCATTERSIV2DI:
37591 icode = CODE_FOR_avx512vl_scattersiv2di;
37592 goto scatter_gen;
37593 case IX86_BUILTIN_SCATTERDIV8SI:
37594 icode = CODE_FOR_avx512vl_scatterdiv8si;
37595 goto scatter_gen;
37596 case IX86_BUILTIN_SCATTERDIV4SI:
37597 icode = CODE_FOR_avx512vl_scatterdiv4si;
37598 goto scatter_gen;
37599 case IX86_BUILTIN_SCATTERDIV4DI:
37600 icode = CODE_FOR_avx512vl_scatterdiv4di;
37601 goto scatter_gen;
37602 case IX86_BUILTIN_SCATTERDIV2DI:
37603 icode = CODE_FOR_avx512vl_scatterdiv2di;
37604 goto scatter_gen;
37605 case IX86_BUILTIN_GATHERPFDPD:
37606 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37607 goto vec_prefetch_gen;
37608 case IX86_BUILTIN_SCATTERALTSIV8DF:
37609 icode = CODE_FOR_avx512f_scattersiv8df;
37610 goto scatter_gen;
37611 case IX86_BUILTIN_SCATTERALTDIV16SF:
37612 icode = CODE_FOR_avx512f_scatterdiv16sf;
37613 goto scatter_gen;
37614 case IX86_BUILTIN_SCATTERALTSIV8DI:
37615 icode = CODE_FOR_avx512f_scattersiv8di;
37616 goto scatter_gen;
37617 case IX86_BUILTIN_SCATTERALTDIV16SI:
37618 icode = CODE_FOR_avx512f_scatterdiv16si;
37619 goto scatter_gen;
37620 case IX86_BUILTIN_GATHERPFDPS:
37621 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
37622 goto vec_prefetch_gen;
37623 case IX86_BUILTIN_GATHERPFQPD:
37624 icode = CODE_FOR_avx512pf_gatherpfv8didf;
37625 goto vec_prefetch_gen;
37626 case IX86_BUILTIN_GATHERPFQPS:
37627 icode = CODE_FOR_avx512pf_gatherpfv8disf;
37628 goto vec_prefetch_gen;
37629 case IX86_BUILTIN_SCATTERPFDPD:
37630 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
37631 goto vec_prefetch_gen;
37632 case IX86_BUILTIN_SCATTERPFDPS:
37633 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
37634 goto vec_prefetch_gen;
37635 case IX86_BUILTIN_SCATTERPFQPD:
37636 icode = CODE_FOR_avx512pf_scatterpfv8didf;
37637 goto vec_prefetch_gen;
37638 case IX86_BUILTIN_SCATTERPFQPS:
37639 icode = CODE_FOR_avx512pf_scatterpfv8disf;
37640 goto vec_prefetch_gen;
37642 gather_gen:
37643 rtx half;
37644 rtx (*gen) (rtx, rtx);
37646 arg0 = CALL_EXPR_ARG (exp, 0);
37647 arg1 = CALL_EXPR_ARG (exp, 1);
37648 arg2 = CALL_EXPR_ARG (exp, 2);
37649 arg3 = CALL_EXPR_ARG (exp, 3);
37650 arg4 = CALL_EXPR_ARG (exp, 4);
37651 op0 = expand_normal (arg0);
37652 op1 = expand_normal (arg1);
37653 op2 = expand_normal (arg2);
37654 op3 = expand_normal (arg3);
37655 op4 = expand_normal (arg4);
37656 /* Note the arg order is different from the operand order. */
37657 mode0 = insn_data[icode].operand[1].mode;
37658 mode2 = insn_data[icode].operand[3].mode;
37659 mode3 = insn_data[icode].operand[4].mode;
37660 mode4 = insn_data[icode].operand[5].mode;
37662 if (target == NULL_RTX
37663 || GET_MODE (target) != insn_data[icode].operand[0].mode
37664 || !insn_data[icode].operand[0].predicate (target,
37665 GET_MODE (target)))
37666 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
37667 else
37668 subtarget = target;
37670 switch (fcode)
37672 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37673 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37674 half = gen_reg_rtx (V8SImode);
37675 if (!nonimmediate_operand (op2, V16SImode))
37676 op2 = copy_to_mode_reg (V16SImode, op2);
37677 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37678 op2 = half;
37679 break;
37680 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37681 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37682 case IX86_BUILTIN_GATHERALTSIV4DF:
37683 case IX86_BUILTIN_GATHERALTSIV4DI:
37684 half = gen_reg_rtx (V4SImode);
37685 if (!nonimmediate_operand (op2, V8SImode))
37686 op2 = copy_to_mode_reg (V8SImode, op2);
37687 emit_insn (gen_vec_extract_lo_v8si (half, op2));
37688 op2 = half;
37689 break;
37690 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37691 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37692 half = gen_reg_rtx (mode0);
37693 if (mode0 == V8SFmode)
37694 gen = gen_vec_extract_lo_v16sf;
37695 else
37696 gen = gen_vec_extract_lo_v16si;
37697 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37698 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37699 emit_insn (gen (half, op0));
37700 op0 = half;
37701 if (GET_MODE (op3) != VOIDmode)
37703 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37704 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37705 emit_insn (gen (half, op3));
37706 op3 = half;
37708 break;
37709 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37710 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37711 case IX86_BUILTIN_GATHERALTDIV8SF:
37712 case IX86_BUILTIN_GATHERALTDIV8SI:
37713 half = gen_reg_rtx (mode0);
37714 if (mode0 == V4SFmode)
37715 gen = gen_vec_extract_lo_v8sf;
37716 else
37717 gen = gen_vec_extract_lo_v8si;
37718 if (!nonimmediate_operand (op0, GET_MODE (op0)))
37719 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
37720 emit_insn (gen (half, op0));
37721 op0 = half;
37722 if (GET_MODE (op3) != VOIDmode)
37724 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37725 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37726 emit_insn (gen (half, op3));
37727 op3 = half;
37729 break;
37730 default:
37731 break;
37734 /* Force memory operand only with base register here. But we
37735 don't want to do it on memory operand for other builtin
37736 functions. */
37737 op1 = ix86_zero_extend_to_Pmode (op1);
37739 if (!insn_data[icode].operand[1].predicate (op0, mode0))
37740 op0 = copy_to_mode_reg (mode0, op0);
37741 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
37742 op1 = copy_to_mode_reg (Pmode, op1);
37743 if (!insn_data[icode].operand[3].predicate (op2, mode2))
37744 op2 = copy_to_mode_reg (mode2, op2);
37746 op3 = fixup_modeless_constant (op3, mode3);
37748 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
37750 if (!insn_data[icode].operand[4].predicate (op3, mode3))
37751 op3 = copy_to_mode_reg (mode3, op3);
37753 else
37755 op3 = copy_to_reg (op3);
37756 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
37758 if (!insn_data[icode].operand[5].predicate (op4, mode4))
37760 error ("the last argument must be scale 1, 2, 4, 8");
37761 return const0_rtx;
37764 /* Optimize. If mask is known to have all high bits set,
37765 replace op0 with pc_rtx to signal that the instruction
37766 overwrites the whole destination and doesn't use its
37767 previous contents. */
37768 if (optimize)
37770 if (TREE_CODE (arg3) == INTEGER_CST)
37772 if (integer_all_onesp (arg3))
37773 op0 = pc_rtx;
37775 else if (TREE_CODE (arg3) == VECTOR_CST)
37777 unsigned int negative = 0;
37778 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
37780 tree cst = VECTOR_CST_ELT (arg3, i);
37781 if (TREE_CODE (cst) == INTEGER_CST
37782 && tree_int_cst_sign_bit (cst))
37783 negative++;
37784 else if (TREE_CODE (cst) == REAL_CST
37785 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
37786 negative++;
37788 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
37789 op0 = pc_rtx;
37791 else if (TREE_CODE (arg3) == SSA_NAME
37792 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
37794 /* Recognize also when mask is like:
37795 __v2df src = _mm_setzero_pd ();
37796 __v2df mask = _mm_cmpeq_pd (src, src);
37798 __v8sf src = _mm256_setzero_ps ();
37799 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
37800 as that is a cheaper way to load all ones into
37801 a register than having to load a constant from
37802 memory. */
37803 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
37804 if (is_gimple_call (def_stmt))
37806 tree fndecl = gimple_call_fndecl (def_stmt);
37807 if (fndecl
37808 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
37809 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
37811 case IX86_BUILTIN_CMPPD:
37812 case IX86_BUILTIN_CMPPS:
37813 case IX86_BUILTIN_CMPPD256:
37814 case IX86_BUILTIN_CMPPS256:
37815 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
37816 break;
37817 /* FALLTHRU */
37818 case IX86_BUILTIN_CMPEQPD:
37819 case IX86_BUILTIN_CMPEQPS:
37820 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
37821 && initializer_zerop (gimple_call_arg (def_stmt,
37822 1)))
37823 op0 = pc_rtx;
37824 break;
37825 default:
37826 break;
37832 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
37833 if (! pat)
37834 return const0_rtx;
37835 emit_insn (pat);
37837 switch (fcode)
37839 case IX86_BUILTIN_GATHER3DIV16SF:
37840 if (target == NULL_RTX)
37841 target = gen_reg_rtx (V8SFmode);
37842 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
37843 break;
37844 case IX86_BUILTIN_GATHER3DIV16SI:
37845 if (target == NULL_RTX)
37846 target = gen_reg_rtx (V8SImode);
37847 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
37848 break;
37849 case IX86_BUILTIN_GATHER3DIV8SF:
37850 case IX86_BUILTIN_GATHERDIV8SF:
37851 if (target == NULL_RTX)
37852 target = gen_reg_rtx (V4SFmode);
37853 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
37854 break;
37855 case IX86_BUILTIN_GATHER3DIV8SI:
37856 case IX86_BUILTIN_GATHERDIV8SI:
37857 if (target == NULL_RTX)
37858 target = gen_reg_rtx (V4SImode);
37859 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
37860 break;
37861 default:
37862 target = subtarget;
37863 break;
37865 return target;
37867 scatter_gen:
37868 arg0 = CALL_EXPR_ARG (exp, 0);
37869 arg1 = CALL_EXPR_ARG (exp, 1);
37870 arg2 = CALL_EXPR_ARG (exp, 2);
37871 arg3 = CALL_EXPR_ARG (exp, 3);
37872 arg4 = CALL_EXPR_ARG (exp, 4);
37873 op0 = expand_normal (arg0);
37874 op1 = expand_normal (arg1);
37875 op2 = expand_normal (arg2);
37876 op3 = expand_normal (arg3);
37877 op4 = expand_normal (arg4);
37878 mode1 = insn_data[icode].operand[1].mode;
37879 mode2 = insn_data[icode].operand[2].mode;
37880 mode3 = insn_data[icode].operand[3].mode;
37881 mode4 = insn_data[icode].operand[4].mode;
37883 /* Scatter instruction stores operand op3 to memory with
37884 indices from op2 and scale from op4 under writemask op1.
37885 If index operand op2 has more elements then source operand
37886 op3 one need to use only its low half. And vice versa. */
37887 switch (fcode)
37889 case IX86_BUILTIN_SCATTERALTSIV8DF:
37890 case IX86_BUILTIN_SCATTERALTSIV8DI:
37891 half = gen_reg_rtx (V8SImode);
37892 if (!nonimmediate_operand (op2, V16SImode))
37893 op2 = copy_to_mode_reg (V16SImode, op2);
37894 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37895 op2 = half;
37896 break;
37897 case IX86_BUILTIN_SCATTERALTDIV16SF:
37898 case IX86_BUILTIN_SCATTERALTDIV16SI:
37899 half = gen_reg_rtx (mode3);
37900 if (mode3 == V8SFmode)
37901 gen = gen_vec_extract_lo_v16sf;
37902 else
37903 gen = gen_vec_extract_lo_v16si;
37904 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37905 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37906 emit_insn (gen (half, op3));
37907 op3 = half;
37908 break;
37909 default:
37910 break;
37913 /* Force memory operand only with base register here. But we
37914 don't want to do it on memory operand for other builtin
37915 functions. */
37916 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37918 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37919 op0 = copy_to_mode_reg (Pmode, op0);
37921 op1 = fixup_modeless_constant (op1, mode1);
37923 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37925 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37926 op1 = copy_to_mode_reg (mode1, op1);
37928 else
37930 op1 = copy_to_reg (op1);
37931 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37934 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37935 op2 = copy_to_mode_reg (mode2, op2);
37937 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37938 op3 = copy_to_mode_reg (mode3, op3);
37940 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37942 error ("the last argument must be scale 1, 2, 4, 8");
37943 return const0_rtx;
37946 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37947 if (! pat)
37948 return const0_rtx;
37950 emit_insn (pat);
37951 return 0;
37953 vec_prefetch_gen:
37954 arg0 = CALL_EXPR_ARG (exp, 0);
37955 arg1 = CALL_EXPR_ARG (exp, 1);
37956 arg2 = CALL_EXPR_ARG (exp, 2);
37957 arg3 = CALL_EXPR_ARG (exp, 3);
37958 arg4 = CALL_EXPR_ARG (exp, 4);
37959 op0 = expand_normal (arg0);
37960 op1 = expand_normal (arg1);
37961 op2 = expand_normal (arg2);
37962 op3 = expand_normal (arg3);
37963 op4 = expand_normal (arg4);
37964 mode0 = insn_data[icode].operand[0].mode;
37965 mode1 = insn_data[icode].operand[1].mode;
37966 mode3 = insn_data[icode].operand[3].mode;
37967 mode4 = insn_data[icode].operand[4].mode;
37969 op0 = fixup_modeless_constant (op0, mode0);
37971 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37973 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37974 op0 = copy_to_mode_reg (mode0, op0);
37976 else
37978 op0 = copy_to_reg (op0);
37979 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37982 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37983 op1 = copy_to_mode_reg (mode1, op1);
37985 /* Force memory operand only with base register here. But we
37986 don't want to do it on memory operand for other builtin
37987 functions. */
37988 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37990 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37991 op2 = copy_to_mode_reg (Pmode, op2);
37993 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37995 error ("the forth argument must be scale 1, 2, 4, 8");
37996 return const0_rtx;
37999 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38001 error ("incorrect hint operand");
38002 return const0_rtx;
38005 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38006 if (! pat)
38007 return const0_rtx;
38009 emit_insn (pat);
38011 return 0;
38013 case IX86_BUILTIN_XABORT:
38014 icode = CODE_FOR_xabort;
38015 arg0 = CALL_EXPR_ARG (exp, 0);
38016 op0 = expand_normal (arg0);
38017 mode0 = insn_data[icode].operand[0].mode;
38018 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38020 error ("the xabort's argument must be an 8-bit immediate");
38021 return const0_rtx;
38023 emit_insn (gen_xabort (op0));
38024 return 0;
38026 case IX86_BUILTIN_RSTORSSP:
38027 case IX86_BUILTIN_CLRSSBSY:
38028 arg0 = CALL_EXPR_ARG (exp, 0);
38029 op0 = expand_normal (arg0);
38030 icode = (fcode == IX86_BUILTIN_RSTORSSP
38031 ? CODE_FOR_rstorssp
38032 : CODE_FOR_clrssbsy);
38033 if (!address_operand (op0, VOIDmode))
38035 op1 = convert_memory_address (Pmode, op0);
38036 op0 = copy_addr_to_reg (op1);
38038 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38039 return 0;
38041 case IX86_BUILTIN_WRSSD:
38042 case IX86_BUILTIN_WRSSQ:
38043 case IX86_BUILTIN_WRUSSD:
38044 case IX86_BUILTIN_WRUSSQ:
38045 arg0 = CALL_EXPR_ARG (exp, 0);
38046 op0 = expand_normal (arg0);
38047 arg1 = CALL_EXPR_ARG (exp, 1);
38048 op1 = expand_normal (arg1);
38049 switch (fcode)
38051 case IX86_BUILTIN_WRSSD:
38052 icode = CODE_FOR_wrsssi;
38053 mode = SImode;
38054 break;
38055 case IX86_BUILTIN_WRSSQ:
38056 icode = CODE_FOR_wrssdi;
38057 mode = DImode;
38058 break;
38059 case IX86_BUILTIN_WRUSSD:
38060 icode = CODE_FOR_wrusssi;
38061 mode = SImode;
38062 break;
38063 case IX86_BUILTIN_WRUSSQ:
38064 icode = CODE_FOR_wrussdi;
38065 mode = DImode;
38066 break;
38068 op0 = force_reg (mode, op0);
38069 if (!address_operand (op1, VOIDmode))
38071 op2 = convert_memory_address (Pmode, op1);
38072 op1 = copy_addr_to_reg (op2);
38074 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38075 return 0;
38077 default:
38078 break;
38081 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38082 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38084 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38085 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38086 target);
38089 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38090 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38092 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38093 switch (fcode)
38095 case IX86_BUILTIN_FABSQ:
38096 case IX86_BUILTIN_COPYSIGNQ:
38097 if (!TARGET_SSE)
38098 /* Emit a normal call if SSE isn't available. */
38099 return expand_call (exp, target, ignore);
38100 /* FALLTHRU */
38101 default:
38102 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38106 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38107 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38109 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38110 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38111 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38112 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38113 int masked = 1;
38114 machine_mode mode, wide_mode, nar_mode;
38116 nar_mode = V4SFmode;
38117 mode = V16SFmode;
38118 wide_mode = V64SFmode;
38119 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38120 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38122 switch (fcode)
38124 case IX86_BUILTIN_4FMAPS:
38125 fcn = gen_avx5124fmaddps_4fmaddps;
38126 masked = 0;
38127 goto v4fma_expand;
38129 case IX86_BUILTIN_4DPWSSD:
38130 nar_mode = V4SImode;
38131 mode = V16SImode;
38132 wide_mode = V64SImode;
38133 fcn = gen_avx5124vnniw_vp4dpwssd;
38134 masked = 0;
38135 goto v4fma_expand;
38137 case IX86_BUILTIN_4DPWSSDS:
38138 nar_mode = V4SImode;
38139 mode = V16SImode;
38140 wide_mode = V64SImode;
38141 fcn = gen_avx5124vnniw_vp4dpwssds;
38142 masked = 0;
38143 goto v4fma_expand;
38145 case IX86_BUILTIN_4FNMAPS:
38146 fcn = gen_avx5124fmaddps_4fnmaddps;
38147 masked = 0;
38148 goto v4fma_expand;
38150 case IX86_BUILTIN_4FNMAPS_MASK:
38151 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38152 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38153 goto v4fma_expand;
38155 case IX86_BUILTIN_4DPWSSD_MASK:
38156 nar_mode = V4SImode;
38157 mode = V16SImode;
38158 wide_mode = V64SImode;
38159 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38160 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38161 goto v4fma_expand;
38163 case IX86_BUILTIN_4DPWSSDS_MASK:
38164 nar_mode = V4SImode;
38165 mode = V16SImode;
38166 wide_mode = V64SImode;
38167 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38168 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38169 goto v4fma_expand;
38171 case IX86_BUILTIN_4FMAPS_MASK:
38173 tree args[4];
38174 rtx ops[4];
38175 rtx wide_reg;
38176 rtx accum;
38177 rtx addr;
38178 rtx mem;
38180 v4fma_expand:
38181 wide_reg = gen_reg_rtx (wide_mode);
38182 for (i = 0; i < 4; i++)
38184 args[i] = CALL_EXPR_ARG (exp, i);
38185 ops[i] = expand_normal (args[i]);
38187 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38188 ops[i]);
38191 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38192 accum = force_reg (mode, accum);
38194 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38195 addr = force_reg (Pmode, addr);
38197 mem = gen_rtx_MEM (nar_mode, addr);
38199 target = gen_reg_rtx (mode);
38201 emit_move_insn (target, accum);
38203 if (! masked)
38204 emit_insn (fcn (target, accum, wide_reg, mem));
38205 else
38207 rtx merge, mask;
38208 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38210 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38212 if (CONST_INT_P (mask))
38213 mask = fixup_modeless_constant (mask, HImode);
38215 mask = force_reg (HImode, mask);
38217 if (GET_MODE (mask) != HImode)
38218 mask = gen_rtx_SUBREG (HImode, mask, 0);
38220 /* If merge is 0 then we're about to emit z-masked variant. */
38221 if (const0_operand (merge, mode))
38222 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38223 /* If merge is the same as accum then emit merge-masked variant. */
38224 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38226 merge = force_reg (mode, merge);
38227 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38229 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38230 else
38232 target = gen_reg_rtx (mode);
38233 emit_move_insn (target, merge);
38234 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38237 return target;
38240 case IX86_BUILTIN_4FNMASS:
38241 fcn = gen_avx5124fmaddps_4fnmaddss;
38242 masked = 0;
38243 goto s4fma_expand;
38245 case IX86_BUILTIN_4FMASS:
38246 fcn = gen_avx5124fmaddps_4fmaddss;
38247 masked = 0;
38248 goto s4fma_expand;
38250 case IX86_BUILTIN_4FNMASS_MASK:
38251 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38252 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38253 goto s4fma_expand;
38255 case IX86_BUILTIN_4FMASS_MASK:
38257 tree args[4];
38258 rtx ops[4];
38259 rtx wide_reg;
38260 rtx accum;
38261 rtx addr;
38262 rtx mem;
38264 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38265 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38267 s4fma_expand:
38268 mode = V4SFmode;
38269 wide_reg = gen_reg_rtx (V64SFmode);
38270 for (i = 0; i < 4; i++)
38272 rtx tmp;
38273 args[i] = CALL_EXPR_ARG (exp, i);
38274 ops[i] = expand_normal (args[i]);
38276 tmp = gen_reg_rtx (SFmode);
38277 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38279 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38280 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38283 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38284 accum = force_reg (V4SFmode, accum);
38286 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38287 addr = force_reg (Pmode, addr);
38289 mem = gen_rtx_MEM (V4SFmode, addr);
38291 target = gen_reg_rtx (V4SFmode);
38293 emit_move_insn (target, accum);
38295 if (! masked)
38296 emit_insn (fcn (target, accum, wide_reg, mem));
38297 else
38299 rtx merge, mask;
38300 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38302 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38304 if (CONST_INT_P (mask))
38305 mask = fixup_modeless_constant (mask, QImode);
38307 mask = force_reg (QImode, mask);
38309 if (GET_MODE (mask) != QImode)
38310 mask = gen_rtx_SUBREG (QImode, mask, 0);
38312 /* If merge is 0 then we're about to emit z-masked variant. */
38313 if (const0_operand (merge, mode))
38314 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38315 /* If merge is the same as accum then emit merge-masked
38316 variant. */
38317 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38319 merge = force_reg (mode, merge);
38320 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38322 /* Merge with something unknown might happen if we z-mask
38323 w/ -O0. */
38324 else
38326 target = gen_reg_rtx (mode);
38327 emit_move_insn (target, merge);
38328 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38331 return target;
38333 case IX86_BUILTIN_RDPID:
38334 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38335 target);
38336 default:
38337 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38341 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38342 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38344 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38345 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38348 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38349 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38351 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38352 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38355 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38356 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38358 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38359 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38362 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38363 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38365 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38366 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38369 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38370 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38372 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38373 const struct builtin_description *d = bdesc_multi_arg + i;
38374 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38375 (enum ix86_builtin_func_type)
38376 d->flag, d->comparison);
38379 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38380 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38382 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38383 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38384 target);
38387 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38388 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38390 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38391 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
38392 target);
38395 gcc_unreachable ();
38398 /* This returns the target-specific builtin with code CODE if
38399 current_function_decl has visibility on this builtin, which is checked
38400 using isa flags. Returns NULL_TREE otherwise. */
38402 static tree ix86_get_builtin (enum ix86_builtins code)
38404 struct cl_target_option *opts;
38405 tree target_tree = NULL_TREE;
38407 /* Determine the isa flags of current_function_decl. */
38409 if (current_function_decl)
38410 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38412 if (target_tree == NULL)
38413 target_tree = target_option_default_node;
38415 opts = TREE_TARGET_OPTION (target_tree);
38417 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38418 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38419 return ix86_builtin_decl (code, true);
38420 else
38421 return NULL_TREE;
38424 /* Return function decl for target specific builtin
38425 for given MPX builtin passed i FCODE. */
38426 static tree
38427 ix86_builtin_mpx_function (unsigned fcode)
38429 switch (fcode)
38431 case BUILT_IN_CHKP_BNDMK:
38432 return ix86_builtins[IX86_BUILTIN_BNDMK];
38434 case BUILT_IN_CHKP_BNDSTX:
38435 return ix86_builtins[IX86_BUILTIN_BNDSTX];
38437 case BUILT_IN_CHKP_BNDLDX:
38438 return ix86_builtins[IX86_BUILTIN_BNDLDX];
38440 case BUILT_IN_CHKP_BNDCL:
38441 return ix86_builtins[IX86_BUILTIN_BNDCL];
38443 case BUILT_IN_CHKP_BNDCU:
38444 return ix86_builtins[IX86_BUILTIN_BNDCU];
38446 case BUILT_IN_CHKP_BNDRET:
38447 return ix86_builtins[IX86_BUILTIN_BNDRET];
38449 case BUILT_IN_CHKP_INTERSECT:
38450 return ix86_builtins[IX86_BUILTIN_BNDINT];
38452 case BUILT_IN_CHKP_NARROW:
38453 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
38455 case BUILT_IN_CHKP_SIZEOF:
38456 return ix86_builtins[IX86_BUILTIN_SIZEOF];
38458 case BUILT_IN_CHKP_EXTRACT_LOWER:
38459 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
38461 case BUILT_IN_CHKP_EXTRACT_UPPER:
38462 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
38464 default:
38465 return NULL_TREE;
38468 gcc_unreachable ();
38471 /* Helper function for ix86_load_bounds and ix86_store_bounds.
38473 Return an address to be used to load/store bounds for pointer
38474 passed in SLOT.
38476 SLOT_NO is an integer constant holding number of a target
38477 dependent special slot to be used in case SLOT is not a memory.
38479 SPECIAL_BASE is a pointer to be used as a base of fake address
38480 to access special slots in Bounds Table. SPECIAL_BASE[-1],
38481 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
38483 static rtx
38484 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
38486 rtx addr = NULL;
38488 /* NULL slot means we pass bounds for pointer not passed to the
38489 function at all. Register slot means we pass pointer in a
38490 register. In both these cases bounds are passed via Bounds
38491 Table. Since we do not have actual pointer stored in memory,
38492 we have to use fake addresses to access Bounds Table. We
38493 start with (special_base - sizeof (void*)) and decrease this
38494 address by pointer size to get addresses for other slots. */
38495 if (!slot || REG_P (slot))
38497 gcc_assert (CONST_INT_P (slot_no));
38498 addr = plus_constant (Pmode, special_base,
38499 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
38501 /* If pointer is passed in a memory then its address is used to
38502 access Bounds Table. */
38503 else if (MEM_P (slot))
38505 addr = XEXP (slot, 0);
38506 if (!register_operand (addr, Pmode))
38507 addr = copy_addr_to_reg (addr);
38509 else
38510 gcc_unreachable ();
38512 return addr;
38515 /* Expand pass uses this hook to load bounds for function parameter
38516 PTR passed in SLOT in case its bounds are not passed in a register.
38518 If SLOT is a memory, then bounds are loaded as for regular pointer
38519 loaded from memory. PTR may be NULL in case SLOT is a memory.
38520 In such case value of PTR (if required) may be loaded from SLOT.
38522 If SLOT is NULL or a register then SLOT_NO is an integer constant
38523 holding number of the target dependent special slot which should be
38524 used to obtain bounds.
38526 Return loaded bounds. */
38528 static rtx
38529 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
38531 rtx reg = gen_reg_rtx (BNDmode);
38532 rtx addr;
38534 /* Get address to be used to access Bounds Table. Special slots start
38535 at the location of return address of the current function. */
38536 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
38538 /* Load pointer value from a memory if we don't have it. */
38539 if (!ptr)
38541 gcc_assert (MEM_P (slot));
38542 ptr = copy_addr_to_reg (slot);
38545 if (!register_operand (ptr, Pmode))
38546 ptr = ix86_zero_extend_to_Pmode (ptr);
38548 emit_insn (BNDmode == BND64mode
38549 ? gen_bnd64_ldx (reg, addr, ptr)
38550 : gen_bnd32_ldx (reg, addr, ptr));
38552 return reg;
38555 /* Expand pass uses this hook to store BOUNDS for call argument PTR
38556 passed in SLOT in case BOUNDS are not passed in a register.
38558 If SLOT is a memory, then BOUNDS are stored as for regular pointer
38559 stored in memory. PTR may be NULL in case SLOT is a memory.
38560 In such case value of PTR (if required) may be loaded from SLOT.
38562 If SLOT is NULL or a register then SLOT_NO is an integer constant
38563 holding number of the target dependent special slot which should be
38564 used to store BOUNDS. */
38566 static void
38567 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
38569 rtx addr;
38571 /* Get address to be used to access Bounds Table. Special slots start
38572 at the location of return address of a called function. */
38573 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
38575 /* Load pointer value from a memory if we don't have it. */
38576 if (!ptr)
38578 gcc_assert (MEM_P (slot));
38579 ptr = copy_addr_to_reg (slot);
38582 if (!register_operand (ptr, Pmode))
38583 ptr = ix86_zero_extend_to_Pmode (ptr);
38585 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
38586 if (!register_operand (bounds, BNDmode))
38587 bounds = copy_to_mode_reg (BNDmode, bounds);
38589 emit_insn (BNDmode == BND64mode
38590 ? gen_bnd64_stx (addr, ptr, bounds)
38591 : gen_bnd32_stx (addr, ptr, bounds));
38594 /* Load and return bounds returned by function in SLOT. */
38596 static rtx
38597 ix86_load_returned_bounds (rtx slot)
38599 rtx res;
38601 gcc_assert (REG_P (slot));
38602 res = gen_reg_rtx (BNDmode);
38603 emit_move_insn (res, slot);
38605 return res;
38608 /* Store BOUNDS returned by function into SLOT. */
38610 static void
38611 ix86_store_returned_bounds (rtx slot, rtx bounds)
38613 gcc_assert (REG_P (slot));
38614 emit_move_insn (slot, bounds);
38617 /* Returns a function decl for a vectorized version of the combined function
38618 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38619 if it is not available. */
38621 static tree
38622 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38623 tree type_in)
38625 machine_mode in_mode, out_mode;
38626 int in_n, out_n;
38628 if (TREE_CODE (type_out) != VECTOR_TYPE
38629 || TREE_CODE (type_in) != VECTOR_TYPE)
38630 return NULL_TREE;
38632 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38633 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38634 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38635 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38637 switch (fn)
38639 CASE_CFN_EXP2:
38640 if (out_mode == SFmode && in_mode == SFmode)
38642 if (out_n == 16 && in_n == 16)
38643 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38645 break;
38647 CASE_CFN_IFLOOR:
38648 CASE_CFN_LFLOOR:
38649 CASE_CFN_LLFLOOR:
38650 /* The round insn does not trap on denormals. */
38651 if (flag_trapping_math || !TARGET_SSE4_1)
38652 break;
38654 if (out_mode == SImode && in_mode == DFmode)
38656 if (out_n == 4 && in_n == 2)
38657 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38658 else if (out_n == 8 && in_n == 4)
38659 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38660 else if (out_n == 16 && in_n == 8)
38661 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38663 if (out_mode == SImode && in_mode == SFmode)
38665 if (out_n == 4 && in_n == 4)
38666 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38667 else if (out_n == 8 && in_n == 8)
38668 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38669 else if (out_n == 16 && in_n == 16)
38670 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38672 break;
38674 CASE_CFN_ICEIL:
38675 CASE_CFN_LCEIL:
38676 CASE_CFN_LLCEIL:
38677 /* The round insn does not trap on denormals. */
38678 if (flag_trapping_math || !TARGET_SSE4_1)
38679 break;
38681 if (out_mode == SImode && in_mode == DFmode)
38683 if (out_n == 4 && in_n == 2)
38684 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38685 else if (out_n == 8 && in_n == 4)
38686 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38687 else if (out_n == 16 && in_n == 8)
38688 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38690 if (out_mode == SImode && in_mode == SFmode)
38692 if (out_n == 4 && in_n == 4)
38693 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38694 else if (out_n == 8 && in_n == 8)
38695 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38696 else if (out_n == 16 && in_n == 16)
38697 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38699 break;
38701 CASE_CFN_IRINT:
38702 CASE_CFN_LRINT:
38703 CASE_CFN_LLRINT:
38704 if (out_mode == SImode && in_mode == DFmode)
38706 if (out_n == 4 && in_n == 2)
38707 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38708 else if (out_n == 8 && in_n == 4)
38709 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38710 else if (out_n == 16 && in_n == 8)
38711 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38713 if (out_mode == SImode && in_mode == SFmode)
38715 if (out_n == 4 && in_n == 4)
38716 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38717 else if (out_n == 8 && in_n == 8)
38718 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38719 else if (out_n == 16 && in_n == 16)
38720 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38722 break;
38724 CASE_CFN_IROUND:
38725 CASE_CFN_LROUND:
38726 CASE_CFN_LLROUND:
38727 /* The round insn does not trap on denormals. */
38728 if (flag_trapping_math || !TARGET_SSE4_1)
38729 break;
38731 if (out_mode == SImode && in_mode == DFmode)
38733 if (out_n == 4 && in_n == 2)
38734 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38735 else if (out_n == 8 && in_n == 4)
38736 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38737 else if (out_n == 16 && in_n == 8)
38738 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38740 if (out_mode == SImode && in_mode == SFmode)
38742 if (out_n == 4 && in_n == 4)
38743 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38744 else if (out_n == 8 && in_n == 8)
38745 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38746 else if (out_n == 16 && in_n == 16)
38747 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38749 break;
38751 CASE_CFN_FLOOR:
38752 /* The round insn does not trap on denormals. */
38753 if (flag_trapping_math || !TARGET_SSE4_1)
38754 break;
38756 if (out_mode == DFmode && in_mode == DFmode)
38758 if (out_n == 2 && in_n == 2)
38759 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38760 else if (out_n == 4 && in_n == 4)
38761 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38762 else if (out_n == 8 && in_n == 8)
38763 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38765 if (out_mode == SFmode && in_mode == SFmode)
38767 if (out_n == 4 && in_n == 4)
38768 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38769 else if (out_n == 8 && in_n == 8)
38770 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38771 else if (out_n == 16 && in_n == 16)
38772 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38774 break;
38776 CASE_CFN_CEIL:
38777 /* The round insn does not trap on denormals. */
38778 if (flag_trapping_math || !TARGET_SSE4_1)
38779 break;
38781 if (out_mode == DFmode && in_mode == DFmode)
38783 if (out_n == 2 && in_n == 2)
38784 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38785 else if (out_n == 4 && in_n == 4)
38786 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38787 else if (out_n == 8 && in_n == 8)
38788 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38790 if (out_mode == SFmode && in_mode == SFmode)
38792 if (out_n == 4 && in_n == 4)
38793 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38794 else if (out_n == 8 && in_n == 8)
38795 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38796 else if (out_n == 16 && in_n == 16)
38797 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38799 break;
38801 CASE_CFN_TRUNC:
38802 /* The round insn does not trap on denormals. */
38803 if (flag_trapping_math || !TARGET_SSE4_1)
38804 break;
38806 if (out_mode == DFmode && in_mode == DFmode)
38808 if (out_n == 2 && in_n == 2)
38809 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
38810 else if (out_n == 4 && in_n == 4)
38811 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
38812 else if (out_n == 8 && in_n == 8)
38813 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
38815 if (out_mode == SFmode && in_mode == SFmode)
38817 if (out_n == 4 && in_n == 4)
38818 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
38819 else if (out_n == 8 && in_n == 8)
38820 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
38821 else if (out_n == 16 && in_n == 16)
38822 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
38824 break;
38826 CASE_CFN_RINT:
38827 /* The round insn does not trap on denormals. */
38828 if (flag_trapping_math || !TARGET_SSE4_1)
38829 break;
38831 if (out_mode == DFmode && in_mode == DFmode)
38833 if (out_n == 2 && in_n == 2)
38834 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
38835 else if (out_n == 4 && in_n == 4)
38836 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
38838 if (out_mode == SFmode && in_mode == SFmode)
38840 if (out_n == 4 && in_n == 4)
38841 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
38842 else if (out_n == 8 && in_n == 8)
38843 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
38845 break;
38847 CASE_CFN_FMA:
38848 if (out_mode == DFmode && in_mode == DFmode)
38850 if (out_n == 2 && in_n == 2)
38851 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
38852 if (out_n == 4 && in_n == 4)
38853 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
38855 if (out_mode == SFmode && in_mode == SFmode)
38857 if (out_n == 4 && in_n == 4)
38858 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38859 if (out_n == 8 && in_n == 8)
38860 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38862 break;
38864 default:
38865 break;
38868 /* Dispatch to a handler for a vectorization library. */
38869 if (ix86_veclib_handler)
38870 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38872 return NULL_TREE;
38875 /* Handler for an SVML-style interface to
38876 a library with vectorized intrinsics. */
38878 static tree
38879 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38881 char name[20];
38882 tree fntype, new_fndecl, args;
38883 unsigned arity;
38884 const char *bname;
38885 machine_mode el_mode, in_mode;
38886 int n, in_n;
38888 /* The SVML is suitable for unsafe math only. */
38889 if (!flag_unsafe_math_optimizations)
38890 return NULL_TREE;
38892 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38893 n = TYPE_VECTOR_SUBPARTS (type_out);
38894 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38895 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38896 if (el_mode != in_mode
38897 || n != in_n)
38898 return NULL_TREE;
38900 switch (fn)
38902 CASE_CFN_EXP:
38903 CASE_CFN_LOG:
38904 CASE_CFN_LOG10:
38905 CASE_CFN_POW:
38906 CASE_CFN_TANH:
38907 CASE_CFN_TAN:
38908 CASE_CFN_ATAN:
38909 CASE_CFN_ATAN2:
38910 CASE_CFN_ATANH:
38911 CASE_CFN_CBRT:
38912 CASE_CFN_SINH:
38913 CASE_CFN_SIN:
38914 CASE_CFN_ASINH:
38915 CASE_CFN_ASIN:
38916 CASE_CFN_COSH:
38917 CASE_CFN_COS:
38918 CASE_CFN_ACOSH:
38919 CASE_CFN_ACOS:
38920 if ((el_mode != DFmode || n != 2)
38921 && (el_mode != SFmode || n != 4))
38922 return NULL_TREE;
38923 break;
38925 default:
38926 return NULL_TREE;
38929 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38930 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38932 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38933 strcpy (name, "vmlsLn4");
38934 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38935 strcpy (name, "vmldLn2");
38936 else if (n == 4)
38938 sprintf (name, "vmls%s", bname+10);
38939 name[strlen (name)-1] = '4';
38941 else
38942 sprintf (name, "vmld%s2", bname+10);
38944 /* Convert to uppercase. */
38945 name[4] &= ~0x20;
38947 arity = 0;
38948 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38949 arity++;
38951 if (arity == 1)
38952 fntype = build_function_type_list (type_out, type_in, NULL);
38953 else
38954 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38956 /* Build a function declaration for the vectorized function. */
38957 new_fndecl = build_decl (BUILTINS_LOCATION,
38958 FUNCTION_DECL, get_identifier (name), fntype);
38959 TREE_PUBLIC (new_fndecl) = 1;
38960 DECL_EXTERNAL (new_fndecl) = 1;
38961 DECL_IS_NOVOPS (new_fndecl) = 1;
38962 TREE_READONLY (new_fndecl) = 1;
38964 return new_fndecl;
38967 /* Handler for an ACML-style interface to
38968 a library with vectorized intrinsics. */
38970 static tree
38971 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38973 char name[20] = "__vr.._";
38974 tree fntype, new_fndecl, args;
38975 unsigned arity;
38976 const char *bname;
38977 machine_mode el_mode, in_mode;
38978 int n, in_n;
38980 /* The ACML is 64bits only and suitable for unsafe math only as
38981 it does not correctly support parts of IEEE with the required
38982 precision such as denormals. */
38983 if (!TARGET_64BIT
38984 || !flag_unsafe_math_optimizations)
38985 return NULL_TREE;
38987 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38988 n = TYPE_VECTOR_SUBPARTS (type_out);
38989 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38990 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38991 if (el_mode != in_mode
38992 || n != in_n)
38993 return NULL_TREE;
38995 switch (fn)
38997 CASE_CFN_SIN:
38998 CASE_CFN_COS:
38999 CASE_CFN_EXP:
39000 CASE_CFN_LOG:
39001 CASE_CFN_LOG2:
39002 CASE_CFN_LOG10:
39003 if (el_mode == DFmode && n == 2)
39005 name[4] = 'd';
39006 name[5] = '2';
39008 else if (el_mode == SFmode && n == 4)
39010 name[4] = 's';
39011 name[5] = '4';
39013 else
39014 return NULL_TREE;
39015 break;
39017 default:
39018 return NULL_TREE;
39021 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39022 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39023 sprintf (name + 7, "%s", bname+10);
39025 arity = 0;
39026 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39027 arity++;
39029 if (arity == 1)
39030 fntype = build_function_type_list (type_out, type_in, NULL);
39031 else
39032 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39034 /* Build a function declaration for the vectorized function. */
39035 new_fndecl = build_decl (BUILTINS_LOCATION,
39036 FUNCTION_DECL, get_identifier (name), fntype);
39037 TREE_PUBLIC (new_fndecl) = 1;
39038 DECL_EXTERNAL (new_fndecl) = 1;
39039 DECL_IS_NOVOPS (new_fndecl) = 1;
39040 TREE_READONLY (new_fndecl) = 1;
39042 return new_fndecl;
39045 /* Returns a decl of a function that implements gather load with
39046 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39047 Return NULL_TREE if it is not available. */
39049 static tree
39050 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39051 const_tree index_type, int scale)
39053 bool si;
39054 enum ix86_builtins code;
39056 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39057 return NULL_TREE;
39059 if ((TREE_CODE (index_type) != INTEGER_TYPE
39060 && !POINTER_TYPE_P (index_type))
39061 || (TYPE_MODE (index_type) != SImode
39062 && TYPE_MODE (index_type) != DImode))
39063 return NULL_TREE;
39065 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39066 return NULL_TREE;
39068 /* v*gather* insn sign extends index to pointer mode. */
39069 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39070 && TYPE_UNSIGNED (index_type))
39071 return NULL_TREE;
39073 if (scale <= 0
39074 || scale > 8
39075 || (scale & (scale - 1)) != 0)
39076 return NULL_TREE;
39078 si = TYPE_MODE (index_type) == SImode;
39079 switch (TYPE_MODE (mem_vectype))
39081 case E_V2DFmode:
39082 if (TARGET_AVX512VL)
39083 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39084 else
39085 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39086 break;
39087 case E_V4DFmode:
39088 if (TARGET_AVX512VL)
39089 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39090 else
39091 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39092 break;
39093 case E_V2DImode:
39094 if (TARGET_AVX512VL)
39095 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39096 else
39097 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39098 break;
39099 case E_V4DImode:
39100 if (TARGET_AVX512VL)
39101 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39102 else
39103 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39104 break;
39105 case E_V4SFmode:
39106 if (TARGET_AVX512VL)
39107 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39108 else
39109 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39110 break;
39111 case E_V8SFmode:
39112 if (TARGET_AVX512VL)
39113 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39114 else
39115 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39116 break;
39117 case E_V4SImode:
39118 if (TARGET_AVX512VL)
39119 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39120 else
39121 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39122 break;
39123 case E_V8SImode:
39124 if (TARGET_AVX512VL)
39125 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39126 else
39127 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39128 break;
39129 case E_V8DFmode:
39130 if (TARGET_AVX512F)
39131 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39132 else
39133 return NULL_TREE;
39134 break;
39135 case E_V8DImode:
39136 if (TARGET_AVX512F)
39137 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39138 else
39139 return NULL_TREE;
39140 break;
39141 case E_V16SFmode:
39142 if (TARGET_AVX512F)
39143 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39144 else
39145 return NULL_TREE;
39146 break;
39147 case E_V16SImode:
39148 if (TARGET_AVX512F)
39149 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39150 else
39151 return NULL_TREE;
39152 break;
39153 default:
39154 return NULL_TREE;
39157 return ix86_get_builtin (code);
39160 /* Returns a decl of a function that implements scatter store with
39161 register type VECTYPE and index type INDEX_TYPE and SCALE.
39162 Return NULL_TREE if it is not available. */
39164 static tree
39165 ix86_vectorize_builtin_scatter (const_tree vectype,
39166 const_tree index_type, int scale)
39168 bool si;
39169 enum ix86_builtins code;
39171 if (!TARGET_AVX512F)
39172 return NULL_TREE;
39174 if ((TREE_CODE (index_type) != INTEGER_TYPE
39175 && !POINTER_TYPE_P (index_type))
39176 || (TYPE_MODE (index_type) != SImode
39177 && TYPE_MODE (index_type) != DImode))
39178 return NULL_TREE;
39180 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39181 return NULL_TREE;
39183 /* v*scatter* insn sign extends index to pointer mode. */
39184 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39185 && TYPE_UNSIGNED (index_type))
39186 return NULL_TREE;
39188 /* Scale can be 1, 2, 4 or 8. */
39189 if (scale <= 0
39190 || scale > 8
39191 || (scale & (scale - 1)) != 0)
39192 return NULL_TREE;
39194 si = TYPE_MODE (index_type) == SImode;
39195 switch (TYPE_MODE (vectype))
39197 case E_V8DFmode:
39198 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39199 break;
39200 case E_V8DImode:
39201 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39202 break;
39203 case E_V16SFmode:
39204 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39205 break;
39206 case E_V16SImode:
39207 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39208 break;
39209 default:
39210 return NULL_TREE;
39213 return ix86_builtins[code];
39216 /* Return true if it is safe to use the rsqrt optabs to optimize
39217 1.0/sqrt. */
39219 static bool
39220 use_rsqrt_p ()
39222 return (TARGET_SSE_MATH
39223 && flag_finite_math_only
39224 && !flag_trapping_math
39225 && flag_unsafe_math_optimizations);
39228 /* Returns a code for a target-specific builtin that implements
39229 reciprocal of the function, or NULL_TREE if not available. */
39231 static tree
39232 ix86_builtin_reciprocal (tree fndecl)
39234 switch (DECL_FUNCTION_CODE (fndecl))
39236 /* Vectorized version of sqrt to rsqrt conversion. */
39237 case IX86_BUILTIN_SQRTPS_NR:
39238 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39240 case IX86_BUILTIN_SQRTPS_NR256:
39241 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39243 default:
39244 return NULL_TREE;
39248 /* Helper for avx_vpermilps256_operand et al. This is also used by
39249 the expansion functions to turn the parallel back into a mask.
39250 The return value is 0 for no match and the imm8+1 for a match. */
39253 avx_vpermilp_parallel (rtx par, machine_mode mode)
39255 unsigned i, nelt = GET_MODE_NUNITS (mode);
39256 unsigned mask = 0;
39257 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39259 if (XVECLEN (par, 0) != (int) nelt)
39260 return 0;
39262 /* Validate that all of the elements are constants, and not totally
39263 out of range. Copy the data into an integral array to make the
39264 subsequent checks easier. */
39265 for (i = 0; i < nelt; ++i)
39267 rtx er = XVECEXP (par, 0, i);
39268 unsigned HOST_WIDE_INT ei;
39270 if (!CONST_INT_P (er))
39271 return 0;
39272 ei = INTVAL (er);
39273 if (ei >= nelt)
39274 return 0;
39275 ipar[i] = ei;
39278 switch (mode)
39280 case E_V8DFmode:
39281 /* In the 512-bit DFmode case, we can only move elements within
39282 a 128-bit lane. First fill the second part of the mask,
39283 then fallthru. */
39284 for (i = 4; i < 6; ++i)
39286 if (ipar[i] < 4 || ipar[i] >= 6)
39287 return 0;
39288 mask |= (ipar[i] - 4) << i;
39290 for (i = 6; i < 8; ++i)
39292 if (ipar[i] < 6)
39293 return 0;
39294 mask |= (ipar[i] - 6) << i;
39296 /* FALLTHRU */
39298 case E_V4DFmode:
39299 /* In the 256-bit DFmode case, we can only move elements within
39300 a 128-bit lane. */
39301 for (i = 0; i < 2; ++i)
39303 if (ipar[i] >= 2)
39304 return 0;
39305 mask |= ipar[i] << i;
39307 for (i = 2; i < 4; ++i)
39309 if (ipar[i] < 2)
39310 return 0;
39311 mask |= (ipar[i] - 2) << i;
39313 break;
39315 case E_V16SFmode:
39316 /* In 512 bit SFmode case, permutation in the upper 256 bits
39317 must mirror the permutation in the lower 256-bits. */
39318 for (i = 0; i < 8; ++i)
39319 if (ipar[i] + 8 != ipar[i + 8])
39320 return 0;
39321 /* FALLTHRU */
39323 case E_V8SFmode:
39324 /* In 256 bit SFmode case, we have full freedom of
39325 movement within the low 128-bit lane, but the high 128-bit
39326 lane must mirror the exact same pattern. */
39327 for (i = 0; i < 4; ++i)
39328 if (ipar[i] + 4 != ipar[i + 4])
39329 return 0;
39330 nelt = 4;
39331 /* FALLTHRU */
39333 case E_V2DFmode:
39334 case E_V4SFmode:
39335 /* In the 128-bit case, we've full freedom in the placement of
39336 the elements from the source operand. */
39337 for (i = 0; i < nelt; ++i)
39338 mask |= ipar[i] << (i * (nelt / 2));
39339 break;
39341 default:
39342 gcc_unreachable ();
39345 /* Make sure success has a non-zero value by adding one. */
39346 return mask + 1;
39349 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39350 the expansion functions to turn the parallel back into a mask.
39351 The return value is 0 for no match and the imm8+1 for a match. */
39354 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39356 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39357 unsigned mask = 0;
39358 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39360 if (XVECLEN (par, 0) != (int) nelt)
39361 return 0;
39363 /* Validate that all of the elements are constants, and not totally
39364 out of range. Copy the data into an integral array to make the
39365 subsequent checks easier. */
39366 for (i = 0; i < nelt; ++i)
39368 rtx er = XVECEXP (par, 0, i);
39369 unsigned HOST_WIDE_INT ei;
39371 if (!CONST_INT_P (er))
39372 return 0;
39373 ei = INTVAL (er);
39374 if (ei >= 2 * nelt)
39375 return 0;
39376 ipar[i] = ei;
39379 /* Validate that the halves of the permute are halves. */
39380 for (i = 0; i < nelt2 - 1; ++i)
39381 if (ipar[i] + 1 != ipar[i + 1])
39382 return 0;
39383 for (i = nelt2; i < nelt - 1; ++i)
39384 if (ipar[i] + 1 != ipar[i + 1])
39385 return 0;
39387 /* Reconstruct the mask. */
39388 for (i = 0; i < 2; ++i)
39390 unsigned e = ipar[i * nelt2];
39391 if (e % nelt2)
39392 return 0;
39393 e /= nelt2;
39394 mask |= e << (i * 4);
39397 /* Make sure success has a non-zero value by adding one. */
39398 return mask + 1;
39401 /* Return a register priority for hard reg REGNO. */
39402 static int
39403 ix86_register_priority (int hard_regno)
39405 /* ebp and r13 as the base always wants a displacement, r12 as the
39406 base always wants an index. So discourage their usage in an
39407 address. */
39408 if (hard_regno == R12_REG || hard_regno == R13_REG)
39409 return 0;
39410 if (hard_regno == BP_REG)
39411 return 1;
39412 /* New x86-64 int registers result in bigger code size. Discourage
39413 them. */
39414 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39415 return 2;
39416 /* New x86-64 SSE registers result in bigger code size. Discourage
39417 them. */
39418 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39419 return 2;
39420 /* Usage of AX register results in smaller code. Prefer it. */
39421 if (hard_regno == AX_REG)
39422 return 4;
39423 return 3;
39426 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39428 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39429 QImode must go into class Q_REGS.
39430 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39431 movdf to do mem-to-mem moves through integer regs. */
39433 static reg_class_t
39434 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39436 machine_mode mode = GET_MODE (x);
39438 /* We're only allowed to return a subclass of CLASS. Many of the
39439 following checks fail for NO_REGS, so eliminate that early. */
39440 if (regclass == NO_REGS)
39441 return NO_REGS;
39443 /* All classes can load zeros. */
39444 if (x == CONST0_RTX (mode))
39445 return regclass;
39447 /* Force constants into memory if we are loading a (nonzero) constant into
39448 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39449 instructions to load from a constant. */
39450 if (CONSTANT_P (x)
39451 && (MAYBE_MMX_CLASS_P (regclass)
39452 || MAYBE_SSE_CLASS_P (regclass)
39453 || MAYBE_MASK_CLASS_P (regclass)))
39454 return NO_REGS;
39456 /* Floating-point constants need more complex checks. */
39457 if (CONST_DOUBLE_P (x))
39459 /* General regs can load everything. */
39460 if (INTEGER_CLASS_P (regclass))
39461 return regclass;
39463 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39464 zero above. We only want to wind up preferring 80387 registers if
39465 we plan on doing computation with them. */
39466 if (IS_STACK_MODE (mode)
39467 && standard_80387_constant_p (x) > 0)
39469 /* Limit class to FP regs. */
39470 if (FLOAT_CLASS_P (regclass))
39471 return FLOAT_REGS;
39472 else if (regclass == FP_TOP_SSE_REGS)
39473 return FP_TOP_REG;
39474 else if (regclass == FP_SECOND_SSE_REGS)
39475 return FP_SECOND_REG;
39478 return NO_REGS;
39481 /* Prefer SSE regs only, if we can use them for math. */
39482 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39483 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39485 /* Generally when we see PLUS here, it's the function invariant
39486 (plus soft-fp const_int). Which can only be computed into general
39487 regs. */
39488 if (GET_CODE (x) == PLUS)
39489 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39491 /* QImode constants are easy to load, but non-constant QImode data
39492 must go into Q_REGS. */
39493 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39495 if (Q_CLASS_P (regclass))
39496 return regclass;
39497 else if (reg_class_subset_p (Q_REGS, regclass))
39498 return Q_REGS;
39499 else
39500 return NO_REGS;
39503 return regclass;
39506 /* Discourage putting floating-point values in SSE registers unless
39507 SSE math is being used, and likewise for the 387 registers. */
39508 static reg_class_t
39509 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39511 machine_mode mode = GET_MODE (x);
39513 /* Restrict the output reload class to the register bank that we are doing
39514 math on. If we would like not to return a subset of CLASS, reject this
39515 alternative: if reload cannot do this, it will still use its choice. */
39516 mode = GET_MODE (x);
39517 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39518 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39520 if (IS_STACK_MODE (mode))
39522 if (regclass == FP_TOP_SSE_REGS)
39523 return FP_TOP_REG;
39524 else if (regclass == FP_SECOND_SSE_REGS)
39525 return FP_SECOND_REG;
39526 else
39527 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39530 return regclass;
39533 static reg_class_t
39534 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39535 machine_mode mode, secondary_reload_info *sri)
39537 /* Double-word spills from general registers to non-offsettable memory
39538 references (zero-extended addresses) require special handling. */
39539 if (TARGET_64BIT
39540 && MEM_P (x)
39541 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39542 && INTEGER_CLASS_P (rclass)
39543 && !offsettable_memref_p (x))
39545 sri->icode = (in_p
39546 ? CODE_FOR_reload_noff_load
39547 : CODE_FOR_reload_noff_store);
39548 /* Add the cost of moving address to a temporary. */
39549 sri->extra_cost = 1;
39551 return NO_REGS;
39554 /* QImode spills from non-QI registers require
39555 intermediate register on 32bit targets. */
39556 if (mode == QImode
39557 && ((!TARGET_64BIT && !in_p
39558 && INTEGER_CLASS_P (rclass)
39559 && MAYBE_NON_Q_CLASS_P (rclass))
39560 || (!TARGET_AVX512DQ
39561 && MAYBE_MASK_CLASS_P (rclass))))
39563 int regno = true_regnum (x);
39565 /* Return Q_REGS if the operand is in memory. */
39566 if (regno == -1)
39567 return Q_REGS;
39569 return NO_REGS;
39572 /* This condition handles corner case where an expression involving
39573 pointers gets vectorized. We're trying to use the address of a
39574 stack slot as a vector initializer.
39576 (set (reg:V2DI 74 [ vect_cst_.2 ])
39577 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39579 Eventually frame gets turned into sp+offset like this:
39581 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39582 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39583 (const_int 392 [0x188]))))
39585 That later gets turned into:
39587 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39588 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39589 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39591 We'll have the following reload recorded:
39593 Reload 0: reload_in (DI) =
39594 (plus:DI (reg/f:DI 7 sp)
39595 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39596 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39597 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39598 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39599 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39600 reload_reg_rtx: (reg:V2DI 22 xmm1)
39602 Which isn't going to work since SSE instructions can't handle scalar
39603 additions. Returning GENERAL_REGS forces the addition into integer
39604 register and reload can handle subsequent reloads without problems. */
39606 if (in_p && GET_CODE (x) == PLUS
39607 && SSE_CLASS_P (rclass)
39608 && SCALAR_INT_MODE_P (mode))
39609 return GENERAL_REGS;
39611 return NO_REGS;
39614 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39616 static bool
39617 ix86_class_likely_spilled_p (reg_class_t rclass)
39619 switch (rclass)
39621 case AREG:
39622 case DREG:
39623 case CREG:
39624 case BREG:
39625 case AD_REGS:
39626 case SIREG:
39627 case DIREG:
39628 case SSE_FIRST_REG:
39629 case FP_TOP_REG:
39630 case FP_SECOND_REG:
39631 case BND_REGS:
39632 return true;
39634 default:
39635 break;
39638 return false;
39641 /* If we are copying between registers from different register sets
39642 (e.g. FP and integer), we may need a memory location.
39644 The function can't work reliably when one of the CLASSES is a class
39645 containing registers from multiple sets. We avoid this by never combining
39646 different sets in a single alternative in the machine description.
39647 Ensure that this constraint holds to avoid unexpected surprises.
39649 When STRICT is false, we are being called from REGISTER_MOVE_COST,
39650 so do not enforce these sanity checks.
39652 To optimize register_move_cost performance, define inline variant. */
39654 static inline bool
39655 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39656 reg_class_t class2, int strict)
39658 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39659 return false;
39661 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39662 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39663 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39664 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39665 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39666 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
39667 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
39668 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
39670 gcc_assert (!strict || lra_in_progress);
39671 return true;
39674 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39675 return true;
39677 /* Between mask and general, we have moves no larger than word size. */
39678 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
39679 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39680 return true;
39682 /* ??? This is a lie. We do have moves between mmx/general, and for
39683 mmx/sse2. But by saying we need secondary memory we discourage the
39684 register allocator from using the mmx registers unless needed. */
39685 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39686 return true;
39688 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39690 /* SSE1 doesn't have any direct moves from other classes. */
39691 if (!TARGET_SSE2)
39692 return true;
39694 /* If the target says that inter-unit moves are more expensive
39695 than moving through memory, then don't generate them. */
39696 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39697 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39698 return true;
39700 /* Between SSE and general, we have moves no larger than word size. */
39701 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39702 return true;
39705 return false;
39708 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
39710 static bool
39711 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39712 reg_class_t class2)
39714 return inline_secondary_memory_needed (mode, class1, class2, true);
39717 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
39719 get_secondary_mem widens integral modes to BITS_PER_WORD.
39720 There is no need to emit full 64 bit move on 64 bit targets
39721 for integral modes that can be moved using 32 bit move. */
39723 static machine_mode
39724 ix86_secondary_memory_needed_mode (machine_mode mode)
39726 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
39727 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
39728 return mode;
39731 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39733 On the 80386, this is the size of MODE in words,
39734 except in the FP regs, where a single reg is always enough. */
39736 static unsigned char
39737 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39739 if (MAYBE_INTEGER_CLASS_P (rclass))
39741 if (mode == XFmode)
39742 return (TARGET_64BIT ? 2 : 3);
39743 else if (mode == XCmode)
39744 return (TARGET_64BIT ? 4 : 6);
39745 else
39746 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39748 else
39750 if (COMPLEX_MODE_P (mode))
39751 return 2;
39752 else
39753 return 1;
39757 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
39759 static bool
39760 ix86_can_change_mode_class (machine_mode from, machine_mode to,
39761 reg_class_t regclass)
39763 if (from == to)
39764 return true;
39766 /* x87 registers can't do subreg at all, as all values are reformatted
39767 to extended precision. */
39768 if (MAYBE_FLOAT_CLASS_P (regclass))
39769 return false;
39771 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39773 /* Vector registers do not support QI or HImode loads. If we don't
39774 disallow a change to these modes, reload will assume it's ok to
39775 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39776 the vec_dupv4hi pattern. */
39777 if (GET_MODE_SIZE (from) < 4)
39778 return false;
39781 return true;
39784 /* Return index of MODE in the sse load/store tables. */
39786 static inline int
39787 sse_store_index (machine_mode mode)
39789 switch (GET_MODE_SIZE (mode))
39791 case 4:
39792 return 0;
39793 case 8:
39794 return 1;
39795 case 16:
39796 return 2;
39797 case 32:
39798 return 3;
39799 case 64:
39800 return 4;
39801 default:
39802 return -1;
39806 /* Return the cost of moving data of mode M between a
39807 register and memory. A value of 2 is the default; this cost is
39808 relative to those in `REGISTER_MOVE_COST'.
39810 This function is used extensively by register_move_cost that is used to
39811 build tables at startup. Make it inline in this case.
39812 When IN is 2, return maximum of in and out move cost.
39814 If moving between registers and memory is more expensive than
39815 between two registers, you should define this macro to express the
39816 relative cost.
39818 Model also increased moving costs of QImode registers in non
39819 Q_REGS classes.
39821 static inline int
39822 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
39823 int in)
39825 int cost;
39826 if (FLOAT_CLASS_P (regclass))
39828 int index;
39829 switch (mode)
39831 case E_SFmode:
39832 index = 0;
39833 break;
39834 case E_DFmode:
39835 index = 1;
39836 break;
39837 case E_XFmode:
39838 index = 2;
39839 break;
39840 default:
39841 return 100;
39843 if (in == 2)
39844 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
39845 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
39847 if (SSE_CLASS_P (regclass))
39849 int index = sse_store_index (mode);
39850 if (index == -1)
39851 return 100;
39852 if (in == 2)
39853 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
39854 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
39856 if (MMX_CLASS_P (regclass))
39858 int index;
39859 switch (GET_MODE_SIZE (mode))
39861 case 4:
39862 index = 0;
39863 break;
39864 case 8:
39865 index = 1;
39866 break;
39867 default:
39868 return 100;
39870 if (in)
39871 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39872 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39874 switch (GET_MODE_SIZE (mode))
39876 case 1:
39877 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39879 if (!in)
39880 return ix86_cost->int_store[0];
39881 if (TARGET_PARTIAL_REG_DEPENDENCY
39882 && optimize_function_for_speed_p (cfun))
39883 cost = ix86_cost->movzbl_load;
39884 else
39885 cost = ix86_cost->int_load[0];
39886 if (in == 2)
39887 return MAX (cost, ix86_cost->int_store[0]);
39888 return cost;
39890 else
39892 if (in == 2)
39893 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39894 if (in)
39895 return ix86_cost->movzbl_load;
39896 else
39897 return ix86_cost->int_store[0] + 4;
39899 break;
39900 case 2:
39901 if (in == 2)
39902 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39903 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39904 default:
39905 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39906 if (mode == TFmode)
39907 mode = XFmode;
39908 if (in == 2)
39909 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39910 else if (in)
39911 cost = ix86_cost->int_load[2];
39912 else
39913 cost = ix86_cost->int_store[2];
39914 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39918 static int
39919 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39920 bool in)
39922 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39926 /* Return the cost of moving data from a register in class CLASS1 to
39927 one in class CLASS2.
39929 It is not required that the cost always equal 2 when FROM is the same as TO;
39930 on some machines it is expensive to move between registers if they are not
39931 general registers. */
39933 static int
39934 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39935 reg_class_t class2_i)
39937 enum reg_class class1 = (enum reg_class) class1_i;
39938 enum reg_class class2 = (enum reg_class) class2_i;
39940 /* In case we require secondary memory, compute cost of the store followed
39941 by load. In order to avoid bad register allocation choices, we need
39942 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39944 if (inline_secondary_memory_needed (mode, class1, class2, false))
39946 int cost = 1;
39948 cost += inline_memory_move_cost (mode, class1, 2);
39949 cost += inline_memory_move_cost (mode, class2, 2);
39951 /* In case of copying from general_purpose_register we may emit multiple
39952 stores followed by single load causing memory size mismatch stall.
39953 Count this as arbitrarily high cost of 20. */
39954 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39955 && TARGET_MEMORY_MISMATCH_STALL
39956 && targetm.class_max_nregs (class1, mode)
39957 > targetm.class_max_nregs (class2, mode))
39958 cost += 20;
39960 /* In the case of FP/MMX moves, the registers actually overlap, and we
39961 have to switch modes in order to treat them differently. */
39962 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39963 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39964 cost += 20;
39966 return cost;
39969 /* Moves between SSE/MMX and integer unit are expensive. */
39970 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39971 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39973 /* ??? By keeping returned value relatively high, we limit the number
39974 of moves between integer and MMX/SSE registers for all targets.
39975 Additionally, high value prevents problem with x86_modes_tieable_p(),
39976 where integer modes in MMX/SSE registers are not tieable
39977 because of missing QImode and HImode moves to, from or between
39978 MMX/SSE registers. */
39979 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39980 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39982 if (MAYBE_FLOAT_CLASS_P (class1))
39983 return ix86_cost->fp_move;
39984 if (MAYBE_SSE_CLASS_P (class1))
39986 if (GET_MODE_BITSIZE (mode) <= 128)
39987 return ix86_cost->xmm_move;
39988 if (GET_MODE_BITSIZE (mode) <= 256)
39989 return ix86_cost->ymm_move;
39990 return ix86_cost->zmm_move;
39992 if (MAYBE_MMX_CLASS_P (class1))
39993 return ix86_cost->mmx_move;
39994 return 2;
39997 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39998 words of a value of mode MODE but can be less for certain modes in
39999 special long registers.
40001 Actually there are no two word move instructions for consecutive
40002 registers. And only registers 0-3 may have mov byte instructions
40003 applied to them. */
40005 static unsigned int
40006 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40008 if (GENERAL_REGNO_P (regno))
40010 if (mode == XFmode)
40011 return TARGET_64BIT ? 2 : 3;
40012 if (mode == XCmode)
40013 return TARGET_64BIT ? 4 : 6;
40014 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40016 if (COMPLEX_MODE_P (mode))
40017 return 2;
40018 if (mode == V64SFmode || mode == V64SImode)
40019 return 4;
40020 return 1;
40023 /* Implement TARGET_HARD_REGNO_MODE_OK. */
40025 static bool
40026 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40028 /* Flags and only flags can only hold CCmode values. */
40029 if (CC_REGNO_P (regno))
40030 return GET_MODE_CLASS (mode) == MODE_CC;
40031 if (GET_MODE_CLASS (mode) == MODE_CC
40032 || GET_MODE_CLASS (mode) == MODE_RANDOM
40033 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40034 return false;
40035 if (STACK_REGNO_P (regno))
40036 return VALID_FP_MODE_P (mode);
40037 if (MASK_REGNO_P (regno))
40038 return (VALID_MASK_REG_MODE (mode)
40039 || (TARGET_AVX512BW
40040 && VALID_MASK_AVX512BW_MODE (mode)));
40041 if (BND_REGNO_P (regno))
40042 return VALID_BND_REG_MODE (mode);
40043 if (SSE_REGNO_P (regno))
40045 /* We implement the move patterns for all vector modes into and
40046 out of SSE registers, even when no operation instructions
40047 are available. */
40049 /* For AVX-512 we allow, regardless of regno:
40050 - XI mode
40051 - any of 512-bit wide vector mode
40052 - any scalar mode. */
40053 if (TARGET_AVX512F
40054 && (mode == XImode
40055 || VALID_AVX512F_REG_MODE (mode)
40056 || VALID_AVX512F_SCALAR_MODE (mode)))
40057 return true;
40059 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40060 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40061 && MOD4_SSE_REGNO_P (regno)
40062 && mode == V64SFmode)
40063 return true;
40065 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40066 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40067 && MOD4_SSE_REGNO_P (regno)
40068 && mode == V64SImode)
40069 return true;
40071 /* TODO check for QI/HI scalars. */
40072 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40073 if (TARGET_AVX512VL
40074 && (mode == OImode
40075 || mode == TImode
40076 || VALID_AVX256_REG_MODE (mode)
40077 || VALID_AVX512VL_128_REG_MODE (mode)))
40078 return true;
40080 /* xmm16-xmm31 are only available for AVX-512. */
40081 if (EXT_REX_SSE_REGNO_P (regno))
40082 return false;
40084 /* OImode and AVX modes are available only when AVX is enabled. */
40085 return ((TARGET_AVX
40086 && VALID_AVX256_REG_OR_OI_MODE (mode))
40087 || VALID_SSE_REG_MODE (mode)
40088 || VALID_SSE2_REG_MODE (mode)
40089 || VALID_MMX_REG_MODE (mode)
40090 || VALID_MMX_REG_MODE_3DNOW (mode));
40092 if (MMX_REGNO_P (regno))
40094 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40095 so if the register is available at all, then we can move data of
40096 the given mode into or out of it. */
40097 return (VALID_MMX_REG_MODE (mode)
40098 || VALID_MMX_REG_MODE_3DNOW (mode));
40101 if (mode == QImode)
40103 /* Take care for QImode values - they can be in non-QI regs,
40104 but then they do cause partial register stalls. */
40105 if (ANY_QI_REGNO_P (regno))
40106 return true;
40107 if (!TARGET_PARTIAL_REG_STALL)
40108 return true;
40109 /* LRA checks if the hard register is OK for the given mode.
40110 QImode values can live in non-QI regs, so we allow all
40111 registers here. */
40112 if (lra_in_progress)
40113 return true;
40114 return !can_create_pseudo_p ();
40116 /* We handle both integer and floats in the general purpose registers. */
40117 else if (VALID_INT_MODE_P (mode))
40118 return true;
40119 else if (VALID_FP_MODE_P (mode))
40120 return true;
40121 else if (VALID_DFP_MODE_P (mode))
40122 return true;
40123 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40124 on to use that value in smaller contexts, this can easily force a
40125 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40126 supporting DImode, allow it. */
40127 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40128 return true;
40130 return false;
40133 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
40134 saves SSE registers across calls is Win64 (thus no need to check the
40135 current ABI here), and with AVX enabled Win64 only guarantees that
40136 the low 16 bytes are saved. */
40138 static bool
40139 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40141 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40144 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40145 tieable integer mode. */
40147 static bool
40148 ix86_tieable_integer_mode_p (machine_mode mode)
40150 switch (mode)
40152 case E_HImode:
40153 case E_SImode:
40154 return true;
40156 case E_QImode:
40157 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40159 case E_DImode:
40160 return TARGET_64BIT;
40162 default:
40163 return false;
40167 /* Implement TARGET_MODES_TIEABLE_P.
40169 Return true if MODE1 is accessible in a register that can hold MODE2
40170 without copying. That is, all register classes that can hold MODE2
40171 can also hold MODE1. */
40173 static bool
40174 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40176 if (mode1 == mode2)
40177 return true;
40179 if (ix86_tieable_integer_mode_p (mode1)
40180 && ix86_tieable_integer_mode_p (mode2))
40181 return true;
40183 /* MODE2 being XFmode implies fp stack or general regs, which means we
40184 can tie any smaller floating point modes to it. Note that we do not
40185 tie this with TFmode. */
40186 if (mode2 == XFmode)
40187 return mode1 == SFmode || mode1 == DFmode;
40189 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40190 that we can tie it with SFmode. */
40191 if (mode2 == DFmode)
40192 return mode1 == SFmode;
40194 /* If MODE2 is only appropriate for an SSE register, then tie with
40195 any other mode acceptable to SSE registers. */
40196 if (GET_MODE_SIZE (mode2) == 32
40197 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40198 return (GET_MODE_SIZE (mode1) == 32
40199 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40200 if (GET_MODE_SIZE (mode2) == 16
40201 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40202 return (GET_MODE_SIZE (mode1) == 16
40203 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40205 /* If MODE2 is appropriate for an MMX register, then tie
40206 with any other mode acceptable to MMX registers. */
40207 if (GET_MODE_SIZE (mode2) == 8
40208 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40209 return (GET_MODE_SIZE (mode1) == 8
40210 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40212 return false;
40215 /* Return the cost of moving between two registers of mode MODE. */
40217 static int
40218 ix86_set_reg_reg_cost (machine_mode mode)
40220 unsigned int units = UNITS_PER_WORD;
40222 switch (GET_MODE_CLASS (mode))
40224 default:
40225 break;
40227 case MODE_CC:
40228 units = GET_MODE_SIZE (CCmode);
40229 break;
40231 case MODE_FLOAT:
40232 if ((TARGET_SSE && mode == TFmode)
40233 || (TARGET_80387 && mode == XFmode)
40234 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40235 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40236 units = GET_MODE_SIZE (mode);
40237 break;
40239 case MODE_COMPLEX_FLOAT:
40240 if ((TARGET_SSE && mode == TCmode)
40241 || (TARGET_80387 && mode == XCmode)
40242 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40243 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40244 units = GET_MODE_SIZE (mode);
40245 break;
40247 case MODE_VECTOR_INT:
40248 case MODE_VECTOR_FLOAT:
40249 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40250 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40251 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40252 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40253 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40254 units = GET_MODE_SIZE (mode);
40257 /* Return the cost of moving between two registers of mode MODE,
40258 assuming that the move will be in pieces of at most UNITS bytes. */
40259 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40262 /* Return cost of vector operation in MODE given that scalar version has
40263 COST. If PARALLEL is true assume that CPU has more than one unit
40264 performing the operation. */
40266 static int
40267 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40269 if (!VECTOR_MODE_P (mode))
40270 return cost;
40272 if (!parallel)
40273 return cost * GET_MODE_NUNITS (mode);
40274 if (GET_MODE_BITSIZE (mode) == 128
40275 && TARGET_SSE_SPLIT_REGS)
40276 return cost * 2;
40277 if (GET_MODE_BITSIZE (mode) > 128
40278 && TARGET_AVX128_OPTIMAL)
40279 return cost * GET_MODE_BITSIZE (mode) / 128;
40280 return cost;
40283 /* Return cost of multiplication in MODE. */
40285 static int
40286 ix86_multiplication_cost (const struct processor_costs *cost,
40287 enum machine_mode mode)
40289 machine_mode inner_mode = mode;
40290 if (VECTOR_MODE_P (mode))
40291 inner_mode = GET_MODE_INNER (mode);
40293 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40294 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40295 else if (X87_FLOAT_MODE_P (mode))
40296 return cost->fmul;
40297 else if (FLOAT_MODE_P (mode))
40298 return ix86_vec_cost (mode,
40299 inner_mode == DFmode
40300 ? cost->mulsd : cost->mulss, true);
40301 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40303 /* V*QImode is emulated with 7-13 insns. */
40304 if (mode == V16QImode || mode == V32QImode)
40306 int extra = 11;
40307 if (TARGET_XOP && mode == V16QImode)
40308 extra = 5;
40309 else if (TARGET_SSSE3)
40310 extra = 6;
40311 return ix86_vec_cost (mode,
40312 cost->mulss * 2 + cost->sse_op * extra,
40313 true);
40315 /* V*DImode is emulated with 5-8 insns. */
40316 else if (mode == V2DImode || mode == V4DImode)
40318 if (TARGET_XOP && mode == V2DImode)
40319 return ix86_vec_cost (mode,
40320 cost->mulss * 2 + cost->sse_op * 3,
40321 true);
40322 else
40323 return ix86_vec_cost (mode,
40324 cost->mulss * 3 + cost->sse_op * 5,
40325 true);
40327 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40328 insns, including two PMULUDQ. */
40329 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40330 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40331 true);
40332 else
40333 return ix86_vec_cost (mode, cost->mulss, true);
40335 else
40336 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40339 /* Return cost of multiplication in MODE. */
40341 static int
40342 ix86_division_cost (const struct processor_costs *cost,
40343 enum machine_mode mode)
40345 machine_mode inner_mode = mode;
40346 if (VECTOR_MODE_P (mode))
40347 inner_mode = GET_MODE_INNER (mode);
40349 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40350 return inner_mode == DFmode ? cost->divsd : cost->divss;
40351 else if (X87_FLOAT_MODE_P (mode))
40352 return cost->fdiv;
40353 else if (FLOAT_MODE_P (mode))
40354 return ix86_vec_cost (mode,
40355 inner_mode == DFmode ? cost->divsd : cost->divss,
40356 true);
40357 else
40358 return cost->divide[MODE_INDEX (mode)];
40361 /* Return cost of shift in MODE.
40362 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40363 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40364 if op1 is a result of subreg.
40366 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
40368 static int
40369 ix86_shift_rotate_cost (const struct processor_costs *cost,
40370 enum machine_mode mode, bool constant_op1,
40371 HOST_WIDE_INT op1_val,
40372 bool speed,
40373 bool and_in_op1,
40374 bool shift_and_truncate,
40375 bool *skip_op0, bool *skip_op1)
40377 if (skip_op0)
40378 *skip_op0 = *skip_op1 = false;
40379 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40381 /* V*QImode is emulated with 1-11 insns. */
40382 if (mode == V16QImode || mode == V32QImode)
40384 int count = 11;
40385 if (TARGET_XOP && mode == V16QImode)
40387 /* For XOP we use vpshab, which requires a broadcast of the
40388 value to the variable shift insn. For constants this
40389 means a V16Q const in mem; even when we can perform the
40390 shift with one insn set the cost to prefer paddb. */
40391 if (constant_op1)
40393 if (skip_op1)
40394 *skip_op1 = true;
40395 return ix86_vec_cost (mode,
40396 cost->sse_op
40397 + (speed
40399 : COSTS_N_BYTES
40400 (GET_MODE_UNIT_SIZE (mode))), true);
40402 count = 3;
40404 else if (TARGET_SSSE3)
40405 count = 7;
40406 return ix86_vec_cost (mode, cost->sse_op * count, true);
40408 else
40409 return ix86_vec_cost (mode, cost->sse_op, true);
40411 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40413 if (constant_op1)
40415 if (op1_val > 32)
40416 return cost->shift_const + COSTS_N_INSNS (2);
40417 else
40418 return cost->shift_const * 2;
40420 else
40422 if (and_in_op1)
40423 return cost->shift_var * 2;
40424 else
40425 return cost->shift_var * 6 + COSTS_N_INSNS (2);
40428 else
40430 if (constant_op1)
40431 return cost->shift_const;
40432 else if (shift_and_truncate)
40434 if (skip_op0)
40435 *skip_op0 = *skip_op1 = true;
40436 /* Return the cost after shift-and truncation. */
40437 return cost->shift_var;
40439 else
40440 return cost->shift_var;
40442 return cost->shift_const;
40445 /* Compute a (partial) cost for rtx X. Return true if the complete
40446 cost has been computed, and false if subexpressions should be
40447 scanned. In either case, *TOTAL contains the cost result. */
40449 static bool
40450 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40451 int *total, bool speed)
40453 rtx mask;
40454 enum rtx_code code = GET_CODE (x);
40455 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40456 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40457 int src_cost;
40459 switch (code)
40461 case SET:
40462 if (register_operand (SET_DEST (x), VOIDmode)
40463 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40465 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40466 return true;
40469 if (register_operand (SET_SRC (x), VOIDmode))
40470 /* Avoid potentially incorrect high cost from rtx_costs
40471 for non-tieable SUBREGs. */
40472 src_cost = 0;
40473 else
40475 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40477 if (CONSTANT_P (SET_SRC (x)))
40478 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40479 a small value, possibly zero for cheap constants. */
40480 src_cost += COSTS_N_INSNS (1);
40483 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40484 return true;
40486 case CONST_INT:
40487 case CONST:
40488 case LABEL_REF:
40489 case SYMBOL_REF:
40490 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40491 *total = 3;
40492 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40493 *total = 2;
40494 else if (flag_pic && SYMBOLIC_CONST (x)
40495 && !(TARGET_64BIT
40496 && (GET_CODE (x) == LABEL_REF
40497 || (GET_CODE (x) == SYMBOL_REF
40498 && SYMBOL_REF_LOCAL_P (x))))
40499 /* Use 0 cost for CONST to improve its propagation. */
40500 && (TARGET_64BIT || GET_CODE (x) != CONST))
40501 *total = 1;
40502 else
40503 *total = 0;
40504 return true;
40506 case CONST_DOUBLE:
40507 if (IS_STACK_MODE (mode))
40508 switch (standard_80387_constant_p (x))
40510 case -1:
40511 case 0:
40512 break;
40513 case 1: /* 0.0 */
40514 *total = 1;
40515 return true;
40516 default: /* Other constants */
40517 *total = 2;
40518 return true;
40520 /* FALLTHRU */
40522 case CONST_VECTOR:
40523 switch (standard_sse_constant_p (x, mode))
40525 case 0:
40526 break;
40527 case 1: /* 0: xor eliminates false dependency */
40528 *total = 0;
40529 return true;
40530 default: /* -1: cmp contains false dependency */
40531 *total = 1;
40532 return true;
40534 /* FALLTHRU */
40536 case CONST_WIDE_INT:
40537 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40538 it'll probably end up. Add a penalty for size. */
40539 *total = (COSTS_N_INSNS (1)
40540 + (!TARGET_64BIT && flag_pic)
40541 + (GET_MODE_SIZE (mode) <= 4
40542 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40543 return true;
40545 case ZERO_EXTEND:
40546 /* The zero extensions is often completely free on x86_64, so make
40547 it as cheap as possible. */
40548 if (TARGET_64BIT && mode == DImode
40549 && GET_MODE (XEXP (x, 0)) == SImode)
40550 *total = 1;
40551 else if (TARGET_ZERO_EXTEND_WITH_AND)
40552 *total = cost->add;
40553 else
40554 *total = cost->movzx;
40555 return false;
40557 case SIGN_EXTEND:
40558 *total = cost->movsx;
40559 return false;
40561 case ASHIFT:
40562 if (SCALAR_INT_MODE_P (mode)
40563 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40564 && CONST_INT_P (XEXP (x, 1)))
40566 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40567 if (value == 1)
40569 *total = cost->add;
40570 return false;
40572 if ((value == 2 || value == 3)
40573 && cost->lea <= cost->shift_const)
40575 *total = cost->lea;
40576 return false;
40579 /* FALLTHRU */
40581 case ROTATE:
40582 case ASHIFTRT:
40583 case LSHIFTRT:
40584 case ROTATERT:
40585 bool skip_op0, skip_op1;
40586 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40587 CONST_INT_P (XEXP (x, 1))
40588 ? INTVAL (XEXP (x, 1)) : -1,
40589 speed,
40590 GET_CODE (XEXP (x, 1)) == AND,
40591 SUBREG_P (XEXP (x, 1))
40592 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40593 &skip_op0, &skip_op1);
40594 if (skip_op0 || skip_op1)
40596 if (!skip_op0)
40597 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40598 if (!skip_op1)
40599 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40600 return true;
40602 return false;
40604 case FMA:
40606 rtx sub;
40608 gcc_assert (FLOAT_MODE_P (mode));
40609 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40611 *total = ix86_vec_cost (mode,
40612 mode == SFmode ? cost->fmass : cost->fmasd,
40613 true);
40614 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40616 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40617 sub = XEXP (x, 0);
40618 if (GET_CODE (sub) == NEG)
40619 sub = XEXP (sub, 0);
40620 *total += rtx_cost (sub, mode, FMA, 0, speed);
40622 sub = XEXP (x, 2);
40623 if (GET_CODE (sub) == NEG)
40624 sub = XEXP (sub, 0);
40625 *total += rtx_cost (sub, mode, FMA, 2, speed);
40626 return true;
40629 case MULT:
40630 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40632 rtx op0 = XEXP (x, 0);
40633 rtx op1 = XEXP (x, 1);
40634 int nbits;
40635 if (CONST_INT_P (XEXP (x, 1)))
40637 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40638 for (nbits = 0; value != 0; value &= value - 1)
40639 nbits++;
40641 else
40642 /* This is arbitrary. */
40643 nbits = 7;
40645 /* Compute costs correctly for widening multiplication. */
40646 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40647 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40648 == GET_MODE_SIZE (mode))
40650 int is_mulwiden = 0;
40651 machine_mode inner_mode = GET_MODE (op0);
40653 if (GET_CODE (op0) == GET_CODE (op1))
40654 is_mulwiden = 1, op1 = XEXP (op1, 0);
40655 else if (CONST_INT_P (op1))
40657 if (GET_CODE (op0) == SIGN_EXTEND)
40658 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40659 == INTVAL (op1);
40660 else
40661 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40664 if (is_mulwiden)
40665 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40668 *total = (cost->mult_init[MODE_INDEX (mode)]
40669 + nbits * cost->mult_bit
40670 + rtx_cost (op0, mode, outer_code, opno, speed)
40671 + rtx_cost (op1, mode, outer_code, opno, speed));
40673 return true;
40675 *total = ix86_multiplication_cost (cost, mode);
40676 return false;
40678 case DIV:
40679 case UDIV:
40680 case MOD:
40681 case UMOD:
40682 *total = ix86_division_cost (cost, mode);
40683 return false;
40685 case PLUS:
40686 if (GET_MODE_CLASS (mode) == MODE_INT
40687 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40689 if (GET_CODE (XEXP (x, 0)) == PLUS
40690 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40691 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40692 && CONSTANT_P (XEXP (x, 1)))
40694 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40695 if (val == 2 || val == 4 || val == 8)
40697 *total = cost->lea;
40698 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40699 outer_code, opno, speed);
40700 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40701 outer_code, opno, speed);
40702 *total += rtx_cost (XEXP (x, 1), mode,
40703 outer_code, opno, speed);
40704 return true;
40707 else if (GET_CODE (XEXP (x, 0)) == MULT
40708 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40710 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40711 if (val == 2 || val == 4 || val == 8)
40713 *total = cost->lea;
40714 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40715 outer_code, opno, speed);
40716 *total += rtx_cost (XEXP (x, 1), mode,
40717 outer_code, opno, speed);
40718 return true;
40721 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40723 /* Add with carry, ignore the cost of adding a carry flag. */
40724 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
40725 *total = cost->add;
40726 else
40728 *total = cost->lea;
40729 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40730 outer_code, opno, speed);
40733 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40734 outer_code, opno, speed);
40735 *total += rtx_cost (XEXP (x, 1), mode,
40736 outer_code, opno, speed);
40737 return true;
40740 /* FALLTHRU */
40742 case MINUS:
40743 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
40744 if (GET_MODE_CLASS (mode) == MODE_INT
40745 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
40746 && GET_CODE (XEXP (x, 0)) == MINUS
40747 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
40749 *total = cost->add;
40750 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40751 outer_code, opno, speed);
40752 *total += rtx_cost (XEXP (x, 1), mode,
40753 outer_code, opno, speed);
40754 return true;
40757 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40759 *total = cost->addss;
40760 return false;
40762 else if (X87_FLOAT_MODE_P (mode))
40764 *total = cost->fadd;
40765 return false;
40767 else if (FLOAT_MODE_P (mode))
40769 *total = ix86_vec_cost (mode, cost->addss, true);
40770 return false;
40772 /* FALLTHRU */
40774 case AND:
40775 case IOR:
40776 case XOR:
40777 if (GET_MODE_CLASS (mode) == MODE_INT
40778 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40780 *total = (cost->add * 2
40781 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40782 << (GET_MODE (XEXP (x, 0)) != DImode))
40783 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40784 << (GET_MODE (XEXP (x, 1)) != DImode)));
40785 return true;
40787 /* FALLTHRU */
40789 case NEG:
40790 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40792 *total = cost->sse_op;
40793 return false;
40795 else if (X87_FLOAT_MODE_P (mode))
40797 *total = cost->fchs;
40798 return false;
40800 else if (FLOAT_MODE_P (mode))
40802 *total = ix86_vec_cost (mode, cost->sse_op, true);
40803 return false;
40805 /* FALLTHRU */
40807 case NOT:
40808 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40809 *total = ix86_vec_cost (mode, cost->sse_op, true);
40810 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40811 *total = cost->add * 2;
40812 else
40813 *total = cost->add;
40814 return false;
40816 case COMPARE:
40817 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
40818 && XEXP (XEXP (x, 0), 1) == const1_rtx
40819 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
40820 && XEXP (x, 1) == const0_rtx)
40822 /* This kind of construct is implemented using test[bwl].
40823 Treat it as if we had an AND. */
40824 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
40825 *total = (cost->add
40826 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
40827 opno, speed)
40828 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
40829 return true;
40832 /* The embedded comparison operand is completely free. */
40833 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
40834 && XEXP (x, 1) == const0_rtx)
40835 *total = 0;
40837 return false;
40839 case FLOAT_EXTEND:
40840 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40841 *total = 0;
40842 else
40843 *total = ix86_vec_cost (mode, cost->addss, true);
40844 return false;
40846 case FLOAT_TRUNCATE:
40847 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
40848 *total = cost->fadd;
40849 else
40850 *total = ix86_vec_cost (mode, cost->addss, true);
40851 return false;
40853 case ABS:
40854 /* SSE requires memory load for the constant operand. It may make
40855 sense to account for this. Of course the constant operand may or
40856 may not be reused. */
40857 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40858 *total = cost->sse_op;
40859 else if (X87_FLOAT_MODE_P (mode))
40860 *total = cost->fabs;
40861 else if (FLOAT_MODE_P (mode))
40862 *total = ix86_vec_cost (mode, cost->sse_op, true);
40863 return false;
40865 case SQRT:
40866 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40867 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40868 else if (X87_FLOAT_MODE_P (mode))
40869 *total = cost->fsqrt;
40870 else if (FLOAT_MODE_P (mode))
40871 *total = ix86_vec_cost (mode,
40872 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40873 true);
40874 return false;
40876 case UNSPEC:
40877 if (XINT (x, 1) == UNSPEC_TP)
40878 *total = 0;
40879 return false;
40881 case VEC_SELECT:
40882 case VEC_CONCAT:
40883 case VEC_DUPLICATE:
40884 /* ??? Assume all of these vector manipulation patterns are
40885 recognizable. In which case they all pretty much have the
40886 same cost. */
40887 *total = cost->sse_op;
40888 return true;
40889 case VEC_MERGE:
40890 mask = XEXP (x, 2);
40891 /* This is masked instruction, assume the same cost,
40892 as nonmasked variant. */
40893 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40894 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40895 else
40896 *total = cost->sse_op;
40897 return true;
40899 default:
40900 return false;
40904 #if TARGET_MACHO
40906 static int current_machopic_label_num;
40908 /* Given a symbol name and its associated stub, write out the
40909 definition of the stub. */
40911 void
40912 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40914 unsigned int length;
40915 char *binder_name, *symbol_name, lazy_ptr_name[32];
40916 int label = ++current_machopic_label_num;
40918 /* For 64-bit we shouldn't get here. */
40919 gcc_assert (!TARGET_64BIT);
40921 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40922 symb = targetm.strip_name_encoding (symb);
40924 length = strlen (stub);
40925 binder_name = XALLOCAVEC (char, length + 32);
40926 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40928 length = strlen (symb);
40929 symbol_name = XALLOCAVEC (char, length + 32);
40930 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40932 sprintf (lazy_ptr_name, "L%d$lz", label);
40934 if (MACHOPIC_ATT_STUB)
40935 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40936 else if (MACHOPIC_PURE)
40937 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40938 else
40939 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40941 fprintf (file, "%s:\n", stub);
40942 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40944 if (MACHOPIC_ATT_STUB)
40946 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40948 else if (MACHOPIC_PURE)
40950 /* PIC stub. */
40951 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40952 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40953 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40954 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40955 label, lazy_ptr_name, label);
40956 fprintf (file, "\tjmp\t*%%ecx\n");
40958 else
40959 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40961 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40962 it needs no stub-binding-helper. */
40963 if (MACHOPIC_ATT_STUB)
40964 return;
40966 fprintf (file, "%s:\n", binder_name);
40968 if (MACHOPIC_PURE)
40970 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40971 fprintf (file, "\tpushl\t%%ecx\n");
40973 else
40974 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40976 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40978 /* N.B. Keep the correspondence of these
40979 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40980 old-pic/new-pic/non-pic stubs; altering this will break
40981 compatibility with existing dylibs. */
40982 if (MACHOPIC_PURE)
40984 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40985 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40987 else
40988 /* 16-byte -mdynamic-no-pic stub. */
40989 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40991 fprintf (file, "%s:\n", lazy_ptr_name);
40992 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40993 fprintf (file, ASM_LONG "%s\n", binder_name);
40995 #endif /* TARGET_MACHO */
40997 /* Order the registers for register allocator. */
40999 void
41000 x86_order_regs_for_local_alloc (void)
41002 int pos = 0;
41003 int i;
41005 /* First allocate the local general purpose registers. */
41006 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41007 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41008 reg_alloc_order [pos++] = i;
41010 /* Global general purpose registers. */
41011 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41012 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41013 reg_alloc_order [pos++] = i;
41015 /* x87 registers come first in case we are doing FP math
41016 using them. */
41017 if (!TARGET_SSE_MATH)
41018 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41019 reg_alloc_order [pos++] = i;
41021 /* SSE registers. */
41022 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41023 reg_alloc_order [pos++] = i;
41024 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41025 reg_alloc_order [pos++] = i;
41027 /* Extended REX SSE registers. */
41028 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41029 reg_alloc_order [pos++] = i;
41031 /* Mask register. */
41032 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41033 reg_alloc_order [pos++] = i;
41035 /* MPX bound registers. */
41036 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
41037 reg_alloc_order [pos++] = i;
41039 /* x87 registers. */
41040 if (TARGET_SSE_MATH)
41041 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41042 reg_alloc_order [pos++] = i;
41044 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41045 reg_alloc_order [pos++] = i;
41047 /* Initialize the rest of array as we do not allocate some registers
41048 at all. */
41049 while (pos < FIRST_PSEUDO_REGISTER)
41050 reg_alloc_order [pos++] = 0;
41053 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41054 in struct attribute_spec handler. */
41055 static tree
41056 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41057 bool *no_add_attrs)
41059 if (TREE_CODE (*node) != FUNCTION_TYPE
41060 && TREE_CODE (*node) != METHOD_TYPE
41061 && TREE_CODE (*node) != FIELD_DECL
41062 && TREE_CODE (*node) != TYPE_DECL)
41064 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41065 name);
41066 *no_add_attrs = true;
41067 return NULL_TREE;
41069 if (TARGET_64BIT)
41071 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41072 name);
41073 *no_add_attrs = true;
41074 return NULL_TREE;
41076 if (is_attribute_p ("callee_pop_aggregate_return", name))
41078 tree cst;
41080 cst = TREE_VALUE (args);
41081 if (TREE_CODE (cst) != INTEGER_CST)
41083 warning (OPT_Wattributes,
41084 "%qE attribute requires an integer constant argument",
41085 name);
41086 *no_add_attrs = true;
41088 else if (compare_tree_int (cst, 0) != 0
41089 && compare_tree_int (cst, 1) != 0)
41091 warning (OPT_Wattributes,
41092 "argument to %qE attribute is neither zero, nor one",
41093 name);
41094 *no_add_attrs = true;
41097 return NULL_TREE;
41100 return NULL_TREE;
41103 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41104 struct attribute_spec.handler. */
41105 static tree
41106 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41107 bool *no_add_attrs)
41109 if (TREE_CODE (*node) != FUNCTION_TYPE
41110 && TREE_CODE (*node) != METHOD_TYPE
41111 && TREE_CODE (*node) != FIELD_DECL
41112 && TREE_CODE (*node) != TYPE_DECL)
41114 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41115 name);
41116 *no_add_attrs = true;
41117 return NULL_TREE;
41120 /* Can combine regparm with all attributes but fastcall. */
41121 if (is_attribute_p ("ms_abi", name))
41123 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41125 error ("ms_abi and sysv_abi attributes are not compatible");
41128 return NULL_TREE;
41130 else if (is_attribute_p ("sysv_abi", name))
41132 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41134 error ("ms_abi and sysv_abi attributes are not compatible");
41137 return NULL_TREE;
41140 return NULL_TREE;
41143 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41144 struct attribute_spec.handler. */
41145 static tree
41146 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41147 bool *no_add_attrs)
41149 tree *type = NULL;
41150 if (DECL_P (*node))
41152 if (TREE_CODE (*node) == TYPE_DECL)
41153 type = &TREE_TYPE (*node);
41155 else
41156 type = node;
41158 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41160 warning (OPT_Wattributes, "%qE attribute ignored",
41161 name);
41162 *no_add_attrs = true;
41165 else if ((is_attribute_p ("ms_struct", name)
41166 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41167 || ((is_attribute_p ("gcc_struct", name)
41168 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41170 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41171 name);
41172 *no_add_attrs = true;
41175 return NULL_TREE;
41178 static tree
41179 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41180 bool *no_add_attrs)
41182 if (TREE_CODE (*node) != FUNCTION_DECL)
41184 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41185 name);
41186 *no_add_attrs = true;
41189 if (is_attribute_p ("indirect_branch", name))
41191 tree cst = TREE_VALUE (args);
41192 if (TREE_CODE (cst) != STRING_CST)
41194 warning (OPT_Wattributes,
41195 "%qE attribute requires a string constant argument",
41196 name);
41197 *no_add_attrs = true;
41199 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41200 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41201 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41202 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41204 warning (OPT_Wattributes,
41205 "argument to %qE attribute is not "
41206 "(keep|thunk|thunk-inline|thunk-extern)", name);
41207 *no_add_attrs = true;
41211 if (is_attribute_p ("function_return", name))
41213 tree cst = TREE_VALUE (args);
41214 if (TREE_CODE (cst) != STRING_CST)
41216 warning (OPT_Wattributes,
41217 "%qE attribute requires a string constant argument",
41218 name);
41219 *no_add_attrs = true;
41221 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41222 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41223 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41224 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41226 warning (OPT_Wattributes,
41227 "argument to %qE attribute is not "
41228 "(keep|thunk|thunk-inline|thunk-extern)", name);
41229 *no_add_attrs = true;
41233 return NULL_TREE;
41236 static tree
41237 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41238 int, bool *)
41240 return NULL_TREE;
41243 static tree
41244 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41246 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41247 but the function type contains args and return type data. */
41248 tree func_type = *node;
41249 tree return_type = TREE_TYPE (func_type);
41251 int nargs = 0;
41252 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41253 while (current_arg_type
41254 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41256 if (nargs == 0)
41258 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41259 error ("interrupt service routine should have a pointer "
41260 "as the first argument");
41262 else if (nargs == 1)
41264 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41265 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41266 error ("interrupt service routine should have unsigned %s"
41267 "int as the second argument",
41268 TARGET_64BIT
41269 ? (TARGET_X32 ? "long long " : "long ")
41270 : "");
41272 nargs++;
41273 current_arg_type = TREE_CHAIN (current_arg_type);
41275 if (!nargs || nargs > 2)
41276 error ("interrupt service routine can only have a pointer argument "
41277 "and an optional integer argument");
41278 if (! VOID_TYPE_P (return_type))
41279 error ("interrupt service routine can't have non-void return value");
41281 return NULL_TREE;
41284 static bool
41285 ix86_ms_bitfield_layout_p (const_tree record_type)
41287 return ((TARGET_MS_BITFIELD_LAYOUT
41288 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41289 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41292 /* Returns an expression indicating where the this parameter is
41293 located on entry to the FUNCTION. */
41295 static rtx
41296 x86_this_parameter (tree function)
41298 tree type = TREE_TYPE (function);
41299 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41300 int nregs;
41302 if (TARGET_64BIT)
41304 const int *parm_regs;
41306 if (ix86_function_type_abi (type) == MS_ABI)
41307 parm_regs = x86_64_ms_abi_int_parameter_registers;
41308 else
41309 parm_regs = x86_64_int_parameter_registers;
41310 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41313 nregs = ix86_function_regparm (type, function);
41315 if (nregs > 0 && !stdarg_p (type))
41317 int regno;
41318 unsigned int ccvt = ix86_get_callcvt (type);
41320 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41321 regno = aggr ? DX_REG : CX_REG;
41322 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41324 regno = CX_REG;
41325 if (aggr)
41326 return gen_rtx_MEM (SImode,
41327 plus_constant (Pmode, stack_pointer_rtx, 4));
41329 else
41331 regno = AX_REG;
41332 if (aggr)
41334 regno = DX_REG;
41335 if (nregs == 1)
41336 return gen_rtx_MEM (SImode,
41337 plus_constant (Pmode,
41338 stack_pointer_rtx, 4));
41341 return gen_rtx_REG (SImode, regno);
41344 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41345 aggr ? 8 : 4));
41348 /* Determine whether x86_output_mi_thunk can succeed. */
41350 static bool
41351 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41352 const_tree function)
41354 /* 64-bit can handle anything. */
41355 if (TARGET_64BIT)
41356 return true;
41358 /* For 32-bit, everything's fine if we have one free register. */
41359 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41360 return true;
41362 /* Need a free register for vcall_offset. */
41363 if (vcall_offset)
41364 return false;
41366 /* Need a free register for GOT references. */
41367 if (flag_pic && !targetm.binds_local_p (function))
41368 return false;
41370 /* Otherwise ok. */
41371 return true;
41374 /* Output the assembler code for a thunk function. THUNK_DECL is the
41375 declaration for the thunk function itself, FUNCTION is the decl for
41376 the target function. DELTA is an immediate constant offset to be
41377 added to THIS. If VCALL_OFFSET is nonzero, the word at
41378 *(*this + vcall_offset) should be added to THIS. */
41380 static void
41381 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41382 HOST_WIDE_INT vcall_offset, tree function)
41384 rtx this_param = x86_this_parameter (function);
41385 rtx this_reg, tmp, fnaddr;
41386 unsigned int tmp_regno;
41387 rtx_insn *insn;
41389 if (TARGET_64BIT)
41390 tmp_regno = R10_REG;
41391 else
41393 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41394 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41395 tmp_regno = AX_REG;
41396 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41397 tmp_regno = DX_REG;
41398 else
41399 tmp_regno = CX_REG;
41402 emit_note (NOTE_INSN_PROLOGUE_END);
41404 /* CET is enabled, insert EB instruction. */
41405 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
41406 emit_insn (gen_nop_endbr ());
41408 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41409 pull it in now and let DELTA benefit. */
41410 if (REG_P (this_param))
41411 this_reg = this_param;
41412 else if (vcall_offset)
41414 /* Put the this parameter into %eax. */
41415 this_reg = gen_rtx_REG (Pmode, AX_REG);
41416 emit_move_insn (this_reg, this_param);
41418 else
41419 this_reg = NULL_RTX;
41421 /* Adjust the this parameter by a fixed constant. */
41422 if (delta)
41424 rtx delta_rtx = GEN_INT (delta);
41425 rtx delta_dst = this_reg ? this_reg : this_param;
41427 if (TARGET_64BIT)
41429 if (!x86_64_general_operand (delta_rtx, Pmode))
41431 tmp = gen_rtx_REG (Pmode, tmp_regno);
41432 emit_move_insn (tmp, delta_rtx);
41433 delta_rtx = tmp;
41437 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41440 /* Adjust the this parameter by a value stored in the vtable. */
41441 if (vcall_offset)
41443 rtx vcall_addr, vcall_mem, this_mem;
41445 tmp = gen_rtx_REG (Pmode, tmp_regno);
41447 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41448 if (Pmode != ptr_mode)
41449 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41450 emit_move_insn (tmp, this_mem);
41452 /* Adjust the this parameter. */
41453 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41454 if (TARGET_64BIT
41455 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41457 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41458 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41459 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41462 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41463 if (Pmode != ptr_mode)
41464 emit_insn (gen_addsi_1_zext (this_reg,
41465 gen_rtx_REG (ptr_mode,
41466 REGNO (this_reg)),
41467 vcall_mem));
41468 else
41469 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41472 /* If necessary, drop THIS back to its stack slot. */
41473 if (this_reg && this_reg != this_param)
41474 emit_move_insn (this_param, this_reg);
41476 fnaddr = XEXP (DECL_RTL (function), 0);
41477 if (TARGET_64BIT)
41479 if (!flag_pic || targetm.binds_local_p (function)
41480 || TARGET_PECOFF)
41482 else
41484 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41485 tmp = gen_rtx_CONST (Pmode, tmp);
41486 fnaddr = gen_const_mem (Pmode, tmp);
41489 else
41491 if (!flag_pic || targetm.binds_local_p (function))
41493 #if TARGET_MACHO
41494 else if (TARGET_MACHO)
41496 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41497 fnaddr = XEXP (fnaddr, 0);
41499 #endif /* TARGET_MACHO */
41500 else
41502 tmp = gen_rtx_REG (Pmode, CX_REG);
41503 output_set_got (tmp, NULL_RTX);
41505 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41506 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41507 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41508 fnaddr = gen_const_mem (Pmode, fnaddr);
41512 /* Our sibling call patterns do not allow memories, because we have no
41513 predicate that can distinguish between frame and non-frame memory.
41514 For our purposes here, we can get away with (ab)using a jump pattern,
41515 because we're going to do no optimization. */
41516 if (MEM_P (fnaddr))
41518 if (sibcall_insn_operand (fnaddr, word_mode))
41520 fnaddr = XEXP (DECL_RTL (function), 0);
41521 tmp = gen_rtx_MEM (QImode, fnaddr);
41522 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41523 tmp = emit_call_insn (tmp);
41524 SIBLING_CALL_P (tmp) = 1;
41526 else
41527 emit_jump_insn (gen_indirect_jump (fnaddr));
41529 else
41531 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41533 // CM_LARGE_PIC always uses pseudo PIC register which is
41534 // uninitialized. Since FUNCTION is local and calling it
41535 // doesn't go through PLT, we use scratch register %r11 as
41536 // PIC register and initialize it here.
41537 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41538 ix86_init_large_pic_reg (tmp_regno);
41539 fnaddr = legitimize_pic_address (fnaddr,
41540 gen_rtx_REG (Pmode, tmp_regno));
41543 if (!sibcall_insn_operand (fnaddr, word_mode))
41545 tmp = gen_rtx_REG (word_mode, tmp_regno);
41546 if (GET_MODE (fnaddr) != word_mode)
41547 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41548 emit_move_insn (tmp, fnaddr);
41549 fnaddr = tmp;
41552 tmp = gen_rtx_MEM (QImode, fnaddr);
41553 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41554 tmp = emit_call_insn (tmp);
41555 SIBLING_CALL_P (tmp) = 1;
41557 emit_barrier ();
41559 /* Emit just enough of rest_of_compilation to get the insns emitted.
41560 Note that use_thunk calls assemble_start_function et al. */
41561 insn = get_insns ();
41562 shorten_branches (insn);
41563 final_start_function (insn, file, 1);
41564 final (insn, file, 1);
41565 final_end_function ();
41568 static void
41569 x86_file_start (void)
41571 default_file_start ();
41572 if (TARGET_16BIT)
41573 fputs ("\t.code16gcc\n", asm_out_file);
41574 #if TARGET_MACHO
41575 darwin_file_start ();
41576 #endif
41577 if (X86_FILE_START_VERSION_DIRECTIVE)
41578 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41579 if (X86_FILE_START_FLTUSED)
41580 fputs ("\t.global\t__fltused\n", asm_out_file);
41581 if (ix86_asm_dialect == ASM_INTEL)
41582 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41586 x86_field_alignment (tree type, int computed)
41588 machine_mode mode;
41590 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41591 return computed;
41592 if (TARGET_IAMCU)
41593 return iamcu_alignment (type, computed);
41594 mode = TYPE_MODE (strip_array_types (type));
41595 if (mode == DFmode || mode == DCmode
41596 || GET_MODE_CLASS (mode) == MODE_INT
41597 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41598 return MIN (32, computed);
41599 return computed;
41602 /* Print call to TARGET to FILE. */
41604 static void
41605 x86_print_call_or_nop (FILE *file, const char *target)
41607 if (flag_nop_mcount)
41608 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41609 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41610 else
41611 fprintf (file, "1:\tcall\t%s\n", target);
41614 /* Output assembler code to FILE to increment profiler label # LABELNO
41615 for profiling a function entry. */
41616 void
41617 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41619 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41620 : MCOUNT_NAME);
41621 if (TARGET_64BIT)
41623 #ifndef NO_PROFILE_COUNTERS
41624 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41625 #endif
41627 if (!TARGET_PECOFF && flag_pic)
41628 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41629 else
41630 x86_print_call_or_nop (file, mcount_name);
41632 else if (flag_pic)
41634 #ifndef NO_PROFILE_COUNTERS
41635 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41636 LPREFIX, labelno);
41637 #endif
41638 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41640 else
41642 #ifndef NO_PROFILE_COUNTERS
41643 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41644 LPREFIX, labelno);
41645 #endif
41646 x86_print_call_or_nop (file, mcount_name);
41649 if (flag_record_mcount)
41651 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41652 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41653 fprintf (file, "\t.previous\n");
41657 /* We don't have exact information about the insn sizes, but we may assume
41658 quite safely that we are informed about all 1 byte insns and memory
41659 address sizes. This is enough to eliminate unnecessary padding in
41660 99% of cases. */
41663 ix86_min_insn_size (rtx_insn *insn)
41665 int l = 0, len;
41667 if (!INSN_P (insn) || !active_insn_p (insn))
41668 return 0;
41670 /* Discard alignments we've emit and jump instructions. */
41671 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41672 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41673 return 0;
41675 /* Important case - calls are always 5 bytes.
41676 It is common to have many calls in the row. */
41677 if (CALL_P (insn)
41678 && symbolic_reference_mentioned_p (PATTERN (insn))
41679 && !SIBLING_CALL_P (insn))
41680 return 5;
41681 len = get_attr_length (insn);
41682 if (len <= 1)
41683 return 1;
41685 /* For normal instructions we rely on get_attr_length being exact,
41686 with a few exceptions. */
41687 if (!JUMP_P (insn))
41689 enum attr_type type = get_attr_type (insn);
41691 switch (type)
41693 case TYPE_MULTI:
41694 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41695 || asm_noperands (PATTERN (insn)) >= 0)
41696 return 0;
41697 break;
41698 case TYPE_OTHER:
41699 case TYPE_FCMP:
41700 break;
41701 default:
41702 /* Otherwise trust get_attr_length. */
41703 return len;
41706 l = get_attr_length_address (insn);
41707 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41708 l = 4;
41710 if (l)
41711 return 1+l;
41712 else
41713 return 2;
41716 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41718 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41719 window. */
41721 static void
41722 ix86_avoid_jump_mispredicts (void)
41724 rtx_insn *insn, *start = get_insns ();
41725 int nbytes = 0, njumps = 0;
41726 bool isjump = false;
41728 /* Look for all minimal intervals of instructions containing 4 jumps.
41729 The intervals are bounded by START and INSN. NBYTES is the total
41730 size of instructions in the interval including INSN and not including
41731 START. When the NBYTES is smaller than 16 bytes, it is possible
41732 that the end of START and INSN ends up in the same 16byte page.
41734 The smallest offset in the page INSN can start is the case where START
41735 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41736 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41738 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41739 have to, control transfer to label(s) can be performed through other
41740 means, and also we estimate minimum length of all asm stmts as 0. */
41741 for (insn = start; insn; insn = NEXT_INSN (insn))
41743 int min_size;
41745 if (LABEL_P (insn))
41747 int align = label_to_alignment (insn);
41748 int max_skip = label_to_max_skip (insn);
41750 if (max_skip > 15)
41751 max_skip = 15;
41752 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41753 already in the current 16 byte page, because otherwise
41754 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41755 bytes to reach 16 byte boundary. */
41756 if (align <= 0
41757 || (align <= 3 && max_skip != (1 << align) - 1))
41758 max_skip = 0;
41759 if (dump_file)
41760 fprintf (dump_file, "Label %i with max_skip %i\n",
41761 INSN_UID (insn), max_skip);
41762 if (max_skip)
41764 while (nbytes + max_skip >= 16)
41766 start = NEXT_INSN (start);
41767 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41768 || CALL_P (start))
41769 njumps--, isjump = true;
41770 else
41771 isjump = false;
41772 nbytes -= ix86_min_insn_size (start);
41775 continue;
41778 min_size = ix86_min_insn_size (insn);
41779 nbytes += min_size;
41780 if (dump_file)
41781 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41782 INSN_UID (insn), min_size);
41783 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41784 || CALL_P (insn))
41785 njumps++;
41786 else
41787 continue;
41789 while (njumps > 3)
41791 start = NEXT_INSN (start);
41792 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41793 || CALL_P (start))
41794 njumps--, isjump = true;
41795 else
41796 isjump = false;
41797 nbytes -= ix86_min_insn_size (start);
41799 gcc_assert (njumps >= 0);
41800 if (dump_file)
41801 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41802 INSN_UID (start), INSN_UID (insn), nbytes);
41804 if (njumps == 3 && isjump && nbytes < 16)
41806 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
41808 if (dump_file)
41809 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
41810 INSN_UID (insn), padsize);
41811 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
41815 #endif
41817 /* AMD Athlon works faster
41818 when RET is not destination of conditional jump or directly preceded
41819 by other jump instruction. We avoid the penalty by inserting NOP just
41820 before the RET instructions in such cases. */
41821 static void
41822 ix86_pad_returns (void)
41824 edge e;
41825 edge_iterator ei;
41827 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41829 basic_block bb = e->src;
41830 rtx_insn *ret = BB_END (bb);
41831 rtx_insn *prev;
41832 bool replace = false;
41834 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
41835 || optimize_bb_for_size_p (bb))
41836 continue;
41837 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
41838 if (active_insn_p (prev) || LABEL_P (prev))
41839 break;
41840 if (prev && LABEL_P (prev))
41842 edge e;
41843 edge_iterator ei;
41845 FOR_EACH_EDGE (e, ei, bb->preds)
41846 if (EDGE_FREQUENCY (e) && e->src->index >= 0
41847 && !(e->flags & EDGE_FALLTHRU))
41849 replace = true;
41850 break;
41853 if (!replace)
41855 prev = prev_active_insn (ret);
41856 if (prev
41857 && ((JUMP_P (prev) && any_condjump_p (prev))
41858 || CALL_P (prev)))
41859 replace = true;
41860 /* Empty functions get branch mispredict even when
41861 the jump destination is not visible to us. */
41862 if (!prev && !optimize_function_for_size_p (cfun))
41863 replace = true;
41865 if (replace)
41867 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
41868 delete_insn (ret);
41873 /* Count the minimum number of instructions in BB. Return 4 if the
41874 number of instructions >= 4. */
41876 static int
41877 ix86_count_insn_bb (basic_block bb)
41879 rtx_insn *insn;
41880 int insn_count = 0;
41882 /* Count number of instructions in this block. Return 4 if the number
41883 of instructions >= 4. */
41884 FOR_BB_INSNS (bb, insn)
41886 /* Only happen in exit blocks. */
41887 if (JUMP_P (insn)
41888 && ANY_RETURN_P (PATTERN (insn)))
41889 break;
41891 if (NONDEBUG_INSN_P (insn)
41892 && GET_CODE (PATTERN (insn)) != USE
41893 && GET_CODE (PATTERN (insn)) != CLOBBER)
41895 insn_count++;
41896 if (insn_count >= 4)
41897 return insn_count;
41901 return insn_count;
41905 /* Count the minimum number of instructions in code path in BB.
41906 Return 4 if the number of instructions >= 4. */
41908 static int
41909 ix86_count_insn (basic_block bb)
41911 edge e;
41912 edge_iterator ei;
41913 int min_prev_count;
41915 /* Only bother counting instructions along paths with no
41916 more than 2 basic blocks between entry and exit. Given
41917 that BB has an edge to exit, determine if a predecessor
41918 of BB has an edge from entry. If so, compute the number
41919 of instructions in the predecessor block. If there
41920 happen to be multiple such blocks, compute the minimum. */
41921 min_prev_count = 4;
41922 FOR_EACH_EDGE (e, ei, bb->preds)
41924 edge prev_e;
41925 edge_iterator prev_ei;
41927 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41929 min_prev_count = 0;
41930 break;
41932 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41934 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41936 int count = ix86_count_insn_bb (e->src);
41937 if (count < min_prev_count)
41938 min_prev_count = count;
41939 break;
41944 if (min_prev_count < 4)
41945 min_prev_count += ix86_count_insn_bb (bb);
41947 return min_prev_count;
41950 /* Pad short function to 4 instructions. */
41952 static void
41953 ix86_pad_short_function (void)
41955 edge e;
41956 edge_iterator ei;
41958 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41960 rtx_insn *ret = BB_END (e->src);
41961 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41963 int insn_count = ix86_count_insn (e->src);
41965 /* Pad short function. */
41966 if (insn_count < 4)
41968 rtx_insn *insn = ret;
41970 /* Find epilogue. */
41971 while (insn
41972 && (!NOTE_P (insn)
41973 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41974 insn = PREV_INSN (insn);
41976 if (!insn)
41977 insn = ret;
41979 /* Two NOPs count as one instruction. */
41980 insn_count = 2 * (4 - insn_count);
41981 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41987 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41988 the epilogue, the Windows system unwinder will apply epilogue logic and
41989 produce incorrect offsets. This can be avoided by adding a nop between
41990 the last insn that can throw and the first insn of the epilogue. */
41992 static void
41993 ix86_seh_fixup_eh_fallthru (void)
41995 edge e;
41996 edge_iterator ei;
41998 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42000 rtx_insn *insn, *next;
42002 /* Find the beginning of the epilogue. */
42003 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42004 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42005 break;
42006 if (insn == NULL)
42007 continue;
42009 /* We only care about preceding insns that can throw. */
42010 insn = prev_active_insn (insn);
42011 if (insn == NULL || !can_throw_internal (insn))
42012 continue;
42014 /* Do not separate calls from their debug information. */
42015 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42016 if (NOTE_P (next)
42017 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
42018 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
42019 insn = next;
42020 else
42021 break;
42023 emit_insn_after (gen_nops (const1_rtx), insn);
42027 /* Given a register number BASE, the lowest of a group of registers, update
42028 regsets IN and OUT with the registers that should be avoided in input
42029 and output operands respectively when trying to avoid generating a modr/m
42030 byte for -mmitigate-rop. */
42032 static void
42033 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42035 SET_HARD_REG_BIT (out, base);
42036 SET_HARD_REG_BIT (out, base + 1);
42037 SET_HARD_REG_BIT (in, base + 2);
42038 SET_HARD_REG_BIT (in, base + 3);
42041 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
42042 that certain encodings of modr/m bytes do not occur. */
42043 static void
42044 ix86_mitigate_rop (void)
42046 HARD_REG_SET input_risky;
42047 HARD_REG_SET output_risky;
42048 HARD_REG_SET inout_risky;
42050 CLEAR_HARD_REG_SET (output_risky);
42051 CLEAR_HARD_REG_SET (input_risky);
42052 SET_HARD_REG_BIT (output_risky, AX_REG);
42053 SET_HARD_REG_BIT (output_risky, CX_REG);
42054 SET_HARD_REG_BIT (input_risky, BX_REG);
42055 SET_HARD_REG_BIT (input_risky, DX_REG);
42056 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42057 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42058 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42059 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42060 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42061 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
42062 COPY_HARD_REG_SET (inout_risky, input_risky);
42063 IOR_HARD_REG_SET (inout_risky, output_risky);
42065 df_note_add_problem ();
42066 /* Fix up what stack-regs did. */
42067 df_insn_rescan_all ();
42068 df_analyze ();
42070 regrename_init (true);
42071 regrename_analyze (NULL);
42073 auto_vec<du_head_p> cands;
42075 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42077 if (!NONDEBUG_INSN_P (insn))
42078 continue;
42080 if (GET_CODE (PATTERN (insn)) == USE
42081 || GET_CODE (PATTERN (insn)) == CLOBBER)
42082 continue;
42084 extract_insn (insn);
42086 int opno0, opno1;
42087 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42088 recog_data.n_operands, &opno0,
42089 &opno1);
42091 if (!ix86_rop_should_change_byte_p (modrm))
42092 continue;
42094 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42096 /* This happens when regrename has to fail a block. */
42097 if (!info->op_info)
42098 continue;
42100 if (info->op_info[opno0].n_chains != 0)
42102 gcc_assert (info->op_info[opno0].n_chains == 1);
42103 du_head_p op0c;
42104 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42105 if (op0c->target_data_1 + op0c->target_data_2 == 0
42106 && !op0c->cannot_rename)
42107 cands.safe_push (op0c);
42109 op0c->target_data_1++;
42111 if (info->op_info[opno1].n_chains != 0)
42113 gcc_assert (info->op_info[opno1].n_chains == 1);
42114 du_head_p op1c;
42115 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42116 if (op1c->target_data_1 + op1c->target_data_2 == 0
42117 && !op1c->cannot_rename)
42118 cands.safe_push (op1c);
42120 op1c->target_data_2++;
42124 int i;
42125 du_head_p head;
42126 FOR_EACH_VEC_ELT (cands, i, head)
42128 int old_reg, best_reg;
42129 HARD_REG_SET unavailable;
42131 CLEAR_HARD_REG_SET (unavailable);
42132 if (head->target_data_1)
42133 IOR_HARD_REG_SET (unavailable, output_risky);
42134 if (head->target_data_2)
42135 IOR_HARD_REG_SET (unavailable, input_risky);
42137 int n_uses;
42138 reg_class superclass = regrename_find_superclass (head, &n_uses,
42139 &unavailable);
42140 old_reg = head->regno;
42141 best_reg = find_rename_reg (head, superclass, &unavailable,
42142 old_reg, false);
42143 bool ok = regrename_do_replace (head, best_reg);
42144 gcc_assert (ok);
42145 if (dump_file)
42146 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42147 reg_names[best_reg], reg_class_names[superclass]);
42151 regrename_finish ();
42153 df_analyze ();
42155 basic_block bb;
42156 regset_head live;
42158 INIT_REG_SET (&live);
42160 FOR_EACH_BB_FN (bb, cfun)
42162 rtx_insn *insn;
42164 COPY_REG_SET (&live, DF_LR_OUT (bb));
42165 df_simulate_initialize_backwards (bb, &live);
42167 FOR_BB_INSNS_REVERSE (bb, insn)
42169 if (!NONDEBUG_INSN_P (insn))
42170 continue;
42172 df_simulate_one_insn_backwards (bb, insn, &live);
42174 if (GET_CODE (PATTERN (insn)) == USE
42175 || GET_CODE (PATTERN (insn)) == CLOBBER)
42176 continue;
42178 extract_insn (insn);
42179 constrain_operands_cached (insn, reload_completed);
42180 int opno0, opno1;
42181 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42182 recog_data.n_operands, &opno0,
42183 &opno1);
42184 if (modrm < 0
42185 || !ix86_rop_should_change_byte_p (modrm)
42186 || opno0 == opno1)
42187 continue;
42189 rtx oldreg = recog_data.operand[opno1];
42190 preprocess_constraints (insn);
42191 const operand_alternative *alt = which_op_alt ();
42193 int i;
42194 for (i = 0; i < recog_data.n_operands; i++)
42195 if (i != opno1
42196 && alt[i].earlyclobber
42197 && reg_overlap_mentioned_p (recog_data.operand[i],
42198 oldreg))
42199 break;
42201 if (i < recog_data.n_operands)
42202 continue;
42204 if (dump_file)
42205 fprintf (dump_file,
42206 "attempting to fix modrm byte in insn %d:"
42207 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42208 reg_class_names[alt[opno1].cl]);
42210 HARD_REG_SET unavailable;
42211 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42212 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42213 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42214 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42215 IOR_HARD_REG_SET (unavailable, output_risky);
42216 IOR_COMPL_HARD_REG_SET (unavailable,
42217 reg_class_contents[alt[opno1].cl]);
42219 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42220 if (!TEST_HARD_REG_BIT (unavailable, i))
42221 break;
42222 if (i == FIRST_PSEUDO_REGISTER)
42224 if (dump_file)
42225 fprintf (dump_file, ", none available\n");
42226 continue;
42228 if (dump_file)
42229 fprintf (dump_file, " -> %d\n", i);
42230 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42231 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42232 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42237 /* Implement machine specific optimizations. We implement padding of returns
42238 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42239 static void
42240 ix86_reorg (void)
42242 /* We are freeing block_for_insn in the toplev to keep compatibility
42243 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42244 compute_bb_for_insn ();
42246 if (flag_mitigate_rop)
42247 ix86_mitigate_rop ();
42249 if (TARGET_SEH && current_function_has_exception_handlers ())
42250 ix86_seh_fixup_eh_fallthru ();
42252 if (optimize && optimize_function_for_speed_p (cfun))
42254 if (TARGET_PAD_SHORT_FUNCTION)
42255 ix86_pad_short_function ();
42256 else if (TARGET_PAD_RETURNS)
42257 ix86_pad_returns ();
42258 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42259 if (TARGET_FOUR_JUMP_LIMIT)
42260 ix86_avoid_jump_mispredicts ();
42261 #endif
42265 /* Return nonzero when QImode register that must be represented via REX prefix
42266 is used. */
42267 bool
42268 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42270 int i;
42271 extract_insn_cached (insn);
42272 for (i = 0; i < recog_data.n_operands; i++)
42273 if (GENERAL_REG_P (recog_data.operand[i])
42274 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42275 return true;
42276 return false;
42279 /* Return true when INSN mentions register that must be encoded using REX
42280 prefix. */
42281 bool
42282 x86_extended_reg_mentioned_p (rtx insn)
42284 subrtx_iterator::array_type array;
42285 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42287 const_rtx x = *iter;
42288 if (REG_P (x)
42289 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42290 return true;
42292 return false;
42295 /* If profitable, negate (without causing overflow) integer constant
42296 of mode MODE at location LOC. Return true in this case. */
42297 bool
42298 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42300 HOST_WIDE_INT val;
42302 if (!CONST_INT_P (*loc))
42303 return false;
42305 switch (mode)
42307 case E_DImode:
42308 /* DImode x86_64 constants must fit in 32 bits. */
42309 gcc_assert (x86_64_immediate_operand (*loc, mode));
42311 mode = SImode;
42312 break;
42314 case E_SImode:
42315 case E_HImode:
42316 case E_QImode:
42317 break;
42319 default:
42320 gcc_unreachable ();
42323 /* Avoid overflows. */
42324 if (mode_signbit_p (mode, *loc))
42325 return false;
42327 val = INTVAL (*loc);
42329 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42330 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42331 if ((val < 0 && val != -128)
42332 || val == 128)
42334 *loc = GEN_INT (-val);
42335 return true;
42338 return false;
42341 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42342 optabs would emit if we didn't have TFmode patterns. */
42344 void
42345 x86_emit_floatuns (rtx operands[2])
42347 rtx_code_label *neglab, *donelab;
42348 rtx i0, i1, f0, in, out;
42349 machine_mode mode, inmode;
42351 inmode = GET_MODE (operands[1]);
42352 gcc_assert (inmode == SImode || inmode == DImode);
42354 out = operands[0];
42355 in = force_reg (inmode, operands[1]);
42356 mode = GET_MODE (out);
42357 neglab = gen_label_rtx ();
42358 donelab = gen_label_rtx ();
42359 f0 = gen_reg_rtx (mode);
42361 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42363 expand_float (out, in, 0);
42365 emit_jump_insn (gen_jump (donelab));
42366 emit_barrier ();
42368 emit_label (neglab);
42370 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42371 1, OPTAB_DIRECT);
42372 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42373 1, OPTAB_DIRECT);
42374 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42376 expand_float (f0, i0, 0);
42378 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42380 emit_label (donelab);
42383 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42384 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42385 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42386 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42388 /* Get a vector mode of the same size as the original but with elements
42389 twice as wide. This is only guaranteed to apply to integral vectors. */
42391 static inline machine_mode
42392 get_mode_wider_vector (machine_mode o)
42394 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42395 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42396 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42397 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42398 return n;
42401 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42402 fill target with val via vec_duplicate. */
42404 static bool
42405 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42407 bool ok;
42408 rtx_insn *insn;
42409 rtx dup;
42411 /* First attempt to recognize VAL as-is. */
42412 dup = gen_vec_duplicate (mode, val);
42413 insn = emit_insn (gen_rtx_SET (target, dup));
42414 if (recog_memoized (insn) < 0)
42416 rtx_insn *seq;
42417 machine_mode innermode = GET_MODE_INNER (mode);
42418 rtx reg;
42420 /* If that fails, force VAL into a register. */
42422 start_sequence ();
42423 reg = force_reg (innermode, val);
42424 if (GET_MODE (reg) != innermode)
42425 reg = gen_lowpart (innermode, reg);
42426 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42427 seq = get_insns ();
42428 end_sequence ();
42429 if (seq)
42430 emit_insn_before (seq, insn);
42432 ok = recog_memoized (insn) >= 0;
42433 gcc_assert (ok);
42435 return true;
42438 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42439 with all elements equal to VAR. Return true if successful. */
42441 static bool
42442 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42443 rtx target, rtx val)
42445 bool ok;
42447 switch (mode)
42449 case E_V2SImode:
42450 case E_V2SFmode:
42451 if (!mmx_ok)
42452 return false;
42453 /* FALLTHRU */
42455 case E_V4DFmode:
42456 case E_V4DImode:
42457 case E_V8SFmode:
42458 case E_V8SImode:
42459 case E_V2DFmode:
42460 case E_V2DImode:
42461 case E_V4SFmode:
42462 case E_V4SImode:
42463 case E_V16SImode:
42464 case E_V8DImode:
42465 case E_V16SFmode:
42466 case E_V8DFmode:
42467 return ix86_vector_duplicate_value (mode, target, val);
42469 case E_V4HImode:
42470 if (!mmx_ok)
42471 return false;
42472 if (TARGET_SSE || TARGET_3DNOW_A)
42474 rtx x;
42476 val = gen_lowpart (SImode, val);
42477 x = gen_rtx_TRUNCATE (HImode, val);
42478 x = gen_rtx_VEC_DUPLICATE (mode, x);
42479 emit_insn (gen_rtx_SET (target, x));
42480 return true;
42482 goto widen;
42484 case E_V8QImode:
42485 if (!mmx_ok)
42486 return false;
42487 goto widen;
42489 case E_V8HImode:
42490 if (TARGET_AVX2)
42491 return ix86_vector_duplicate_value (mode, target, val);
42493 if (TARGET_SSE2)
42495 struct expand_vec_perm_d dperm;
42496 rtx tmp1, tmp2;
42498 permute:
42499 memset (&dperm, 0, sizeof (dperm));
42500 dperm.target = target;
42501 dperm.vmode = mode;
42502 dperm.nelt = GET_MODE_NUNITS (mode);
42503 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42504 dperm.one_operand_p = true;
42506 /* Extend to SImode using a paradoxical SUBREG. */
42507 tmp1 = gen_reg_rtx (SImode);
42508 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42510 /* Insert the SImode value as low element of a V4SImode vector. */
42511 tmp2 = gen_reg_rtx (V4SImode);
42512 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42513 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42515 ok = (expand_vec_perm_1 (&dperm)
42516 || expand_vec_perm_broadcast_1 (&dperm));
42517 gcc_assert (ok);
42518 return ok;
42520 goto widen;
42522 case E_V16QImode:
42523 if (TARGET_AVX2)
42524 return ix86_vector_duplicate_value (mode, target, val);
42526 if (TARGET_SSE2)
42527 goto permute;
42528 goto widen;
42530 widen:
42531 /* Replicate the value once into the next wider mode and recurse. */
42533 machine_mode smode, wsmode, wvmode;
42534 rtx x;
42536 smode = GET_MODE_INNER (mode);
42537 wvmode = get_mode_wider_vector (mode);
42538 wsmode = GET_MODE_INNER (wvmode);
42540 val = convert_modes (wsmode, smode, val, true);
42541 x = expand_simple_binop (wsmode, ASHIFT, val,
42542 GEN_INT (GET_MODE_BITSIZE (smode)),
42543 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42544 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42546 x = gen_reg_rtx (wvmode);
42547 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42548 gcc_assert (ok);
42549 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42550 return ok;
42553 case E_V16HImode:
42554 case E_V32QImode:
42555 if (TARGET_AVX2)
42556 return ix86_vector_duplicate_value (mode, target, val);
42557 else
42559 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42560 rtx x = gen_reg_rtx (hvmode);
42562 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42563 gcc_assert (ok);
42565 x = gen_rtx_VEC_CONCAT (mode, x, x);
42566 emit_insn (gen_rtx_SET (target, x));
42568 return true;
42570 case E_V64QImode:
42571 case E_V32HImode:
42572 if (TARGET_AVX512BW)
42573 return ix86_vector_duplicate_value (mode, target, val);
42574 else
42576 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42577 rtx x = gen_reg_rtx (hvmode);
42579 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42580 gcc_assert (ok);
42582 x = gen_rtx_VEC_CONCAT (mode, x, x);
42583 emit_insn (gen_rtx_SET (target, x));
42585 return true;
42587 default:
42588 return false;
42592 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42593 whose ONE_VAR element is VAR, and other elements are zero. Return true
42594 if successful. */
42596 static bool
42597 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42598 rtx target, rtx var, int one_var)
42600 machine_mode vsimode;
42601 rtx new_target;
42602 rtx x, tmp;
42603 bool use_vector_set = false;
42604 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42606 switch (mode)
42608 case E_V2DImode:
42609 /* For SSE4.1, we normally use vector set. But if the second
42610 element is zero and inter-unit moves are OK, we use movq
42611 instead. */
42612 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42613 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42614 && one_var == 0));
42615 break;
42616 case E_V16QImode:
42617 case E_V4SImode:
42618 case E_V4SFmode:
42619 use_vector_set = TARGET_SSE4_1;
42620 break;
42621 case E_V8HImode:
42622 use_vector_set = TARGET_SSE2;
42623 break;
42624 case E_V4HImode:
42625 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42626 break;
42627 case E_V32QImode:
42628 case E_V16HImode:
42629 use_vector_set = TARGET_AVX;
42630 break;
42631 case E_V8SImode:
42632 use_vector_set = TARGET_AVX;
42633 gen_vec_set_0 = gen_vec_setv8si_0;
42634 break;
42635 case E_V8SFmode:
42636 use_vector_set = TARGET_AVX;
42637 gen_vec_set_0 = gen_vec_setv8sf_0;
42638 break;
42639 case E_V4DFmode:
42640 use_vector_set = TARGET_AVX;
42641 gen_vec_set_0 = gen_vec_setv4df_0;
42642 break;
42643 case E_V4DImode:
42644 /* Use ix86_expand_vector_set in 64bit mode only. */
42645 use_vector_set = TARGET_AVX && TARGET_64BIT;
42646 gen_vec_set_0 = gen_vec_setv4di_0;
42647 break;
42648 case E_V16SImode:
42649 use_vector_set = TARGET_AVX512F && one_var == 0;
42650 gen_vec_set_0 = gen_vec_setv16si_0;
42651 break;
42652 case E_V16SFmode:
42653 use_vector_set = TARGET_AVX512F && one_var == 0;
42654 gen_vec_set_0 = gen_vec_setv16sf_0;
42655 break;
42656 case E_V8DFmode:
42657 use_vector_set = TARGET_AVX512F && one_var == 0;
42658 gen_vec_set_0 = gen_vec_setv8df_0;
42659 break;
42660 case E_V8DImode:
42661 /* Use ix86_expand_vector_set in 64bit mode only. */
42662 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
42663 gen_vec_set_0 = gen_vec_setv8di_0;
42664 break;
42665 default:
42666 break;
42669 if (use_vector_set)
42671 if (gen_vec_set_0 && one_var == 0)
42673 var = force_reg (GET_MODE_INNER (mode), var);
42674 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
42675 return true;
42677 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42678 var = force_reg (GET_MODE_INNER (mode), var);
42679 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42680 return true;
42683 switch (mode)
42685 case E_V2SFmode:
42686 case E_V2SImode:
42687 if (!mmx_ok)
42688 return false;
42689 /* FALLTHRU */
42691 case E_V2DFmode:
42692 case E_V2DImode:
42693 if (one_var != 0)
42694 return false;
42695 var = force_reg (GET_MODE_INNER (mode), var);
42696 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42697 emit_insn (gen_rtx_SET (target, x));
42698 return true;
42700 case E_V4SFmode:
42701 case E_V4SImode:
42702 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42703 new_target = gen_reg_rtx (mode);
42704 else
42705 new_target = target;
42706 var = force_reg (GET_MODE_INNER (mode), var);
42707 x = gen_rtx_VEC_DUPLICATE (mode, var);
42708 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42709 emit_insn (gen_rtx_SET (new_target, x));
42710 if (one_var != 0)
42712 /* We need to shuffle the value to the correct position, so
42713 create a new pseudo to store the intermediate result. */
42715 /* With SSE2, we can use the integer shuffle insns. */
42716 if (mode != V4SFmode && TARGET_SSE2)
42718 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42719 const1_rtx,
42720 GEN_INT (one_var == 1 ? 0 : 1),
42721 GEN_INT (one_var == 2 ? 0 : 1),
42722 GEN_INT (one_var == 3 ? 0 : 1)));
42723 if (target != new_target)
42724 emit_move_insn (target, new_target);
42725 return true;
42728 /* Otherwise convert the intermediate result to V4SFmode and
42729 use the SSE1 shuffle instructions. */
42730 if (mode != V4SFmode)
42732 tmp = gen_reg_rtx (V4SFmode);
42733 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42735 else
42736 tmp = new_target;
42738 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42739 const1_rtx,
42740 GEN_INT (one_var == 1 ? 0 : 1),
42741 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42742 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42744 if (mode != V4SFmode)
42745 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42746 else if (tmp != target)
42747 emit_move_insn (target, tmp);
42749 else if (target != new_target)
42750 emit_move_insn (target, new_target);
42751 return true;
42753 case E_V8HImode:
42754 case E_V16QImode:
42755 vsimode = V4SImode;
42756 goto widen;
42757 case E_V4HImode:
42758 case E_V8QImode:
42759 if (!mmx_ok)
42760 return false;
42761 vsimode = V2SImode;
42762 goto widen;
42763 widen:
42764 if (one_var != 0)
42765 return false;
42767 /* Zero extend the variable element to SImode and recurse. */
42768 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42770 x = gen_reg_rtx (vsimode);
42771 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42772 var, one_var))
42773 gcc_unreachable ();
42775 emit_move_insn (target, gen_lowpart (mode, x));
42776 return true;
42778 default:
42779 return false;
42783 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42784 consisting of the values in VALS. It is known that all elements
42785 except ONE_VAR are constants. Return true if successful. */
42787 static bool
42788 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42789 rtx target, rtx vals, int one_var)
42791 rtx var = XVECEXP (vals, 0, one_var);
42792 machine_mode wmode;
42793 rtx const_vec, x;
42795 const_vec = copy_rtx (vals);
42796 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42797 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42799 switch (mode)
42801 case E_V2DFmode:
42802 case E_V2DImode:
42803 case E_V2SFmode:
42804 case E_V2SImode:
42805 /* For the two element vectors, it's just as easy to use
42806 the general case. */
42807 return false;
42809 case E_V4DImode:
42810 /* Use ix86_expand_vector_set in 64bit mode only. */
42811 if (!TARGET_64BIT)
42812 return false;
42813 /* FALLTHRU */
42814 case E_V4DFmode:
42815 case E_V8SFmode:
42816 case E_V8SImode:
42817 case E_V16HImode:
42818 case E_V32QImode:
42819 case E_V4SFmode:
42820 case E_V4SImode:
42821 case E_V8HImode:
42822 case E_V4HImode:
42823 break;
42825 case E_V16QImode:
42826 if (TARGET_SSE4_1)
42827 break;
42828 wmode = V8HImode;
42829 goto widen;
42830 case E_V8QImode:
42831 wmode = V4HImode;
42832 goto widen;
42833 widen:
42834 /* There's no way to set one QImode entry easily. Combine
42835 the variable value with its adjacent constant value, and
42836 promote to an HImode set. */
42837 x = XVECEXP (vals, 0, one_var ^ 1);
42838 if (one_var & 1)
42840 var = convert_modes (HImode, QImode, var, true);
42841 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
42842 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42843 x = GEN_INT (INTVAL (x) & 0xff);
42845 else
42847 var = convert_modes (HImode, QImode, var, true);
42848 x = gen_int_mode (INTVAL (x) << 8, HImode);
42850 if (x != const0_rtx)
42851 var = expand_simple_binop (HImode, IOR, var, x, var,
42852 1, OPTAB_LIB_WIDEN);
42854 x = gen_reg_rtx (wmode);
42855 emit_move_insn (x, gen_lowpart (wmode, const_vec));
42856 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
42858 emit_move_insn (target, gen_lowpart (mode, x));
42859 return true;
42861 default:
42862 return false;
42865 emit_move_insn (target, const_vec);
42866 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42867 return true;
42870 /* A subroutine of ix86_expand_vector_init_general. Use vector
42871 concatenate to handle the most general case: all values variable,
42872 and none identical. */
42874 static void
42875 ix86_expand_vector_init_concat (machine_mode mode,
42876 rtx target, rtx *ops, int n)
42878 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
42879 rtx first[16], second[8], third[4];
42880 rtvec v;
42881 int i, j;
42883 switch (n)
42885 case 2:
42886 switch (mode)
42888 case E_V16SImode:
42889 cmode = V8SImode;
42890 break;
42891 case E_V16SFmode:
42892 cmode = V8SFmode;
42893 break;
42894 case E_V8DImode:
42895 cmode = V4DImode;
42896 break;
42897 case E_V8DFmode:
42898 cmode = V4DFmode;
42899 break;
42900 case E_V8SImode:
42901 cmode = V4SImode;
42902 break;
42903 case E_V8SFmode:
42904 cmode = V4SFmode;
42905 break;
42906 case E_V4DImode:
42907 cmode = V2DImode;
42908 break;
42909 case E_V4DFmode:
42910 cmode = V2DFmode;
42911 break;
42912 case E_V4SImode:
42913 cmode = V2SImode;
42914 break;
42915 case E_V4SFmode:
42916 cmode = V2SFmode;
42917 break;
42918 case E_V2DImode:
42919 cmode = DImode;
42920 break;
42921 case E_V2SImode:
42922 cmode = SImode;
42923 break;
42924 case E_V2DFmode:
42925 cmode = DFmode;
42926 break;
42927 case E_V2SFmode:
42928 cmode = SFmode;
42929 break;
42930 default:
42931 gcc_unreachable ();
42934 if (!register_operand (ops[1], cmode))
42935 ops[1] = force_reg (cmode, ops[1]);
42936 if (!register_operand (ops[0], cmode))
42937 ops[0] = force_reg (cmode, ops[0]);
42938 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42939 ops[1])));
42940 break;
42942 case 4:
42943 switch (mode)
42945 case E_V4DImode:
42946 cmode = V2DImode;
42947 break;
42948 case E_V4DFmode:
42949 cmode = V2DFmode;
42950 break;
42951 case E_V4SImode:
42952 cmode = V2SImode;
42953 break;
42954 case E_V4SFmode:
42955 cmode = V2SFmode;
42956 break;
42957 default:
42958 gcc_unreachable ();
42960 goto half;
42962 case 8:
42963 switch (mode)
42965 case E_V8DImode:
42966 cmode = V2DImode;
42967 hmode = V4DImode;
42968 break;
42969 case E_V8DFmode:
42970 cmode = V2DFmode;
42971 hmode = V4DFmode;
42972 break;
42973 case E_V8SImode:
42974 cmode = V2SImode;
42975 hmode = V4SImode;
42976 break;
42977 case E_V8SFmode:
42978 cmode = V2SFmode;
42979 hmode = V4SFmode;
42980 break;
42981 default:
42982 gcc_unreachable ();
42984 goto half;
42986 case 16:
42987 switch (mode)
42989 case E_V16SImode:
42990 cmode = V2SImode;
42991 hmode = V4SImode;
42992 gmode = V8SImode;
42993 break;
42994 case E_V16SFmode:
42995 cmode = V2SFmode;
42996 hmode = V4SFmode;
42997 gmode = V8SFmode;
42998 break;
42999 default:
43000 gcc_unreachable ();
43002 goto half;
43004 half:
43005 /* FIXME: We process inputs backward to help RA. PR 36222. */
43006 i = n - 1;
43007 j = (n >> 1) - 1;
43008 for (; i > 0; i -= 2, j--)
43010 first[j] = gen_reg_rtx (cmode);
43011 v = gen_rtvec (2, ops[i - 1], ops[i]);
43012 ix86_expand_vector_init (false, first[j],
43013 gen_rtx_PARALLEL (cmode, v));
43016 n >>= 1;
43017 if (n > 4)
43019 gcc_assert (hmode != VOIDmode);
43020 gcc_assert (gmode != VOIDmode);
43021 for (i = j = 0; i < n; i += 2, j++)
43023 second[j] = gen_reg_rtx (hmode);
43024 ix86_expand_vector_init_concat (hmode, second [j],
43025 &first [i], 2);
43027 n >>= 1;
43028 for (i = j = 0; i < n; i += 2, j++)
43030 third[j] = gen_reg_rtx (gmode);
43031 ix86_expand_vector_init_concat (gmode, third[j],
43032 &second[i], 2);
43034 n >>= 1;
43035 ix86_expand_vector_init_concat (mode, target, third, n);
43037 else if (n > 2)
43039 gcc_assert (hmode != VOIDmode);
43040 for (i = j = 0; i < n; i += 2, j++)
43042 second[j] = gen_reg_rtx (hmode);
43043 ix86_expand_vector_init_concat (hmode, second [j],
43044 &first [i], 2);
43046 n >>= 1;
43047 ix86_expand_vector_init_concat (mode, target, second, n);
43049 else
43050 ix86_expand_vector_init_concat (mode, target, first, n);
43051 break;
43053 default:
43054 gcc_unreachable ();
43058 /* A subroutine of ix86_expand_vector_init_general. Use vector
43059 interleave to handle the most general case: all values variable,
43060 and none identical. */
43062 static void
43063 ix86_expand_vector_init_interleave (machine_mode mode,
43064 rtx target, rtx *ops, int n)
43066 machine_mode first_imode, second_imode, third_imode, inner_mode;
43067 int i, j;
43068 rtx op0, op1;
43069 rtx (*gen_load_even) (rtx, rtx, rtx);
43070 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43071 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43073 switch (mode)
43075 case E_V8HImode:
43076 gen_load_even = gen_vec_setv8hi;
43077 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43078 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43079 inner_mode = HImode;
43080 first_imode = V4SImode;
43081 second_imode = V2DImode;
43082 third_imode = VOIDmode;
43083 break;
43084 case E_V16QImode:
43085 gen_load_even = gen_vec_setv16qi;
43086 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43087 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43088 inner_mode = QImode;
43089 first_imode = V8HImode;
43090 second_imode = V4SImode;
43091 third_imode = V2DImode;
43092 break;
43093 default:
43094 gcc_unreachable ();
43097 for (i = 0; i < n; i++)
43099 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43100 op0 = gen_reg_rtx (SImode);
43101 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43103 /* Insert the SImode value as low element of V4SImode vector. */
43104 op1 = gen_reg_rtx (V4SImode);
43105 op0 = gen_rtx_VEC_MERGE (V4SImode,
43106 gen_rtx_VEC_DUPLICATE (V4SImode,
43107 op0),
43108 CONST0_RTX (V4SImode),
43109 const1_rtx);
43110 emit_insn (gen_rtx_SET (op1, op0));
43112 /* Cast the V4SImode vector back to a vector in orignal mode. */
43113 op0 = gen_reg_rtx (mode);
43114 emit_move_insn (op0, gen_lowpart (mode, op1));
43116 /* Load even elements into the second position. */
43117 emit_insn (gen_load_even (op0,
43118 force_reg (inner_mode,
43119 ops [i + i + 1]),
43120 const1_rtx));
43122 /* Cast vector to FIRST_IMODE vector. */
43123 ops[i] = gen_reg_rtx (first_imode);
43124 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43127 /* Interleave low FIRST_IMODE vectors. */
43128 for (i = j = 0; i < n; i += 2, j++)
43130 op0 = gen_reg_rtx (first_imode);
43131 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43133 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43134 ops[j] = gen_reg_rtx (second_imode);
43135 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43138 /* Interleave low SECOND_IMODE vectors. */
43139 switch (second_imode)
43141 case E_V4SImode:
43142 for (i = j = 0; i < n / 2; i += 2, j++)
43144 op0 = gen_reg_rtx (second_imode);
43145 emit_insn (gen_interleave_second_low (op0, ops[i],
43146 ops[i + 1]));
43148 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43149 vector. */
43150 ops[j] = gen_reg_rtx (third_imode);
43151 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43153 second_imode = V2DImode;
43154 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43155 /* FALLTHRU */
43157 case E_V2DImode:
43158 op0 = gen_reg_rtx (second_imode);
43159 emit_insn (gen_interleave_second_low (op0, ops[0],
43160 ops[1]));
43162 /* Cast the SECOND_IMODE vector back to a vector on original
43163 mode. */
43164 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43165 break;
43167 default:
43168 gcc_unreachable ();
43172 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43173 all values variable, and none identical. */
43175 static void
43176 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43177 rtx target, rtx vals)
43179 rtx ops[64], op0, op1, op2, op3, op4, op5;
43180 machine_mode half_mode = VOIDmode;
43181 machine_mode quarter_mode = VOIDmode;
43182 int n, i;
43184 switch (mode)
43186 case E_V2SFmode:
43187 case E_V2SImode:
43188 if (!mmx_ok && !TARGET_SSE)
43189 break;
43190 /* FALLTHRU */
43192 case E_V16SImode:
43193 case E_V16SFmode:
43194 case E_V8DFmode:
43195 case E_V8DImode:
43196 case E_V8SFmode:
43197 case E_V8SImode:
43198 case E_V4DFmode:
43199 case E_V4DImode:
43200 case E_V4SFmode:
43201 case E_V4SImode:
43202 case E_V2DFmode:
43203 case E_V2DImode:
43204 n = GET_MODE_NUNITS (mode);
43205 for (i = 0; i < n; i++)
43206 ops[i] = XVECEXP (vals, 0, i);
43207 ix86_expand_vector_init_concat (mode, target, ops, n);
43208 return;
43210 case E_V2TImode:
43211 for (i = 0; i < 2; i++)
43212 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43213 op0 = gen_reg_rtx (V4DImode);
43214 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43215 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43216 return;
43218 case E_V4TImode:
43219 for (i = 0; i < 4; i++)
43220 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43221 ops[4] = gen_reg_rtx (V4DImode);
43222 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43223 ops[5] = gen_reg_rtx (V4DImode);
43224 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43225 op0 = gen_reg_rtx (V8DImode);
43226 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43227 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43228 return;
43230 case E_V32QImode:
43231 half_mode = V16QImode;
43232 goto half;
43234 case E_V16HImode:
43235 half_mode = V8HImode;
43236 goto half;
43238 half:
43239 n = GET_MODE_NUNITS (mode);
43240 for (i = 0; i < n; i++)
43241 ops[i] = XVECEXP (vals, 0, i);
43242 op0 = gen_reg_rtx (half_mode);
43243 op1 = gen_reg_rtx (half_mode);
43244 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43245 n >> 2);
43246 ix86_expand_vector_init_interleave (half_mode, op1,
43247 &ops [n >> 1], n >> 2);
43248 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43249 return;
43251 case E_V64QImode:
43252 quarter_mode = V16QImode;
43253 half_mode = V32QImode;
43254 goto quarter;
43256 case E_V32HImode:
43257 quarter_mode = V8HImode;
43258 half_mode = V16HImode;
43259 goto quarter;
43261 quarter:
43262 n = GET_MODE_NUNITS (mode);
43263 for (i = 0; i < n; i++)
43264 ops[i] = XVECEXP (vals, 0, i);
43265 op0 = gen_reg_rtx (quarter_mode);
43266 op1 = gen_reg_rtx (quarter_mode);
43267 op2 = gen_reg_rtx (quarter_mode);
43268 op3 = gen_reg_rtx (quarter_mode);
43269 op4 = gen_reg_rtx (half_mode);
43270 op5 = gen_reg_rtx (half_mode);
43271 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43272 n >> 3);
43273 ix86_expand_vector_init_interleave (quarter_mode, op1,
43274 &ops [n >> 2], n >> 3);
43275 ix86_expand_vector_init_interleave (quarter_mode, op2,
43276 &ops [n >> 1], n >> 3);
43277 ix86_expand_vector_init_interleave (quarter_mode, op3,
43278 &ops [(n >> 1) | (n >> 2)], n >> 3);
43279 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43280 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43281 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43282 return;
43284 case E_V16QImode:
43285 if (!TARGET_SSE4_1)
43286 break;
43287 /* FALLTHRU */
43289 case E_V8HImode:
43290 if (!TARGET_SSE2)
43291 break;
43293 /* Don't use ix86_expand_vector_init_interleave if we can't
43294 move from GPR to SSE register directly. */
43295 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43296 break;
43298 n = GET_MODE_NUNITS (mode);
43299 for (i = 0; i < n; i++)
43300 ops[i] = XVECEXP (vals, 0, i);
43301 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43302 return;
43304 case E_V4HImode:
43305 case E_V8QImode:
43306 break;
43308 default:
43309 gcc_unreachable ();
43313 int i, j, n_elts, n_words, n_elt_per_word;
43314 machine_mode inner_mode;
43315 rtx words[4], shift;
43317 inner_mode = GET_MODE_INNER (mode);
43318 n_elts = GET_MODE_NUNITS (mode);
43319 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43320 n_elt_per_word = n_elts / n_words;
43321 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43323 for (i = 0; i < n_words; ++i)
43325 rtx word = NULL_RTX;
43327 for (j = 0; j < n_elt_per_word; ++j)
43329 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43330 elt = convert_modes (word_mode, inner_mode, elt, true);
43332 if (j == 0)
43333 word = elt;
43334 else
43336 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43337 word, 1, OPTAB_LIB_WIDEN);
43338 word = expand_simple_binop (word_mode, IOR, word, elt,
43339 word, 1, OPTAB_LIB_WIDEN);
43343 words[i] = word;
43346 if (n_words == 1)
43347 emit_move_insn (target, gen_lowpart (mode, words[0]));
43348 else if (n_words == 2)
43350 rtx tmp = gen_reg_rtx (mode);
43351 emit_clobber (tmp);
43352 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43353 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43354 emit_move_insn (target, tmp);
43356 else if (n_words == 4)
43358 rtx tmp = gen_reg_rtx (V4SImode);
43359 gcc_assert (word_mode == SImode);
43360 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43361 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43362 emit_move_insn (target, gen_lowpart (mode, tmp));
43364 else
43365 gcc_unreachable ();
43369 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43370 instructions unless MMX_OK is true. */
43372 void
43373 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43375 machine_mode mode = GET_MODE (target);
43376 machine_mode inner_mode = GET_MODE_INNER (mode);
43377 int n_elts = GET_MODE_NUNITS (mode);
43378 int n_var = 0, one_var = -1;
43379 bool all_same = true, all_const_zero = true;
43380 int i;
43381 rtx x;
43383 /* Handle first initialization from vector elts. */
43384 if (n_elts != XVECLEN (vals, 0))
43386 rtx subtarget = target;
43387 x = XVECEXP (vals, 0, 0);
43388 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43389 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43391 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43392 if (inner_mode == QImode || inner_mode == HImode)
43394 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43395 mode = mode_for_vector (SImode, n_bits / 4).require ();
43396 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43397 ops[0] = gen_lowpart (inner_mode, ops[0]);
43398 ops[1] = gen_lowpart (inner_mode, ops[1]);
43399 subtarget = gen_reg_rtx (mode);
43401 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43402 if (subtarget != target)
43403 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43404 return;
43406 gcc_unreachable ();
43409 for (i = 0; i < n_elts; ++i)
43411 x = XVECEXP (vals, 0, i);
43412 if (!(CONST_SCALAR_INT_P (x)
43413 || CONST_DOUBLE_P (x)
43414 || CONST_FIXED_P (x)))
43415 n_var++, one_var = i;
43416 else if (x != CONST0_RTX (inner_mode))
43417 all_const_zero = false;
43418 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43419 all_same = false;
43422 /* Constants are best loaded from the constant pool. */
43423 if (n_var == 0)
43425 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43426 return;
43429 /* If all values are identical, broadcast the value. */
43430 if (all_same
43431 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43432 XVECEXP (vals, 0, 0)))
43433 return;
43435 /* Values where only one field is non-constant are best loaded from
43436 the pool and overwritten via move later. */
43437 if (n_var == 1)
43439 if (all_const_zero
43440 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43441 XVECEXP (vals, 0, one_var),
43442 one_var))
43443 return;
43445 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43446 return;
43449 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43452 void
43453 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43455 machine_mode mode = GET_MODE (target);
43456 machine_mode inner_mode = GET_MODE_INNER (mode);
43457 machine_mode half_mode;
43458 bool use_vec_merge = false;
43459 rtx tmp;
43460 static rtx (*gen_extract[6][2]) (rtx, rtx)
43462 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43463 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43464 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43465 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43466 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43467 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43469 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43471 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43472 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43473 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43474 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43475 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43476 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43478 int i, j, n;
43479 machine_mode mmode = VOIDmode;
43480 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43482 switch (mode)
43484 case E_V2SFmode:
43485 case E_V2SImode:
43486 if (mmx_ok)
43488 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43489 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43490 if (elt == 0)
43491 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43492 else
43493 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43494 emit_insn (gen_rtx_SET (target, tmp));
43495 return;
43497 break;
43499 case E_V2DImode:
43500 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43501 if (use_vec_merge)
43502 break;
43504 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43505 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43506 if (elt == 0)
43507 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43508 else
43509 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43510 emit_insn (gen_rtx_SET (target, tmp));
43511 return;
43513 case E_V2DFmode:
43515 rtx op0, op1;
43517 /* For the two element vectors, we implement a VEC_CONCAT with
43518 the extraction of the other element. */
43520 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43521 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43523 if (elt == 0)
43524 op0 = val, op1 = tmp;
43525 else
43526 op0 = tmp, op1 = val;
43528 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43529 emit_insn (gen_rtx_SET (target, tmp));
43531 return;
43533 case E_V4SFmode:
43534 use_vec_merge = TARGET_SSE4_1;
43535 if (use_vec_merge)
43536 break;
43538 switch (elt)
43540 case 0:
43541 use_vec_merge = true;
43542 break;
43544 case 1:
43545 /* tmp = target = A B C D */
43546 tmp = copy_to_reg (target);
43547 /* target = A A B B */
43548 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43549 /* target = X A B B */
43550 ix86_expand_vector_set (false, target, val, 0);
43551 /* target = A X C D */
43552 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43553 const1_rtx, const0_rtx,
43554 GEN_INT (2+4), GEN_INT (3+4)));
43555 return;
43557 case 2:
43558 /* tmp = target = A B C D */
43559 tmp = copy_to_reg (target);
43560 /* tmp = X B C D */
43561 ix86_expand_vector_set (false, tmp, val, 0);
43562 /* target = A B X D */
43563 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43564 const0_rtx, const1_rtx,
43565 GEN_INT (0+4), GEN_INT (3+4)));
43566 return;
43568 case 3:
43569 /* tmp = target = A B C D */
43570 tmp = copy_to_reg (target);
43571 /* tmp = X B C D */
43572 ix86_expand_vector_set (false, tmp, val, 0);
43573 /* target = A B X D */
43574 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43575 const0_rtx, const1_rtx,
43576 GEN_INT (2+4), GEN_INT (0+4)));
43577 return;
43579 default:
43580 gcc_unreachable ();
43582 break;
43584 case E_V4SImode:
43585 use_vec_merge = TARGET_SSE4_1;
43586 if (use_vec_merge)
43587 break;
43589 /* Element 0 handled by vec_merge below. */
43590 if (elt == 0)
43592 use_vec_merge = true;
43593 break;
43596 if (TARGET_SSE2)
43598 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43599 store into element 0, then shuffle them back. */
43601 rtx order[4];
43603 order[0] = GEN_INT (elt);
43604 order[1] = const1_rtx;
43605 order[2] = const2_rtx;
43606 order[3] = GEN_INT (3);
43607 order[elt] = const0_rtx;
43609 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43610 order[1], order[2], order[3]));
43612 ix86_expand_vector_set (false, target, val, 0);
43614 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43615 order[1], order[2], order[3]));
43617 else
43619 /* For SSE1, we have to reuse the V4SF code. */
43620 rtx t = gen_reg_rtx (V4SFmode);
43621 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43622 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43623 emit_move_insn (target, gen_lowpart (mode, t));
43625 return;
43627 case E_V8HImode:
43628 use_vec_merge = TARGET_SSE2;
43629 break;
43630 case E_V4HImode:
43631 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43632 break;
43634 case E_V16QImode:
43635 use_vec_merge = TARGET_SSE4_1;
43636 break;
43638 case E_V8QImode:
43639 break;
43641 case E_V32QImode:
43642 half_mode = V16QImode;
43643 j = 0;
43644 n = 16;
43645 goto half;
43647 case E_V16HImode:
43648 half_mode = V8HImode;
43649 j = 1;
43650 n = 8;
43651 goto half;
43653 case E_V8SImode:
43654 half_mode = V4SImode;
43655 j = 2;
43656 n = 4;
43657 goto half;
43659 case E_V4DImode:
43660 half_mode = V2DImode;
43661 j = 3;
43662 n = 2;
43663 goto half;
43665 case E_V8SFmode:
43666 half_mode = V4SFmode;
43667 j = 4;
43668 n = 4;
43669 goto half;
43671 case E_V4DFmode:
43672 half_mode = V2DFmode;
43673 j = 5;
43674 n = 2;
43675 goto half;
43677 half:
43678 /* Compute offset. */
43679 i = elt / n;
43680 elt %= n;
43682 gcc_assert (i <= 1);
43684 /* Extract the half. */
43685 tmp = gen_reg_rtx (half_mode);
43686 emit_insn (gen_extract[j][i] (tmp, target));
43688 /* Put val in tmp at elt. */
43689 ix86_expand_vector_set (false, tmp, val, elt);
43691 /* Put it back. */
43692 emit_insn (gen_insert[j][i] (target, target, tmp));
43693 return;
43695 case E_V8DFmode:
43696 if (TARGET_AVX512F)
43698 mmode = QImode;
43699 gen_blendm = gen_avx512f_blendmv8df;
43701 break;
43703 case E_V8DImode:
43704 if (TARGET_AVX512F)
43706 mmode = QImode;
43707 gen_blendm = gen_avx512f_blendmv8di;
43709 break;
43711 case E_V16SFmode:
43712 if (TARGET_AVX512F)
43714 mmode = HImode;
43715 gen_blendm = gen_avx512f_blendmv16sf;
43717 break;
43719 case E_V16SImode:
43720 if (TARGET_AVX512F)
43722 mmode = HImode;
43723 gen_blendm = gen_avx512f_blendmv16si;
43725 break;
43727 case E_V32HImode:
43728 if (TARGET_AVX512F && TARGET_AVX512BW)
43730 mmode = SImode;
43731 gen_blendm = gen_avx512bw_blendmv32hi;
43733 break;
43735 case E_V64QImode:
43736 if (TARGET_AVX512F && TARGET_AVX512BW)
43738 mmode = DImode;
43739 gen_blendm = gen_avx512bw_blendmv64qi;
43741 break;
43743 default:
43744 break;
43747 if (mmode != VOIDmode)
43749 tmp = gen_reg_rtx (mode);
43750 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43751 /* The avx512*_blendm<mode> expanders have different operand order
43752 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43753 elements where the mask is set and second input operand otherwise,
43754 in {sse,avx}*_*blend* the first input operand is used for elements
43755 where the mask is clear and second input operand otherwise. */
43756 emit_insn (gen_blendm (target, target, tmp,
43757 force_reg (mmode,
43758 gen_int_mode (1 << elt, mmode))));
43760 else if (use_vec_merge)
43762 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
43763 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
43764 emit_insn (gen_rtx_SET (target, tmp));
43766 else
43768 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43770 emit_move_insn (mem, target);
43772 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43773 emit_move_insn (tmp, val);
43775 emit_move_insn (target, mem);
43779 void
43780 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
43782 machine_mode mode = GET_MODE (vec);
43783 machine_mode inner_mode = GET_MODE_INNER (mode);
43784 bool use_vec_extr = false;
43785 rtx tmp;
43787 switch (mode)
43789 case E_V2SImode:
43790 case E_V2SFmode:
43791 if (!mmx_ok)
43792 break;
43793 /* FALLTHRU */
43795 case E_V2DFmode:
43796 case E_V2DImode:
43797 case E_V2TImode:
43798 case E_V4TImode:
43799 use_vec_extr = true;
43800 break;
43802 case E_V4SFmode:
43803 use_vec_extr = TARGET_SSE4_1;
43804 if (use_vec_extr)
43805 break;
43807 switch (elt)
43809 case 0:
43810 tmp = vec;
43811 break;
43813 case 1:
43814 case 3:
43815 tmp = gen_reg_rtx (mode);
43816 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
43817 GEN_INT (elt), GEN_INT (elt),
43818 GEN_INT (elt+4), GEN_INT (elt+4)));
43819 break;
43821 case 2:
43822 tmp = gen_reg_rtx (mode);
43823 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
43824 break;
43826 default:
43827 gcc_unreachable ();
43829 vec = tmp;
43830 use_vec_extr = true;
43831 elt = 0;
43832 break;
43834 case E_V4SImode:
43835 use_vec_extr = TARGET_SSE4_1;
43836 if (use_vec_extr)
43837 break;
43839 if (TARGET_SSE2)
43841 switch (elt)
43843 case 0:
43844 tmp = vec;
43845 break;
43847 case 1:
43848 case 3:
43849 tmp = gen_reg_rtx (mode);
43850 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
43851 GEN_INT (elt), GEN_INT (elt),
43852 GEN_INT (elt), GEN_INT (elt)));
43853 break;
43855 case 2:
43856 tmp = gen_reg_rtx (mode);
43857 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
43858 break;
43860 default:
43861 gcc_unreachable ();
43863 vec = tmp;
43864 use_vec_extr = true;
43865 elt = 0;
43867 else
43869 /* For SSE1, we have to reuse the V4SF code. */
43870 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
43871 gen_lowpart (V4SFmode, vec), elt);
43872 return;
43874 break;
43876 case E_V8HImode:
43877 use_vec_extr = TARGET_SSE2;
43878 break;
43879 case E_V4HImode:
43880 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43881 break;
43883 case E_V16QImode:
43884 use_vec_extr = TARGET_SSE4_1;
43885 break;
43887 case E_V8SFmode:
43888 if (TARGET_AVX)
43890 tmp = gen_reg_rtx (V4SFmode);
43891 if (elt < 4)
43892 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
43893 else
43894 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
43895 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43896 return;
43898 break;
43900 case E_V4DFmode:
43901 if (TARGET_AVX)
43903 tmp = gen_reg_rtx (V2DFmode);
43904 if (elt < 2)
43905 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
43906 else
43907 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
43908 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43909 return;
43911 break;
43913 case E_V32QImode:
43914 if (TARGET_AVX)
43916 tmp = gen_reg_rtx (V16QImode);
43917 if (elt < 16)
43918 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
43919 else
43920 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
43921 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43922 return;
43924 break;
43926 case E_V16HImode:
43927 if (TARGET_AVX)
43929 tmp = gen_reg_rtx (V8HImode);
43930 if (elt < 8)
43931 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
43932 else
43933 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43934 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43935 return;
43937 break;
43939 case E_V8SImode:
43940 if (TARGET_AVX)
43942 tmp = gen_reg_rtx (V4SImode);
43943 if (elt < 4)
43944 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43945 else
43946 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43947 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43948 return;
43950 break;
43952 case E_V4DImode:
43953 if (TARGET_AVX)
43955 tmp = gen_reg_rtx (V2DImode);
43956 if (elt < 2)
43957 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43958 else
43959 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43960 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43961 return;
43963 break;
43965 case E_V32HImode:
43966 if (TARGET_AVX512BW)
43968 tmp = gen_reg_rtx (V16HImode);
43969 if (elt < 16)
43970 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43971 else
43972 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43973 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43974 return;
43976 break;
43978 case E_V64QImode:
43979 if (TARGET_AVX512BW)
43981 tmp = gen_reg_rtx (V32QImode);
43982 if (elt < 32)
43983 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43984 else
43985 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43986 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43987 return;
43989 break;
43991 case E_V16SFmode:
43992 tmp = gen_reg_rtx (V8SFmode);
43993 if (elt < 8)
43994 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43995 else
43996 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43997 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43998 return;
44000 case E_V8DFmode:
44001 tmp = gen_reg_rtx (V4DFmode);
44002 if (elt < 4)
44003 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44004 else
44005 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44006 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44007 return;
44009 case E_V16SImode:
44010 tmp = gen_reg_rtx (V8SImode);
44011 if (elt < 8)
44012 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44013 else
44014 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44015 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44016 return;
44018 case E_V8DImode:
44019 tmp = gen_reg_rtx (V4DImode);
44020 if (elt < 4)
44021 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44022 else
44023 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44024 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44025 return;
44027 case E_V8QImode:
44028 /* ??? Could extract the appropriate HImode element and shift. */
44029 default:
44030 break;
44033 if (use_vec_extr)
44035 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44036 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44038 /* Let the rtl optimizers know about the zero extension performed. */
44039 if (inner_mode == QImode || inner_mode == HImode)
44041 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44042 target = gen_lowpart (SImode, target);
44045 emit_insn (gen_rtx_SET (target, tmp));
44047 else
44049 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44051 emit_move_insn (mem, vec);
44053 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44054 emit_move_insn (target, tmp);
44058 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44059 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44060 The upper bits of DEST are undefined, though they shouldn't cause
44061 exceptions (some bits from src or all zeros are ok). */
44063 static void
44064 emit_reduc_half (rtx dest, rtx src, int i)
44066 rtx tem, d = dest;
44067 switch (GET_MODE (src))
44069 case E_V4SFmode:
44070 if (i == 128)
44071 tem = gen_sse_movhlps (dest, src, src);
44072 else
44073 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44074 GEN_INT (1 + 4), GEN_INT (1 + 4));
44075 break;
44076 case E_V2DFmode:
44077 tem = gen_vec_interleave_highv2df (dest, src, src);
44078 break;
44079 case E_V16QImode:
44080 case E_V8HImode:
44081 case E_V4SImode:
44082 case E_V2DImode:
44083 d = gen_reg_rtx (V1TImode);
44084 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44085 GEN_INT (i / 2));
44086 break;
44087 case E_V8SFmode:
44088 if (i == 256)
44089 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44090 else
44091 tem = gen_avx_shufps256 (dest, src, src,
44092 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44093 break;
44094 case E_V4DFmode:
44095 if (i == 256)
44096 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44097 else
44098 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44099 break;
44100 case E_V32QImode:
44101 case E_V16HImode:
44102 case E_V8SImode:
44103 case E_V4DImode:
44104 if (i == 256)
44106 if (GET_MODE (dest) != V4DImode)
44107 d = gen_reg_rtx (V4DImode);
44108 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44109 gen_lowpart (V4DImode, src),
44110 const1_rtx);
44112 else
44114 d = gen_reg_rtx (V2TImode);
44115 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44116 GEN_INT (i / 2));
44118 break;
44119 case E_V64QImode:
44120 case E_V32HImode:
44121 case E_V16SImode:
44122 case E_V16SFmode:
44123 case E_V8DImode:
44124 case E_V8DFmode:
44125 if (i > 128)
44126 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44127 gen_lowpart (V16SImode, src),
44128 gen_lowpart (V16SImode, src),
44129 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44130 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44131 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44132 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44133 GEN_INT (0xC), GEN_INT (0xD),
44134 GEN_INT (0xE), GEN_INT (0xF),
44135 GEN_INT (0x10), GEN_INT (0x11),
44136 GEN_INT (0x12), GEN_INT (0x13),
44137 GEN_INT (0x14), GEN_INT (0x15),
44138 GEN_INT (0x16), GEN_INT (0x17));
44139 else
44140 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44141 gen_lowpart (V16SImode, src),
44142 GEN_INT (i == 128 ? 0x2 : 0x1),
44143 GEN_INT (0x3),
44144 GEN_INT (0x3),
44145 GEN_INT (0x3),
44146 GEN_INT (i == 128 ? 0x6 : 0x5),
44147 GEN_INT (0x7),
44148 GEN_INT (0x7),
44149 GEN_INT (0x7),
44150 GEN_INT (i == 128 ? 0xA : 0x9),
44151 GEN_INT (0xB),
44152 GEN_INT (0xB),
44153 GEN_INT (0xB),
44154 GEN_INT (i == 128 ? 0xE : 0xD),
44155 GEN_INT (0xF),
44156 GEN_INT (0xF),
44157 GEN_INT (0xF));
44158 break;
44159 default:
44160 gcc_unreachable ();
44162 emit_insn (tem);
44163 if (d != dest)
44164 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44167 /* Expand a vector reduction. FN is the binary pattern to reduce;
44168 DEST is the destination; IN is the input vector. */
44170 void
44171 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44173 rtx half, dst, vec = in;
44174 machine_mode mode = GET_MODE (in);
44175 int i;
44177 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44178 if (TARGET_SSE4_1
44179 && mode == V8HImode
44180 && fn == gen_uminv8hi3)
44182 emit_insn (gen_sse4_1_phminposuw (dest, in));
44183 return;
44186 for (i = GET_MODE_BITSIZE (mode);
44187 i > GET_MODE_UNIT_BITSIZE (mode);
44188 i >>= 1)
44190 half = gen_reg_rtx (mode);
44191 emit_reduc_half (half, vec, i);
44192 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44193 dst = dest;
44194 else
44195 dst = gen_reg_rtx (mode);
44196 emit_insn (fn (dst, half, vec));
44197 vec = dst;
44201 /* Target hook for scalar_mode_supported_p. */
44202 static bool
44203 ix86_scalar_mode_supported_p (scalar_mode mode)
44205 if (DECIMAL_FLOAT_MODE_P (mode))
44206 return default_decimal_float_supported_p ();
44207 else if (mode == TFmode)
44208 return true;
44209 else
44210 return default_scalar_mode_supported_p (mode);
44213 /* Implements target hook vector_mode_supported_p. */
44214 static bool
44215 ix86_vector_mode_supported_p (machine_mode mode)
44217 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44218 return true;
44219 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44220 return true;
44221 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44222 return true;
44223 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44224 return true;
44225 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44226 return true;
44227 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44228 return true;
44229 return false;
44232 /* Target hook for c_mode_for_suffix. */
44233 static machine_mode
44234 ix86_c_mode_for_suffix (char suffix)
44236 if (suffix == 'q')
44237 return TFmode;
44238 if (suffix == 'w')
44239 return XFmode;
44241 return VOIDmode;
44244 /* Worker function for TARGET_MD_ASM_ADJUST.
44246 We implement asm flag outputs, and maintain source compatibility
44247 with the old cc0-based compiler. */
44249 static rtx_insn *
44250 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44251 vec<const char *> &constraints,
44252 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44254 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44255 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44257 bool saw_asm_flag = false;
44259 start_sequence ();
44260 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44262 const char *con = constraints[i];
44263 if (strncmp (con, "=@cc", 4) != 0)
44264 continue;
44265 con += 4;
44266 if (strchr (con, ',') != NULL)
44268 error ("alternatives not allowed in asm flag output");
44269 continue;
44272 bool invert = false;
44273 if (con[0] == 'n')
44274 invert = true, con++;
44276 machine_mode mode = CCmode;
44277 rtx_code code = UNKNOWN;
44279 switch (con[0])
44281 case 'a':
44282 if (con[1] == 0)
44283 mode = CCAmode, code = EQ;
44284 else if (con[1] == 'e' && con[2] == 0)
44285 mode = CCCmode, code = NE;
44286 break;
44287 case 'b':
44288 if (con[1] == 0)
44289 mode = CCCmode, code = EQ;
44290 else if (con[1] == 'e' && con[2] == 0)
44291 mode = CCAmode, code = NE;
44292 break;
44293 case 'c':
44294 if (con[1] == 0)
44295 mode = CCCmode, code = EQ;
44296 break;
44297 case 'e':
44298 if (con[1] == 0)
44299 mode = CCZmode, code = EQ;
44300 break;
44301 case 'g':
44302 if (con[1] == 0)
44303 mode = CCGCmode, code = GT;
44304 else if (con[1] == 'e' && con[2] == 0)
44305 mode = CCGCmode, code = GE;
44306 break;
44307 case 'l':
44308 if (con[1] == 0)
44309 mode = CCGCmode, code = LT;
44310 else if (con[1] == 'e' && con[2] == 0)
44311 mode = CCGCmode, code = LE;
44312 break;
44313 case 'o':
44314 if (con[1] == 0)
44315 mode = CCOmode, code = EQ;
44316 break;
44317 case 'p':
44318 if (con[1] == 0)
44319 mode = CCPmode, code = EQ;
44320 break;
44321 case 's':
44322 if (con[1] == 0)
44323 mode = CCSmode, code = EQ;
44324 break;
44325 case 'z':
44326 if (con[1] == 0)
44327 mode = CCZmode, code = EQ;
44328 break;
44330 if (code == UNKNOWN)
44332 error ("unknown asm flag output %qs", constraints[i]);
44333 continue;
44335 if (invert)
44336 code = reverse_condition (code);
44338 rtx dest = outputs[i];
44339 if (!saw_asm_flag)
44341 /* This is the first asm flag output. Here we put the flags
44342 register in as the real output and adjust the condition to
44343 allow it. */
44344 constraints[i] = "=Bf";
44345 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44346 saw_asm_flag = true;
44348 else
44350 /* We don't need the flags register as output twice. */
44351 constraints[i] = "=X";
44352 outputs[i] = gen_rtx_SCRATCH (SImode);
44355 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44356 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44358 machine_mode dest_mode = GET_MODE (dest);
44359 if (!SCALAR_INT_MODE_P (dest_mode))
44361 error ("invalid type for asm flag output");
44362 continue;
44365 if (dest_mode == DImode && !TARGET_64BIT)
44366 dest_mode = SImode;
44368 if (dest_mode != QImode)
44370 rtx destqi = gen_reg_rtx (QImode);
44371 emit_insn (gen_rtx_SET (destqi, x));
44373 if (TARGET_ZERO_EXTEND_WITH_AND
44374 && optimize_function_for_speed_p (cfun))
44376 x = force_reg (dest_mode, const0_rtx);
44378 emit_insn (gen_movstrictqi
44379 (gen_lowpart (QImode, x), destqi));
44381 else
44382 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44385 if (dest_mode != GET_MODE (dest))
44387 rtx tmp = gen_reg_rtx (SImode);
44389 emit_insn (gen_rtx_SET (tmp, x));
44390 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44392 else
44393 emit_insn (gen_rtx_SET (dest, x));
44395 rtx_insn *seq = get_insns ();
44396 end_sequence ();
44398 if (saw_asm_flag)
44399 return seq;
44400 else
44402 /* If we had no asm flag outputs, clobber the flags. */
44403 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44404 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44405 return NULL;
44409 /* Implements target vector targetm.asm.encode_section_info. */
44411 static void ATTRIBUTE_UNUSED
44412 ix86_encode_section_info (tree decl, rtx rtl, int first)
44414 default_encode_section_info (decl, rtl, first);
44416 if (ix86_in_large_data_p (decl))
44417 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44420 /* Worker function for REVERSE_CONDITION. */
44422 enum rtx_code
44423 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44425 return (mode == CCFPmode
44426 ? reverse_condition_maybe_unordered (code)
44427 : reverse_condition (code));
44430 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44431 to OPERANDS[0]. */
44433 const char *
44434 output_387_reg_move (rtx_insn *insn, rtx *operands)
44436 if (REG_P (operands[0]))
44438 if (REG_P (operands[1])
44439 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44441 if (REGNO (operands[0]) == FIRST_STACK_REG)
44442 return output_387_ffreep (operands, 0);
44443 return "fstp\t%y0";
44445 if (STACK_TOP_P (operands[0]))
44446 return "fld%Z1\t%y1";
44447 return "fst\t%y0";
44449 else if (MEM_P (operands[0]))
44451 gcc_assert (REG_P (operands[1]));
44452 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44453 return "fstp%Z0\t%y0";
44454 else
44456 /* There is no non-popping store to memory for XFmode.
44457 So if we need one, follow the store with a load. */
44458 if (GET_MODE (operands[0]) == XFmode)
44459 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44460 else
44461 return "fst%Z0\t%y0";
44464 else
44465 gcc_unreachable();
44468 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44469 FP status register is set. */
44471 void
44472 ix86_emit_fp_unordered_jump (rtx label)
44474 rtx reg = gen_reg_rtx (HImode);
44475 rtx temp;
44477 emit_insn (gen_x86_fnstsw_1 (reg));
44479 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44481 emit_insn (gen_x86_sahf_1 (reg));
44483 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44484 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44486 else
44488 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44490 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44491 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44494 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44495 gen_rtx_LABEL_REF (VOIDmode, label),
44496 pc_rtx);
44497 temp = gen_rtx_SET (pc_rtx, temp);
44499 emit_jump_insn (temp);
44500 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44503 /* Output code to perform a log1p XFmode calculation. */
44505 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44507 rtx_code_label *label1 = gen_label_rtx ();
44508 rtx_code_label *label2 = gen_label_rtx ();
44510 rtx tmp = gen_reg_rtx (XFmode);
44511 rtx tmp2 = gen_reg_rtx (XFmode);
44512 rtx test;
44514 emit_insn (gen_absxf2 (tmp, op1));
44515 test = gen_rtx_GE (VOIDmode, tmp,
44516 const_double_from_real_value (
44517 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44518 XFmode));
44519 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44521 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44522 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44523 emit_jump (label2);
44525 emit_label (label1);
44526 emit_move_insn (tmp, CONST1_RTX (XFmode));
44527 emit_insn (gen_addxf3 (tmp, op1, tmp));
44528 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44529 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44531 emit_label (label2);
44534 /* Emit code for round calculation. */
44535 void ix86_emit_i387_round (rtx op0, rtx op1)
44537 machine_mode inmode = GET_MODE (op1);
44538 machine_mode outmode = GET_MODE (op0);
44539 rtx e1, e2, res, tmp, tmp1, half;
44540 rtx scratch = gen_reg_rtx (HImode);
44541 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44542 rtx_code_label *jump_label = gen_label_rtx ();
44543 rtx insn;
44544 rtx (*gen_abs) (rtx, rtx);
44545 rtx (*gen_neg) (rtx, rtx);
44547 switch (inmode)
44549 case E_SFmode:
44550 gen_abs = gen_abssf2;
44551 break;
44552 case E_DFmode:
44553 gen_abs = gen_absdf2;
44554 break;
44555 case E_XFmode:
44556 gen_abs = gen_absxf2;
44557 break;
44558 default:
44559 gcc_unreachable ();
44562 switch (outmode)
44564 case E_SFmode:
44565 gen_neg = gen_negsf2;
44566 break;
44567 case E_DFmode:
44568 gen_neg = gen_negdf2;
44569 break;
44570 case E_XFmode:
44571 gen_neg = gen_negxf2;
44572 break;
44573 case E_HImode:
44574 gen_neg = gen_neghi2;
44575 break;
44576 case E_SImode:
44577 gen_neg = gen_negsi2;
44578 break;
44579 case E_DImode:
44580 gen_neg = gen_negdi2;
44581 break;
44582 default:
44583 gcc_unreachable ();
44586 e1 = gen_reg_rtx (inmode);
44587 e2 = gen_reg_rtx (inmode);
44588 res = gen_reg_rtx (outmode);
44590 half = const_double_from_real_value (dconsthalf, inmode);
44592 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44594 /* scratch = fxam(op1) */
44595 emit_insn (gen_rtx_SET (scratch,
44596 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44597 UNSPEC_FXAM)));
44598 /* e1 = fabs(op1) */
44599 emit_insn (gen_abs (e1, op1));
44601 /* e2 = e1 + 0.5 */
44602 half = force_reg (inmode, half);
44603 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44605 /* res = floor(e2) */
44606 if (inmode != XFmode)
44608 tmp1 = gen_reg_rtx (XFmode);
44610 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44612 else
44613 tmp1 = e2;
44615 switch (outmode)
44617 case E_SFmode:
44618 case E_DFmode:
44620 rtx tmp0 = gen_reg_rtx (XFmode);
44622 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44624 emit_insn (gen_rtx_SET (res,
44625 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44626 UNSPEC_TRUNC_NOOP)));
44628 break;
44629 case E_XFmode:
44630 emit_insn (gen_frndintxf2_floor (res, tmp1));
44631 break;
44632 case E_HImode:
44633 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44634 break;
44635 case E_SImode:
44636 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44637 break;
44638 case E_DImode:
44639 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44640 break;
44641 default:
44642 gcc_unreachable ();
44645 /* flags = signbit(a) */
44646 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44648 /* if (flags) then res = -res */
44649 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44650 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44651 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44652 pc_rtx);
44653 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44654 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44655 JUMP_LABEL (insn) = jump_label;
44657 emit_insn (gen_neg (res, res));
44659 emit_label (jump_label);
44660 LABEL_NUSES (jump_label) = 1;
44662 emit_move_insn (op0, res);
44665 /* Output code to perform a Newton-Rhapson approximation of a single precision
44666 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44668 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44670 rtx x0, x1, e0, e1;
44672 x0 = gen_reg_rtx (mode);
44673 e0 = gen_reg_rtx (mode);
44674 e1 = gen_reg_rtx (mode);
44675 x1 = gen_reg_rtx (mode);
44677 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44679 b = force_reg (mode, b);
44681 /* x0 = rcp(b) estimate */
44682 if (mode == V16SFmode || mode == V8DFmode)
44684 if (TARGET_AVX512ER)
44686 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44687 UNSPEC_RCP28)));
44688 /* res = a * x0 */
44689 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44690 return;
44692 else
44693 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44694 UNSPEC_RCP14)));
44696 else
44697 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44698 UNSPEC_RCP)));
44700 /* e0 = x0 * b */
44701 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44703 /* e0 = x0 * e0 */
44704 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44706 /* e1 = x0 + x0 */
44707 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44709 /* x1 = e1 - e0 */
44710 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44712 /* res = a * x1 */
44713 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44716 /* Output code to perform a Newton-Rhapson approximation of a
44717 single precision floating point [reciprocal] square root. */
44719 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44721 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44722 REAL_VALUE_TYPE r;
44723 int unspec;
44725 x0 = gen_reg_rtx (mode);
44726 e0 = gen_reg_rtx (mode);
44727 e1 = gen_reg_rtx (mode);
44728 e2 = gen_reg_rtx (mode);
44729 e3 = gen_reg_rtx (mode);
44731 if (TARGET_AVX512ER && mode == V16SFmode)
44733 if (recip)
44734 /* res = rsqrt28(a) estimate */
44735 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44736 UNSPEC_RSQRT28)));
44737 else
44739 /* x0 = rsqrt28(a) estimate */
44740 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44741 UNSPEC_RSQRT28)));
44742 /* res = rcp28(x0) estimate */
44743 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44744 UNSPEC_RCP28)));
44746 return;
44749 real_from_integer (&r, VOIDmode, -3, SIGNED);
44750 mthree = const_double_from_real_value (r, SFmode);
44752 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44753 mhalf = const_double_from_real_value (r, SFmode);
44754 unspec = UNSPEC_RSQRT;
44756 if (VECTOR_MODE_P (mode))
44758 mthree = ix86_build_const_vector (mode, true, mthree);
44759 mhalf = ix86_build_const_vector (mode, true, mhalf);
44760 /* There is no 512-bit rsqrt. There is however rsqrt14. */
44761 if (GET_MODE_SIZE (mode) == 64)
44762 unspec = UNSPEC_RSQRT14;
44765 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
44766 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
44768 a = force_reg (mode, a);
44770 /* x0 = rsqrt(a) estimate */
44771 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44772 unspec)));
44774 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
44775 if (!recip)
44777 rtx zero = force_reg (mode, CONST0_RTX(mode));
44778 rtx mask;
44780 /* Handle masked compare. */
44781 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
44783 mask = gen_reg_rtx (HImode);
44784 /* Imm value 0x4 corresponds to not-equal comparison. */
44785 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
44786 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
44788 else
44790 mask = gen_reg_rtx (mode);
44791 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
44792 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
44796 /* e0 = x0 * a */
44797 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
44798 /* e1 = e0 * x0 */
44799 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
44801 /* e2 = e1 - 3. */
44802 mthree = force_reg (mode, mthree);
44803 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
44805 mhalf = force_reg (mode, mhalf);
44806 if (recip)
44807 /* e3 = -.5 * x0 */
44808 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
44809 else
44810 /* e3 = -.5 * e0 */
44811 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
44812 /* ret = e2 * e3 */
44813 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
44816 #ifdef TARGET_SOLARIS
44817 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
44819 static void
44820 i386_solaris_elf_named_section (const char *name, unsigned int flags,
44821 tree decl)
44823 /* With Binutils 2.15, the "@unwind" marker must be specified on
44824 every occurrence of the ".eh_frame" section, not just the first
44825 one. */
44826 if (TARGET_64BIT
44827 && strcmp (name, ".eh_frame") == 0)
44829 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
44830 flags & SECTION_WRITE ? "aw" : "a");
44831 return;
44834 #ifndef USE_GAS
44835 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
44837 solaris_elf_asm_comdat_section (name, flags, decl);
44838 return;
44840 #endif
44842 default_elf_asm_named_section (name, flags, decl);
44844 #endif /* TARGET_SOLARIS */
44846 /* Return the mangling of TYPE if it is an extended fundamental type. */
44848 static const char *
44849 ix86_mangle_type (const_tree type)
44851 type = TYPE_MAIN_VARIANT (type);
44853 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
44854 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
44855 return NULL;
44857 switch (TYPE_MODE (type))
44859 case E_TFmode:
44860 /* __float128 is "g". */
44861 return "g";
44862 case E_XFmode:
44863 /* "long double" or __float80 is "e". */
44864 return "e";
44865 default:
44866 return NULL;
44870 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
44872 static tree
44873 ix86_stack_protect_guard (void)
44875 if (TARGET_SSP_TLS_GUARD)
44877 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
44878 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
44879 tree type = build_qualified_type (type_node, qual);
44880 tree t;
44882 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
44884 t = ix86_tls_stack_chk_guard_decl;
44886 if (t == NULL)
44888 rtx x;
44890 t = build_decl
44891 (UNKNOWN_LOCATION, VAR_DECL,
44892 get_identifier (ix86_stack_protector_guard_symbol_str),
44893 type);
44894 TREE_STATIC (t) = 1;
44895 TREE_PUBLIC (t) = 1;
44896 DECL_EXTERNAL (t) = 1;
44897 TREE_USED (t) = 1;
44898 TREE_THIS_VOLATILE (t) = 1;
44899 DECL_ARTIFICIAL (t) = 1;
44900 DECL_IGNORED_P (t) = 1;
44902 /* Do not share RTL as the declaration is visible outside of
44903 current function. */
44904 x = DECL_RTL (t);
44905 RTX_FLAG (x, used) = 1;
44907 ix86_tls_stack_chk_guard_decl = t;
44910 else
44912 tree asptrtype = build_pointer_type (type);
44914 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
44915 t = build2 (MEM_REF, asptrtype, t,
44916 build_int_cst (asptrtype, 0));
44919 return t;
44922 return default_stack_protect_guard ();
44925 /* For 32-bit code we can save PIC register setup by using
44926 __stack_chk_fail_local hidden function instead of calling
44927 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
44928 register, so it is better to call __stack_chk_fail directly. */
44930 static tree ATTRIBUTE_UNUSED
44931 ix86_stack_protect_fail (void)
44933 return TARGET_64BIT
44934 ? default_external_stack_protect_fail ()
44935 : default_hidden_stack_protect_fail ();
44938 /* Select a format to encode pointers in exception handling data. CODE
44939 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44940 true if the symbol may be affected by dynamic relocations.
44942 ??? All x86 object file formats are capable of representing this.
44943 After all, the relocation needed is the same as for the call insn.
44944 Whether or not a particular assembler allows us to enter such, I
44945 guess we'll have to see. */
44947 asm_preferred_eh_data_format (int code, int global)
44949 if (flag_pic)
44951 int type = DW_EH_PE_sdata8;
44952 if (!TARGET_64BIT
44953 || ix86_cmodel == CM_SMALL_PIC
44954 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44955 type = DW_EH_PE_sdata4;
44956 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44958 if (ix86_cmodel == CM_SMALL
44959 || (ix86_cmodel == CM_MEDIUM && code))
44960 return DW_EH_PE_udata4;
44961 return DW_EH_PE_absptr;
44964 /* Expand copysign from SIGN to the positive value ABS_VALUE
44965 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44966 the sign-bit. */
44967 static void
44968 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44970 machine_mode mode = GET_MODE (sign);
44971 rtx sgn = gen_reg_rtx (mode);
44972 if (mask == NULL_RTX)
44974 machine_mode vmode;
44976 if (mode == SFmode)
44977 vmode = V4SFmode;
44978 else if (mode == DFmode)
44979 vmode = V2DFmode;
44980 else
44981 vmode = mode;
44983 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44984 if (!VECTOR_MODE_P (mode))
44986 /* We need to generate a scalar mode mask in this case. */
44987 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44988 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44989 mask = gen_reg_rtx (mode);
44990 emit_insn (gen_rtx_SET (mask, tmp));
44993 else
44994 mask = gen_rtx_NOT (mode, mask);
44995 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44996 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44999 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45000 mask for masking out the sign-bit is stored in *SMASK, if that is
45001 non-null. */
45002 static rtx
45003 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45005 machine_mode vmode, mode = GET_MODE (op0);
45006 rtx xa, mask;
45008 xa = gen_reg_rtx (mode);
45009 if (mode == SFmode)
45010 vmode = V4SFmode;
45011 else if (mode == DFmode)
45012 vmode = V2DFmode;
45013 else
45014 vmode = mode;
45015 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45016 if (!VECTOR_MODE_P (mode))
45018 /* We need to generate a scalar mode mask in this case. */
45019 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45020 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45021 mask = gen_reg_rtx (mode);
45022 emit_insn (gen_rtx_SET (mask, tmp));
45024 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45026 if (smask)
45027 *smask = mask;
45029 return xa;
45032 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45033 swapping the operands if SWAP_OPERANDS is true. The expanded
45034 code is a forward jump to a newly created label in case the
45035 comparison is true. The generated label rtx is returned. */
45036 static rtx_code_label *
45037 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45038 bool swap_operands)
45040 bool unordered_compare = ix86_unordered_fp_compare (code);
45041 rtx_code_label *label;
45042 rtx tmp, reg;
45044 if (swap_operands)
45045 std::swap (op0, op1);
45047 label = gen_label_rtx ();
45048 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45049 if (unordered_compare)
45050 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45051 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45052 emit_insn (gen_rtx_SET (reg, tmp));
45053 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45054 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45055 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45056 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45057 JUMP_LABEL (tmp) = label;
45059 return label;
45062 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45063 using comparison code CODE. Operands are swapped for the comparison if
45064 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45065 static rtx
45066 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45067 bool swap_operands)
45069 rtx (*insn)(rtx, rtx, rtx, rtx);
45070 machine_mode mode = GET_MODE (op0);
45071 rtx mask = gen_reg_rtx (mode);
45073 if (swap_operands)
45074 std::swap (op0, op1);
45076 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45078 emit_insn (insn (mask, op0, op1,
45079 gen_rtx_fmt_ee (code, mode, op0, op1)));
45080 return mask;
45083 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45084 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45085 static rtx
45086 ix86_gen_TWO52 (machine_mode mode)
45088 REAL_VALUE_TYPE TWO52r;
45089 rtx TWO52;
45091 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45092 TWO52 = const_double_from_real_value (TWO52r, mode);
45093 TWO52 = force_reg (mode, TWO52);
45095 return TWO52;
45098 /* Expand SSE sequence for computing lround from OP1 storing
45099 into OP0. */
45100 void
45101 ix86_expand_lround (rtx op0, rtx op1)
45103 /* C code for the stuff we're doing below:
45104 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45105 return (long)tmp;
45107 machine_mode mode = GET_MODE (op1);
45108 const struct real_format *fmt;
45109 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45110 rtx adj;
45112 /* load nextafter (0.5, 0.0) */
45113 fmt = REAL_MODE_FORMAT (mode);
45114 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45115 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45117 /* adj = copysign (0.5, op1) */
45118 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45119 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45121 /* adj = op1 + adj */
45122 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45124 /* op0 = (imode)adj */
45125 expand_fix (op0, adj, 0);
45128 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45129 into OPERAND0. */
45130 void
45131 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45133 /* C code for the stuff we're doing below (for do_floor):
45134 xi = (long)op1;
45135 xi -= (double)xi > op1 ? 1 : 0;
45136 return xi;
45138 machine_mode fmode = GET_MODE (op1);
45139 machine_mode imode = GET_MODE (op0);
45140 rtx ireg, freg, tmp;
45141 rtx_code_label *label;
45143 /* reg = (long)op1 */
45144 ireg = gen_reg_rtx (imode);
45145 expand_fix (ireg, op1, 0);
45147 /* freg = (double)reg */
45148 freg = gen_reg_rtx (fmode);
45149 expand_float (freg, ireg, 0);
45151 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45152 label = ix86_expand_sse_compare_and_jump (UNLE,
45153 freg, op1, !do_floor);
45154 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45155 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45156 emit_move_insn (ireg, tmp);
45158 emit_label (label);
45159 LABEL_NUSES (label) = 1;
45161 emit_move_insn (op0, ireg);
45164 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
45165 void
45166 ix86_expand_rint (rtx operand0, rtx operand1)
45168 /* C code for the stuff we're doing below:
45169 xa = fabs (operand1);
45170 if (!isless (xa, 2**52))
45171 return operand1;
45172 two52 = 2**52;
45173 if (flag_rounding_math)
45175 two52 = copysign (two52, operand1);
45176 xa = operand1;
45178 xa = xa + two52 - two52;
45179 return copysign (xa, operand1);
45181 machine_mode mode = GET_MODE (operand0);
45182 rtx res, xa, TWO52, two52, mask;
45183 rtx_code_label *label;
45185 res = gen_reg_rtx (mode);
45186 emit_move_insn (res, operand1);
45188 /* xa = abs (operand1) */
45189 xa = ix86_expand_sse_fabs (res, &mask);
45191 /* if (!isless (xa, TWO52)) goto label; */
45192 TWO52 = ix86_gen_TWO52 (mode);
45193 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45195 two52 = TWO52;
45196 if (flag_rounding_math)
45198 two52 = gen_reg_rtx (mode);
45199 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45200 xa = res;
45203 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45204 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45206 ix86_sse_copysign_to_positive (res, xa, res, mask);
45208 emit_label (label);
45209 LABEL_NUSES (label) = 1;
45211 emit_move_insn (operand0, res);
45214 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45215 into OPERAND0. */
45216 void
45217 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45219 /* C code for the stuff we expand below.
45220 double xa = fabs (x), x2;
45221 if (!isless (xa, TWO52))
45222 return x;
45223 xa = xa + TWO52 - TWO52;
45224 x2 = copysign (xa, x);
45225 Compensate. Floor:
45226 if (x2 > x)
45227 x2 -= 1;
45228 Compensate. Ceil:
45229 if (x2 < x)
45230 x2 -= -1;
45231 return x2;
45233 machine_mode mode = GET_MODE (operand0);
45234 rtx xa, TWO52, tmp, one, res, mask;
45235 rtx_code_label *label;
45237 TWO52 = ix86_gen_TWO52 (mode);
45239 /* Temporary for holding the result, initialized to the input
45240 operand to ease control flow. */
45241 res = gen_reg_rtx (mode);
45242 emit_move_insn (res, operand1);
45244 /* xa = abs (operand1) */
45245 xa = ix86_expand_sse_fabs (res, &mask);
45247 /* if (!isless (xa, TWO52)) goto label; */
45248 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45250 /* xa = xa + TWO52 - TWO52; */
45251 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45252 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45254 /* xa = copysign (xa, operand1) */
45255 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45257 /* generate 1.0 or -1.0 */
45258 one = force_reg (mode,
45259 const_double_from_real_value (do_floor
45260 ? dconst1 : dconstm1, mode));
45262 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45263 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45264 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45265 /* We always need to subtract here to preserve signed zero. */
45266 tmp = expand_simple_binop (mode, MINUS,
45267 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45268 emit_move_insn (res, tmp);
45270 emit_label (label);
45271 LABEL_NUSES (label) = 1;
45273 emit_move_insn (operand0, res);
45276 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45277 into OPERAND0. */
45278 void
45279 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45281 /* C code for the stuff we expand below.
45282 double xa = fabs (x), x2;
45283 if (!isless (xa, TWO52))
45284 return x;
45285 x2 = (double)(long)x;
45286 Compensate. Floor:
45287 if (x2 > x)
45288 x2 -= 1;
45289 Compensate. Ceil:
45290 if (x2 < x)
45291 x2 += 1;
45292 if (HONOR_SIGNED_ZEROS (mode))
45293 return copysign (x2, x);
45294 return x2;
45296 machine_mode mode = GET_MODE (operand0);
45297 rtx xa, xi, TWO52, tmp, one, res, mask;
45298 rtx_code_label *label;
45300 TWO52 = ix86_gen_TWO52 (mode);
45302 /* Temporary for holding the result, initialized to the input
45303 operand to ease control flow. */
45304 res = gen_reg_rtx (mode);
45305 emit_move_insn (res, operand1);
45307 /* xa = abs (operand1) */
45308 xa = ix86_expand_sse_fabs (res, &mask);
45310 /* if (!isless (xa, TWO52)) goto label; */
45311 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45313 /* xa = (double)(long)x */
45314 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45315 expand_fix (xi, res, 0);
45316 expand_float (xa, xi, 0);
45318 /* generate 1.0 */
45319 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45321 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45322 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45323 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45324 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45325 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45326 emit_move_insn (res, tmp);
45328 if (HONOR_SIGNED_ZEROS (mode))
45329 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45331 emit_label (label);
45332 LABEL_NUSES (label) = 1;
45334 emit_move_insn (operand0, res);
45337 /* Expand SSE sequence for computing round from OPERAND1 storing
45338 into OPERAND0. Sequence that works without relying on DImode truncation
45339 via cvttsd2siq that is only available on 64bit targets. */
45340 void
45341 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45343 /* C code for the stuff we expand below.
45344 double xa = fabs (x), xa2, x2;
45345 if (!isless (xa, TWO52))
45346 return x;
45347 Using the absolute value and copying back sign makes
45348 -0.0 -> -0.0 correct.
45349 xa2 = xa + TWO52 - TWO52;
45350 Compensate.
45351 dxa = xa2 - xa;
45352 if (dxa <= -0.5)
45353 xa2 += 1;
45354 else if (dxa > 0.5)
45355 xa2 -= 1;
45356 x2 = copysign (xa2, x);
45357 return x2;
45359 machine_mode mode = GET_MODE (operand0);
45360 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45361 rtx_code_label *label;
45363 TWO52 = ix86_gen_TWO52 (mode);
45365 /* Temporary for holding the result, initialized to the input
45366 operand to ease control flow. */
45367 res = gen_reg_rtx (mode);
45368 emit_move_insn (res, operand1);
45370 /* xa = abs (operand1) */
45371 xa = ix86_expand_sse_fabs (res, &mask);
45373 /* if (!isless (xa, TWO52)) goto label; */
45374 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45376 /* xa2 = xa + TWO52 - TWO52; */
45377 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45378 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45380 /* dxa = xa2 - xa; */
45381 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45383 /* generate 0.5, 1.0 and -0.5 */
45384 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45385 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45386 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45387 0, OPTAB_DIRECT);
45389 /* Compensate. */
45390 tmp = gen_reg_rtx (mode);
45391 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45392 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45393 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45394 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45395 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45396 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45397 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45398 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45400 /* res = copysign (xa2, operand1) */
45401 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45403 emit_label (label);
45404 LABEL_NUSES (label) = 1;
45406 emit_move_insn (operand0, res);
45409 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45410 into OPERAND0. */
45411 void
45412 ix86_expand_trunc (rtx operand0, rtx operand1)
45414 /* C code for SSE variant we expand below.
45415 double xa = fabs (x), x2;
45416 if (!isless (xa, TWO52))
45417 return x;
45418 x2 = (double)(long)x;
45419 if (HONOR_SIGNED_ZEROS (mode))
45420 return copysign (x2, x);
45421 return x2;
45423 machine_mode mode = GET_MODE (operand0);
45424 rtx xa, xi, TWO52, res, mask;
45425 rtx_code_label *label;
45427 TWO52 = ix86_gen_TWO52 (mode);
45429 /* Temporary for holding the result, initialized to the input
45430 operand to ease control flow. */
45431 res = gen_reg_rtx (mode);
45432 emit_move_insn (res, operand1);
45434 /* xa = abs (operand1) */
45435 xa = ix86_expand_sse_fabs (res, &mask);
45437 /* if (!isless (xa, TWO52)) goto label; */
45438 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45440 /* x = (double)(long)x */
45441 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45442 expand_fix (xi, res, 0);
45443 expand_float (res, xi, 0);
45445 if (HONOR_SIGNED_ZEROS (mode))
45446 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45448 emit_label (label);
45449 LABEL_NUSES (label) = 1;
45451 emit_move_insn (operand0, res);
45454 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45455 into OPERAND0. */
45456 void
45457 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45459 machine_mode mode = GET_MODE (operand0);
45460 rtx xa, mask, TWO52, one, res, smask, tmp;
45461 rtx_code_label *label;
45463 /* C code for SSE variant we expand below.
45464 double xa = fabs (x), x2;
45465 if (!isless (xa, TWO52))
45466 return x;
45467 xa2 = xa + TWO52 - TWO52;
45468 Compensate:
45469 if (xa2 > xa)
45470 xa2 -= 1.0;
45471 x2 = copysign (xa2, x);
45472 return x2;
45475 TWO52 = ix86_gen_TWO52 (mode);
45477 /* Temporary for holding the result, initialized to the input
45478 operand to ease control flow. */
45479 res = gen_reg_rtx (mode);
45480 emit_move_insn (res, operand1);
45482 /* xa = abs (operand1) */
45483 xa = ix86_expand_sse_fabs (res, &smask);
45485 /* if (!isless (xa, TWO52)) goto label; */
45486 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45488 /* res = xa + TWO52 - TWO52; */
45489 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45490 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45491 emit_move_insn (res, tmp);
45493 /* generate 1.0 */
45494 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45496 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45497 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45498 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45499 tmp = expand_simple_binop (mode, MINUS,
45500 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45501 emit_move_insn (res, tmp);
45503 /* res = copysign (res, operand1) */
45504 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45506 emit_label (label);
45507 LABEL_NUSES (label) = 1;
45509 emit_move_insn (operand0, res);
45512 /* Expand SSE sequence for computing round from OPERAND1 storing
45513 into OPERAND0. */
45514 void
45515 ix86_expand_round (rtx operand0, rtx operand1)
45517 /* C code for the stuff we're doing below:
45518 double xa = fabs (x);
45519 if (!isless (xa, TWO52))
45520 return x;
45521 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45522 return copysign (xa, x);
45524 machine_mode mode = GET_MODE (operand0);
45525 rtx res, TWO52, xa, xi, half, mask;
45526 rtx_code_label *label;
45527 const struct real_format *fmt;
45528 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45530 /* Temporary for holding the result, initialized to the input
45531 operand to ease control flow. */
45532 res = gen_reg_rtx (mode);
45533 emit_move_insn (res, operand1);
45535 TWO52 = ix86_gen_TWO52 (mode);
45536 xa = ix86_expand_sse_fabs (res, &mask);
45537 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45539 /* load nextafter (0.5, 0.0) */
45540 fmt = REAL_MODE_FORMAT (mode);
45541 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45542 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45544 /* xa = xa + 0.5 */
45545 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45546 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45548 /* xa = (double)(int64_t)xa */
45549 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45550 expand_fix (xi, xa, 0);
45551 expand_float (xa, xi, 0);
45553 /* res = copysign (xa, operand1) */
45554 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45556 emit_label (label);
45557 LABEL_NUSES (label) = 1;
45559 emit_move_insn (operand0, res);
45562 /* Expand SSE sequence for computing round
45563 from OP1 storing into OP0 using sse4 round insn. */
45564 void
45565 ix86_expand_round_sse4 (rtx op0, rtx op1)
45567 machine_mode mode = GET_MODE (op0);
45568 rtx e1, e2, res, half;
45569 const struct real_format *fmt;
45570 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45571 rtx (*gen_copysign) (rtx, rtx, rtx);
45572 rtx (*gen_round) (rtx, rtx, rtx);
45574 switch (mode)
45576 case E_SFmode:
45577 gen_copysign = gen_copysignsf3;
45578 gen_round = gen_sse4_1_roundsf2;
45579 break;
45580 case E_DFmode:
45581 gen_copysign = gen_copysigndf3;
45582 gen_round = gen_sse4_1_rounddf2;
45583 break;
45584 default:
45585 gcc_unreachable ();
45588 /* round (a) = trunc (a + copysign (0.5, a)) */
45590 /* load nextafter (0.5, 0.0) */
45591 fmt = REAL_MODE_FORMAT (mode);
45592 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45593 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45594 half = const_double_from_real_value (pred_half, mode);
45596 /* e1 = copysign (0.5, op1) */
45597 e1 = gen_reg_rtx (mode);
45598 emit_insn (gen_copysign (e1, half, op1));
45600 /* e2 = op1 + e1 */
45601 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45603 /* res = trunc (e2) */
45604 res = gen_reg_rtx (mode);
45605 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45607 emit_move_insn (op0, res);
45611 /* Table of valid machine attributes. */
45612 static const struct attribute_spec ix86_attribute_table[] =
45614 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
45615 affects_type_identity, handler, exclude } */
45616 /* Stdcall attribute says callee is responsible for popping arguments
45617 if they are not variable. */
45618 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45619 NULL },
45620 /* Fastcall attribute says callee is responsible for popping arguments
45621 if they are not variable. */
45622 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45623 NULL },
45624 /* Thiscall attribute says callee is responsible for popping arguments
45625 if they are not variable. */
45626 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45627 NULL },
45628 /* Cdecl attribute says the callee is a normal C declaration */
45629 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45630 NULL },
45631 /* Regparm attribute specifies how many integer arguments are to be
45632 passed in registers. */
45633 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
45634 NULL },
45635 /* Sseregparm attribute says we are using x86_64 calling conventions
45636 for FP arguments. */
45637 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45638 NULL },
45639 /* The transactional memory builtins are implicitly regparm or fastcall
45640 depending on the ABI. Override the generic do-nothing attribute that
45641 these builtins were declared with. */
45642 { "*tm regparm", 0, 0, false, true, true, true,
45643 ix86_handle_tm_regparm_attribute, NULL },
45644 /* force_align_arg_pointer says this function realigns the stack at entry. */
45645 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45646 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
45647 NULL },
45648 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45649 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
45650 NULL },
45651 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
45652 NULL },
45653 { "shared", 0, 0, true, false, false, false,
45654 ix86_handle_shared_attribute, NULL },
45655 #endif
45656 { "ms_struct", 0, 0, false, false, false, false,
45657 ix86_handle_struct_attribute, NULL },
45658 { "gcc_struct", 0, 0, false, false, false, false,
45659 ix86_handle_struct_attribute, NULL },
45660 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45661 SUBTARGET_ATTRIBUTE_TABLE,
45662 #endif
45663 /* ms_abi and sysv_abi calling convention function attributes. */
45664 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
45665 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
45666 NULL },
45667 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45668 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45669 { "ms_hook_prologue", 0, 0, true, false, false, false,
45670 ix86_handle_fndecl_attribute, NULL },
45671 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
45672 ix86_handle_callee_pop_aggregate_return, NULL },
45673 { "interrupt", 0, 0, false, true, true, false,
45674 ix86_handle_interrupt_attribute, NULL },
45675 { "no_caller_saved_registers", 0, 0, false, true, true, false,
45676 ix86_handle_no_caller_saved_registers_attribute, NULL },
45677 { "naked", 0, 0, true, false, false, false,
45678 ix86_handle_fndecl_attribute, NULL },
45679 { "indirect_branch", 1, 1, true, false, false, false,
45680 ix86_handle_fndecl_attribute, NULL },
45681 { "function_return", 1, 1, true, false, false, false,
45682 ix86_handle_fndecl_attribute, NULL },
45684 /* End element. */
45685 { NULL, 0, 0, false, false, false, false, NULL, NULL }
45688 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45689 static int
45690 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45691 tree vectype, int)
45693 bool fp = false;
45694 machine_mode mode = TImode;
45695 int index;
45696 if (vectype != NULL)
45698 fp = FLOAT_TYPE_P (vectype);
45699 mode = TYPE_MODE (vectype);
45702 switch (type_of_cost)
45704 case scalar_stmt:
45705 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
45707 case scalar_load:
45708 /* load/store costs are relative to register move which is 2. Recompute
45709 it to COSTS_N_INSNS so everything have same base. */
45710 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
45711 : ix86_cost->int_load [2]) / 2;
45713 case scalar_store:
45714 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
45715 : ix86_cost->int_store [2]) / 2;
45717 case vector_stmt:
45718 return ix86_vec_cost (mode,
45719 fp ? ix86_cost->addss : ix86_cost->sse_op,
45720 true);
45722 case vector_load:
45723 index = sse_store_index (mode);
45724 /* See PR82713 - we may end up being called on non-vector type. */
45725 if (index < 0)
45726 index = 2;
45727 return ix86_vec_cost (mode,
45728 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
45729 true);
45731 case vector_store:
45732 index = sse_store_index (mode);
45733 /* See PR82713 - we may end up being called on non-vector type. */
45734 if (index < 0)
45735 index = 2;
45736 return ix86_vec_cost (mode,
45737 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
45738 true);
45740 case vec_to_scalar:
45741 case scalar_to_vec:
45742 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
45744 /* We should have separate costs for unaligned loads and gather/scatter.
45745 Do that incrementally. */
45746 case unaligned_load:
45747 index = sse_store_index (mode);
45748 /* See PR82713 - we may end up being called on non-vector type. */
45749 if (index < 0)
45750 index = 2;
45751 return ix86_vec_cost (mode,
45752 COSTS_N_INSNS
45753 (ix86_cost->sse_unaligned_load[index]) / 2,
45754 true);
45756 case unaligned_store:
45757 index = sse_store_index (mode);
45758 /* See PR82713 - we may end up being called on non-vector type. */
45759 if (index < 0)
45760 index = 2;
45761 return ix86_vec_cost (mode,
45762 COSTS_N_INSNS
45763 (ix86_cost->sse_unaligned_store[index]) / 2,
45764 true);
45766 case vector_gather_load:
45767 return ix86_vec_cost (mode,
45768 COSTS_N_INSNS
45769 (ix86_cost->gather_static
45770 + ix86_cost->gather_per_elt
45771 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
45772 true);
45774 case vector_scatter_store:
45775 return ix86_vec_cost (mode,
45776 COSTS_N_INSNS
45777 (ix86_cost->scatter_static
45778 + ix86_cost->scatter_per_elt
45779 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
45780 true);
45782 case cond_branch_taken:
45783 return ix86_cost->cond_taken_branch_cost;
45785 case cond_branch_not_taken:
45786 return ix86_cost->cond_not_taken_branch_cost;
45788 case vec_perm:
45789 case vec_promote_demote:
45790 return ix86_vec_cost (mode,
45791 ix86_cost->sse_op, true);
45793 case vec_construct:
45794 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
45796 default:
45797 gcc_unreachable ();
45801 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
45802 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
45803 insn every time. */
45805 static GTY(()) rtx_insn *vselect_insn;
45807 /* Initialize vselect_insn. */
45809 static void
45810 init_vselect_insn (void)
45812 unsigned i;
45813 rtx x;
45815 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
45816 for (i = 0; i < MAX_VECT_LEN; ++i)
45817 XVECEXP (x, 0, i) = const0_rtx;
45818 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
45819 const0_rtx), x);
45820 x = gen_rtx_SET (const0_rtx, x);
45821 start_sequence ();
45822 vselect_insn = emit_insn (x);
45823 end_sequence ();
45826 /* Construct (set target (vec_select op0 (parallel perm))) and
45827 return true if that's a valid instruction in the active ISA. */
45829 static bool
45830 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
45831 unsigned nelt, bool testing_p)
45833 unsigned int i;
45834 rtx x, save_vconcat;
45835 int icode;
45837 if (vselect_insn == NULL_RTX)
45838 init_vselect_insn ();
45840 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
45841 PUT_NUM_ELEM (XVEC (x, 0), nelt);
45842 for (i = 0; i < nelt; ++i)
45843 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
45844 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45845 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
45846 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
45847 SET_DEST (PATTERN (vselect_insn)) = target;
45848 icode = recog_memoized (vselect_insn);
45850 if (icode >= 0 && !testing_p)
45851 emit_insn (copy_rtx (PATTERN (vselect_insn)));
45853 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
45854 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
45855 INSN_CODE (vselect_insn) = -1;
45857 return icode >= 0;
45860 /* Similar, but generate a vec_concat from op0 and op1 as well. */
45862 static bool
45863 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
45864 const unsigned char *perm, unsigned nelt,
45865 bool testing_p)
45867 machine_mode v2mode;
45868 rtx x;
45869 bool ok;
45871 if (vselect_insn == NULL_RTX)
45872 init_vselect_insn ();
45874 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
45875 return false;
45876 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
45877 PUT_MODE (x, v2mode);
45878 XEXP (x, 0) = op0;
45879 XEXP (x, 1) = op1;
45880 ok = expand_vselect (target, x, perm, nelt, testing_p);
45881 XEXP (x, 0) = const0_rtx;
45882 XEXP (x, 1) = const0_rtx;
45883 return ok;
45886 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45887 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
45889 static bool
45890 expand_vec_perm_blend (struct expand_vec_perm_d *d)
45892 machine_mode mmode, vmode = d->vmode;
45893 unsigned i, mask, nelt = d->nelt;
45894 rtx target, op0, op1, maskop, x;
45895 rtx rperm[32], vperm;
45897 if (d->one_operand_p)
45898 return false;
45899 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
45900 && (TARGET_AVX512BW
45901 || GET_MODE_UNIT_SIZE (vmode) >= 4))
45903 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45905 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45907 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45909 else
45910 return false;
45912 /* This is a blend, not a permute. Elements must stay in their
45913 respective lanes. */
45914 for (i = 0; i < nelt; ++i)
45916 unsigned e = d->perm[i];
45917 if (!(e == i || e == i + nelt))
45918 return false;
45921 if (d->testing_p)
45922 return true;
45924 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
45925 decision should be extracted elsewhere, so that we only try that
45926 sequence once all budget==3 options have been tried. */
45927 target = d->target;
45928 op0 = d->op0;
45929 op1 = d->op1;
45930 mask = 0;
45932 switch (vmode)
45934 case E_V8DFmode:
45935 case E_V16SFmode:
45936 case E_V4DFmode:
45937 case E_V8SFmode:
45938 case E_V2DFmode:
45939 case E_V4SFmode:
45940 case E_V8HImode:
45941 case E_V8SImode:
45942 case E_V32HImode:
45943 case E_V64QImode:
45944 case E_V16SImode:
45945 case E_V8DImode:
45946 for (i = 0; i < nelt; ++i)
45947 mask |= (d->perm[i] >= nelt) << i;
45948 break;
45950 case E_V2DImode:
45951 for (i = 0; i < 2; ++i)
45952 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
45953 vmode = V8HImode;
45954 goto do_subreg;
45956 case E_V4SImode:
45957 for (i = 0; i < 4; ++i)
45958 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45959 vmode = V8HImode;
45960 goto do_subreg;
45962 case E_V16QImode:
45963 /* See if bytes move in pairs so we can use pblendw with
45964 an immediate argument, rather than pblendvb with a vector
45965 argument. */
45966 for (i = 0; i < 16; i += 2)
45967 if (d->perm[i] + 1 != d->perm[i + 1])
45969 use_pblendvb:
45970 for (i = 0; i < nelt; ++i)
45971 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45973 finish_pblendvb:
45974 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45975 vperm = force_reg (vmode, vperm);
45977 if (GET_MODE_SIZE (vmode) == 16)
45978 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45979 else
45980 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45981 if (target != d->target)
45982 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45983 return true;
45986 for (i = 0; i < 8; ++i)
45987 mask |= (d->perm[i * 2] >= 16) << i;
45988 vmode = V8HImode;
45989 /* FALLTHRU */
45991 do_subreg:
45992 target = gen_reg_rtx (vmode);
45993 op0 = gen_lowpart (vmode, op0);
45994 op1 = gen_lowpart (vmode, op1);
45995 break;
45997 case E_V32QImode:
45998 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45999 for (i = 0; i < 32; i += 2)
46000 if (d->perm[i] + 1 != d->perm[i + 1])
46001 goto use_pblendvb;
46002 /* See if bytes move in quadruplets. If yes, vpblendd
46003 with immediate can be used. */
46004 for (i = 0; i < 32; i += 4)
46005 if (d->perm[i] + 2 != d->perm[i + 2])
46006 break;
46007 if (i < 32)
46009 /* See if bytes move the same in both lanes. If yes,
46010 vpblendw with immediate can be used. */
46011 for (i = 0; i < 16; i += 2)
46012 if (d->perm[i] + 16 != d->perm[i + 16])
46013 goto use_pblendvb;
46015 /* Use vpblendw. */
46016 for (i = 0; i < 16; ++i)
46017 mask |= (d->perm[i * 2] >= 32) << i;
46018 vmode = V16HImode;
46019 goto do_subreg;
46022 /* Use vpblendd. */
46023 for (i = 0; i < 8; ++i)
46024 mask |= (d->perm[i * 4] >= 32) << i;
46025 vmode = V8SImode;
46026 goto do_subreg;
46028 case E_V16HImode:
46029 /* See if words move in pairs. If yes, vpblendd can be used. */
46030 for (i = 0; i < 16; i += 2)
46031 if (d->perm[i] + 1 != d->perm[i + 1])
46032 break;
46033 if (i < 16)
46035 /* See if words move the same in both lanes. If not,
46036 vpblendvb must be used. */
46037 for (i = 0; i < 8; i++)
46038 if (d->perm[i] + 8 != d->perm[i + 8])
46040 /* Use vpblendvb. */
46041 for (i = 0; i < 32; ++i)
46042 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46044 vmode = V32QImode;
46045 nelt = 32;
46046 target = gen_reg_rtx (vmode);
46047 op0 = gen_lowpart (vmode, op0);
46048 op1 = gen_lowpart (vmode, op1);
46049 goto finish_pblendvb;
46052 /* Use vpblendw. */
46053 for (i = 0; i < 16; ++i)
46054 mask |= (d->perm[i] >= 16) << i;
46055 break;
46058 /* Use vpblendd. */
46059 for (i = 0; i < 8; ++i)
46060 mask |= (d->perm[i * 2] >= 16) << i;
46061 vmode = V8SImode;
46062 goto do_subreg;
46064 case E_V4DImode:
46065 /* Use vpblendd. */
46066 for (i = 0; i < 4; ++i)
46067 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46068 vmode = V8SImode;
46069 goto do_subreg;
46071 default:
46072 gcc_unreachable ();
46075 switch (vmode)
46077 case E_V8DFmode:
46078 case E_V8DImode:
46079 mmode = QImode;
46080 break;
46081 case E_V16SFmode:
46082 case E_V16SImode:
46083 mmode = HImode;
46084 break;
46085 case E_V32HImode:
46086 mmode = SImode;
46087 break;
46088 case E_V64QImode:
46089 mmode = DImode;
46090 break;
46091 default:
46092 mmode = VOIDmode;
46095 if (mmode != VOIDmode)
46096 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46097 else
46098 maskop = GEN_INT (mask);
46100 /* This matches five different patterns with the different modes. */
46101 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46102 x = gen_rtx_SET (target, x);
46103 emit_insn (x);
46104 if (target != d->target)
46105 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46107 return true;
46110 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46111 in terms of the variable form of vpermilps.
46113 Note that we will have already failed the immediate input vpermilps,
46114 which requires that the high and low part shuffle be identical; the
46115 variable form doesn't require that. */
46117 static bool
46118 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46120 rtx rperm[8], vperm;
46121 unsigned i;
46123 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46124 return false;
46126 /* We can only permute within the 128-bit lane. */
46127 for (i = 0; i < 8; ++i)
46129 unsigned e = d->perm[i];
46130 if (i < 4 ? e >= 4 : e < 4)
46131 return false;
46134 if (d->testing_p)
46135 return true;
46137 for (i = 0; i < 8; ++i)
46139 unsigned e = d->perm[i];
46141 /* Within each 128-bit lane, the elements of op0 are numbered
46142 from 0 and the elements of op1 are numbered from 4. */
46143 if (e >= 8 + 4)
46144 e -= 8;
46145 else if (e >= 4)
46146 e -= 4;
46148 rperm[i] = GEN_INT (e);
46151 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46152 vperm = force_reg (V8SImode, vperm);
46153 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46155 return true;
46158 /* Return true if permutation D can be performed as VMODE permutation
46159 instead. */
46161 static bool
46162 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46164 unsigned int i, j, chunk;
46166 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46167 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46168 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46169 return false;
46171 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46172 return true;
46174 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46175 for (i = 0; i < d->nelt; i += chunk)
46176 if (d->perm[i] & (chunk - 1))
46177 return false;
46178 else
46179 for (j = 1; j < chunk; ++j)
46180 if (d->perm[i] + j != d->perm[i + j])
46181 return false;
46183 return true;
46186 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46187 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46189 static bool
46190 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46192 unsigned i, nelt, eltsz, mask;
46193 unsigned char perm[64];
46194 machine_mode vmode = V16QImode;
46195 rtx rperm[64], vperm, target, op0, op1;
46197 nelt = d->nelt;
46199 if (!d->one_operand_p)
46201 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46203 if (TARGET_AVX2
46204 && valid_perm_using_mode_p (V2TImode, d))
46206 if (d->testing_p)
46207 return true;
46209 /* Use vperm2i128 insn. The pattern uses
46210 V4DImode instead of V2TImode. */
46211 target = d->target;
46212 if (d->vmode != V4DImode)
46213 target = gen_reg_rtx (V4DImode);
46214 op0 = gen_lowpart (V4DImode, d->op0);
46215 op1 = gen_lowpart (V4DImode, d->op1);
46216 rperm[0]
46217 = GEN_INT ((d->perm[0] / (nelt / 2))
46218 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46219 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46220 if (target != d->target)
46221 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46222 return true;
46224 return false;
46227 else
46229 if (GET_MODE_SIZE (d->vmode) == 16)
46231 if (!TARGET_SSSE3)
46232 return false;
46234 else if (GET_MODE_SIZE (d->vmode) == 32)
46236 if (!TARGET_AVX2)
46237 return false;
46239 /* V4DImode should be already handled through
46240 expand_vselect by vpermq instruction. */
46241 gcc_assert (d->vmode != V4DImode);
46243 vmode = V32QImode;
46244 if (d->vmode == V8SImode
46245 || d->vmode == V16HImode
46246 || d->vmode == V32QImode)
46248 /* First see if vpermq can be used for
46249 V8SImode/V16HImode/V32QImode. */
46250 if (valid_perm_using_mode_p (V4DImode, d))
46252 for (i = 0; i < 4; i++)
46253 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46254 if (d->testing_p)
46255 return true;
46256 target = gen_reg_rtx (V4DImode);
46257 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46258 perm, 4, false))
46260 emit_move_insn (d->target,
46261 gen_lowpart (d->vmode, target));
46262 return true;
46264 return false;
46267 /* Next see if vpermd can be used. */
46268 if (valid_perm_using_mode_p (V8SImode, d))
46269 vmode = V8SImode;
46271 /* Or if vpermps can be used. */
46272 else if (d->vmode == V8SFmode)
46273 vmode = V8SImode;
46275 if (vmode == V32QImode)
46277 /* vpshufb only works intra lanes, it is not
46278 possible to shuffle bytes in between the lanes. */
46279 for (i = 0; i < nelt; ++i)
46280 if ((d->perm[i] ^ i) & (nelt / 2))
46281 return false;
46284 else if (GET_MODE_SIZE (d->vmode) == 64)
46286 if (!TARGET_AVX512BW)
46287 return false;
46289 /* If vpermq didn't work, vpshufb won't work either. */
46290 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46291 return false;
46293 vmode = V64QImode;
46294 if (d->vmode == V16SImode
46295 || d->vmode == V32HImode
46296 || d->vmode == V64QImode)
46298 /* First see if vpermq can be used for
46299 V16SImode/V32HImode/V64QImode. */
46300 if (valid_perm_using_mode_p (V8DImode, d))
46302 for (i = 0; i < 8; i++)
46303 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46304 if (d->testing_p)
46305 return true;
46306 target = gen_reg_rtx (V8DImode);
46307 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46308 perm, 8, false))
46310 emit_move_insn (d->target,
46311 gen_lowpart (d->vmode, target));
46312 return true;
46314 return false;
46317 /* Next see if vpermd can be used. */
46318 if (valid_perm_using_mode_p (V16SImode, d))
46319 vmode = V16SImode;
46321 /* Or if vpermps can be used. */
46322 else if (d->vmode == V16SFmode)
46323 vmode = V16SImode;
46324 if (vmode == V64QImode)
46326 /* vpshufb only works intra lanes, it is not
46327 possible to shuffle bytes in between the lanes. */
46328 for (i = 0; i < nelt; ++i)
46329 if ((d->perm[i] ^ i) & (nelt / 4))
46330 return false;
46333 else
46334 return false;
46337 if (d->testing_p)
46338 return true;
46340 if (vmode == V8SImode)
46341 for (i = 0; i < 8; ++i)
46342 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46343 else if (vmode == V16SImode)
46344 for (i = 0; i < 16; ++i)
46345 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46346 else
46348 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46349 if (!d->one_operand_p)
46350 mask = 2 * nelt - 1;
46351 else if (vmode == V16QImode)
46352 mask = nelt - 1;
46353 else if (vmode == V64QImode)
46354 mask = nelt / 4 - 1;
46355 else
46356 mask = nelt / 2 - 1;
46358 for (i = 0; i < nelt; ++i)
46360 unsigned j, e = d->perm[i] & mask;
46361 for (j = 0; j < eltsz; ++j)
46362 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46366 vperm = gen_rtx_CONST_VECTOR (vmode,
46367 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46368 vperm = force_reg (vmode, vperm);
46370 target = d->target;
46371 if (d->vmode != vmode)
46372 target = gen_reg_rtx (vmode);
46373 op0 = gen_lowpart (vmode, d->op0);
46374 if (d->one_operand_p)
46376 if (vmode == V16QImode)
46377 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46378 else if (vmode == V32QImode)
46379 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46380 else if (vmode == V64QImode)
46381 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46382 else if (vmode == V8SFmode)
46383 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46384 else if (vmode == V8SImode)
46385 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46386 else if (vmode == V16SFmode)
46387 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46388 else if (vmode == V16SImode)
46389 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46390 else
46391 gcc_unreachable ();
46393 else
46395 op1 = gen_lowpart (vmode, d->op1);
46396 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46398 if (target != d->target)
46399 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46401 return true;
46404 /* For V*[QHS]Imode permutations, check if the same permutation
46405 can't be performed in a 2x, 4x or 8x wider inner mode. */
46407 static bool
46408 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46409 struct expand_vec_perm_d *nd)
46411 int i;
46412 machine_mode mode = VOIDmode;
46414 switch (d->vmode)
46416 case E_V16QImode: mode = V8HImode; break;
46417 case E_V32QImode: mode = V16HImode; break;
46418 case E_V64QImode: mode = V32HImode; break;
46419 case E_V8HImode: mode = V4SImode; break;
46420 case E_V16HImode: mode = V8SImode; break;
46421 case E_V32HImode: mode = V16SImode; break;
46422 case E_V4SImode: mode = V2DImode; break;
46423 case E_V8SImode: mode = V4DImode; break;
46424 case E_V16SImode: mode = V8DImode; break;
46425 default: return false;
46427 for (i = 0; i < d->nelt; i += 2)
46428 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46429 return false;
46430 nd->vmode = mode;
46431 nd->nelt = d->nelt / 2;
46432 for (i = 0; i < nd->nelt; i++)
46433 nd->perm[i] = d->perm[2 * i] / 2;
46434 if (GET_MODE_INNER (mode) != DImode)
46435 canonicalize_vector_int_perm (nd, nd);
46436 if (nd != d)
46438 nd->one_operand_p = d->one_operand_p;
46439 nd->testing_p = d->testing_p;
46440 if (d->op0 == d->op1)
46441 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46442 else
46444 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46445 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46447 if (d->testing_p)
46448 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46449 else
46450 nd->target = gen_reg_rtx (nd->vmode);
46452 return true;
46455 /* Try to expand one-operand permutation with constant mask. */
46457 static bool
46458 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46460 machine_mode mode = GET_MODE (d->op0);
46461 machine_mode maskmode = mode;
46462 rtx (*gen) (rtx, rtx, rtx) = NULL;
46463 rtx target, op0, mask;
46464 rtx vec[64];
46466 if (!rtx_equal_p (d->op0, d->op1))
46467 return false;
46469 if (!TARGET_AVX512F)
46470 return false;
46472 switch (mode)
46474 case E_V16SImode:
46475 gen = gen_avx512f_permvarv16si;
46476 break;
46477 case E_V16SFmode:
46478 gen = gen_avx512f_permvarv16sf;
46479 maskmode = V16SImode;
46480 break;
46481 case E_V8DImode:
46482 gen = gen_avx512f_permvarv8di;
46483 break;
46484 case E_V8DFmode:
46485 gen = gen_avx512f_permvarv8df;
46486 maskmode = V8DImode;
46487 break;
46488 default:
46489 return false;
46492 target = d->target;
46493 op0 = d->op0;
46494 for (int i = 0; i < d->nelt; ++i)
46495 vec[i] = GEN_INT (d->perm[i]);
46496 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46497 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46498 return true;
46501 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46502 in a single instruction. */
46504 static bool
46505 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46507 unsigned i, nelt = d->nelt;
46508 struct expand_vec_perm_d nd;
46510 /* Check plain VEC_SELECT first, because AVX has instructions that could
46511 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46512 input where SEL+CONCAT may not. */
46513 if (d->one_operand_p)
46515 int mask = nelt - 1;
46516 bool identity_perm = true;
46517 bool broadcast_perm = true;
46519 for (i = 0; i < nelt; i++)
46521 nd.perm[i] = d->perm[i] & mask;
46522 if (nd.perm[i] != i)
46523 identity_perm = false;
46524 if (nd.perm[i])
46525 broadcast_perm = false;
46528 if (identity_perm)
46530 if (!d->testing_p)
46531 emit_move_insn (d->target, d->op0);
46532 return true;
46534 else if (broadcast_perm && TARGET_AVX2)
46536 /* Use vpbroadcast{b,w,d}. */
46537 rtx (*gen) (rtx, rtx) = NULL;
46538 switch (d->vmode)
46540 case E_V64QImode:
46541 if (TARGET_AVX512BW)
46542 gen = gen_avx512bw_vec_dupv64qi_1;
46543 break;
46544 case E_V32QImode:
46545 gen = gen_avx2_pbroadcastv32qi_1;
46546 break;
46547 case E_V32HImode:
46548 if (TARGET_AVX512BW)
46549 gen = gen_avx512bw_vec_dupv32hi_1;
46550 break;
46551 case E_V16HImode:
46552 gen = gen_avx2_pbroadcastv16hi_1;
46553 break;
46554 case E_V16SImode:
46555 if (TARGET_AVX512F)
46556 gen = gen_avx512f_vec_dupv16si_1;
46557 break;
46558 case E_V8SImode:
46559 gen = gen_avx2_pbroadcastv8si_1;
46560 break;
46561 case E_V16QImode:
46562 gen = gen_avx2_pbroadcastv16qi;
46563 break;
46564 case E_V8HImode:
46565 gen = gen_avx2_pbroadcastv8hi;
46566 break;
46567 case E_V16SFmode:
46568 if (TARGET_AVX512F)
46569 gen = gen_avx512f_vec_dupv16sf_1;
46570 break;
46571 case E_V8SFmode:
46572 gen = gen_avx2_vec_dupv8sf_1;
46573 break;
46574 case E_V8DFmode:
46575 if (TARGET_AVX512F)
46576 gen = gen_avx512f_vec_dupv8df_1;
46577 break;
46578 case E_V8DImode:
46579 if (TARGET_AVX512F)
46580 gen = gen_avx512f_vec_dupv8di_1;
46581 break;
46582 /* For other modes prefer other shuffles this function creates. */
46583 default: break;
46585 if (gen != NULL)
46587 if (!d->testing_p)
46588 emit_insn (gen (d->target, d->op0));
46589 return true;
46593 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46594 return true;
46596 /* There are plenty of patterns in sse.md that are written for
46597 SEL+CONCAT and are not replicated for a single op. Perhaps
46598 that should be changed, to avoid the nastiness here. */
46600 /* Recognize interleave style patterns, which means incrementing
46601 every other permutation operand. */
46602 for (i = 0; i < nelt; i += 2)
46604 nd.perm[i] = d->perm[i] & mask;
46605 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46607 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46608 d->testing_p))
46609 return true;
46611 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46612 if (nelt >= 4)
46614 for (i = 0; i < nelt; i += 4)
46616 nd.perm[i + 0] = d->perm[i + 0] & mask;
46617 nd.perm[i + 1] = d->perm[i + 1] & mask;
46618 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46619 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46622 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46623 d->testing_p))
46624 return true;
46628 /* Finally, try the fully general two operand permute. */
46629 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46630 d->testing_p))
46631 return true;
46633 /* Recognize interleave style patterns with reversed operands. */
46634 if (!d->one_operand_p)
46636 for (i = 0; i < nelt; ++i)
46638 unsigned e = d->perm[i];
46639 if (e >= nelt)
46640 e -= nelt;
46641 else
46642 e += nelt;
46643 nd.perm[i] = e;
46646 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46647 d->testing_p))
46648 return true;
46651 /* Try the SSE4.1 blend variable merge instructions. */
46652 if (expand_vec_perm_blend (d))
46653 return true;
46655 /* Try one of the AVX vpermil variable permutations. */
46656 if (expand_vec_perm_vpermil (d))
46657 return true;
46659 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46660 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46661 if (expand_vec_perm_pshufb (d))
46662 return true;
46664 /* Try the AVX2 vpalignr instruction. */
46665 if (expand_vec_perm_palignr (d, true))
46666 return true;
46668 /* Try the AVX512F vperm{s,d} instructions. */
46669 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46670 return true;
46672 /* Try the AVX512F vpermt2/vpermi2 instructions. */
46673 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46674 return true;
46676 /* See if we can get the same permutation in different vector integer
46677 mode. */
46678 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
46680 if (!d->testing_p)
46681 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
46682 return true;
46684 return false;
46687 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46688 in terms of a pair of pshuflw + pshufhw instructions. */
46690 static bool
46691 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
46693 unsigned char perm2[MAX_VECT_LEN];
46694 unsigned i;
46695 bool ok;
46697 if (d->vmode != V8HImode || !d->one_operand_p)
46698 return false;
46700 /* The two permutations only operate in 64-bit lanes. */
46701 for (i = 0; i < 4; ++i)
46702 if (d->perm[i] >= 4)
46703 return false;
46704 for (i = 4; i < 8; ++i)
46705 if (d->perm[i] < 4)
46706 return false;
46708 if (d->testing_p)
46709 return true;
46711 /* Emit the pshuflw. */
46712 memcpy (perm2, d->perm, 4);
46713 for (i = 4; i < 8; ++i)
46714 perm2[i] = i;
46715 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
46716 gcc_assert (ok);
46718 /* Emit the pshufhw. */
46719 memcpy (perm2 + 4, d->perm + 4, 4);
46720 for (i = 0; i < 4; ++i)
46721 perm2[i] = i;
46722 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
46723 gcc_assert (ok);
46725 return true;
46728 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46729 the permutation using the SSSE3 palignr instruction. This succeeds
46730 when all of the elements in PERM fit within one vector and we merely
46731 need to shift them down so that a single vector permutation has a
46732 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
46733 the vpalignr instruction itself can perform the requested permutation. */
46735 static bool
46736 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
46738 unsigned i, nelt = d->nelt;
46739 unsigned min, max, minswap, maxswap;
46740 bool in_order, ok, swap = false;
46741 rtx shift, target;
46742 struct expand_vec_perm_d dcopy;
46744 /* Even with AVX, palignr only operates on 128-bit vectors,
46745 in AVX2 palignr operates on both 128-bit lanes. */
46746 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46747 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
46748 return false;
46750 min = 2 * nelt;
46751 max = 0;
46752 minswap = 2 * nelt;
46753 maxswap = 0;
46754 for (i = 0; i < nelt; ++i)
46756 unsigned e = d->perm[i];
46757 unsigned eswap = d->perm[i] ^ nelt;
46758 if (GET_MODE_SIZE (d->vmode) == 32)
46760 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
46761 eswap = e ^ (nelt / 2);
46763 if (e < min)
46764 min = e;
46765 if (e > max)
46766 max = e;
46767 if (eswap < minswap)
46768 minswap = eswap;
46769 if (eswap > maxswap)
46770 maxswap = eswap;
46772 if (min == 0
46773 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
46775 if (d->one_operand_p
46776 || minswap == 0
46777 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
46778 ? nelt / 2 : nelt))
46779 return false;
46780 swap = true;
46781 min = minswap;
46782 max = maxswap;
46785 /* Given that we have SSSE3, we know we'll be able to implement the
46786 single operand permutation after the palignr with pshufb for
46787 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
46788 first. */
46789 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
46790 return true;
46792 dcopy = *d;
46793 if (swap)
46795 dcopy.op0 = d->op1;
46796 dcopy.op1 = d->op0;
46797 for (i = 0; i < nelt; ++i)
46798 dcopy.perm[i] ^= nelt;
46801 in_order = true;
46802 for (i = 0; i < nelt; ++i)
46804 unsigned e = dcopy.perm[i];
46805 if (GET_MODE_SIZE (d->vmode) == 32
46806 && e >= nelt
46807 && (e & (nelt / 2 - 1)) < min)
46808 e = e - min - (nelt / 2);
46809 else
46810 e = e - min;
46811 if (e != i)
46812 in_order = false;
46813 dcopy.perm[i] = e;
46815 dcopy.one_operand_p = true;
46817 if (single_insn_only_p && !in_order)
46818 return false;
46820 /* For AVX2, test whether we can permute the result in one instruction. */
46821 if (d->testing_p)
46823 if (in_order)
46824 return true;
46825 dcopy.op1 = dcopy.op0;
46826 return expand_vec_perm_1 (&dcopy);
46829 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
46830 if (GET_MODE_SIZE (d->vmode) == 16)
46832 target = gen_reg_rtx (TImode);
46833 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
46834 gen_lowpart (TImode, dcopy.op0), shift));
46836 else
46838 target = gen_reg_rtx (V2TImode);
46839 emit_insn (gen_avx2_palignrv2ti (target,
46840 gen_lowpart (V2TImode, dcopy.op1),
46841 gen_lowpart (V2TImode, dcopy.op0),
46842 shift));
46845 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
46847 /* Test for the degenerate case where the alignment by itself
46848 produces the desired permutation. */
46849 if (in_order)
46851 emit_move_insn (d->target, dcopy.op0);
46852 return true;
46855 ok = expand_vec_perm_1 (&dcopy);
46856 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
46858 return ok;
46861 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
46862 the permutation using the SSE4_1 pblendv instruction. Potentially
46863 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
46865 static bool
46866 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
46868 unsigned i, which, nelt = d->nelt;
46869 struct expand_vec_perm_d dcopy, dcopy1;
46870 machine_mode vmode = d->vmode;
46871 bool ok;
46873 /* Use the same checks as in expand_vec_perm_blend. */
46874 if (d->one_operand_p)
46875 return false;
46876 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46878 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46880 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46882 else
46883 return false;
46885 /* Figure out where permutation elements stay not in their
46886 respective lanes. */
46887 for (i = 0, which = 0; i < nelt; ++i)
46889 unsigned e = d->perm[i];
46890 if (e != i)
46891 which |= (e < nelt ? 1 : 2);
46893 /* We can pblend the part where elements stay not in their
46894 respective lanes only when these elements are all in one
46895 half of a permutation.
46896 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
46897 lanes, but both 8 and 9 >= 8
46898 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
46899 respective lanes and 8 >= 8, but 2 not. */
46900 if (which != 1 && which != 2)
46901 return false;
46902 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
46903 return true;
46905 /* First we apply one operand permutation to the part where
46906 elements stay not in their respective lanes. */
46907 dcopy = *d;
46908 if (which == 2)
46909 dcopy.op0 = dcopy.op1 = d->op1;
46910 else
46911 dcopy.op0 = dcopy.op1 = d->op0;
46912 if (!d->testing_p)
46913 dcopy.target = gen_reg_rtx (vmode);
46914 dcopy.one_operand_p = true;
46916 for (i = 0; i < nelt; ++i)
46917 dcopy.perm[i] = d->perm[i] & (nelt - 1);
46919 ok = expand_vec_perm_1 (&dcopy);
46920 if (GET_MODE_SIZE (vmode) != 16 && !ok)
46921 return false;
46922 else
46923 gcc_assert (ok);
46924 if (d->testing_p)
46925 return true;
46927 /* Next we put permuted elements into their positions. */
46928 dcopy1 = *d;
46929 if (which == 2)
46930 dcopy1.op1 = dcopy.target;
46931 else
46932 dcopy1.op0 = dcopy.target;
46934 for (i = 0; i < nelt; ++i)
46935 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
46937 ok = expand_vec_perm_blend (&dcopy1);
46938 gcc_assert (ok);
46940 return true;
46943 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
46945 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46946 a two vector permutation into a single vector permutation by using
46947 an interleave operation to merge the vectors. */
46949 static bool
46950 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
46952 struct expand_vec_perm_d dremap, dfinal;
46953 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46954 unsigned HOST_WIDE_INT contents;
46955 unsigned char remap[2 * MAX_VECT_LEN];
46956 rtx_insn *seq;
46957 bool ok, same_halves = false;
46959 if (GET_MODE_SIZE (d->vmode) == 16)
46961 if (d->one_operand_p)
46962 return false;
46964 else if (GET_MODE_SIZE (d->vmode) == 32)
46966 if (!TARGET_AVX)
46967 return false;
46968 /* For 32-byte modes allow even d->one_operand_p.
46969 The lack of cross-lane shuffling in some instructions
46970 might prevent a single insn shuffle. */
46971 dfinal = *d;
46972 dfinal.testing_p = true;
46973 /* If expand_vec_perm_interleave3 can expand this into
46974 a 3 insn sequence, give up and let it be expanded as
46975 3 insn sequence. While that is one insn longer,
46976 it doesn't need a memory operand and in the common
46977 case that both interleave low and high permutations
46978 with the same operands are adjacent needs 4 insns
46979 for both after CSE. */
46980 if (expand_vec_perm_interleave3 (&dfinal))
46981 return false;
46983 else
46984 return false;
46986 /* Examine from whence the elements come. */
46987 contents = 0;
46988 for (i = 0; i < nelt; ++i)
46989 contents |= HOST_WIDE_INT_1U << d->perm[i];
46991 memset (remap, 0xff, sizeof (remap));
46992 dremap = *d;
46994 if (GET_MODE_SIZE (d->vmode) == 16)
46996 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46998 /* Split the two input vectors into 4 halves. */
46999 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47000 h2 = h1 << nelt2;
47001 h3 = h2 << nelt2;
47002 h4 = h3 << nelt2;
47004 /* If the elements from the low halves use interleave low, and similarly
47005 for interleave high. If the elements are from mis-matched halves, we
47006 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47007 if ((contents & (h1 | h3)) == contents)
47009 /* punpckl* */
47010 for (i = 0; i < nelt2; ++i)
47012 remap[i] = i * 2;
47013 remap[i + nelt] = i * 2 + 1;
47014 dremap.perm[i * 2] = i;
47015 dremap.perm[i * 2 + 1] = i + nelt;
47017 if (!TARGET_SSE2 && d->vmode == V4SImode)
47018 dremap.vmode = V4SFmode;
47020 else if ((contents & (h2 | h4)) == contents)
47022 /* punpckh* */
47023 for (i = 0; i < nelt2; ++i)
47025 remap[i + nelt2] = i * 2;
47026 remap[i + nelt + nelt2] = i * 2 + 1;
47027 dremap.perm[i * 2] = i + nelt2;
47028 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47030 if (!TARGET_SSE2 && d->vmode == V4SImode)
47031 dremap.vmode = V4SFmode;
47033 else if ((contents & (h1 | h4)) == contents)
47035 /* shufps */
47036 for (i = 0; i < nelt2; ++i)
47038 remap[i] = i;
47039 remap[i + nelt + nelt2] = i + nelt2;
47040 dremap.perm[i] = i;
47041 dremap.perm[i + nelt2] = i + nelt + nelt2;
47043 if (nelt != 4)
47045 /* shufpd */
47046 dremap.vmode = V2DImode;
47047 dremap.nelt = 2;
47048 dremap.perm[0] = 0;
47049 dremap.perm[1] = 3;
47052 else if ((contents & (h2 | h3)) == contents)
47054 /* shufps */
47055 for (i = 0; i < nelt2; ++i)
47057 remap[i + nelt2] = i;
47058 remap[i + nelt] = i + nelt2;
47059 dremap.perm[i] = i + nelt2;
47060 dremap.perm[i + nelt2] = i + nelt;
47062 if (nelt != 4)
47064 /* shufpd */
47065 dremap.vmode = V2DImode;
47066 dremap.nelt = 2;
47067 dremap.perm[0] = 1;
47068 dremap.perm[1] = 2;
47071 else
47072 return false;
47074 else
47076 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47077 unsigned HOST_WIDE_INT q[8];
47078 unsigned int nonzero_halves[4];
47080 /* Split the two input vectors into 8 quarters. */
47081 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47082 for (i = 1; i < 8; ++i)
47083 q[i] = q[0] << (nelt4 * i);
47084 for (i = 0; i < 4; ++i)
47085 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47087 nonzero_halves[nzcnt] = i;
47088 ++nzcnt;
47091 if (nzcnt == 1)
47093 gcc_assert (d->one_operand_p);
47094 nonzero_halves[1] = nonzero_halves[0];
47095 same_halves = true;
47097 else if (d->one_operand_p)
47099 gcc_assert (nonzero_halves[0] == 0);
47100 gcc_assert (nonzero_halves[1] == 1);
47103 if (nzcnt <= 2)
47105 if (d->perm[0] / nelt2 == nonzero_halves[1])
47107 /* Attempt to increase the likelihood that dfinal
47108 shuffle will be intra-lane. */
47109 std::swap (nonzero_halves[0], nonzero_halves[1]);
47112 /* vperm2f128 or vperm2i128. */
47113 for (i = 0; i < nelt2; ++i)
47115 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47116 remap[i + nonzero_halves[0] * nelt2] = i;
47117 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47118 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47121 if (d->vmode != V8SFmode
47122 && d->vmode != V4DFmode
47123 && d->vmode != V8SImode)
47125 dremap.vmode = V8SImode;
47126 dremap.nelt = 8;
47127 for (i = 0; i < 4; ++i)
47129 dremap.perm[i] = i + nonzero_halves[0] * 4;
47130 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47134 else if (d->one_operand_p)
47135 return false;
47136 else if (TARGET_AVX2
47137 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47139 /* vpunpckl* */
47140 for (i = 0; i < nelt4; ++i)
47142 remap[i] = i * 2;
47143 remap[i + nelt] = i * 2 + 1;
47144 remap[i + nelt2] = i * 2 + nelt2;
47145 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47146 dremap.perm[i * 2] = i;
47147 dremap.perm[i * 2 + 1] = i + nelt;
47148 dremap.perm[i * 2 + nelt2] = i + nelt2;
47149 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47152 else if (TARGET_AVX2
47153 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47155 /* vpunpckh* */
47156 for (i = 0; i < nelt4; ++i)
47158 remap[i + nelt4] = i * 2;
47159 remap[i + nelt + nelt4] = i * 2 + 1;
47160 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47161 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47162 dremap.perm[i * 2] = i + nelt4;
47163 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47164 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47165 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47168 else
47169 return false;
47172 /* Use the remapping array set up above to move the elements from their
47173 swizzled locations into their final destinations. */
47174 dfinal = *d;
47175 for (i = 0; i < nelt; ++i)
47177 unsigned e = remap[d->perm[i]];
47178 gcc_assert (e < nelt);
47179 /* If same_halves is true, both halves of the remapped vector are the
47180 same. Avoid cross-lane accesses if possible. */
47181 if (same_halves && i >= nelt2)
47183 gcc_assert (e < nelt2);
47184 dfinal.perm[i] = e + nelt2;
47186 else
47187 dfinal.perm[i] = e;
47189 if (!d->testing_p)
47191 dremap.target = gen_reg_rtx (dremap.vmode);
47192 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47194 dfinal.op1 = dfinal.op0;
47195 dfinal.one_operand_p = true;
47197 /* Test if the final remap can be done with a single insn. For V4SFmode or
47198 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47199 start_sequence ();
47200 ok = expand_vec_perm_1 (&dfinal);
47201 seq = get_insns ();
47202 end_sequence ();
47204 if (!ok)
47205 return false;
47207 if (d->testing_p)
47208 return true;
47210 if (dremap.vmode != dfinal.vmode)
47212 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47213 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47216 ok = expand_vec_perm_1 (&dremap);
47217 gcc_assert (ok);
47219 emit_insn (seq);
47220 return true;
47223 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47224 a single vector cross-lane permutation into vpermq followed
47225 by any of the single insn permutations. */
47227 static bool
47228 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47230 struct expand_vec_perm_d dremap, dfinal;
47231 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47232 unsigned contents[2];
47233 bool ok;
47235 if (!(TARGET_AVX2
47236 && (d->vmode == V32QImode || d->vmode == V16HImode)
47237 && d->one_operand_p))
47238 return false;
47240 contents[0] = 0;
47241 contents[1] = 0;
47242 for (i = 0; i < nelt2; ++i)
47244 contents[0] |= 1u << (d->perm[i] / nelt4);
47245 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47248 for (i = 0; i < 2; ++i)
47250 unsigned int cnt = 0;
47251 for (j = 0; j < 4; ++j)
47252 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47253 return false;
47256 if (d->testing_p)
47257 return true;
47259 dremap = *d;
47260 dremap.vmode = V4DImode;
47261 dremap.nelt = 4;
47262 dremap.target = gen_reg_rtx (V4DImode);
47263 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47264 dremap.op1 = dremap.op0;
47265 dremap.one_operand_p = true;
47266 for (i = 0; i < 2; ++i)
47268 unsigned int cnt = 0;
47269 for (j = 0; j < 4; ++j)
47270 if ((contents[i] & (1u << j)) != 0)
47271 dremap.perm[2 * i + cnt++] = j;
47272 for (; cnt < 2; ++cnt)
47273 dremap.perm[2 * i + cnt] = 0;
47276 dfinal = *d;
47277 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47278 dfinal.op1 = dfinal.op0;
47279 dfinal.one_operand_p = true;
47280 for (i = 0, j = 0; i < nelt; ++i)
47282 if (i == nelt2)
47283 j = 2;
47284 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47285 if ((d->perm[i] / nelt4) == dremap.perm[j])
47287 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47288 dfinal.perm[i] |= nelt4;
47289 else
47290 gcc_unreachable ();
47293 ok = expand_vec_perm_1 (&dremap);
47294 gcc_assert (ok);
47296 ok = expand_vec_perm_1 (&dfinal);
47297 gcc_assert (ok);
47299 return true;
47302 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47303 a vector permutation using two instructions, vperm2f128 resp.
47304 vperm2i128 followed by any single in-lane permutation. */
47306 static bool
47307 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47309 struct expand_vec_perm_d dfirst, dsecond;
47310 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47311 bool ok;
47313 if (!TARGET_AVX
47314 || GET_MODE_SIZE (d->vmode) != 32
47315 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47316 return false;
47318 dsecond = *d;
47319 dsecond.one_operand_p = false;
47320 dsecond.testing_p = true;
47322 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47323 immediate. For perm < 16 the second permutation uses
47324 d->op0 as first operand, for perm >= 16 it uses d->op1
47325 as first operand. The second operand is the result of
47326 vperm2[fi]128. */
47327 for (perm = 0; perm < 32; perm++)
47329 /* Ignore permutations which do not move anything cross-lane. */
47330 if (perm < 16)
47332 /* The second shuffle for e.g. V4DFmode has
47333 0123 and ABCD operands.
47334 Ignore AB23, as 23 is already in the second lane
47335 of the first operand. */
47336 if ((perm & 0xc) == (1 << 2)) continue;
47337 /* And 01CD, as 01 is in the first lane of the first
47338 operand. */
47339 if ((perm & 3) == 0) continue;
47340 /* And 4567, as then the vperm2[fi]128 doesn't change
47341 anything on the original 4567 second operand. */
47342 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47344 else
47346 /* The second shuffle for e.g. V4DFmode has
47347 4567 and ABCD operands.
47348 Ignore AB67, as 67 is already in the second lane
47349 of the first operand. */
47350 if ((perm & 0xc) == (3 << 2)) continue;
47351 /* And 45CD, as 45 is in the first lane of the first
47352 operand. */
47353 if ((perm & 3) == 2) continue;
47354 /* And 0123, as then the vperm2[fi]128 doesn't change
47355 anything on the original 0123 first operand. */
47356 if ((perm & 0xf) == (1 << 2)) continue;
47359 for (i = 0; i < nelt; i++)
47361 j = d->perm[i] / nelt2;
47362 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47363 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47364 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47365 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47366 else
47367 break;
47370 if (i == nelt)
47372 start_sequence ();
47373 ok = expand_vec_perm_1 (&dsecond);
47374 end_sequence ();
47376 else
47377 ok = false;
47379 if (ok)
47381 if (d->testing_p)
47382 return true;
47384 /* Found a usable second shuffle. dfirst will be
47385 vperm2f128 on d->op0 and d->op1. */
47386 dsecond.testing_p = false;
47387 dfirst = *d;
47388 dfirst.target = gen_reg_rtx (d->vmode);
47389 for (i = 0; i < nelt; i++)
47390 dfirst.perm[i] = (i & (nelt2 - 1))
47391 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47393 canonicalize_perm (&dfirst);
47394 ok = expand_vec_perm_1 (&dfirst);
47395 gcc_assert (ok);
47397 /* And dsecond is some single insn shuffle, taking
47398 d->op0 and result of vperm2f128 (if perm < 16) or
47399 d->op1 and result of vperm2f128 (otherwise). */
47400 if (perm >= 16)
47401 dsecond.op0 = dsecond.op1;
47402 dsecond.op1 = dfirst.target;
47404 ok = expand_vec_perm_1 (&dsecond);
47405 gcc_assert (ok);
47407 return true;
47410 /* For one operand, the only useful vperm2f128 permutation is 0x01
47411 aka lanes swap. */
47412 if (d->one_operand_p)
47413 return false;
47416 return false;
47419 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47420 a two vector permutation using 2 intra-lane interleave insns
47421 and cross-lane shuffle for 32-byte vectors. */
47423 static bool
47424 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47426 unsigned i, nelt;
47427 rtx (*gen) (rtx, rtx, rtx);
47429 if (d->one_operand_p)
47430 return false;
47431 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47433 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47435 else
47436 return false;
47438 nelt = d->nelt;
47439 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47440 return false;
47441 for (i = 0; i < nelt; i += 2)
47442 if (d->perm[i] != d->perm[0] + i / 2
47443 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47444 return false;
47446 if (d->testing_p)
47447 return true;
47449 switch (d->vmode)
47451 case E_V32QImode:
47452 if (d->perm[0])
47453 gen = gen_vec_interleave_highv32qi;
47454 else
47455 gen = gen_vec_interleave_lowv32qi;
47456 break;
47457 case E_V16HImode:
47458 if (d->perm[0])
47459 gen = gen_vec_interleave_highv16hi;
47460 else
47461 gen = gen_vec_interleave_lowv16hi;
47462 break;
47463 case E_V8SImode:
47464 if (d->perm[0])
47465 gen = gen_vec_interleave_highv8si;
47466 else
47467 gen = gen_vec_interleave_lowv8si;
47468 break;
47469 case E_V4DImode:
47470 if (d->perm[0])
47471 gen = gen_vec_interleave_highv4di;
47472 else
47473 gen = gen_vec_interleave_lowv4di;
47474 break;
47475 case E_V8SFmode:
47476 if (d->perm[0])
47477 gen = gen_vec_interleave_highv8sf;
47478 else
47479 gen = gen_vec_interleave_lowv8sf;
47480 break;
47481 case E_V4DFmode:
47482 if (d->perm[0])
47483 gen = gen_vec_interleave_highv4df;
47484 else
47485 gen = gen_vec_interleave_lowv4df;
47486 break;
47487 default:
47488 gcc_unreachable ();
47491 emit_insn (gen (d->target, d->op0, d->op1));
47492 return true;
47495 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47496 a single vector permutation using a single intra-lane vector
47497 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47498 the non-swapped and swapped vectors together. */
47500 static bool
47501 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47503 struct expand_vec_perm_d dfirst, dsecond;
47504 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47505 rtx_insn *seq;
47506 bool ok;
47507 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47509 if (!TARGET_AVX
47510 || TARGET_AVX2
47511 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47512 || !d->one_operand_p)
47513 return false;
47515 dfirst = *d;
47516 for (i = 0; i < nelt; i++)
47517 dfirst.perm[i] = 0xff;
47518 for (i = 0, msk = 0; i < nelt; i++)
47520 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47521 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47522 return false;
47523 dfirst.perm[j] = d->perm[i];
47524 if (j != i)
47525 msk |= (1 << i);
47527 for (i = 0; i < nelt; i++)
47528 if (dfirst.perm[i] == 0xff)
47529 dfirst.perm[i] = i;
47531 if (!d->testing_p)
47532 dfirst.target = gen_reg_rtx (dfirst.vmode);
47534 start_sequence ();
47535 ok = expand_vec_perm_1 (&dfirst);
47536 seq = get_insns ();
47537 end_sequence ();
47539 if (!ok)
47540 return false;
47542 if (d->testing_p)
47543 return true;
47545 emit_insn (seq);
47547 dsecond = *d;
47548 dsecond.op0 = dfirst.target;
47549 dsecond.op1 = dfirst.target;
47550 dsecond.one_operand_p = true;
47551 dsecond.target = gen_reg_rtx (dsecond.vmode);
47552 for (i = 0; i < nelt; i++)
47553 dsecond.perm[i] = i ^ nelt2;
47555 ok = expand_vec_perm_1 (&dsecond);
47556 gcc_assert (ok);
47558 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47559 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47560 return true;
47563 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47564 permutation using two vperm2f128, followed by a vshufpd insn blending
47565 the two vectors together. */
47567 static bool
47568 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47570 struct expand_vec_perm_d dfirst, dsecond, dthird;
47571 bool ok;
47573 if (!TARGET_AVX || (d->vmode != V4DFmode))
47574 return false;
47576 if (d->testing_p)
47577 return true;
47579 dfirst = *d;
47580 dsecond = *d;
47581 dthird = *d;
47583 dfirst.perm[0] = (d->perm[0] & ~1);
47584 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47585 dfirst.perm[2] = (d->perm[2] & ~1);
47586 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47587 dsecond.perm[0] = (d->perm[1] & ~1);
47588 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47589 dsecond.perm[2] = (d->perm[3] & ~1);
47590 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47591 dthird.perm[0] = (d->perm[0] % 2);
47592 dthird.perm[1] = (d->perm[1] % 2) + 4;
47593 dthird.perm[2] = (d->perm[2] % 2) + 2;
47594 dthird.perm[3] = (d->perm[3] % 2) + 6;
47596 dfirst.target = gen_reg_rtx (dfirst.vmode);
47597 dsecond.target = gen_reg_rtx (dsecond.vmode);
47598 dthird.op0 = dfirst.target;
47599 dthird.op1 = dsecond.target;
47600 dthird.one_operand_p = false;
47602 canonicalize_perm (&dfirst);
47603 canonicalize_perm (&dsecond);
47605 ok = expand_vec_perm_1 (&dfirst)
47606 && expand_vec_perm_1 (&dsecond)
47607 && expand_vec_perm_1 (&dthird);
47609 gcc_assert (ok);
47611 return true;
47614 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47615 permutation with two pshufb insns and an ior. We should have already
47616 failed all two instruction sequences. */
47618 static bool
47619 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47621 rtx rperm[2][16], vperm, l, h, op, m128;
47622 unsigned int i, nelt, eltsz;
47624 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47625 return false;
47626 gcc_assert (!d->one_operand_p);
47628 if (d->testing_p)
47629 return true;
47631 nelt = d->nelt;
47632 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47634 /* Generate two permutation masks. If the required element is within
47635 the given vector it is shuffled into the proper lane. If the required
47636 element is in the other vector, force a zero into the lane by setting
47637 bit 7 in the permutation mask. */
47638 m128 = GEN_INT (-128);
47639 for (i = 0; i < nelt; ++i)
47641 unsigned j, e = d->perm[i];
47642 unsigned which = (e >= nelt);
47643 if (e >= nelt)
47644 e -= nelt;
47646 for (j = 0; j < eltsz; ++j)
47648 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47649 rperm[1-which][i*eltsz + j] = m128;
47653 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47654 vperm = force_reg (V16QImode, vperm);
47656 l = gen_reg_rtx (V16QImode);
47657 op = gen_lowpart (V16QImode, d->op0);
47658 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47660 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47661 vperm = force_reg (V16QImode, vperm);
47663 h = gen_reg_rtx (V16QImode);
47664 op = gen_lowpart (V16QImode, d->op1);
47665 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47667 op = d->target;
47668 if (d->vmode != V16QImode)
47669 op = gen_reg_rtx (V16QImode);
47670 emit_insn (gen_iorv16qi3 (op, l, h));
47671 if (op != d->target)
47672 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47674 return true;
47677 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
47678 with two vpshufb insns, vpermq and vpor. We should have already failed
47679 all two or three instruction sequences. */
47681 static bool
47682 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
47684 rtx rperm[2][32], vperm, l, h, hp, op, m128;
47685 unsigned int i, nelt, eltsz;
47687 if (!TARGET_AVX2
47688 || !d->one_operand_p
47689 || (d->vmode != V32QImode && d->vmode != V16HImode))
47690 return false;
47692 if (d->testing_p)
47693 return true;
47695 nelt = d->nelt;
47696 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47698 /* Generate two permutation masks. If the required element is within
47699 the same lane, it is shuffled in. If the required element from the
47700 other lane, force a zero by setting bit 7 in the permutation mask.
47701 In the other mask the mask has non-negative elements if element
47702 is requested from the other lane, but also moved to the other lane,
47703 so that the result of vpshufb can have the two V2TImode halves
47704 swapped. */
47705 m128 = GEN_INT (-128);
47706 for (i = 0; i < nelt; ++i)
47708 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47709 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47711 for (j = 0; j < eltsz; ++j)
47713 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
47714 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
47718 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47719 vperm = force_reg (V32QImode, vperm);
47721 h = gen_reg_rtx (V32QImode);
47722 op = gen_lowpart (V32QImode, d->op0);
47723 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47725 /* Swap the 128-byte lanes of h into hp. */
47726 hp = gen_reg_rtx (V4DImode);
47727 op = gen_lowpart (V4DImode, h);
47728 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
47729 const1_rtx));
47731 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47732 vperm = force_reg (V32QImode, vperm);
47734 l = gen_reg_rtx (V32QImode);
47735 op = gen_lowpart (V32QImode, d->op0);
47736 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47738 op = d->target;
47739 if (d->vmode != V32QImode)
47740 op = gen_reg_rtx (V32QImode);
47741 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
47742 if (op != d->target)
47743 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47745 return true;
47748 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47749 and extract-odd permutations of two V32QImode and V16QImode operand
47750 with two vpshufb insns, vpor and vpermq. We should have already
47751 failed all two or three instruction sequences. */
47753 static bool
47754 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
47756 rtx rperm[2][32], vperm, l, h, ior, op, m128;
47757 unsigned int i, nelt, eltsz;
47759 if (!TARGET_AVX2
47760 || d->one_operand_p
47761 || (d->vmode != V32QImode && d->vmode != V16HImode))
47762 return false;
47764 for (i = 0; i < d->nelt; ++i)
47765 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
47766 return false;
47768 if (d->testing_p)
47769 return true;
47771 nelt = d->nelt;
47772 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47774 /* Generate two permutation masks. In the first permutation mask
47775 the first quarter will contain indexes for the first half
47776 of the op0, the second quarter will contain bit 7 set, third quarter
47777 will contain indexes for the second half of the op0 and the
47778 last quarter bit 7 set. In the second permutation mask
47779 the first quarter will contain bit 7 set, the second quarter
47780 indexes for the first half of the op1, the third quarter bit 7 set
47781 and last quarter indexes for the second half of the op1.
47782 I.e. the first mask e.g. for V32QImode extract even will be:
47783 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
47784 (all values masked with 0xf except for -128) and second mask
47785 for extract even will be
47786 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
47787 m128 = GEN_INT (-128);
47788 for (i = 0; i < nelt; ++i)
47790 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47791 unsigned which = d->perm[i] >= nelt;
47792 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
47794 for (j = 0; j < eltsz; ++j)
47796 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
47797 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
47801 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47802 vperm = force_reg (V32QImode, vperm);
47804 l = gen_reg_rtx (V32QImode);
47805 op = gen_lowpart (V32QImode, d->op0);
47806 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
47808 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47809 vperm = force_reg (V32QImode, vperm);
47811 h = gen_reg_rtx (V32QImode);
47812 op = gen_lowpart (V32QImode, d->op1);
47813 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47815 ior = gen_reg_rtx (V32QImode);
47816 emit_insn (gen_iorv32qi3 (ior, l, h));
47818 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
47819 op = gen_reg_rtx (V4DImode);
47820 ior = gen_lowpart (V4DImode, ior);
47821 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
47822 const1_rtx, GEN_INT (3)));
47823 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47825 return true;
47828 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47829 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
47830 with two "and" and "pack" or two "shift" and "pack" insns. We should
47831 have already failed all two instruction sequences. */
47833 static bool
47834 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
47836 rtx op, dop0, dop1, t;
47837 unsigned i, odd, c, s, nelt = d->nelt;
47838 bool end_perm = false;
47839 machine_mode half_mode;
47840 rtx (*gen_and) (rtx, rtx, rtx);
47841 rtx (*gen_pack) (rtx, rtx, rtx);
47842 rtx (*gen_shift) (rtx, rtx, rtx);
47844 if (d->one_operand_p)
47845 return false;
47847 switch (d->vmode)
47849 case E_V8HImode:
47850 /* Required for "pack". */
47851 if (!TARGET_SSE4_1)
47852 return false;
47853 c = 0xffff;
47854 s = 16;
47855 half_mode = V4SImode;
47856 gen_and = gen_andv4si3;
47857 gen_pack = gen_sse4_1_packusdw;
47858 gen_shift = gen_lshrv4si3;
47859 break;
47860 case E_V16QImode:
47861 /* No check as all instructions are SSE2. */
47862 c = 0xff;
47863 s = 8;
47864 half_mode = V8HImode;
47865 gen_and = gen_andv8hi3;
47866 gen_pack = gen_sse2_packuswb;
47867 gen_shift = gen_lshrv8hi3;
47868 break;
47869 case E_V16HImode:
47870 if (!TARGET_AVX2)
47871 return false;
47872 c = 0xffff;
47873 s = 16;
47874 half_mode = V8SImode;
47875 gen_and = gen_andv8si3;
47876 gen_pack = gen_avx2_packusdw;
47877 gen_shift = gen_lshrv8si3;
47878 end_perm = true;
47879 break;
47880 case E_V32QImode:
47881 if (!TARGET_AVX2)
47882 return false;
47883 c = 0xff;
47884 s = 8;
47885 half_mode = V16HImode;
47886 gen_and = gen_andv16hi3;
47887 gen_pack = gen_avx2_packuswb;
47888 gen_shift = gen_lshrv16hi3;
47889 end_perm = true;
47890 break;
47891 default:
47892 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
47893 general shuffles. */
47894 return false;
47897 /* Check that permutation is even or odd. */
47898 odd = d->perm[0];
47899 if (odd > 1)
47900 return false;
47902 for (i = 1; i < nelt; ++i)
47903 if (d->perm[i] != 2 * i + odd)
47904 return false;
47906 if (d->testing_p)
47907 return true;
47909 dop0 = gen_reg_rtx (half_mode);
47910 dop1 = gen_reg_rtx (half_mode);
47911 if (odd == 0)
47913 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
47914 t = force_reg (half_mode, t);
47915 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
47916 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
47918 else
47920 emit_insn (gen_shift (dop0,
47921 gen_lowpart (half_mode, d->op0),
47922 GEN_INT (s)));
47923 emit_insn (gen_shift (dop1,
47924 gen_lowpart (half_mode, d->op1),
47925 GEN_INT (s)));
47927 /* In AVX2 for 256 bit case we need to permute pack result. */
47928 if (TARGET_AVX2 && end_perm)
47930 op = gen_reg_rtx (d->vmode);
47931 t = gen_reg_rtx (V4DImode);
47932 emit_insn (gen_pack (op, dop0, dop1));
47933 emit_insn (gen_avx2_permv4di_1 (t,
47934 gen_lowpart (V4DImode, op),
47935 const0_rtx,
47936 const2_rtx,
47937 const1_rtx,
47938 GEN_INT (3)));
47939 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
47941 else
47942 emit_insn (gen_pack (d->target, dop0, dop1));
47944 return true;
47947 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
47948 and extract-odd permutations of two V64QI operands
47949 with two "shifts", two "truncs" and one "concat" insns for "odd"
47950 and two "truncs" and one concat insn for "even."
47951 Have already failed all two instruction sequences. */
47953 static bool
47954 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47956 rtx t1, t2, t3, t4;
47957 unsigned i, odd, nelt = d->nelt;
47959 if (!TARGET_AVX512BW
47960 || d->one_operand_p
47961 || d->vmode != V64QImode)
47962 return false;
47964 /* Check that permutation is even or odd. */
47965 odd = d->perm[0];
47966 if (odd > 1)
47967 return false;
47969 for (i = 1; i < nelt; ++i)
47970 if (d->perm[i] != 2 * i + odd)
47971 return false;
47973 if (d->testing_p)
47974 return true;
47977 if (odd)
47979 t1 = gen_reg_rtx (V32HImode);
47980 t2 = gen_reg_rtx (V32HImode);
47981 emit_insn (gen_lshrv32hi3 (t1,
47982 gen_lowpart (V32HImode, d->op0),
47983 GEN_INT (8)));
47984 emit_insn (gen_lshrv32hi3 (t2,
47985 gen_lowpart (V32HImode, d->op1),
47986 GEN_INT (8)));
47988 else
47990 t1 = gen_lowpart (V32HImode, d->op0);
47991 t2 = gen_lowpart (V32HImode, d->op1);
47994 t3 = gen_reg_rtx (V32QImode);
47995 t4 = gen_reg_rtx (V32QImode);
47996 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47997 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47998 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48000 return true;
48003 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48004 and extract-odd permutations. */
48006 static bool
48007 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48009 rtx t1, t2, t3, t4, t5;
48011 switch (d->vmode)
48013 case E_V4DFmode:
48014 if (d->testing_p)
48015 break;
48016 t1 = gen_reg_rtx (V4DFmode);
48017 t2 = gen_reg_rtx (V4DFmode);
48019 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48020 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48021 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48023 /* Now an unpck[lh]pd will produce the result required. */
48024 if (odd)
48025 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48026 else
48027 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48028 emit_insn (t3);
48029 break;
48031 case E_V8SFmode:
48033 int mask = odd ? 0xdd : 0x88;
48035 if (d->testing_p)
48036 break;
48037 t1 = gen_reg_rtx (V8SFmode);
48038 t2 = gen_reg_rtx (V8SFmode);
48039 t3 = gen_reg_rtx (V8SFmode);
48041 /* Shuffle within the 128-bit lanes to produce:
48042 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48043 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48044 GEN_INT (mask)));
48046 /* Shuffle the lanes around to produce:
48047 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48048 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48049 GEN_INT (0x3)));
48051 /* Shuffle within the 128-bit lanes to produce:
48052 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48053 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48055 /* Shuffle within the 128-bit lanes to produce:
48056 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48057 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48059 /* Shuffle the lanes around to produce:
48060 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48061 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48062 GEN_INT (0x20)));
48064 break;
48066 case E_V2DFmode:
48067 case E_V4SFmode:
48068 case E_V2DImode:
48069 case E_V4SImode:
48070 /* These are always directly implementable by expand_vec_perm_1. */
48071 gcc_unreachable ();
48073 case E_V8HImode:
48074 if (TARGET_SSE4_1)
48075 return expand_vec_perm_even_odd_pack (d);
48076 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48077 return expand_vec_perm_pshufb2 (d);
48078 else
48080 if (d->testing_p)
48081 break;
48082 /* We need 2*log2(N)-1 operations to achieve odd/even
48083 with interleave. */
48084 t1 = gen_reg_rtx (V8HImode);
48085 t2 = gen_reg_rtx (V8HImode);
48086 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48087 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48088 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48089 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48090 if (odd)
48091 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48092 else
48093 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48094 emit_insn (t3);
48096 break;
48098 case E_V16QImode:
48099 return expand_vec_perm_even_odd_pack (d);
48101 case E_V16HImode:
48102 case E_V32QImode:
48103 return expand_vec_perm_even_odd_pack (d);
48105 case E_V64QImode:
48106 return expand_vec_perm_even_odd_trunc (d);
48108 case E_V4DImode:
48109 if (!TARGET_AVX2)
48111 struct expand_vec_perm_d d_copy = *d;
48112 d_copy.vmode = V4DFmode;
48113 if (d->testing_p)
48114 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48115 else
48116 d_copy.target = gen_reg_rtx (V4DFmode);
48117 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48118 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48119 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48121 if (!d->testing_p)
48122 emit_move_insn (d->target,
48123 gen_lowpart (V4DImode, d_copy.target));
48124 return true;
48126 return false;
48129 if (d->testing_p)
48130 break;
48132 t1 = gen_reg_rtx (V4DImode);
48133 t2 = gen_reg_rtx (V4DImode);
48135 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48136 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48137 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48139 /* Now an vpunpck[lh]qdq will produce the result required. */
48140 if (odd)
48141 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48142 else
48143 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48144 emit_insn (t3);
48145 break;
48147 case E_V8SImode:
48148 if (!TARGET_AVX2)
48150 struct expand_vec_perm_d d_copy = *d;
48151 d_copy.vmode = V8SFmode;
48152 if (d->testing_p)
48153 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48154 else
48155 d_copy.target = gen_reg_rtx (V8SFmode);
48156 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48157 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48158 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48160 if (!d->testing_p)
48161 emit_move_insn (d->target,
48162 gen_lowpart (V8SImode, d_copy.target));
48163 return true;
48165 return false;
48168 if (d->testing_p)
48169 break;
48171 t1 = gen_reg_rtx (V8SImode);
48172 t2 = gen_reg_rtx (V8SImode);
48173 t3 = gen_reg_rtx (V4DImode);
48174 t4 = gen_reg_rtx (V4DImode);
48175 t5 = gen_reg_rtx (V4DImode);
48177 /* Shuffle the lanes around into
48178 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48179 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48180 gen_lowpart (V4DImode, d->op1),
48181 GEN_INT (0x20)));
48182 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48183 gen_lowpart (V4DImode, d->op1),
48184 GEN_INT (0x31)));
48186 /* Swap the 2nd and 3rd position in each lane into
48187 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48188 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48189 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48190 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48191 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48193 /* Now an vpunpck[lh]qdq will produce
48194 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48195 if (odd)
48196 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48197 gen_lowpart (V4DImode, t2));
48198 else
48199 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48200 gen_lowpart (V4DImode, t2));
48201 emit_insn (t3);
48202 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48203 break;
48205 default:
48206 gcc_unreachable ();
48209 return true;
48212 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48213 extract-even and extract-odd permutations. */
48215 static bool
48216 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48218 unsigned i, odd, nelt = d->nelt;
48220 odd = d->perm[0];
48221 if (odd != 0 && odd != 1)
48222 return false;
48224 for (i = 1; i < nelt; ++i)
48225 if (d->perm[i] != 2 * i + odd)
48226 return false;
48228 return expand_vec_perm_even_odd_1 (d, odd);
48231 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48232 permutations. We assume that expand_vec_perm_1 has already failed. */
48234 static bool
48235 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48237 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48238 machine_mode vmode = d->vmode;
48239 unsigned char perm2[4];
48240 rtx op0 = d->op0, dest;
48241 bool ok;
48243 switch (vmode)
48245 case E_V4DFmode:
48246 case E_V8SFmode:
48247 /* These are special-cased in sse.md so that we can optionally
48248 use the vbroadcast instruction. They expand to two insns
48249 if the input happens to be in a register. */
48250 gcc_unreachable ();
48252 case E_V2DFmode:
48253 case E_V2DImode:
48254 case E_V4SFmode:
48255 case E_V4SImode:
48256 /* These are always implementable using standard shuffle patterns. */
48257 gcc_unreachable ();
48259 case E_V8HImode:
48260 case E_V16QImode:
48261 /* These can be implemented via interleave. We save one insn by
48262 stopping once we have promoted to V4SImode and then use pshufd. */
48263 if (d->testing_p)
48264 return true;
48267 rtx dest;
48268 rtx (*gen) (rtx, rtx, rtx)
48269 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48270 : gen_vec_interleave_lowv8hi;
48272 if (elt >= nelt2)
48274 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48275 : gen_vec_interleave_highv8hi;
48276 elt -= nelt2;
48278 nelt2 /= 2;
48280 dest = gen_reg_rtx (vmode);
48281 emit_insn (gen (dest, op0, op0));
48282 vmode = get_mode_wider_vector (vmode);
48283 op0 = gen_lowpart (vmode, dest);
48285 while (vmode != V4SImode);
48287 memset (perm2, elt, 4);
48288 dest = gen_reg_rtx (V4SImode);
48289 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48290 gcc_assert (ok);
48291 if (!d->testing_p)
48292 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48293 return true;
48295 case E_V64QImode:
48296 case E_V32QImode:
48297 case E_V16HImode:
48298 case E_V8SImode:
48299 case E_V4DImode:
48300 /* For AVX2 broadcasts of the first element vpbroadcast* or
48301 vpermq should be used by expand_vec_perm_1. */
48302 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48303 return false;
48305 default:
48306 gcc_unreachable ();
48310 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48311 broadcast permutations. */
48313 static bool
48314 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48316 unsigned i, elt, nelt = d->nelt;
48318 if (!d->one_operand_p)
48319 return false;
48321 elt = d->perm[0];
48322 for (i = 1; i < nelt; ++i)
48323 if (d->perm[i] != elt)
48324 return false;
48326 return expand_vec_perm_broadcast_1 (d);
48329 /* Implement arbitrary permutations of two V64QImode operands
48330 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
48331 static bool
48332 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48334 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48335 return false;
48337 if (d->testing_p)
48338 return true;
48340 struct expand_vec_perm_d ds[2];
48341 rtx rperm[128], vperm, target0, target1;
48342 unsigned int i, nelt;
48343 machine_mode vmode;
48345 nelt = d->nelt;
48346 vmode = V64QImode;
48348 for (i = 0; i < 2; i++)
48350 ds[i] = *d;
48351 ds[i].vmode = V32HImode;
48352 ds[i].nelt = 32;
48353 ds[i].target = gen_reg_rtx (V32HImode);
48354 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48355 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48358 /* Prepare permutations such that the first one takes care of
48359 putting the even bytes into the right positions or one higher
48360 positions (ds[0]) and the second one takes care of
48361 putting the odd bytes into the right positions or one below
48362 (ds[1]). */
48364 for (i = 0; i < nelt; i++)
48366 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48367 if (i & 1)
48369 rperm[i] = constm1_rtx;
48370 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48372 else
48374 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48375 rperm[i + 64] = constm1_rtx;
48379 bool ok = expand_vec_perm_1 (&ds[0]);
48380 gcc_assert (ok);
48381 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48383 ok = expand_vec_perm_1 (&ds[1]);
48384 gcc_assert (ok);
48385 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48387 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48388 vperm = force_reg (vmode, vperm);
48389 target0 = gen_reg_rtx (V64QImode);
48390 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48392 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48393 vperm = force_reg (vmode, vperm);
48394 target1 = gen_reg_rtx (V64QImode);
48395 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48397 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48398 return true;
48401 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48402 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48403 all the shorter instruction sequences. */
48405 static bool
48406 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48408 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48409 unsigned int i, nelt, eltsz;
48410 bool used[4];
48412 if (!TARGET_AVX2
48413 || d->one_operand_p
48414 || (d->vmode != V32QImode && d->vmode != V16HImode))
48415 return false;
48417 if (d->testing_p)
48418 return true;
48420 nelt = d->nelt;
48421 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48423 /* Generate 4 permutation masks. If the required element is within
48424 the same lane, it is shuffled in. If the required element from the
48425 other lane, force a zero by setting bit 7 in the permutation mask.
48426 In the other mask the mask has non-negative elements if element
48427 is requested from the other lane, but also moved to the other lane,
48428 so that the result of vpshufb can have the two V2TImode halves
48429 swapped. */
48430 m128 = GEN_INT (-128);
48431 for (i = 0; i < 32; ++i)
48433 rperm[0][i] = m128;
48434 rperm[1][i] = m128;
48435 rperm[2][i] = m128;
48436 rperm[3][i] = m128;
48438 used[0] = false;
48439 used[1] = false;
48440 used[2] = false;
48441 used[3] = false;
48442 for (i = 0; i < nelt; ++i)
48444 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48445 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48446 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48448 for (j = 0; j < eltsz; ++j)
48449 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48450 used[which] = true;
48453 for (i = 0; i < 2; ++i)
48455 if (!used[2 * i + 1])
48457 h[i] = NULL_RTX;
48458 continue;
48460 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48461 gen_rtvec_v (32, rperm[2 * i + 1]));
48462 vperm = force_reg (V32QImode, vperm);
48463 h[i] = gen_reg_rtx (V32QImode);
48464 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48465 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48468 /* Swap the 128-byte lanes of h[X]. */
48469 for (i = 0; i < 2; ++i)
48471 if (h[i] == NULL_RTX)
48472 continue;
48473 op = gen_reg_rtx (V4DImode);
48474 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48475 const2_rtx, GEN_INT (3), const0_rtx,
48476 const1_rtx));
48477 h[i] = gen_lowpart (V32QImode, op);
48480 for (i = 0; i < 2; ++i)
48482 if (!used[2 * i])
48484 l[i] = NULL_RTX;
48485 continue;
48487 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48488 vperm = force_reg (V32QImode, vperm);
48489 l[i] = gen_reg_rtx (V32QImode);
48490 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48491 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48494 for (i = 0; i < 2; ++i)
48496 if (h[i] && l[i])
48498 op = gen_reg_rtx (V32QImode);
48499 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48500 l[i] = op;
48502 else if (h[i])
48503 l[i] = h[i];
48506 gcc_assert (l[0] && l[1]);
48507 op = d->target;
48508 if (d->vmode != V32QImode)
48509 op = gen_reg_rtx (V32QImode);
48510 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48511 if (op != d->target)
48512 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48513 return true;
48516 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
48517 taken care of, perform the expansion in D and return true on success. */
48519 static bool
48520 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48522 /* Try a single instruction expansion. */
48523 if (expand_vec_perm_1 (d))
48524 return true;
48526 /* Try sequences of two instructions. */
48528 if (expand_vec_perm_pshuflw_pshufhw (d))
48529 return true;
48531 if (expand_vec_perm_palignr (d, false))
48532 return true;
48534 if (expand_vec_perm_interleave2 (d))
48535 return true;
48537 if (expand_vec_perm_broadcast (d))
48538 return true;
48540 if (expand_vec_perm_vpermq_perm_1 (d))
48541 return true;
48543 if (expand_vec_perm_vperm2f128 (d))
48544 return true;
48546 if (expand_vec_perm_pblendv (d))
48547 return true;
48549 /* Try sequences of three instructions. */
48551 if (expand_vec_perm_even_odd_pack (d))
48552 return true;
48554 if (expand_vec_perm_2vperm2f128_vshuf (d))
48555 return true;
48557 if (expand_vec_perm_pshufb2 (d))
48558 return true;
48560 if (expand_vec_perm_interleave3 (d))
48561 return true;
48563 if (expand_vec_perm_vperm2f128_vblend (d))
48564 return true;
48566 /* Try sequences of four instructions. */
48568 if (expand_vec_perm_even_odd_trunc (d))
48569 return true;
48570 if (expand_vec_perm_vpshufb2_vpermq (d))
48571 return true;
48573 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48574 return true;
48576 if (expand_vec_perm_vpermt2_vpshub2 (d))
48577 return true;
48579 /* ??? Look for narrow permutations whose element orderings would
48580 allow the promotion to a wider mode. */
48582 /* ??? Look for sequences of interleave or a wider permute that place
48583 the data into the correct lanes for a half-vector shuffle like
48584 pshuf[lh]w or vpermilps. */
48586 /* ??? Look for sequences of interleave that produce the desired results.
48587 The combinatorics of punpck[lh] get pretty ugly... */
48589 if (expand_vec_perm_even_odd (d))
48590 return true;
48592 /* Even longer sequences. */
48593 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48594 return true;
48596 /* See if we can get the same permutation in different vector integer
48597 mode. */
48598 struct expand_vec_perm_d nd;
48599 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48601 if (!d->testing_p)
48602 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48603 return true;
48606 return false;
48609 /* If a permutation only uses one operand, make it clear. Returns true
48610 if the permutation references both operands. */
48612 static bool
48613 canonicalize_perm (struct expand_vec_perm_d *d)
48615 int i, which, nelt = d->nelt;
48617 for (i = which = 0; i < nelt; ++i)
48618 which |= (d->perm[i] < nelt ? 1 : 2);
48620 d->one_operand_p = true;
48621 switch (which)
48623 default:
48624 gcc_unreachable();
48626 case 3:
48627 if (!rtx_equal_p (d->op0, d->op1))
48629 d->one_operand_p = false;
48630 break;
48632 /* The elements of PERM do not suggest that only the first operand
48633 is used, but both operands are identical. Allow easier matching
48634 of the permutation by folding the permutation into the single
48635 input vector. */
48636 /* FALLTHRU */
48638 case 2:
48639 for (i = 0; i < nelt; ++i)
48640 d->perm[i] &= nelt - 1;
48641 d->op0 = d->op1;
48642 break;
48644 case 1:
48645 d->op1 = d->op0;
48646 break;
48649 return (which == 3);
48652 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
48654 static bool
48655 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
48656 rtx op1, const vec_perm_indices &sel)
48658 struct expand_vec_perm_d d;
48659 unsigned char perm[MAX_VECT_LEN];
48660 unsigned int i, nelt, which;
48661 bool two_args;
48663 d.target = target;
48664 d.op0 = op0;
48665 d.op1 = op1;
48667 d.vmode = vmode;
48668 gcc_assert (VECTOR_MODE_P (d.vmode));
48669 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48670 d.testing_p = !target;
48672 gcc_assert (sel.length () == nelt);
48673 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48675 /* Given sufficient ISA support we can just return true here
48676 for selected vector modes. */
48677 switch (d.vmode)
48679 case E_V16SFmode:
48680 case E_V16SImode:
48681 case E_V8DImode:
48682 case E_V8DFmode:
48683 if (!TARGET_AVX512F)
48684 return false;
48685 /* All implementable with a single vperm[it]2 insn. */
48686 if (d.testing_p)
48687 return true;
48688 break;
48689 case E_V32HImode:
48690 if (!TARGET_AVX512BW)
48691 return false;
48692 if (d.testing_p)
48693 /* All implementable with a single vperm[it]2 insn. */
48694 return true;
48695 break;
48696 case E_V64QImode:
48697 if (!TARGET_AVX512BW)
48698 return false;
48699 if (d.testing_p)
48700 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
48701 return true;
48702 break;
48703 case E_V8SImode:
48704 case E_V8SFmode:
48705 case E_V4DFmode:
48706 case E_V4DImode:
48707 if (!TARGET_AVX)
48708 return false;
48709 if (d.testing_p && TARGET_AVX512VL)
48710 /* All implementable with a single vperm[it]2 insn. */
48711 return true;
48712 break;
48713 case E_V16HImode:
48714 if (!TARGET_SSE2)
48715 return false;
48716 if (d.testing_p && TARGET_AVX2)
48717 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48718 return true;
48719 break;
48720 case E_V32QImode:
48721 if (!TARGET_SSE2)
48722 return false;
48723 if (d.testing_p && TARGET_AVX2)
48724 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48725 return true;
48726 break;
48727 case E_V8HImode:
48728 case E_V16QImode:
48729 if (!TARGET_SSE2)
48730 return false;
48731 /* Fall through. */
48732 case E_V4SImode:
48733 case E_V4SFmode:
48734 if (!TARGET_SSE)
48735 return false;
48736 /* All implementable with a single vpperm insn. */
48737 if (d.testing_p && TARGET_XOP)
48738 return true;
48739 /* All implementable with 2 pshufb + 1 ior. */
48740 if (d.testing_p && TARGET_SSSE3)
48741 return true;
48742 break;
48743 case E_V2DImode:
48744 case E_V2DFmode:
48745 if (!TARGET_SSE)
48746 return false;
48747 /* All implementable with shufpd or unpck[lh]pd. */
48748 if (d.testing_p)
48749 return true;
48750 break;
48751 default:
48752 return false;
48755 for (i = which = 0; i < nelt; ++i)
48757 unsigned char e = sel[i];
48758 gcc_assert (e < 2 * nelt);
48759 d.perm[i] = e;
48760 perm[i] = e;
48761 which |= (e < nelt ? 1 : 2);
48764 if (d.testing_p)
48766 /* For all elements from second vector, fold the elements to first. */
48767 if (which == 2)
48768 for (i = 0; i < nelt; ++i)
48769 d.perm[i] -= nelt;
48771 /* Check whether the mask can be applied to the vector type. */
48772 d.one_operand_p = (which != 3);
48774 /* Implementable with shufps or pshufd. */
48775 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
48776 return true;
48778 /* Otherwise we have to go through the motions and see if we can
48779 figure out how to generate the requested permutation. */
48780 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
48781 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
48782 if (!d.one_operand_p)
48783 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
48785 start_sequence ();
48786 bool ret = ix86_expand_vec_perm_const_1 (&d);
48787 end_sequence ();
48789 return ret;
48792 two_args = canonicalize_perm (&d);
48794 if (ix86_expand_vec_perm_const_1 (&d))
48795 return true;
48797 /* If the selector says both arguments are needed, but the operands are the
48798 same, the above tried to expand with one_operand_p and flattened selector.
48799 If that didn't work, retry without one_operand_p; we succeeded with that
48800 during testing. */
48801 if (two_args && d.one_operand_p)
48803 d.one_operand_p = false;
48804 memcpy (d.perm, perm, sizeof (perm));
48805 return ix86_expand_vec_perm_const_1 (&d);
48808 return false;
48811 void
48812 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
48814 struct expand_vec_perm_d d;
48815 unsigned i, nelt;
48817 d.target = targ;
48818 d.op0 = op0;
48819 d.op1 = op1;
48820 d.vmode = GET_MODE (targ);
48821 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48822 d.one_operand_p = false;
48823 d.testing_p = false;
48825 for (i = 0; i < nelt; ++i)
48826 d.perm[i] = i * 2 + odd;
48828 /* We'll either be able to implement the permutation directly... */
48829 if (expand_vec_perm_1 (&d))
48830 return;
48832 /* ... or we use the special-case patterns. */
48833 expand_vec_perm_even_odd_1 (&d, odd);
48836 static void
48837 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
48839 struct expand_vec_perm_d d;
48840 unsigned i, nelt, base;
48841 bool ok;
48843 d.target = targ;
48844 d.op0 = op0;
48845 d.op1 = op1;
48846 d.vmode = GET_MODE (targ);
48847 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48848 d.one_operand_p = false;
48849 d.testing_p = false;
48851 base = high_p ? nelt / 2 : 0;
48852 for (i = 0; i < nelt / 2; ++i)
48854 d.perm[i * 2] = i + base;
48855 d.perm[i * 2 + 1] = i + base + nelt;
48858 /* Note that for AVX this isn't one instruction. */
48859 ok = ix86_expand_vec_perm_const_1 (&d);
48860 gcc_assert (ok);
48864 /* Expand a vector operation CODE for a V*QImode in terms of the
48865 same operation on V*HImode. */
48867 void
48868 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
48870 machine_mode qimode = GET_MODE (dest);
48871 machine_mode himode;
48872 rtx (*gen_il) (rtx, rtx, rtx);
48873 rtx (*gen_ih) (rtx, rtx, rtx);
48874 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
48875 struct expand_vec_perm_d d;
48876 bool ok, full_interleave;
48877 bool uns_p = false;
48878 int i;
48880 switch (qimode)
48882 case E_V16QImode:
48883 himode = V8HImode;
48884 gen_il = gen_vec_interleave_lowv16qi;
48885 gen_ih = gen_vec_interleave_highv16qi;
48886 break;
48887 case E_V32QImode:
48888 himode = V16HImode;
48889 gen_il = gen_avx2_interleave_lowv32qi;
48890 gen_ih = gen_avx2_interleave_highv32qi;
48891 break;
48892 case E_V64QImode:
48893 himode = V32HImode;
48894 gen_il = gen_avx512bw_interleave_lowv64qi;
48895 gen_ih = gen_avx512bw_interleave_highv64qi;
48896 break;
48897 default:
48898 gcc_unreachable ();
48901 op2_l = op2_h = op2;
48902 switch (code)
48904 case MULT:
48905 /* Unpack data such that we've got a source byte in each low byte of
48906 each word. We don't care what goes into the high byte of each word.
48907 Rather than trying to get zero in there, most convenient is to let
48908 it be a copy of the low byte. */
48909 op2_l = gen_reg_rtx (qimode);
48910 op2_h = gen_reg_rtx (qimode);
48911 emit_insn (gen_il (op2_l, op2, op2));
48912 emit_insn (gen_ih (op2_h, op2, op2));
48914 op1_l = gen_reg_rtx (qimode);
48915 op1_h = gen_reg_rtx (qimode);
48916 emit_insn (gen_il (op1_l, op1, op1));
48917 emit_insn (gen_ih (op1_h, op1, op1));
48918 full_interleave = qimode == V16QImode;
48919 break;
48921 case ASHIFT:
48922 case LSHIFTRT:
48923 uns_p = true;
48924 /* FALLTHRU */
48925 case ASHIFTRT:
48926 op1_l = gen_reg_rtx (himode);
48927 op1_h = gen_reg_rtx (himode);
48928 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
48929 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
48930 full_interleave = true;
48931 break;
48932 default:
48933 gcc_unreachable ();
48936 /* Perform the operation. */
48937 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
48938 1, OPTAB_DIRECT);
48939 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
48940 1, OPTAB_DIRECT);
48941 gcc_assert (res_l && res_h);
48943 /* Merge the data back into the right place. */
48944 d.target = dest;
48945 d.op0 = gen_lowpart (qimode, res_l);
48946 d.op1 = gen_lowpart (qimode, res_h);
48947 d.vmode = qimode;
48948 d.nelt = GET_MODE_NUNITS (qimode);
48949 d.one_operand_p = false;
48950 d.testing_p = false;
48952 if (full_interleave)
48954 /* For SSE2, we used an full interleave, so the desired
48955 results are in the even elements. */
48956 for (i = 0; i < d.nelt; ++i)
48957 d.perm[i] = i * 2;
48959 else
48961 /* For AVX, the interleave used above was not cross-lane. So the
48962 extraction is evens but with the second and third quarter swapped.
48963 Happily, that is even one insn shorter than even extraction.
48964 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48965 always first from the first and then from the second source operand,
48966 the index bits above the low 4 bits remains the same.
48967 Thus, for d.nelt == 32 we want permutation
48968 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48969 and for d.nelt == 64 we want permutation
48970 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48971 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48972 for (i = 0; i < d.nelt; ++i)
48973 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48976 ok = ix86_expand_vec_perm_const_1 (&d);
48977 gcc_assert (ok);
48979 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48980 gen_rtx_fmt_ee (code, qimode, op1, op2));
48983 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48984 if op is CONST_VECTOR with all odd elements equal to their
48985 preceding element. */
48987 static bool
48988 const_vector_equal_evenodd_p (rtx op)
48990 machine_mode mode = GET_MODE (op);
48991 int i, nunits = GET_MODE_NUNITS (mode);
48992 if (GET_CODE (op) != CONST_VECTOR
48993 || nunits != CONST_VECTOR_NUNITS (op))
48994 return false;
48995 for (i = 0; i < nunits; i += 2)
48996 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48997 return false;
48998 return true;
49001 void
49002 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49003 bool uns_p, bool odd_p)
49005 machine_mode mode = GET_MODE (op1);
49006 machine_mode wmode = GET_MODE (dest);
49007 rtx x;
49008 rtx orig_op1 = op1, orig_op2 = op2;
49010 if (!nonimmediate_operand (op1, mode))
49011 op1 = force_reg (mode, op1);
49012 if (!nonimmediate_operand (op2, mode))
49013 op2 = force_reg (mode, op2);
49015 /* We only play even/odd games with vectors of SImode. */
49016 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49018 /* If we're looking for the odd results, shift those members down to
49019 the even slots. For some cpus this is faster than a PSHUFD. */
49020 if (odd_p)
49022 /* For XOP use vpmacsdqh, but only for smult, as it is only
49023 signed. */
49024 if (TARGET_XOP && mode == V4SImode && !uns_p)
49026 x = force_reg (wmode, CONST0_RTX (wmode));
49027 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49028 return;
49031 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49032 if (!const_vector_equal_evenodd_p (orig_op1))
49033 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49034 x, NULL, 1, OPTAB_DIRECT);
49035 if (!const_vector_equal_evenodd_p (orig_op2))
49036 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49037 x, NULL, 1, OPTAB_DIRECT);
49038 op1 = gen_lowpart (mode, op1);
49039 op2 = gen_lowpart (mode, op2);
49042 if (mode == V16SImode)
49044 if (uns_p)
49045 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49046 else
49047 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49049 else if (mode == V8SImode)
49051 if (uns_p)
49052 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49053 else
49054 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49056 else if (uns_p)
49057 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49058 else if (TARGET_SSE4_1)
49059 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49060 else
49062 rtx s1, s2, t0, t1, t2;
49064 /* The easiest way to implement this without PMULDQ is to go through
49065 the motions as if we are performing a full 64-bit multiply. With
49066 the exception that we need to do less shuffling of the elements. */
49068 /* Compute the sign-extension, aka highparts, of the two operands. */
49069 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49070 op1, pc_rtx, pc_rtx);
49071 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49072 op2, pc_rtx, pc_rtx);
49074 /* Multiply LO(A) * HI(B), and vice-versa. */
49075 t1 = gen_reg_rtx (wmode);
49076 t2 = gen_reg_rtx (wmode);
49077 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49078 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49080 /* Multiply LO(A) * LO(B). */
49081 t0 = gen_reg_rtx (wmode);
49082 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49084 /* Combine and shift the highparts into place. */
49085 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49086 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49087 1, OPTAB_DIRECT);
49089 /* Combine high and low parts. */
49090 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49091 return;
49093 emit_insn (x);
49096 void
49097 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49098 bool uns_p, bool high_p)
49100 machine_mode wmode = GET_MODE (dest);
49101 machine_mode mode = GET_MODE (op1);
49102 rtx t1, t2, t3, t4, mask;
49104 switch (mode)
49106 case E_V4SImode:
49107 t1 = gen_reg_rtx (mode);
49108 t2 = gen_reg_rtx (mode);
49109 if (TARGET_XOP && !uns_p)
49111 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49112 shuffle the elements once so that all elements are in the right
49113 place for immediate use: { A C B D }. */
49114 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49115 const1_rtx, GEN_INT (3)));
49116 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49117 const1_rtx, GEN_INT (3)));
49119 else
49121 /* Put the elements into place for the multiply. */
49122 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49123 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49124 high_p = false;
49126 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49127 break;
49129 case E_V8SImode:
49130 /* Shuffle the elements between the lanes. After this we
49131 have { A B E F | C D G H } for each operand. */
49132 t1 = gen_reg_rtx (V4DImode);
49133 t2 = gen_reg_rtx (V4DImode);
49134 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49135 const0_rtx, const2_rtx,
49136 const1_rtx, GEN_INT (3)));
49137 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49138 const0_rtx, const2_rtx,
49139 const1_rtx, GEN_INT (3)));
49141 /* Shuffle the elements within the lanes. After this we
49142 have { A A B B | C C D D } or { E E F F | G G H H }. */
49143 t3 = gen_reg_rtx (V8SImode);
49144 t4 = gen_reg_rtx (V8SImode);
49145 mask = GEN_INT (high_p
49146 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49147 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49148 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49149 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49151 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49152 break;
49154 case E_V8HImode:
49155 case E_V16HImode:
49156 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49157 uns_p, OPTAB_DIRECT);
49158 t2 = expand_binop (mode,
49159 uns_p ? umul_highpart_optab : smul_highpart_optab,
49160 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49161 gcc_assert (t1 && t2);
49163 t3 = gen_reg_rtx (mode);
49164 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49165 emit_move_insn (dest, gen_lowpart (wmode, t3));
49166 break;
49168 case E_V16QImode:
49169 case E_V32QImode:
49170 case E_V32HImode:
49171 case E_V16SImode:
49172 case E_V64QImode:
49173 t1 = gen_reg_rtx (wmode);
49174 t2 = gen_reg_rtx (wmode);
49175 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49176 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49178 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49179 break;
49181 default:
49182 gcc_unreachable ();
49186 void
49187 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49189 rtx res_1, res_2, res_3, res_4;
49191 res_1 = gen_reg_rtx (V4SImode);
49192 res_2 = gen_reg_rtx (V4SImode);
49193 res_3 = gen_reg_rtx (V2DImode);
49194 res_4 = gen_reg_rtx (V2DImode);
49195 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49196 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49198 /* Move the results in element 2 down to element 1; we don't care
49199 what goes in elements 2 and 3. Then we can merge the parts
49200 back together with an interleave.
49202 Note that two other sequences were tried:
49203 (1) Use interleaves at the start instead of psrldq, which allows
49204 us to use a single shufps to merge things back at the end.
49205 (2) Use shufps here to combine the two vectors, then pshufd to
49206 put the elements in the correct order.
49207 In both cases the cost of the reformatting stall was too high
49208 and the overall sequence slower. */
49210 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49211 const0_rtx, const2_rtx,
49212 const0_rtx, const0_rtx));
49213 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49214 const0_rtx, const2_rtx,
49215 const0_rtx, const0_rtx));
49216 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49218 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49221 void
49222 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49224 machine_mode mode = GET_MODE (op0);
49225 rtx t1, t2, t3, t4, t5, t6;
49227 if (TARGET_AVX512DQ && mode == V8DImode)
49228 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49229 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49230 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49231 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49232 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49233 else if (TARGET_XOP && mode == V2DImode)
49235 /* op1: A,B,C,D, op2: E,F,G,H */
49236 op1 = gen_lowpart (V4SImode, op1);
49237 op2 = gen_lowpart (V4SImode, op2);
49239 t1 = gen_reg_rtx (V4SImode);
49240 t2 = gen_reg_rtx (V4SImode);
49241 t3 = gen_reg_rtx (V2DImode);
49242 t4 = gen_reg_rtx (V2DImode);
49244 /* t1: B,A,D,C */
49245 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49246 GEN_INT (1),
49247 GEN_INT (0),
49248 GEN_INT (3),
49249 GEN_INT (2)));
49251 /* t2: (B*E),(A*F),(D*G),(C*H) */
49252 emit_insn (gen_mulv4si3 (t2, t1, op2));
49254 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49255 emit_insn (gen_xop_phadddq (t3, t2));
49257 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49258 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49260 /* Multiply lower parts and add all */
49261 t5 = gen_reg_rtx (V2DImode);
49262 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49263 gen_lowpart (V4SImode, op1),
49264 gen_lowpart (V4SImode, op2)));
49265 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49268 else
49270 machine_mode nmode;
49271 rtx (*umul) (rtx, rtx, rtx);
49273 if (mode == V2DImode)
49275 umul = gen_vec_widen_umult_even_v4si;
49276 nmode = V4SImode;
49278 else if (mode == V4DImode)
49280 umul = gen_vec_widen_umult_even_v8si;
49281 nmode = V8SImode;
49283 else if (mode == V8DImode)
49285 umul = gen_vec_widen_umult_even_v16si;
49286 nmode = V16SImode;
49288 else
49289 gcc_unreachable ();
49292 /* Multiply low parts. */
49293 t1 = gen_reg_rtx (mode);
49294 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49296 /* Shift input vectors right 32 bits so we can multiply high parts. */
49297 t6 = GEN_INT (32);
49298 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49299 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49301 /* Multiply high parts by low parts. */
49302 t4 = gen_reg_rtx (mode);
49303 t5 = gen_reg_rtx (mode);
49304 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49305 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49307 /* Combine and shift the highparts back. */
49308 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49309 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49311 /* Combine high and low parts. */
49312 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49315 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49316 gen_rtx_MULT (mode, op1, op2));
49319 /* Return 1 if control tansfer instruction INSN
49320 should be encoded with bnd prefix.
49321 If insn is NULL then return 1 when control
49322 transfer instructions should be prefixed with
49323 bnd by default for current function. */
49325 bool
49326 ix86_bnd_prefixed_insn_p (rtx insn)
49328 /* For call insns check special flag. */
49329 if (insn && CALL_P (insn))
49331 rtx call = get_call_rtx_from (insn);
49332 if (call)
49333 return CALL_EXPR_WITH_BOUNDS_P (call);
49336 /* All other insns are prefixed only if function is instrumented. */
49337 return chkp_function_instrumented_p (current_function_decl);
49340 /* Return 1 if control tansfer instruction INSN
49341 should be encoded with notrack prefix. */
49343 static bool
49344 ix86_notrack_prefixed_insn_p (rtx insn)
49346 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
49347 return false;
49349 if (CALL_P (insn))
49351 rtx call = get_call_rtx_from (insn);
49352 gcc_assert (call != NULL_RTX);
49353 rtx addr = XEXP (call, 0);
49355 /* Do not emit 'notrack' if it's not an indirect call. */
49356 if (MEM_P (addr)
49357 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49358 return false;
49359 else
49360 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49363 if (JUMP_P (insn) && !flag_cet_switch)
49365 rtx target = JUMP_LABEL (insn);
49366 if (target == NULL_RTX || ANY_RETURN_P (target))
49367 return false;
49369 /* Check the jump is a switch table. */
49370 rtx_insn *label = as_a<rtx_insn *> (target);
49371 rtx_insn *table = next_insn (label);
49372 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49373 return false;
49374 else
49375 return true;
49377 return false;
49380 /* Calculate integer abs() using only SSE2 instructions. */
49382 void
49383 ix86_expand_sse2_abs (rtx target, rtx input)
49385 machine_mode mode = GET_MODE (target);
49386 rtx tmp0, tmp1, x;
49388 switch (mode)
49390 /* For 32-bit signed integer X, the best way to calculate the absolute
49391 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49392 case E_V4SImode:
49393 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49394 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49395 NULL, 0, OPTAB_DIRECT);
49396 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49397 NULL, 0, OPTAB_DIRECT);
49398 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49399 target, 0, OPTAB_DIRECT);
49400 break;
49402 /* For 16-bit signed integer X, the best way to calculate the absolute
49403 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49404 case E_V8HImode:
49405 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49407 x = expand_simple_binop (mode, SMAX, tmp0, input,
49408 target, 0, OPTAB_DIRECT);
49409 break;
49411 /* For 8-bit signed integer X, the best way to calculate the absolute
49412 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49413 as SSE2 provides the PMINUB insn. */
49414 case E_V16QImode:
49415 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49417 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49418 target, 0, OPTAB_DIRECT);
49419 break;
49421 default:
49422 gcc_unreachable ();
49425 if (x != target)
49426 emit_move_insn (target, x);
49429 /* Expand an extract from a vector register through pextr insn.
49430 Return true if successful. */
49432 bool
49433 ix86_expand_pextr (rtx *operands)
49435 rtx dst = operands[0];
49436 rtx src = operands[1];
49438 unsigned int size = INTVAL (operands[2]);
49439 unsigned int pos = INTVAL (operands[3]);
49441 if (SUBREG_P (dst))
49443 /* Reject non-lowpart subregs. */
49444 if (SUBREG_BYTE (dst) > 0)
49445 return false;
49446 dst = SUBREG_REG (dst);
49449 if (SUBREG_P (src))
49451 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49452 src = SUBREG_REG (src);
49455 switch (GET_MODE (src))
49457 case E_V16QImode:
49458 case E_V8HImode:
49459 case E_V4SImode:
49460 case E_V2DImode:
49461 case E_V1TImode:
49462 case E_TImode:
49464 machine_mode srcmode, dstmode;
49465 rtx d, pat;
49467 if (!int_mode_for_size (size, 0).exists (&dstmode))
49468 return false;
49470 switch (dstmode)
49472 case E_QImode:
49473 if (!TARGET_SSE4_1)
49474 return false;
49475 srcmode = V16QImode;
49476 break;
49478 case E_HImode:
49479 if (!TARGET_SSE2)
49480 return false;
49481 srcmode = V8HImode;
49482 break;
49484 case E_SImode:
49485 if (!TARGET_SSE4_1)
49486 return false;
49487 srcmode = V4SImode;
49488 break;
49490 case E_DImode:
49491 gcc_assert (TARGET_64BIT);
49492 if (!TARGET_SSE4_1)
49493 return false;
49494 srcmode = V2DImode;
49495 break;
49497 default:
49498 return false;
49501 /* Reject extractions from misaligned positions. */
49502 if (pos & (size-1))
49503 return false;
49505 if (GET_MODE (dst) == dstmode)
49506 d = dst;
49507 else
49508 d = gen_reg_rtx (dstmode);
49510 /* Construct insn pattern. */
49511 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49512 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49514 /* Let the rtl optimizers know about the zero extension performed. */
49515 if (dstmode == QImode || dstmode == HImode)
49517 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49518 d = gen_lowpart (SImode, d);
49521 emit_insn (gen_rtx_SET (d, pat));
49523 if (d != dst)
49524 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49525 return true;
49528 default:
49529 return false;
49533 /* Expand an insert into a vector register through pinsr insn.
49534 Return true if successful. */
49536 bool
49537 ix86_expand_pinsr (rtx *operands)
49539 rtx dst = operands[0];
49540 rtx src = operands[3];
49542 unsigned int size = INTVAL (operands[1]);
49543 unsigned int pos = INTVAL (operands[2]);
49545 if (SUBREG_P (dst))
49547 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49548 dst = SUBREG_REG (dst);
49551 switch (GET_MODE (dst))
49553 case E_V16QImode:
49554 case E_V8HImode:
49555 case E_V4SImode:
49556 case E_V2DImode:
49557 case E_V1TImode:
49558 case E_TImode:
49560 machine_mode srcmode, dstmode;
49561 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49562 rtx d;
49564 if (!int_mode_for_size (size, 0).exists (&srcmode))
49565 return false;
49567 switch (srcmode)
49569 case E_QImode:
49570 if (!TARGET_SSE4_1)
49571 return false;
49572 dstmode = V16QImode;
49573 pinsr = gen_sse4_1_pinsrb;
49574 break;
49576 case E_HImode:
49577 if (!TARGET_SSE2)
49578 return false;
49579 dstmode = V8HImode;
49580 pinsr = gen_sse2_pinsrw;
49581 break;
49583 case E_SImode:
49584 if (!TARGET_SSE4_1)
49585 return false;
49586 dstmode = V4SImode;
49587 pinsr = gen_sse4_1_pinsrd;
49588 break;
49590 case E_DImode:
49591 gcc_assert (TARGET_64BIT);
49592 if (!TARGET_SSE4_1)
49593 return false;
49594 dstmode = V2DImode;
49595 pinsr = gen_sse4_1_pinsrq;
49596 break;
49598 default:
49599 return false;
49602 /* Reject insertions to misaligned positions. */
49603 if (pos & (size-1))
49604 return false;
49606 if (SUBREG_P (src))
49608 unsigned int srcpos = SUBREG_BYTE (src);
49610 if (srcpos > 0)
49612 rtx extr_ops[4];
49614 extr_ops[0] = gen_reg_rtx (srcmode);
49615 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49616 extr_ops[2] = GEN_INT (size);
49617 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49619 if (!ix86_expand_pextr (extr_ops))
49620 return false;
49622 src = extr_ops[0];
49624 else
49625 src = gen_lowpart (srcmode, SUBREG_REG (src));
49628 if (GET_MODE (dst) == dstmode)
49629 d = dst;
49630 else
49631 d = gen_reg_rtx (dstmode);
49633 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49634 gen_lowpart (srcmode, src),
49635 GEN_INT (1 << (pos / size))));
49636 if (d != dst)
49637 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49638 return true;
49641 default:
49642 return false;
49646 /* This function returns the calling abi specific va_list type node.
49647 It returns the FNDECL specific va_list type. */
49649 static tree
49650 ix86_fn_abi_va_list (tree fndecl)
49652 if (!TARGET_64BIT)
49653 return va_list_type_node;
49654 gcc_assert (fndecl != NULL_TREE);
49656 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49657 return ms_va_list_type_node;
49658 else
49659 return sysv_va_list_type_node;
49662 /* Returns the canonical va_list type specified by TYPE. If there
49663 is no valid TYPE provided, it return NULL_TREE. */
49665 static tree
49666 ix86_canonical_va_list_type (tree type)
49668 if (TARGET_64BIT)
49670 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49671 return ms_va_list_type_node;
49673 if ((TREE_CODE (type) == ARRAY_TYPE
49674 && integer_zerop (array_type_nelts (type)))
49675 || POINTER_TYPE_P (type))
49677 tree elem_type = TREE_TYPE (type);
49678 if (TREE_CODE (elem_type) == RECORD_TYPE
49679 && lookup_attribute ("sysv_abi va_list",
49680 TYPE_ATTRIBUTES (elem_type)))
49681 return sysv_va_list_type_node;
49684 return NULL_TREE;
49687 return std_canonical_va_list_type (type);
49690 /* Iterate through the target-specific builtin types for va_list.
49691 IDX denotes the iterator, *PTREE is set to the result type of
49692 the va_list builtin, and *PNAME to its internal type.
49693 Returns zero if there is no element for this index, otherwise
49694 IDX should be increased upon the next call.
49695 Note, do not iterate a base builtin's name like __builtin_va_list.
49696 Used from c_common_nodes_and_builtins. */
49698 static int
49699 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
49701 if (TARGET_64BIT)
49703 switch (idx)
49705 default:
49706 break;
49708 case 0:
49709 *ptree = ms_va_list_type_node;
49710 *pname = "__builtin_ms_va_list";
49711 return 1;
49713 case 1:
49714 *ptree = sysv_va_list_type_node;
49715 *pname = "__builtin_sysv_va_list";
49716 return 1;
49720 return 0;
49723 #undef TARGET_SCHED_DISPATCH
49724 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
49725 #undef TARGET_SCHED_DISPATCH_DO
49726 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
49727 #undef TARGET_SCHED_REASSOCIATION_WIDTH
49728 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
49729 #undef TARGET_SCHED_REORDER
49730 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
49731 #undef TARGET_SCHED_ADJUST_PRIORITY
49732 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
49733 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
49734 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
49735 ix86_dependencies_evaluation_hook
49738 /* Implementation of reassociation_width target hook used by
49739 reassoc phase to identify parallelism level in reassociated
49740 tree. Statements tree_code is passed in OPC. Arguments type
49741 is passed in MODE. */
49743 static int
49744 ix86_reassociation_width (unsigned int op, machine_mode mode)
49746 int width = 1;
49747 /* Vector part. */
49748 if (VECTOR_MODE_P (mode))
49750 int div = 1;
49751 if (INTEGRAL_MODE_P (mode))
49752 width = ix86_cost->reassoc_vec_int;
49753 else if (FLOAT_MODE_P (mode))
49754 width = ix86_cost->reassoc_vec_fp;
49756 if (width == 1)
49757 return 1;
49759 /* Integer vector instructions execute in FP unit
49760 and can execute 3 additions and one multiplication per cycle. */
49761 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
49762 && op != PLUS && op != MINUS)
49763 return 1;
49765 /* Account for targets that splits wide vectors into multiple parts. */
49766 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
49767 div = GET_MODE_BITSIZE (mode) / 128;
49768 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
49769 div = GET_MODE_BITSIZE (mode) / 64;
49770 width = (width + div - 1) / div;
49772 /* Scalar part. */
49773 else if (INTEGRAL_MODE_P (mode))
49774 width = ix86_cost->reassoc_int;
49775 else if (FLOAT_MODE_P (mode))
49776 width = ix86_cost->reassoc_fp;
49778 /* Avoid using too many registers in 32bit mode. */
49779 if (!TARGET_64BIT && width > 2)
49780 width = 2;
49781 return width;
49784 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
49785 place emms and femms instructions. */
49787 static machine_mode
49788 ix86_preferred_simd_mode (scalar_mode mode)
49790 if (!TARGET_SSE)
49791 return word_mode;
49793 switch (mode)
49795 case E_QImode:
49796 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
49797 return V64QImode;
49798 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49799 return V32QImode;
49800 else
49801 return V16QImode;
49803 case E_HImode:
49804 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
49805 return V32HImode;
49806 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49807 return V16HImode;
49808 else
49809 return V8HImode;
49811 case E_SImode:
49812 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49813 return V16SImode;
49814 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49815 return V8SImode;
49816 else
49817 return V4SImode;
49819 case E_DImode:
49820 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49821 return V8DImode;
49822 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49823 return V4DImode;
49824 else
49825 return V2DImode;
49827 case E_SFmode:
49828 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49829 return V16SFmode;
49830 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49831 return V8SFmode;
49832 else
49833 return V4SFmode;
49835 case E_DFmode:
49836 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49837 return V8DFmode;
49838 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49839 return V4DFmode;
49840 else if (TARGET_SSE2)
49841 return V2DFmode;
49842 /* FALLTHRU */
49844 default:
49845 return word_mode;
49849 /* All CPUs prefer to avoid cross-lane operations so perform reductions
49850 upper against lower halves up to SSE reg size. */
49852 static machine_mode
49853 ix86_split_reduction (machine_mode mode)
49855 /* Reduce lowpart against highpart until we reach SSE reg width to
49856 avoid cross-lane operations. */
49857 switch (mode)
49859 case E_V8DImode:
49860 case E_V4DImode:
49861 return V2DImode;
49862 case E_V16SImode:
49863 case E_V8SImode:
49864 return V4SImode;
49865 case E_V32HImode:
49866 case E_V16HImode:
49867 return V8HImode;
49868 case E_V64QImode:
49869 case E_V32QImode:
49870 return V16QImode;
49871 case E_V16SFmode:
49872 case E_V8SFmode:
49873 return V4SFmode;
49874 case E_V8DFmode:
49875 case E_V4DFmode:
49876 return V2DFmode;
49877 default:
49878 return mode;
49882 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
49883 vectors. If AVX512F is enabled then try vectorizing with 512bit,
49884 256bit and 128bit vectors. */
49886 static void
49887 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
49889 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
49891 sizes->safe_push (64);
49892 sizes->safe_push (32);
49893 sizes->safe_push (16);
49895 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
49897 sizes->safe_push (32);
49898 sizes->safe_push (16);
49902 /* Implemenation of targetm.vectorize.get_mask_mode. */
49904 static opt_machine_mode
49905 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
49907 unsigned elem_size = vector_size / nunits;
49909 /* Scalar mask case. */
49910 if ((TARGET_AVX512F && vector_size == 64)
49911 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
49913 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
49914 return smallest_int_mode_for_size (nunits);
49917 scalar_int_mode elem_mode
49918 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
49920 gcc_assert (elem_size * nunits == vector_size);
49922 return mode_for_vector (elem_mode, nunits);
49927 /* Return class of registers which could be used for pseudo of MODE
49928 and of class RCLASS for spilling instead of memory. Return NO_REGS
49929 if it is not possible or non-profitable. */
49931 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
49933 static reg_class_t
49934 ix86_spill_class (reg_class_t rclass, machine_mode mode)
49936 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
49937 && TARGET_SSE2
49938 && TARGET_INTER_UNIT_MOVES_TO_VEC
49939 && TARGET_INTER_UNIT_MOVES_FROM_VEC
49940 && (mode == SImode || (TARGET_64BIT && mode == DImode))
49941 && INTEGER_CLASS_P (rclass))
49942 return ALL_SSE_REGS;
49943 return NO_REGS;
49946 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
49947 but returns a lower bound. */
49949 static unsigned int
49950 ix86_max_noce_ifcvt_seq_cost (edge e)
49952 bool predictable_p = predictable_edge_p (e);
49954 enum compiler_param param
49955 = (predictable_p
49956 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
49957 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
49959 /* If we have a parameter set, use that, otherwise take a guess using
49960 BRANCH_COST. */
49961 if (global_options_set.x_param_values[param])
49962 return PARAM_VALUE (param);
49963 else
49964 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
49967 /* Return true if SEQ is a good candidate as a replacement for the
49968 if-convertible sequence described in IF_INFO. */
49970 static bool
49971 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
49973 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
49975 int cmov_cnt = 0;
49976 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
49977 Maybe we should allow even more conditional moves as long as they
49978 are used far enough not to stall the CPU, or also consider
49979 IF_INFO->TEST_BB succ edge probabilities. */
49980 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
49982 rtx set = single_set (insn);
49983 if (!set)
49984 continue;
49985 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
49986 continue;
49987 rtx src = SET_SRC (set);
49988 machine_mode mode = GET_MODE (src);
49989 if (GET_MODE_CLASS (mode) != MODE_INT
49990 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49991 continue;
49992 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49993 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49994 continue;
49995 /* insn is CMOV or FCMOV. */
49996 if (++cmov_cnt > 1)
49997 return false;
50000 return default_noce_conversion_profitable_p (seq, if_info);
50003 /* Implement targetm.vectorize.init_cost. */
50005 static void *
50006 ix86_init_cost (struct loop *)
50008 unsigned *cost = XNEWVEC (unsigned, 3);
50009 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50010 return cost;
50013 /* Implement targetm.vectorize.add_stmt_cost. */
50015 static unsigned
50016 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50017 struct _stmt_vec_info *stmt_info, int misalign,
50018 enum vect_cost_model_location where)
50020 unsigned *cost = (unsigned *) data;
50021 unsigned retval = 0;
50023 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50024 int stmt_cost = - 1;
50026 if ((kind == vector_stmt || kind == scalar_stmt)
50027 && stmt_info
50028 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50030 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50031 bool fp = false;
50032 machine_mode mode = TImode;
50034 if (vectype != NULL)
50036 fp = FLOAT_TYPE_P (vectype);
50037 mode = TYPE_MODE (vectype);
50039 /*machine_mode inner_mode = mode;
50040 if (VECTOR_MODE_P (mode))
50041 inner_mode = GET_MODE_INNER (mode);*/
50043 switch (subcode)
50045 case PLUS_EXPR:
50046 case POINTER_PLUS_EXPR:
50047 case MINUS_EXPR:
50048 if (kind == scalar_stmt)
50050 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50051 stmt_cost = ix86_cost->addss;
50052 else if (X87_FLOAT_MODE_P (mode))
50053 stmt_cost = ix86_cost->fadd;
50054 else
50055 stmt_cost = ix86_cost->add;
50057 else
50058 stmt_cost = ix86_vec_cost (mode,
50059 fp ? ix86_cost->addss
50060 : ix86_cost->sse_op,
50061 true);
50062 break;
50064 case MULT_EXPR:
50065 case WIDEN_MULT_EXPR:
50066 case MULT_HIGHPART_EXPR:
50067 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50068 break;
50069 case FMA_EXPR:
50070 stmt_cost = ix86_vec_cost (mode,
50071 mode == SFmode ? ix86_cost->fmass
50072 : ix86_cost->fmasd,
50073 true);
50074 break;
50075 case NEGATE_EXPR:
50076 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50077 stmt_cost = ix86_cost->sse_op;
50078 else if (X87_FLOAT_MODE_P (mode))
50079 stmt_cost = ix86_cost->fchs;
50080 else if (VECTOR_MODE_P (mode))
50081 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50082 else
50083 stmt_cost = ix86_cost->add;
50084 break;
50085 case TRUNC_DIV_EXPR:
50086 case CEIL_DIV_EXPR:
50087 case FLOOR_DIV_EXPR:
50088 case ROUND_DIV_EXPR:
50089 case TRUNC_MOD_EXPR:
50090 case CEIL_MOD_EXPR:
50091 case FLOOR_MOD_EXPR:
50092 case RDIV_EXPR:
50093 case ROUND_MOD_EXPR:
50094 case EXACT_DIV_EXPR:
50095 stmt_cost = ix86_division_cost (ix86_cost, mode);
50096 break;
50098 case RSHIFT_EXPR:
50099 case LSHIFT_EXPR:
50100 case LROTATE_EXPR:
50101 case RROTATE_EXPR:
50103 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50104 stmt_cost = ix86_shift_rotate_cost
50105 (ix86_cost, mode,
50106 TREE_CODE (op2) == INTEGER_CST,
50107 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50108 true, false, false, NULL, NULL);
50110 break;
50111 case NOP_EXPR:
50112 stmt_cost = 0;
50113 break;
50115 case BIT_IOR_EXPR:
50116 case ABS_EXPR:
50117 case MIN_EXPR:
50118 case MAX_EXPR:
50119 case BIT_XOR_EXPR:
50120 case BIT_AND_EXPR:
50121 case BIT_NOT_EXPR:
50122 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50123 stmt_cost = ix86_cost->sse_op;
50124 else if (VECTOR_MODE_P (mode))
50125 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50126 else
50127 stmt_cost = ix86_cost->add;
50128 break;
50129 default:
50130 break;
50133 if (stmt_cost == -1)
50134 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50136 /* Penalize DFmode vector operations for Bonnell. */
50137 if (TARGET_BONNELL && kind == vector_stmt
50138 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50139 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50141 /* Statements in an inner loop relative to the loop being
50142 vectorized are weighted more heavily. The value here is
50143 arbitrary and could potentially be improved with analysis. */
50144 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50145 count *= 50; /* FIXME. */
50147 retval = (unsigned) (count * stmt_cost);
50149 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50150 for Silvermont as it has out of order integer pipeline and can execute
50151 2 scalar instruction per tick, but has in order SIMD pipeline. */
50152 if ((TARGET_SILVERMONT || TARGET_INTEL)
50153 && stmt_info && stmt_info->stmt)
50155 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50156 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50157 retval = (retval * 17) / 10;
50160 cost[where] += retval;
50162 return retval;
50165 /* Implement targetm.vectorize.finish_cost. */
50167 static void
50168 ix86_finish_cost (void *data, unsigned *prologue_cost,
50169 unsigned *body_cost, unsigned *epilogue_cost)
50171 unsigned *cost = (unsigned *) data;
50172 *prologue_cost = cost[vect_prologue];
50173 *body_cost = cost[vect_body];
50174 *epilogue_cost = cost[vect_epilogue];
50177 /* Implement targetm.vectorize.destroy_cost_data. */
50179 static void
50180 ix86_destroy_cost_data (void *data)
50182 free (data);
50185 /* Validate target specific memory model bits in VAL. */
50187 static unsigned HOST_WIDE_INT
50188 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50190 enum memmodel model = memmodel_from_int (val);
50191 bool strong;
50193 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50194 |MEMMODEL_MASK)
50195 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50197 warning (OPT_Winvalid_memory_model,
50198 "unknown architecture specific memory model");
50199 return MEMMODEL_SEQ_CST;
50201 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50202 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50204 warning (OPT_Winvalid_memory_model,
50205 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50206 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50208 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50210 warning (OPT_Winvalid_memory_model,
50211 "HLE_RELEASE not used with RELEASE or stronger memory model");
50212 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50214 return val;
50217 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50218 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50219 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50220 or number of vecsize_mangle variants that should be emitted. */
50222 static int
50223 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50224 struct cgraph_simd_clone *clonei,
50225 tree base_type, int num)
50227 int ret = 1;
50229 if (clonei->simdlen
50230 && (clonei->simdlen < 2
50231 || clonei->simdlen > 1024
50232 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50234 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50235 "unsupported simdlen %d", clonei->simdlen);
50236 return 0;
50239 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50240 if (TREE_CODE (ret_type) != VOID_TYPE)
50241 switch (TYPE_MODE (ret_type))
50243 case E_QImode:
50244 case E_HImode:
50245 case E_SImode:
50246 case E_DImode:
50247 case E_SFmode:
50248 case E_DFmode:
50249 /* case E_SCmode: */
50250 /* case E_DCmode: */
50251 break;
50252 default:
50253 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50254 "unsupported return type %qT for simd", ret_type);
50255 return 0;
50258 tree t;
50259 int i;
50261 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50262 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50263 switch (TYPE_MODE (TREE_TYPE (t)))
50265 case E_QImode:
50266 case E_HImode:
50267 case E_SImode:
50268 case E_DImode:
50269 case E_SFmode:
50270 case E_DFmode:
50271 /* case E_SCmode: */
50272 /* case E_DCmode: */
50273 break;
50274 default:
50275 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50276 "unsupported argument type %qT for simd", TREE_TYPE (t));
50277 return 0;
50280 if (!TREE_PUBLIC (node->decl))
50282 /* If the function isn't exported, we can pick up just one ISA
50283 for the clones. */
50284 if (TARGET_AVX512F)
50285 clonei->vecsize_mangle = 'e';
50286 else if (TARGET_AVX2)
50287 clonei->vecsize_mangle = 'd';
50288 else if (TARGET_AVX)
50289 clonei->vecsize_mangle = 'c';
50290 else
50291 clonei->vecsize_mangle = 'b';
50292 ret = 1;
50294 else
50296 clonei->vecsize_mangle = "bcde"[num];
50297 ret = 4;
50299 clonei->mask_mode = VOIDmode;
50300 switch (clonei->vecsize_mangle)
50302 case 'b':
50303 clonei->vecsize_int = 128;
50304 clonei->vecsize_float = 128;
50305 break;
50306 case 'c':
50307 clonei->vecsize_int = 128;
50308 clonei->vecsize_float = 256;
50309 break;
50310 case 'd':
50311 clonei->vecsize_int = 256;
50312 clonei->vecsize_float = 256;
50313 break;
50314 case 'e':
50315 clonei->vecsize_int = 512;
50316 clonei->vecsize_float = 512;
50317 if (TYPE_MODE (base_type) == QImode)
50318 clonei->mask_mode = DImode;
50319 else
50320 clonei->mask_mode = SImode;
50321 break;
50323 if (clonei->simdlen == 0)
50325 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50326 clonei->simdlen = clonei->vecsize_int;
50327 else
50328 clonei->simdlen = clonei->vecsize_float;
50329 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50331 else if (clonei->simdlen > 16)
50333 /* For compatibility with ICC, use the same upper bounds
50334 for simdlen. In particular, for CTYPE below, use the return type,
50335 unless the function returns void, in that case use the characteristic
50336 type. If it is possible for given SIMDLEN to pass CTYPE value
50337 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50338 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50339 emit corresponding clone. */
50340 tree ctype = ret_type;
50341 if (TREE_CODE (ret_type) == VOID_TYPE)
50342 ctype = base_type;
50343 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50344 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50345 cnt /= clonei->vecsize_int;
50346 else
50347 cnt /= clonei->vecsize_float;
50348 if (cnt > (TARGET_64BIT ? 16 : 8))
50350 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50351 "unsupported simdlen %d", clonei->simdlen);
50352 return 0;
50355 return ret;
50358 /* Add target attribute to SIMD clone NODE if needed. */
50360 static void
50361 ix86_simd_clone_adjust (struct cgraph_node *node)
50363 const char *str = NULL;
50364 gcc_assert (node->decl == cfun->decl);
50365 switch (node->simdclone->vecsize_mangle)
50367 case 'b':
50368 if (!TARGET_SSE2)
50369 str = "sse2";
50370 break;
50371 case 'c':
50372 if (!TARGET_AVX)
50373 str = "avx";
50374 break;
50375 case 'd':
50376 if (!TARGET_AVX2)
50377 str = "avx2";
50378 break;
50379 case 'e':
50380 if (!TARGET_AVX512F)
50381 str = "avx512f";
50382 break;
50383 default:
50384 gcc_unreachable ();
50386 if (str == NULL)
50387 return;
50388 push_cfun (NULL);
50389 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50390 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50391 gcc_assert (ok);
50392 pop_cfun ();
50393 ix86_reset_previous_fndecl ();
50394 ix86_set_current_function (node->decl);
50397 /* If SIMD clone NODE can't be used in a vectorized loop
50398 in current function, return -1, otherwise return a badness of using it
50399 (0 if it is most desirable from vecsize_mangle point of view, 1
50400 slightly less desirable, etc.). */
50402 static int
50403 ix86_simd_clone_usable (struct cgraph_node *node)
50405 switch (node->simdclone->vecsize_mangle)
50407 case 'b':
50408 if (!TARGET_SSE2)
50409 return -1;
50410 if (!TARGET_AVX)
50411 return 0;
50412 return TARGET_AVX2 ? 2 : 1;
50413 case 'c':
50414 if (!TARGET_AVX)
50415 return -1;
50416 return TARGET_AVX2 ? 1 : 0;
50417 case 'd':
50418 if (!TARGET_AVX2)
50419 return -1;
50420 return 0;
50421 case 'e':
50422 if (!TARGET_AVX512F)
50423 return -1;
50424 return 0;
50425 default:
50426 gcc_unreachable ();
50430 /* This function adjusts the unroll factor based on
50431 the hardware capabilities. For ex, bdver3 has
50432 a loop buffer which makes unrolling of smaller
50433 loops less important. This function decides the
50434 unroll factor using number of memory references
50435 (value 32 is used) as a heuristic. */
50437 static unsigned
50438 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50440 basic_block *bbs;
50441 rtx_insn *insn;
50442 unsigned i;
50443 unsigned mem_count = 0;
50445 if (!TARGET_ADJUST_UNROLL)
50446 return nunroll;
50448 /* Count the number of memory references within the loop body.
50449 This value determines the unrolling factor for bdver3 and bdver4
50450 architectures. */
50451 subrtx_iterator::array_type array;
50452 bbs = get_loop_body (loop);
50453 for (i = 0; i < loop->num_nodes; i++)
50454 FOR_BB_INSNS (bbs[i], insn)
50455 if (NONDEBUG_INSN_P (insn))
50456 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50457 if (const_rtx x = *iter)
50458 if (MEM_P (x))
50460 machine_mode mode = GET_MODE (x);
50461 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50462 if (n_words > 4)
50463 mem_count += 2;
50464 else
50465 mem_count += 1;
50467 free (bbs);
50469 if (mem_count && mem_count <=32)
50470 return 32/mem_count;
50472 return nunroll;
50476 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50478 static bool
50479 ix86_float_exceptions_rounding_supported_p (void)
50481 /* For x87 floating point with standard excess precision handling,
50482 there is no adddf3 pattern (since x87 floating point only has
50483 XFmode operations) so the default hook implementation gets this
50484 wrong. */
50485 return TARGET_80387 || TARGET_SSE_MATH;
50488 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50490 static void
50491 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50493 if (!TARGET_80387 && !TARGET_SSE_MATH)
50494 return;
50495 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50496 if (TARGET_80387)
50498 tree fenv_index_type = build_index_type (size_int (6));
50499 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50500 tree fenv_var = create_tmp_var_raw (fenv_type);
50501 TREE_ADDRESSABLE (fenv_var) = 1;
50502 tree fenv_ptr = build_pointer_type (fenv_type);
50503 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50504 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50505 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50506 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50507 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50508 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50509 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50510 tree hold_fnclex = build_call_expr (fnclex, 0);
50511 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50512 NULL_TREE, NULL_TREE);
50513 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50514 hold_fnclex);
50515 *clear = build_call_expr (fnclex, 0);
50516 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50517 tree fnstsw_call = build_call_expr (fnstsw, 0);
50518 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50519 sw_var, fnstsw_call);
50520 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50521 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50522 exceptions_var, exceptions_x87);
50523 *update = build2 (COMPOUND_EXPR, integer_type_node,
50524 sw_mod, update_mod);
50525 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50526 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50528 if (TARGET_SSE_MATH)
50530 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50531 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50532 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50533 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50534 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50535 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50536 mxcsr_orig_var, stmxcsr_hold_call);
50537 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50538 mxcsr_orig_var,
50539 build_int_cst (unsigned_type_node, 0x1f80));
50540 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50541 build_int_cst (unsigned_type_node, 0xffffffc0));
50542 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50543 mxcsr_mod_var, hold_mod_val);
50544 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50545 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50546 hold_assign_orig, hold_assign_mod);
50547 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50548 ldmxcsr_hold_call);
50549 if (*hold)
50550 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50551 else
50552 *hold = hold_all;
50553 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50554 if (*clear)
50555 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50556 ldmxcsr_clear_call);
50557 else
50558 *clear = ldmxcsr_clear_call;
50559 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50560 tree exceptions_sse = fold_convert (integer_type_node,
50561 stxmcsr_update_call);
50562 if (*update)
50564 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50565 exceptions_var, exceptions_sse);
50566 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50567 exceptions_var, exceptions_mod);
50568 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50569 exceptions_assign);
50571 else
50572 *update = build2 (MODIFY_EXPR, integer_type_node,
50573 exceptions_var, exceptions_sse);
50574 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50575 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50576 ldmxcsr_update_call);
50578 tree atomic_feraiseexcept
50579 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50580 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50581 1, exceptions_var);
50582 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50583 atomic_feraiseexcept_call);
50586 /* Return mode to be used for bounds or VOIDmode
50587 if bounds are not supported. */
50589 static machine_mode
50590 ix86_mpx_bound_mode ()
50592 /* Do not support pointer checker if MPX
50593 is not enabled. */
50594 if (!TARGET_MPX)
50596 if (flag_check_pointer_bounds)
50597 warning (0, "Pointer Checker requires MPX support on this target."
50598 " Use -mmpx options to enable MPX.");
50599 return VOIDmode;
50602 return BNDmode;
50605 /* Return constant used to statically initialize constant bounds.
50607 This function is used to create special bound values. For now
50608 only INIT bounds and NONE bounds are expected. More special
50609 values may be added later. */
50611 static tree
50612 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
50614 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
50615 : build_zero_cst (pointer_sized_int_node);
50616 tree high = ub ? build_zero_cst (pointer_sized_int_node)
50617 : build_minus_one_cst (pointer_sized_int_node);
50619 /* This function is supposed to be used to create INIT and
50620 NONE bounds only. */
50621 gcc_assert ((lb == 0 && ub == -1)
50622 || (lb == -1 && ub == 0));
50624 return build_complex (NULL, low, high);
50627 /* Generate a list of statements STMTS to initialize pointer bounds
50628 variable VAR with bounds LB and UB. Return the number of generated
50629 statements. */
50631 static int
50632 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
50634 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
50635 tree lhs, modify, var_p;
50637 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
50638 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
50640 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
50641 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
50642 append_to_statement_list (modify, stmts);
50644 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
50645 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
50646 TYPE_SIZE_UNIT (pointer_sized_int_node)));
50647 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
50648 append_to_statement_list (modify, stmts);
50650 return 2;
50653 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50654 /* For i386, common symbol is local only for non-PIE binaries. For
50655 x86-64, common symbol is local only for non-PIE binaries or linker
50656 supports copy reloc in PIE binaries. */
50658 static bool
50659 ix86_binds_local_p (const_tree exp)
50661 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50662 (!flag_pic
50663 || (TARGET_64BIT
50664 && HAVE_LD_PIE_COPYRELOC != 0)));
50666 #endif
50668 /* If MEM is in the form of [base+offset], extract the two parts
50669 of address and set to BASE and OFFSET, otherwise return false. */
50671 static bool
50672 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50674 rtx addr;
50676 gcc_assert (MEM_P (mem));
50678 addr = XEXP (mem, 0);
50680 if (GET_CODE (addr) == CONST)
50681 addr = XEXP (addr, 0);
50683 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50685 *base = addr;
50686 *offset = const0_rtx;
50687 return true;
50690 if (GET_CODE (addr) == PLUS
50691 && (REG_P (XEXP (addr, 0))
50692 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50693 && CONST_INT_P (XEXP (addr, 1)))
50695 *base = XEXP (addr, 0);
50696 *offset = XEXP (addr, 1);
50697 return true;
50700 return false;
50703 /* Given OPERANDS of consecutive load/store, check if we can merge
50704 them into move multiple. LOAD is true if they are load instructions.
50705 MODE is the mode of memory operands. */
50707 bool
50708 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50709 machine_mode mode)
50711 HOST_WIDE_INT offval_1, offval_2, msize;
50712 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50714 if (load)
50716 mem_1 = operands[1];
50717 mem_2 = operands[3];
50718 reg_1 = operands[0];
50719 reg_2 = operands[2];
50721 else
50723 mem_1 = operands[0];
50724 mem_2 = operands[2];
50725 reg_1 = operands[1];
50726 reg_2 = operands[3];
50729 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50731 if (REGNO (reg_1) != REGNO (reg_2))
50732 return false;
50734 /* Check if the addresses are in the form of [base+offset]. */
50735 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50736 return false;
50737 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50738 return false;
50740 /* Check if the bases are the same. */
50741 if (!rtx_equal_p (base_1, base_2))
50742 return false;
50744 offval_1 = INTVAL (offset_1);
50745 offval_2 = INTVAL (offset_2);
50746 msize = GET_MODE_SIZE (mode);
50747 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50748 if (offval_1 + msize != offval_2)
50749 return false;
50751 return true;
50754 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50756 static bool
50757 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50758 optimization_type opt_type)
50760 switch (op)
50762 case asin_optab:
50763 case acos_optab:
50764 case log1p_optab:
50765 case exp_optab:
50766 case exp10_optab:
50767 case exp2_optab:
50768 case expm1_optab:
50769 case ldexp_optab:
50770 case scalb_optab:
50771 case round_optab:
50772 return opt_type == OPTIMIZE_FOR_SPEED;
50774 case rint_optab:
50775 if (SSE_FLOAT_MODE_P (mode1)
50776 && TARGET_SSE_MATH
50777 && !flag_trapping_math
50778 && !TARGET_SSE4_1)
50779 return opt_type == OPTIMIZE_FOR_SPEED;
50780 return true;
50782 case floor_optab:
50783 case ceil_optab:
50784 case btrunc_optab:
50785 if (SSE_FLOAT_MODE_P (mode1)
50786 && TARGET_SSE_MATH
50787 && !flag_trapping_math
50788 && TARGET_SSE4_1)
50789 return true;
50790 return opt_type == OPTIMIZE_FOR_SPEED;
50792 case rsqrt_optab:
50793 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
50795 default:
50796 return true;
50800 /* Address space support.
50802 This is not "far pointers" in the 16-bit sense, but an easy way
50803 to use %fs and %gs segment prefixes. Therefore:
50805 (a) All address spaces have the same modes,
50806 (b) All address spaces have the same addresss forms,
50807 (c) While %fs and %gs are technically subsets of the generic
50808 address space, they are probably not subsets of each other.
50809 (d) Since we have no access to the segment base register values
50810 without resorting to a system call, we cannot convert a
50811 non-default address space to a default address space.
50812 Therefore we do not claim %fs or %gs are subsets of generic.
50814 Therefore we can (mostly) use the default hooks. */
50816 /* All use of segmentation is assumed to make address 0 valid. */
50818 static bool
50819 ix86_addr_space_zero_address_valid (addr_space_t as)
50821 return as != ADDR_SPACE_GENERIC;
50824 static void
50825 ix86_init_libfuncs (void)
50827 if (TARGET_64BIT)
50829 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
50830 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
50832 else
50834 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
50835 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
50838 #if TARGET_MACHO
50839 darwin_rename_builtins ();
50840 #endif
50843 /* Generate call to __divmoddi4. */
50845 static void
50846 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
50847 rtx op0, rtx op1,
50848 rtx *quot_p, rtx *rem_p)
50850 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
50852 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
50853 mode,
50854 op0, GET_MODE (op0),
50855 op1, GET_MODE (op1),
50856 XEXP (rem, 0), Pmode);
50857 *quot_p = quot;
50858 *rem_p = rem;
50861 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
50862 FPU, assume that the fpcw is set to extended precision; when using
50863 only SSE, rounding is correct; when using both SSE and the FPU,
50864 the rounding precision is indeterminate, since either may be chosen
50865 apparently at random. */
50867 static enum flt_eval_method
50868 ix86_excess_precision (enum excess_precision_type type)
50870 switch (type)
50872 case EXCESS_PRECISION_TYPE_FAST:
50873 /* The fastest type to promote to will always be the native type,
50874 whether that occurs with implicit excess precision or
50875 otherwise. */
50876 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50877 case EXCESS_PRECISION_TYPE_STANDARD:
50878 case EXCESS_PRECISION_TYPE_IMPLICIT:
50879 /* Otherwise, the excess precision we want when we are
50880 in a standards compliant mode, and the implicit precision we
50881 provide would be identical were it not for the unpredictable
50882 cases. */
50883 if (!TARGET_80387)
50884 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50885 else if (!TARGET_MIX_SSE_I387)
50887 if (!TARGET_SSE_MATH)
50888 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
50889 else if (TARGET_SSE2)
50890 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
50893 /* If we are in standards compliant mode, but we know we will
50894 calculate in unpredictable precision, return
50895 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
50896 excess precision if the target can't guarantee it will honor
50897 it. */
50898 return (type == EXCESS_PRECISION_TYPE_STANDARD
50899 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
50900 : FLT_EVAL_METHOD_UNPREDICTABLE);
50901 default:
50902 gcc_unreachable ();
50905 return FLT_EVAL_METHOD_UNPREDICTABLE;
50908 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
50909 decrements by exactly 2 no matter what the position was, there is no pushb.
50911 But as CIE data alignment factor on this arch is -4 for 32bit targets
50912 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
50913 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
50915 poly_int64
50916 ix86_push_rounding (poly_int64 bytes)
50918 return ROUND_UP (bytes, UNITS_PER_WORD);
50921 /* Target-specific selftests. */
50923 #if CHECKING_P
50925 namespace selftest {
50927 /* Verify that hard regs are dumped as expected (in compact mode). */
50929 static void
50930 ix86_test_dumping_hard_regs ()
50932 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
50933 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
50936 /* Test dumping an insn with repeated references to the same SCRATCH,
50937 to verify the rtx_reuse code. */
50939 static void
50940 ix86_test_dumping_memory_blockage ()
50942 set_new_first_and_last_insn (NULL, NULL);
50944 rtx pat = gen_memory_blockage ();
50945 rtx_reuse_manager r;
50946 r.preprocess (pat);
50948 /* Verify that the repeated references to the SCRATCH show use
50949 reuse IDS. The first should be prefixed with a reuse ID,
50950 and the second should be dumped as a "reuse_rtx" of that ID.
50951 The expected string assumes Pmode == DImode. */
50952 if (Pmode == DImode)
50953 ASSERT_RTL_DUMP_EQ_WITH_REUSE
50954 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
50955 " (unspec:BLK [\n"
50956 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
50957 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
50960 /* Verify loading an RTL dump; specifically a dump of copying
50961 a param on x86_64 from a hard reg into the frame.
50962 This test is target-specific since the dump contains target-specific
50963 hard reg names. */
50965 static void
50966 ix86_test_loading_dump_fragment_1 ()
50968 rtl_dump_test t (SELFTEST_LOCATION,
50969 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
50971 rtx_insn *insn = get_insn_by_uid (1);
50973 /* The block structure and indentation here is purely for
50974 readability; it mirrors the structure of the rtx. */
50975 tree mem_expr;
50977 rtx pat = PATTERN (insn);
50978 ASSERT_EQ (SET, GET_CODE (pat));
50980 rtx dest = SET_DEST (pat);
50981 ASSERT_EQ (MEM, GET_CODE (dest));
50982 /* Verify the "/c" was parsed. */
50983 ASSERT_TRUE (RTX_FLAG (dest, call));
50984 ASSERT_EQ (SImode, GET_MODE (dest));
50986 rtx addr = XEXP (dest, 0);
50987 ASSERT_EQ (PLUS, GET_CODE (addr));
50988 ASSERT_EQ (DImode, GET_MODE (addr));
50990 rtx lhs = XEXP (addr, 0);
50991 /* Verify that the "frame" REG was consolidated. */
50992 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
50995 rtx rhs = XEXP (addr, 1);
50996 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
50997 ASSERT_EQ (-4, INTVAL (rhs));
51000 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51001 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51002 /* "i" should have been handled by synthesizing a global int
51003 variable named "i". */
51004 mem_expr = MEM_EXPR (dest);
51005 ASSERT_NE (mem_expr, NULL);
51006 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51007 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51008 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51009 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51010 /* "+0". */
51011 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51012 ASSERT_EQ (0, MEM_OFFSET (dest));
51013 /* "S4". */
51014 ASSERT_EQ (4, MEM_SIZE (dest));
51015 /* "A32. */
51016 ASSERT_EQ (32, MEM_ALIGN (dest));
51019 rtx src = SET_SRC (pat);
51020 ASSERT_EQ (REG, GET_CODE (src));
51021 ASSERT_EQ (SImode, GET_MODE (src));
51022 ASSERT_EQ (5, REGNO (src));
51023 tree reg_expr = REG_EXPR (src);
51024 /* "i" here should point to the same var as for the MEM_EXPR. */
51025 ASSERT_EQ (reg_expr, mem_expr);
51030 /* Verify that the RTL loader copes with a call_insn dump.
51031 This test is target-specific since the dump contains a target-specific
51032 hard reg name. */
51034 static void
51035 ix86_test_loading_call_insn ()
51037 /* The test dump includes register "xmm0", where requires TARGET_SSE
51038 to exist. */
51039 if (!TARGET_SSE)
51040 return;
51042 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51044 rtx_insn *insn = get_insns ();
51045 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51047 /* "/j". */
51048 ASSERT_TRUE (RTX_FLAG (insn, jump));
51050 rtx pat = PATTERN (insn);
51051 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51053 /* Verify REG_NOTES. */
51055 /* "(expr_list:REG_CALL_DECL". */
51056 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51057 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51058 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51060 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51061 rtx_expr_list *note1 = note0->next ();
51062 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51064 ASSERT_EQ (NULL, note1->next ());
51067 /* Verify CALL_INSN_FUNCTION_USAGE. */
51069 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51070 rtx_expr_list *usage
51071 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51072 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51073 ASSERT_EQ (DFmode, GET_MODE (usage));
51074 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51075 ASSERT_EQ (NULL, usage->next ());
51079 /* Verify that the RTL loader copes a dump from print_rtx_function.
51080 This test is target-specific since the dump contains target-specific
51081 hard reg names. */
51083 static void
51084 ix86_test_loading_full_dump ()
51086 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51088 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51090 rtx_insn *insn_1 = get_insn_by_uid (1);
51091 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51093 rtx_insn *insn_7 = get_insn_by_uid (7);
51094 ASSERT_EQ (INSN, GET_CODE (insn_7));
51095 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51097 rtx_insn *insn_15 = get_insn_by_uid (15);
51098 ASSERT_EQ (INSN, GET_CODE (insn_15));
51099 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51101 /* Verify crtl->return_rtx. */
51102 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51103 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51104 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51107 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51108 In particular, verify that it correctly loads the 2nd operand.
51109 This test is target-specific since these are machine-specific
51110 operands (and enums). */
51112 static void
51113 ix86_test_loading_unspec ()
51115 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51117 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51119 ASSERT_TRUE (cfun);
51121 /* Test of an UNSPEC. */
51122 rtx_insn *insn = get_insns ();
51123 ASSERT_EQ (INSN, GET_CODE (insn));
51124 rtx set = single_set (insn);
51125 ASSERT_NE (NULL, set);
51126 rtx dst = SET_DEST (set);
51127 ASSERT_EQ (MEM, GET_CODE (dst));
51128 rtx src = SET_SRC (set);
51129 ASSERT_EQ (UNSPEC, GET_CODE (src));
51130 ASSERT_EQ (BLKmode, GET_MODE (src));
51131 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51133 rtx v0 = XVECEXP (src, 0, 0);
51135 /* Verify that the two uses of the first SCRATCH have pointer
51136 equality. */
51137 rtx scratch_a = XEXP (dst, 0);
51138 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51140 rtx scratch_b = XEXP (v0, 0);
51141 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51143 ASSERT_EQ (scratch_a, scratch_b);
51145 /* Verify that the two mems are thus treated as equal. */
51146 ASSERT_TRUE (rtx_equal_p (dst, v0));
51148 /* Verify the the insn is recognized. */
51149 ASSERT_NE(-1, recog_memoized (insn));
51151 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51152 insn = NEXT_INSN (insn);
51153 ASSERT_EQ (INSN, GET_CODE (insn));
51155 set = single_set (insn);
51156 ASSERT_NE (NULL, set);
51158 src = SET_SRC (set);
51159 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51160 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51163 /* Run all target-specific selftests. */
51165 static void
51166 ix86_run_selftests (void)
51168 ix86_test_dumping_hard_regs ();
51169 ix86_test_dumping_memory_blockage ();
51171 /* Various tests of loading RTL dumps, here because they contain
51172 ix86-isms (e.g. names of hard regs). */
51173 ix86_test_loading_dump_fragment_1 ();
51174 ix86_test_loading_call_insn ();
51175 ix86_test_loading_full_dump ();
51176 ix86_test_loading_unspec ();
51179 } // namespace selftest
51181 #endif /* CHECKING_P */
51183 /* Initialize the GCC target structure. */
51184 #undef TARGET_RETURN_IN_MEMORY
51185 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51187 #undef TARGET_LEGITIMIZE_ADDRESS
51188 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51190 #undef TARGET_ATTRIBUTE_TABLE
51191 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51192 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51193 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51194 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51195 # undef TARGET_MERGE_DECL_ATTRIBUTES
51196 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51197 #endif
51199 #undef TARGET_COMP_TYPE_ATTRIBUTES
51200 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51202 #undef TARGET_INIT_BUILTINS
51203 #define TARGET_INIT_BUILTINS ix86_init_builtins
51204 #undef TARGET_BUILTIN_DECL
51205 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51206 #undef TARGET_EXPAND_BUILTIN
51207 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51209 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51210 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51211 ix86_builtin_vectorized_function
51213 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51214 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51216 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51217 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51219 #undef TARGET_BUILTIN_RECIPROCAL
51220 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51222 #undef TARGET_ASM_FUNCTION_EPILOGUE
51223 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51225 #undef TARGET_ENCODE_SECTION_INFO
51226 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51227 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51228 #else
51229 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51230 #endif
51232 #undef TARGET_ASM_OPEN_PAREN
51233 #define TARGET_ASM_OPEN_PAREN ""
51234 #undef TARGET_ASM_CLOSE_PAREN
51235 #define TARGET_ASM_CLOSE_PAREN ""
51237 #undef TARGET_ASM_BYTE_OP
51238 #define TARGET_ASM_BYTE_OP ASM_BYTE
51240 #undef TARGET_ASM_ALIGNED_HI_OP
51241 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51242 #undef TARGET_ASM_ALIGNED_SI_OP
51243 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51244 #ifdef ASM_QUAD
51245 #undef TARGET_ASM_ALIGNED_DI_OP
51246 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51247 #endif
51249 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51250 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51252 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51253 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51255 #undef TARGET_ASM_UNALIGNED_HI_OP
51256 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51257 #undef TARGET_ASM_UNALIGNED_SI_OP
51258 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51259 #undef TARGET_ASM_UNALIGNED_DI_OP
51260 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51262 #undef TARGET_PRINT_OPERAND
51263 #define TARGET_PRINT_OPERAND ix86_print_operand
51264 #undef TARGET_PRINT_OPERAND_ADDRESS
51265 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51266 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51267 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51268 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51269 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51271 #undef TARGET_SCHED_INIT_GLOBAL
51272 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51273 #undef TARGET_SCHED_ADJUST_COST
51274 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51275 #undef TARGET_SCHED_ISSUE_RATE
51276 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51277 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51278 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51279 ia32_multipass_dfa_lookahead
51280 #undef TARGET_SCHED_MACRO_FUSION_P
51281 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51282 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51283 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51285 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51286 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51288 #undef TARGET_MEMMODEL_CHECK
51289 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51291 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51292 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51294 #ifdef HAVE_AS_TLS
51295 #undef TARGET_HAVE_TLS
51296 #define TARGET_HAVE_TLS true
51297 #endif
51298 #undef TARGET_CANNOT_FORCE_CONST_MEM
51299 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51300 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51301 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51303 #undef TARGET_DELEGITIMIZE_ADDRESS
51304 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51306 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51307 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51309 #undef TARGET_MS_BITFIELD_LAYOUT_P
51310 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51312 #if TARGET_MACHO
51313 #undef TARGET_BINDS_LOCAL_P
51314 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51315 #else
51316 #undef TARGET_BINDS_LOCAL_P
51317 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51318 #endif
51319 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51320 #undef TARGET_BINDS_LOCAL_P
51321 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51322 #endif
51324 #undef TARGET_ASM_OUTPUT_MI_THUNK
51325 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51326 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51327 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51329 #undef TARGET_ASM_FILE_START
51330 #define TARGET_ASM_FILE_START x86_file_start
51332 #undef TARGET_OPTION_OVERRIDE
51333 #define TARGET_OPTION_OVERRIDE ix86_option_override
51335 #undef TARGET_REGISTER_MOVE_COST
51336 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51337 #undef TARGET_MEMORY_MOVE_COST
51338 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51339 #undef TARGET_RTX_COSTS
51340 #define TARGET_RTX_COSTS ix86_rtx_costs
51341 #undef TARGET_ADDRESS_COST
51342 #define TARGET_ADDRESS_COST ix86_address_cost
51344 #undef TARGET_FLAGS_REGNUM
51345 #define TARGET_FLAGS_REGNUM FLAGS_REG
51346 #undef TARGET_FIXED_CONDITION_CODE_REGS
51347 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51348 #undef TARGET_CC_MODES_COMPATIBLE
51349 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51351 #undef TARGET_MACHINE_DEPENDENT_REORG
51352 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51354 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51355 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51357 #undef TARGET_BUILD_BUILTIN_VA_LIST
51358 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51360 #undef TARGET_FOLD_BUILTIN
51361 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51363 #undef TARGET_GIMPLE_FOLD_BUILTIN
51364 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51366 #undef TARGET_COMPARE_VERSION_PRIORITY
51367 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51369 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51370 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51371 ix86_generate_version_dispatcher_body
51373 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51374 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51375 ix86_get_function_versions_dispatcher
51377 #undef TARGET_ENUM_VA_LIST_P
51378 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51380 #undef TARGET_FN_ABI_VA_LIST
51381 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51383 #undef TARGET_CANONICAL_VA_LIST_TYPE
51384 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51386 #undef TARGET_EXPAND_BUILTIN_VA_START
51387 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51389 #undef TARGET_MD_ASM_ADJUST
51390 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51392 #undef TARGET_C_EXCESS_PRECISION
51393 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51394 #undef TARGET_PROMOTE_PROTOTYPES
51395 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51396 #undef TARGET_SETUP_INCOMING_VARARGS
51397 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51398 #undef TARGET_MUST_PASS_IN_STACK
51399 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51400 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51401 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51402 #undef TARGET_FUNCTION_ARG_ADVANCE
51403 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51404 #undef TARGET_FUNCTION_ARG
51405 #define TARGET_FUNCTION_ARG ix86_function_arg
51406 #undef TARGET_INIT_PIC_REG
51407 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51408 #undef TARGET_USE_PSEUDO_PIC_REG
51409 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51410 #undef TARGET_FUNCTION_ARG_BOUNDARY
51411 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51412 #undef TARGET_PASS_BY_REFERENCE
51413 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51414 #undef TARGET_INTERNAL_ARG_POINTER
51415 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51416 #undef TARGET_UPDATE_STACK_BOUNDARY
51417 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51418 #undef TARGET_GET_DRAP_RTX
51419 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51420 #undef TARGET_STRICT_ARGUMENT_NAMING
51421 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51422 #undef TARGET_STATIC_CHAIN
51423 #define TARGET_STATIC_CHAIN ix86_static_chain
51424 #undef TARGET_TRAMPOLINE_INIT
51425 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51426 #undef TARGET_RETURN_POPS_ARGS
51427 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51429 #undef TARGET_WARN_FUNC_RETURN
51430 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51432 #undef TARGET_LEGITIMATE_COMBINED_INSN
51433 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51435 #undef TARGET_ASAN_SHADOW_OFFSET
51436 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51438 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51439 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51441 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51442 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51444 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51445 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51447 #undef TARGET_C_MODE_FOR_SUFFIX
51448 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51450 #ifdef HAVE_AS_TLS
51451 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51452 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51453 #endif
51455 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51456 #undef TARGET_INSERT_ATTRIBUTES
51457 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51458 #endif
51460 #undef TARGET_MANGLE_TYPE
51461 #define TARGET_MANGLE_TYPE ix86_mangle_type
51463 #undef TARGET_STACK_PROTECT_GUARD
51464 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51466 #if !TARGET_MACHO
51467 #undef TARGET_STACK_PROTECT_FAIL
51468 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51469 #endif
51471 #undef TARGET_FUNCTION_VALUE
51472 #define TARGET_FUNCTION_VALUE ix86_function_value
51474 #undef TARGET_FUNCTION_VALUE_REGNO_P
51475 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51477 #undef TARGET_PROMOTE_FUNCTION_MODE
51478 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51480 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51481 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51483 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51484 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51486 #undef TARGET_INSTANTIATE_DECLS
51487 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51489 #undef TARGET_SECONDARY_RELOAD
51490 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51491 #undef TARGET_SECONDARY_MEMORY_NEEDED
51492 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51493 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51494 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51496 #undef TARGET_CLASS_MAX_NREGS
51497 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51499 #undef TARGET_PREFERRED_RELOAD_CLASS
51500 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51501 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51502 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51503 #undef TARGET_CLASS_LIKELY_SPILLED_P
51504 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51506 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51507 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51508 ix86_builtin_vectorization_cost
51509 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51510 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51511 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51512 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51513 ix86_preferred_simd_mode
51514 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51515 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51516 ix86_split_reduction
51517 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51518 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51519 ix86_autovectorize_vector_sizes
51520 #undef TARGET_VECTORIZE_GET_MASK_MODE
51521 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51522 #undef TARGET_VECTORIZE_INIT_COST
51523 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51524 #undef TARGET_VECTORIZE_ADD_STMT_COST
51525 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51526 #undef TARGET_VECTORIZE_FINISH_COST
51527 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51528 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51529 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51531 #undef TARGET_SET_CURRENT_FUNCTION
51532 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51534 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51535 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51537 #undef TARGET_OPTION_SAVE
51538 #define TARGET_OPTION_SAVE ix86_function_specific_save
51540 #undef TARGET_OPTION_RESTORE
51541 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51543 #undef TARGET_OPTION_POST_STREAM_IN
51544 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51546 #undef TARGET_OPTION_PRINT
51547 #define TARGET_OPTION_PRINT ix86_function_specific_print
51549 #undef TARGET_OPTION_FUNCTION_VERSIONS
51550 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51552 #undef TARGET_CAN_INLINE_P
51553 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51555 #undef TARGET_LEGITIMATE_ADDRESS_P
51556 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51558 #undef TARGET_REGISTER_PRIORITY
51559 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51561 #undef TARGET_REGISTER_USAGE_LEVELING_P
51562 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51564 #undef TARGET_LEGITIMATE_CONSTANT_P
51565 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51567 #undef TARGET_COMPUTE_FRAME_LAYOUT
51568 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
51570 #undef TARGET_FRAME_POINTER_REQUIRED
51571 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51573 #undef TARGET_CAN_ELIMINATE
51574 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51576 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51577 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51579 #undef TARGET_ASM_CODE_END
51580 #define TARGET_ASM_CODE_END ix86_code_end
51582 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51583 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51585 #undef TARGET_CANONICALIZE_COMPARISON
51586 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
51588 #undef TARGET_LOOP_UNROLL_ADJUST
51589 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51591 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51592 #undef TARGET_SPILL_CLASS
51593 #define TARGET_SPILL_CLASS ix86_spill_class
51595 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51596 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51597 ix86_simd_clone_compute_vecsize_and_simdlen
51599 #undef TARGET_SIMD_CLONE_ADJUST
51600 #define TARGET_SIMD_CLONE_ADJUST \
51601 ix86_simd_clone_adjust
51603 #undef TARGET_SIMD_CLONE_USABLE
51604 #define TARGET_SIMD_CLONE_USABLE \
51605 ix86_simd_clone_usable
51607 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51608 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51609 ix86_float_exceptions_rounding_supported_p
51611 #undef TARGET_MODE_EMIT
51612 #define TARGET_MODE_EMIT ix86_emit_mode_set
51614 #undef TARGET_MODE_NEEDED
51615 #define TARGET_MODE_NEEDED ix86_mode_needed
51617 #undef TARGET_MODE_AFTER
51618 #define TARGET_MODE_AFTER ix86_mode_after
51620 #undef TARGET_MODE_ENTRY
51621 #define TARGET_MODE_ENTRY ix86_mode_entry
51623 #undef TARGET_MODE_EXIT
51624 #define TARGET_MODE_EXIT ix86_mode_exit
51626 #undef TARGET_MODE_PRIORITY
51627 #define TARGET_MODE_PRIORITY ix86_mode_priority
51629 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51630 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51632 #undef TARGET_LOAD_BOUNDS_FOR_ARG
51633 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
51635 #undef TARGET_STORE_BOUNDS_FOR_ARG
51636 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
51638 #undef TARGET_LOAD_RETURNED_BOUNDS
51639 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
51641 #undef TARGET_STORE_RETURNED_BOUNDS
51642 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
51644 #undef TARGET_CHKP_BOUND_MODE
51645 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
51647 #undef TARGET_BUILTIN_CHKP_FUNCTION
51648 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
51650 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
51651 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
51653 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
51654 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
51656 #undef TARGET_CHKP_INITIALIZE_BOUNDS
51657 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
51659 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
51660 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
51662 #undef TARGET_OFFLOAD_OPTIONS
51663 #define TARGET_OFFLOAD_OPTIONS \
51664 ix86_offload_options
51666 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
51667 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
51669 #undef TARGET_OPTAB_SUPPORTED_P
51670 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
51672 #undef TARGET_HARD_REGNO_SCRATCH_OK
51673 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
51675 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
51676 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
51678 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
51679 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
51681 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
51682 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
51684 #undef TARGET_INIT_LIBFUNCS
51685 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
51687 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
51688 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
51690 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
51691 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
51693 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
51694 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
51696 #undef TARGET_HARD_REGNO_NREGS
51697 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
51698 #undef TARGET_HARD_REGNO_MODE_OK
51699 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
51701 #undef TARGET_MODES_TIEABLE_P
51702 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
51704 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
51705 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
51706 ix86_hard_regno_call_part_clobbered
51708 #undef TARGET_CAN_CHANGE_MODE_CLASS
51709 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
51711 #undef TARGET_STATIC_RTX_ALIGNMENT
51712 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
51713 #undef TARGET_CONSTANT_ALIGNMENT
51714 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
51716 #undef TARGET_EMPTY_RECORD_P
51717 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
51719 #undef TARGET_WARN_PARAMETER_PASSING_ABI
51720 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
51722 #if CHECKING_P
51723 #undef TARGET_RUN_TARGET_SELFTESTS
51724 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
51725 #endif /* #if CHECKING_P */
51727 struct gcc_target targetm = TARGET_INITIALIZER;
51729 #include "gt-i386.h"