i386: Set cfun->machine->max_used_stack_alignment if needed
[official-gcc.git] / gcc / config / i386 / i386.c
blob7554fd1f65997fbaec8950d9683f5432a7bacb02
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "cgraph.h"
41 #include "diagnostic.h"
42 #include "cfgbuild.h"
43 #include "alias.h"
44 #include "fold-const.h"
45 #include "attribs.h"
46 #include "calls.h"
47 #include "stor-layout.h"
48 #include "varasm.h"
49 #include "output.h"
50 #include "insn-attr.h"
51 #include "flags.h"
52 #include "except.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "cfgrtl.h"
56 #include "common/common-target.h"
57 #include "langhooks.h"
58 #include "reload.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "tm-constrs.h"
62 #include "params.h"
63 #include "cselib.h"
64 #include "sched-int.h"
65 #include "opts.h"
66 #include "tree-pass.h"
67 #include "context.h"
68 #include "pass_manager.h"
69 #include "target-globals.h"
70 #include "gimple-iterator.h"
71 #include "tree-vectorizer.h"
72 #include "shrink-wrap.h"
73 #include "builtins.h"
74 #include "rtl-iter.h"
75 #include "tree-iterator.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
91 #include "wide-int-bitmask.h"
92 #include "tree-vector-builder.h"
94 /* This file should be included last. */
95 #include "target-def.h"
97 #include "x86-tune-costs.h"
99 static rtx legitimize_dllimport_symbol (rtx, bool);
100 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
101 static rtx legitimize_pe_coff_symbol (rtx, bool);
102 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
103 static bool ix86_save_reg (unsigned int, bool, bool);
104 static bool ix86_function_naked (const_tree);
105 static bool ix86_notrack_prefixed_insn_p (rtx);
106 static void ix86_emit_restore_reg_using_pop (rtx);
109 #ifndef CHECK_STACK_LIMIT
110 #define CHECK_STACK_LIMIT (-1)
111 #endif
113 /* Return index of given mode in mult and division cost tables. */
114 #define MODE_INDEX(mode) \
115 ((mode) == QImode ? 0 \
116 : (mode) == HImode ? 1 \
117 : (mode) == SImode ? 2 \
118 : (mode) == DImode ? 3 \
119 : 4)
122 /* Set by -mtune. */
123 const struct processor_costs *ix86_tune_cost = NULL;
125 /* Set by -mtune or -Os. */
126 const struct processor_costs *ix86_cost = NULL;
128 /* Processor feature/optimization bitmasks. */
129 #define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
130 #define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
131 #define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
132 #define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
133 #define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
134 #define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
135 #define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
136 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
137 #define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
138 #define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
139 #define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
140 #define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
141 #define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
142 #define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
143 #define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
144 #define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
145 #define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
146 #define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
147 #define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
148 #define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
149 #define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
150 #define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
151 | m_ICELAKE_CLIENT | m_ICELAKE_SERVER)
152 #define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
153 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
154 #define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
155 #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
156 #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
157 #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
159 #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
160 #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
161 #define m_K6_GEODE (m_K6 | m_GEODE)
162 #define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
163 #define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
164 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
165 #define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
166 #define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
167 #define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
168 #define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
169 #define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
170 #define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
171 #define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
172 #define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
173 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
174 #define m_BTVER (m_BTVER1 | m_BTVER2)
175 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
176 | m_ZNVER1)
178 #define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
180 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
181 #undef DEF_TUNE
182 #define DEF_TUNE(tune, name, selector) name,
183 #include "x86-tune.def"
184 #undef DEF_TUNE
187 /* Feature tests against the various tunings. */
188 unsigned char ix86_tune_features[X86_TUNE_LAST];
190 /* Feature tests against the various tunings used to create ix86_tune_features
191 based on the processor mask. */
192 static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
193 #undef DEF_TUNE
194 #define DEF_TUNE(tune, name, selector) selector,
195 #include "x86-tune.def"
196 #undef DEF_TUNE
199 /* Feature tests against the various architecture variations. */
200 unsigned char ix86_arch_features[X86_ARCH_LAST];
202 /* Feature tests against the various architecture variations, used to create
203 ix86_arch_features based on the processor mask. */
204 static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
205 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
206 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
208 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
209 ~m_386,
211 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
212 ~(m_386 | m_486),
214 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
215 ~m_386,
217 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
218 ~m_386,
221 /* In case the average insn count for single function invocation is
222 lower than this constant, emit fast (but longer) prologue and
223 epilogue code. */
224 #define FAST_PROLOGUE_INSN_COUNT 20
226 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
227 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
228 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
229 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
231 /* Array of the smallest class containing reg number REGNO, indexed by
232 REGNO. Used by REGNO_REG_CLASS in i386.h. */
234 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
236 /* ax, dx, cx, bx */
237 AREG, DREG, CREG, BREG,
238 /* si, di, bp, sp */
239 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
240 /* FP registers */
241 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
242 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
243 /* arg pointer */
244 NON_Q_REGS,
245 /* flags, fpsr, fpcr, frame */
246 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
247 /* SSE registers */
248 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
249 SSE_REGS, SSE_REGS,
250 /* MMX registers */
251 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
252 MMX_REGS, MMX_REGS,
253 /* REX registers */
254 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
255 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
256 /* SSE REX registers */
257 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
258 SSE_REGS, SSE_REGS,
259 /* AVX-512 SSE registers */
260 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
261 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
262 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
263 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
264 /* Mask registers. */
265 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
266 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
269 /* The "default" register map used in 32bit mode. */
271 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
273 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
274 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
276 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
277 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
278 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
279 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
280 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
281 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
282 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
283 101, 102, 103, 104, /* bound registers */
286 /* The "default" register map used in 64bit mode. */
288 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
290 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
291 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
292 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
293 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
294 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
295 8,9,10,11,12,13,14,15, /* extended integer registers */
296 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
297 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
298 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
299 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
300 126, 127, 128, 129, /* bound registers */
303 /* Define the register numbers to be used in Dwarf debugging information.
304 The SVR4 reference port C compiler uses the following register numbers
305 in its Dwarf output code:
306 0 for %eax (gcc regno = 0)
307 1 for %ecx (gcc regno = 2)
308 2 for %edx (gcc regno = 1)
309 3 for %ebx (gcc regno = 3)
310 4 for %esp (gcc regno = 7)
311 5 for %ebp (gcc regno = 6)
312 6 for %esi (gcc regno = 4)
313 7 for %edi (gcc regno = 5)
314 The following three DWARF register numbers are never generated by
315 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
316 believed these numbers have these meanings.
317 8 for %eip (no gcc equivalent)
318 9 for %eflags (gcc regno = 17)
319 10 for %trapno (no gcc equivalent)
320 It is not at all clear how we should number the FP stack registers
321 for the x86 architecture. If the version of SDB on x86/svr4 were
322 a bit less brain dead with respect to floating-point then we would
323 have a precedent to follow with respect to DWARF register numbers
324 for x86 FP registers, but the SDB on x86/svr4 was so completely
325 broken with respect to FP registers that it is hardly worth thinking
326 of it as something to strive for compatibility with.
327 The version of x86/svr4 SDB I had does (partially)
328 seem to believe that DWARF register number 11 is associated with
329 the x86 register %st(0), but that's about all. Higher DWARF
330 register numbers don't seem to be associated with anything in
331 particular, and even for DWARF regno 11, SDB only seemed to under-
332 stand that it should say that a variable lives in %st(0) (when
333 asked via an `=' command) if we said it was in DWARF regno 11,
334 but SDB still printed garbage when asked for the value of the
335 variable in question (via a `/' command).
336 (Also note that the labels SDB printed for various FP stack regs
337 when doing an `x' command were all wrong.)
338 Note that these problems generally don't affect the native SVR4
339 C compiler because it doesn't allow the use of -O with -g and
340 because when it is *not* optimizing, it allocates a memory
341 location for each floating-point variable, and the memory
342 location is what gets described in the DWARF AT_location
343 attribute for the variable in question.
344 Regardless of the severe mental illness of the x86/svr4 SDB, we
345 do something sensible here and we use the following DWARF
346 register numbers. Note that these are all stack-top-relative
347 numbers.
348 11 for %st(0) (gcc regno = 8)
349 12 for %st(1) (gcc regno = 9)
350 13 for %st(2) (gcc regno = 10)
351 14 for %st(3) (gcc regno = 11)
352 15 for %st(4) (gcc regno = 12)
353 16 for %st(5) (gcc regno = 13)
354 17 for %st(6) (gcc regno = 14)
355 18 for %st(7) (gcc regno = 15)
357 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
359 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
360 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
361 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
362 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
363 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
364 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
365 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
366 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
367 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
368 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
369 101, 102, 103, 104, /* bound registers */
372 /* Define parameter passing and return registers. */
374 static int const x86_64_int_parameter_registers[6] =
376 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
379 static int const x86_64_ms_abi_int_parameter_registers[4] =
381 CX_REG, DX_REG, R8_REG, R9_REG
384 static int const x86_64_int_return_registers[4] =
386 AX_REG, DX_REG, DI_REG, SI_REG
389 /* Additional registers that are clobbered by SYSV calls. */
391 #define NUM_X86_64_MS_CLOBBERED_REGS 12
392 static int const x86_64_ms_sysv_extra_clobbered_registers
393 [NUM_X86_64_MS_CLOBBERED_REGS] =
395 SI_REG, DI_REG,
396 XMM6_REG, XMM7_REG,
397 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
398 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
401 enum xlogue_stub {
402 XLOGUE_STUB_SAVE,
403 XLOGUE_STUB_RESTORE,
404 XLOGUE_STUB_RESTORE_TAIL,
405 XLOGUE_STUB_SAVE_HFP,
406 XLOGUE_STUB_RESTORE_HFP,
407 XLOGUE_STUB_RESTORE_HFP_TAIL,
409 XLOGUE_STUB_COUNT
412 enum xlogue_stub_sets {
413 XLOGUE_SET_ALIGNED,
414 XLOGUE_SET_ALIGNED_PLUS_8,
415 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
416 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
418 XLOGUE_SET_COUNT
421 /* Register save/restore layout used by out-of-line stubs. */
422 class xlogue_layout {
423 public:
424 struct reginfo
426 unsigned regno;
427 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
428 rsi) to where each register is stored. */
431 unsigned get_nregs () const {return m_nregs;}
432 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
434 const reginfo &get_reginfo (unsigned reg) const
436 gcc_assert (reg < m_nregs);
437 return m_regs[reg];
440 static const char *get_stub_name (enum xlogue_stub stub,
441 unsigned n_extra_args);
443 /* Returns an rtx for the stub's symbol based upon
444 1.) the specified stub (save, restore or restore_ret) and
445 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
446 3.) rather or not stack alignment is being performed. */
447 static rtx get_stub_rtx (enum xlogue_stub stub);
449 /* Returns the amount of stack space (including padding) that the stub
450 needs to store registers based upon data in the machine_function. */
451 HOST_WIDE_INT get_stack_space_used () const
453 const struct machine_function *m = cfun->machine;
454 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
456 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
457 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
460 /* Returns the offset for the base pointer used by the stub. */
461 HOST_WIDE_INT get_stub_ptr_offset () const
463 return STUB_INDEX_OFFSET + m_stack_align_off_in;
466 static const struct xlogue_layout &get_instance ();
467 static unsigned count_stub_managed_regs ();
468 static bool is_stub_managed_reg (unsigned regno, unsigned count);
470 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
471 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
472 static const unsigned MAX_REGS = 18;
473 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
474 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
475 static const unsigned STUB_NAME_MAX_LEN = 20;
476 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
477 static const unsigned REG_ORDER[MAX_REGS];
478 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
480 private:
481 xlogue_layout ();
482 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
483 xlogue_layout (const xlogue_layout &);
485 /* True if hard frame pointer is used. */
486 bool m_hfp;
488 /* Max number of register this layout manages. */
489 unsigned m_nregs;
491 /* Incoming offset from 16-byte alignment. */
492 HOST_WIDE_INT m_stack_align_off_in;
494 /* Register order and offsets. */
495 struct reginfo m_regs[MAX_REGS];
497 /* Lazy-inited cache of symbol names for stubs. */
498 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
499 [STUB_NAME_MAX_LEN];
501 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
504 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
505 "savms64",
506 "resms64",
507 "resms64x",
508 "savms64f",
509 "resms64f",
510 "resms64fx"
513 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
514 /* The below offset values are where each register is stored for the layout
515 relative to incoming stack pointer. The value of each m_regs[].offset will
516 be relative to the incoming base pointer (rax or rsi) used by the stub.
518 s_instances: 0 1 2 3
519 Offset: realigned or aligned + 8
520 Register aligned aligned + 8 aligned w/HFP w/HFP */
521 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
522 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
523 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
524 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
525 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
526 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
527 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
528 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
529 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
530 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
531 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
532 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
533 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
534 BP_REG, /* 0xc0 0xc8 N/A N/A */
535 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
536 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
537 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
538 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
541 /* Instantiate static const values. */
542 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
543 const unsigned xlogue_layout::MIN_REGS;
544 const unsigned xlogue_layout::MAX_REGS;
545 const unsigned xlogue_layout::MAX_EXTRA_REGS;
546 const unsigned xlogue_layout::VARIANT_COUNT;
547 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
549 /* Initialize xlogue_layout::s_stub_names to zero. */
550 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
551 [STUB_NAME_MAX_LEN];
553 /* Instantiates all xlogue_layout instances. */
554 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
555 xlogue_layout (0, false),
556 xlogue_layout (8, false),
557 xlogue_layout (0, true),
558 xlogue_layout (8, true)
561 /* Return an appropriate const instance of xlogue_layout based upon values
562 in cfun->machine and crtl. */
563 const struct xlogue_layout &
564 xlogue_layout::get_instance ()
566 enum xlogue_stub_sets stub_set;
567 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
569 if (stack_realign_fp)
570 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
571 else if (frame_pointer_needed)
572 stub_set = aligned_plus_8
573 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
574 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
575 else
576 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
578 return s_instances[stub_set];
581 /* Determine how many clobbered registers can be saved by the stub.
582 Returns the count of registers the stub will save and restore. */
583 unsigned
584 xlogue_layout::count_stub_managed_regs ()
586 bool hfp = frame_pointer_needed || stack_realign_fp;
587 unsigned i, count;
588 unsigned regno;
590 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
592 regno = REG_ORDER[i];
593 if (regno == BP_REG && hfp)
594 continue;
595 if (!ix86_save_reg (regno, false, false))
596 break;
597 ++count;
599 return count;
602 /* Determine if register REGNO is a stub managed register given the
603 total COUNT of stub managed registers. */
604 bool
605 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
607 bool hfp = frame_pointer_needed || stack_realign_fp;
608 unsigned i;
610 for (i = 0; i < count; ++i)
612 gcc_assert (i < MAX_REGS);
613 if (REG_ORDER[i] == BP_REG && hfp)
614 ++count;
615 else if (REG_ORDER[i] == regno)
616 return true;
618 return false;
621 /* Constructor for xlogue_layout. */
622 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
623 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
624 m_stack_align_off_in (stack_align_off_in)
626 HOST_WIDE_INT offset = stack_align_off_in;
627 unsigned i, j;
629 for (i = j = 0; i < MAX_REGS; ++i)
631 unsigned regno = REG_ORDER[i];
633 if (regno == BP_REG && hfp)
634 continue;
635 if (SSE_REGNO_P (regno))
637 offset += 16;
638 /* Verify that SSE regs are always aligned. */
639 gcc_assert (!((stack_align_off_in + offset) & 15));
641 else
642 offset += 8;
644 m_regs[j].regno = regno;
645 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
647 gcc_assert (j == m_nregs);
650 const char *
651 xlogue_layout::get_stub_name (enum xlogue_stub stub,
652 unsigned n_extra_regs)
654 const int have_avx = TARGET_AVX;
655 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
657 /* Lazy init */
658 if (!*name)
660 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
661 (have_avx ? "avx" : "sse"),
662 STUB_BASE_NAMES[stub],
663 MIN_REGS + n_extra_regs);
664 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
667 return name;
670 /* Return rtx of a symbol ref for the entry point (based upon
671 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
673 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
675 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
676 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
677 gcc_assert (stub < XLOGUE_STUB_COUNT);
678 gcc_assert (crtl->stack_realign_finalized);
680 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
683 /* Define the structure for the machine field in struct function. */
685 struct GTY(()) stack_local_entry {
686 unsigned short mode;
687 unsigned short n;
688 rtx rtl;
689 struct stack_local_entry *next;
692 /* Which cpu are we scheduling for. */
693 enum attr_cpu ix86_schedule;
695 /* Which cpu are we optimizing for. */
696 enum processor_type ix86_tune;
698 /* Which instruction set architecture to use. */
699 enum processor_type ix86_arch;
701 /* True if processor has SSE prefetch instruction. */
702 unsigned char x86_prefetch_sse;
704 /* -mstackrealign option */
705 static const char ix86_force_align_arg_pointer_string[]
706 = "force_align_arg_pointer";
708 static rtx (*ix86_gen_leave) (void);
709 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
711 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
712 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
713 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
714 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
715 static rtx (*ix86_gen_clzero) (rtx);
716 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
717 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
718 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
719 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
720 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
721 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
723 /* Preferred alignment for stack boundary in bits. */
724 unsigned int ix86_preferred_stack_boundary;
726 /* Alignment for incoming stack boundary in bits specified at
727 command line. */
728 static unsigned int ix86_user_incoming_stack_boundary;
730 /* Default alignment for incoming stack boundary in bits. */
731 static unsigned int ix86_default_incoming_stack_boundary;
733 /* Alignment for incoming stack boundary in bits. */
734 unsigned int ix86_incoming_stack_boundary;
736 /* Calling abi specific va_list type nodes. */
737 static GTY(()) tree sysv_va_list_type_node;
738 static GTY(()) tree ms_va_list_type_node;
740 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
741 char internal_label_prefix[16];
742 int internal_label_prefix_len;
744 /* Fence to use after loop using movnt. */
745 tree x86_mfence;
747 /* Register class used for passing given 64bit part of the argument.
748 These represent classes as documented by the PS ABI, with the exception
749 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
750 use SF or DFmode move instead of DImode to avoid reformatting penalties.
752 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
753 whenever possible (upper half does contain padding). */
754 enum x86_64_reg_class
756 X86_64_NO_CLASS,
757 X86_64_INTEGER_CLASS,
758 X86_64_INTEGERSI_CLASS,
759 X86_64_SSE_CLASS,
760 X86_64_SSESF_CLASS,
761 X86_64_SSEDF_CLASS,
762 X86_64_SSEUP_CLASS,
763 X86_64_X87_CLASS,
764 X86_64_X87UP_CLASS,
765 X86_64_COMPLEX_X87_CLASS,
766 X86_64_MEMORY_CLASS
769 #define MAX_CLASSES 8
771 /* Table of constants used by fldpi, fldln2, etc.... */
772 static REAL_VALUE_TYPE ext_80387_constants_table [5];
773 static bool ext_80387_constants_init;
776 static struct machine_function * ix86_init_machine_status (void);
777 static rtx ix86_function_value (const_tree, const_tree, bool);
778 static bool ix86_function_value_regno_p (const unsigned int);
779 static unsigned int ix86_function_arg_boundary (machine_mode,
780 const_tree);
781 static rtx ix86_static_chain (const_tree, bool);
782 static int ix86_function_regparm (const_tree, const_tree);
783 static void ix86_compute_frame_layout (void);
784 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
785 rtx, rtx, int);
786 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
787 static tree ix86_canonical_va_list_type (tree);
788 static void predict_jump (int);
789 static unsigned int split_stack_prologue_scratch_regno (void);
790 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
792 enum ix86_function_specific_strings
794 IX86_FUNCTION_SPECIFIC_ARCH,
795 IX86_FUNCTION_SPECIFIC_TUNE,
796 IX86_FUNCTION_SPECIFIC_MAX
799 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
800 const char *, const char *, enum fpmath_unit,
801 bool);
802 static void ix86_function_specific_save (struct cl_target_option *,
803 struct gcc_options *opts);
804 static void ix86_function_specific_restore (struct gcc_options *opts,
805 struct cl_target_option *);
806 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
807 static void ix86_function_specific_print (FILE *, int,
808 struct cl_target_option *);
809 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
810 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
811 struct gcc_options *,
812 struct gcc_options *,
813 struct gcc_options *);
814 static bool ix86_can_inline_p (tree, tree);
815 static void ix86_set_current_function (tree);
816 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
818 static enum calling_abi ix86_function_abi (const_tree);
821 #ifndef SUBTARGET32_DEFAULT_CPU
822 #define SUBTARGET32_DEFAULT_CPU "i386"
823 #endif
825 /* Whether -mtune= or -march= were specified */
826 static int ix86_tune_defaulted;
827 static int ix86_arch_specified;
829 /* Vectorization library interface and handlers. */
830 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
832 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
833 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
835 /* Processor target table, indexed by processor number */
836 struct ptt
838 const char *const name; /* processor name */
839 const struct processor_costs *cost; /* Processor costs */
841 /* Default alignments. */
842 const char *const align_loop;
843 const char *const align_jump;
844 const char *const align_label;
845 const char *const align_func;
848 /* This table must be in sync with enum processor_type in i386.h. */
849 static const struct ptt processor_target_table[PROCESSOR_max] =
851 /* The "0:0:8" label alignment specified for some processors generates
852 secondary 8-byte alignment only for those label/jump/loop targets
853 which have primary alignment. */
855 {"generic", &generic_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
856 {"i386", &i386_cost, "4", "4", NULL, "4" },
857 {"i486", &i486_cost, "16", "16", "0:0:8", "16"},
858 {"pentium", &pentium_cost, "16:8:8", "16:8:8", "0:0:8", "16"},
859 {"lakemont", &lakemont_cost, "16:8:8", "16:8:8", "0:0:8", "16"},
860 {"pentiumpro", &pentiumpro_cost, "16", "16:11:8", "0:0:8", "16"},
861 {"pentium4", &pentium4_cost, NULL, NULL, NULL, NULL},
862 {"nocona", &nocona_cost, NULL, NULL, NULL, NULL},
863 {"core2", &core_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
864 {"nehalem", &core_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
865 {"sandybridge", &core_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
866 {"haswell", &core_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
867 {"bonnell", &atom_cost, "16", "16:8:8", "0:0:8", "16"},
868 {"silvermont", &slm_cost, "16", "16:8:8", "0:0:8", "16"},
869 {"goldmont", &slm_cost, "16", "16:8:8", "0:0:8", "16"},
870 {"goldmont-plus", &slm_cost, "16", "16:8:8", "0:0:8", "16"},
871 {"tremont", &slm_cost, "16", "16:8:8", "0:0:8", "16"},
872 {"knl", &slm_cost, "16", "16:8:8", "0:0:8", "16"},
873 {"knm", &slm_cost, "16", "16:8:8", "0:0:8", "16"},
874 {"skylake", &skylake_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
875 {"skylake-avx512", &skylake_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
876 {"cannonlake", &skylake_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
877 {"icelake-client", &skylake_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
878 {"icelake-server", &skylake_cost, "16:11:8", "16:11:8", "0:0:8", "16"},
879 {"intel", &intel_cost, "16", "16:8:8", "0:0:8", "16"},
880 {"geode", &geode_cost, NULL, NULL, NULL, NULL},
881 {"k6", &k6_cost, "32:8:8", "32:8:8", "0:0:8", "32"},
882 {"athlon", &athlon_cost, "16:8:8", "16:8:8", "0:0:8", "16"},
883 {"k8", &k8_cost, "16:8:8", "16:8:8", "0:0:8", "16"},
884 {"amdfam10", &amdfam10_cost, "32:25:8", "32:8:8", "0:0:8", "32"},
885 {"bdver1", &bdver1_cost, "16:11:8", "16:8:8", "0:0:8", "11"},
886 {"bdver2", &bdver2_cost, "16:11:8", "16:8:8", "0:0:8", "11"},
887 {"bdver3", &bdver3_cost, "16:11:8", "16:8:8", "0:0:8", "11"},
888 {"bdver4", &bdver4_cost, "16:11:8", "16:8:8", "0:0:8", "11"},
889 {"btver1", &btver1_cost, "16:11:8", "16:8:8", "0:0:8", "11"},
890 {"btver2", &btver2_cost, "16:11:8", "16:8:8", "0:0:8", "11"},
891 {"znver1", &znver1_cost, "16", "16", "0:0:8", "16"}
894 static unsigned int
895 rest_of_handle_insert_vzeroupper (void)
897 int i;
899 /* vzeroupper instructions are inserted immediately after reload to
900 account for possible spills from 256bit or 512bit registers. The pass
901 reuses mode switching infrastructure by re-running mode insertion
902 pass, so disable entities that have already been processed. */
903 for (i = 0; i < MAX_386_ENTITIES; i++)
904 ix86_optimize_mode_switching[i] = 0;
906 ix86_optimize_mode_switching[AVX_U128] = 1;
908 /* Call optimize_mode_switching. */
909 g->get_passes ()->execute_pass_mode_switching ();
910 return 0;
913 /* Return 1 if INSN uses or defines a hard register.
914 Hard register uses in a memory address are ignored.
915 Clobbers and flags definitions are ignored. */
917 static bool
918 has_non_address_hard_reg (rtx_insn *insn)
920 df_ref ref;
921 FOR_EACH_INSN_DEF (ref, insn)
922 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
923 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
924 && DF_REF_REGNO (ref) != FLAGS_REG)
925 return true;
927 FOR_EACH_INSN_USE (ref, insn)
928 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
929 return true;
931 return false;
934 /* Check if comparison INSN may be transformed
935 into vector comparison. Currently we transform
936 zero checks only which look like:
938 (set (reg:CCZ 17 flags)
939 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
940 (subreg:SI (reg:DI x) 0))
941 (const_int 0 [0]))) */
943 static bool
944 convertible_comparison_p (rtx_insn *insn)
946 if (!TARGET_SSE4_1)
947 return false;
949 rtx def_set = single_set (insn);
951 gcc_assert (def_set);
953 rtx src = SET_SRC (def_set);
954 rtx dst = SET_DEST (def_set);
956 gcc_assert (GET_CODE (src) == COMPARE);
958 if (GET_CODE (dst) != REG
959 || REGNO (dst) != FLAGS_REG
960 || GET_MODE (dst) != CCZmode)
961 return false;
963 rtx op1 = XEXP (src, 0);
964 rtx op2 = XEXP (src, 1);
966 if (op2 != CONST0_RTX (GET_MODE (op2)))
967 return false;
969 if (GET_CODE (op1) != IOR)
970 return false;
972 op2 = XEXP (op1, 1);
973 op1 = XEXP (op1, 0);
975 if (!SUBREG_P (op1)
976 || !SUBREG_P (op2)
977 || GET_MODE (op1) != SImode
978 || GET_MODE (op2) != SImode
979 || ((SUBREG_BYTE (op1) != 0
980 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
981 && (SUBREG_BYTE (op2) != 0
982 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
983 return false;
985 op1 = SUBREG_REG (op1);
986 op2 = SUBREG_REG (op2);
988 if (op1 != op2
989 || !REG_P (op1)
990 || GET_MODE (op1) != DImode)
991 return false;
993 return true;
996 /* The DImode version of scalar_to_vector_candidate_p. */
998 static bool
999 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
1001 rtx def_set = single_set (insn);
1003 if (!def_set)
1004 return false;
1006 if (has_non_address_hard_reg (insn))
1007 return false;
1009 rtx src = SET_SRC (def_set);
1010 rtx dst = SET_DEST (def_set);
1012 if (GET_CODE (src) == COMPARE)
1013 return convertible_comparison_p (insn);
1015 /* We are interested in DImode promotion only. */
1016 if ((GET_MODE (src) != DImode
1017 && !CONST_INT_P (src))
1018 || GET_MODE (dst) != DImode)
1019 return false;
1021 if (!REG_P (dst) && !MEM_P (dst))
1022 return false;
1024 switch (GET_CODE (src))
1026 case ASHIFTRT:
1027 if (!TARGET_AVX512VL)
1028 return false;
1029 /* FALLTHRU */
1031 case ASHIFT:
1032 case LSHIFTRT:
1033 if (!REG_P (XEXP (src, 1))
1034 && (!SUBREG_P (XEXP (src, 1))
1035 || SUBREG_BYTE (XEXP (src, 1)) != 0
1036 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1037 && (!CONST_INT_P (XEXP (src, 1))
1038 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1039 return false;
1041 if (GET_MODE (XEXP (src, 1)) != QImode
1042 && !CONST_INT_P (XEXP (src, 1)))
1043 return false;
1044 break;
1046 case PLUS:
1047 case MINUS:
1048 case IOR:
1049 case XOR:
1050 case AND:
1051 if (!REG_P (XEXP (src, 1))
1052 && !MEM_P (XEXP (src, 1))
1053 && !CONST_INT_P (XEXP (src, 1)))
1054 return false;
1056 if (GET_MODE (XEXP (src, 1)) != DImode
1057 && !CONST_INT_P (XEXP (src, 1)))
1058 return false;
1059 break;
1061 case NEG:
1062 case NOT:
1063 break;
1065 case REG:
1066 return true;
1068 case MEM:
1069 case CONST_INT:
1070 return REG_P (dst);
1072 default:
1073 return false;
1076 if (!REG_P (XEXP (src, 0))
1077 && !MEM_P (XEXP (src, 0))
1078 && !CONST_INT_P (XEXP (src, 0))
1079 /* Check for andnot case. */
1080 && (GET_CODE (src) != AND
1081 || GET_CODE (XEXP (src, 0)) != NOT
1082 || !REG_P (XEXP (XEXP (src, 0), 0))))
1083 return false;
1085 if (GET_MODE (XEXP (src, 0)) != DImode
1086 && !CONST_INT_P (XEXP (src, 0)))
1087 return false;
1089 return true;
1092 /* The TImode version of scalar_to_vector_candidate_p. */
1094 static bool
1095 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1097 rtx def_set = single_set (insn);
1099 if (!def_set)
1100 return false;
1102 if (has_non_address_hard_reg (insn))
1103 return false;
1105 rtx src = SET_SRC (def_set);
1106 rtx dst = SET_DEST (def_set);
1108 /* Only TImode load and store are allowed. */
1109 if (GET_MODE (dst) != TImode)
1110 return false;
1112 if (MEM_P (dst))
1114 /* Check for store. Memory must be aligned or unaligned store
1115 is optimal. Only support store from register, standard SSE
1116 constant or CONST_WIDE_INT generated from piecewise store.
1118 ??? Verify performance impact before enabling CONST_INT for
1119 __int128 store. */
1120 if (misaligned_operand (dst, TImode)
1121 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1122 return false;
1124 switch (GET_CODE (src))
1126 default:
1127 return false;
1129 case REG:
1130 case CONST_WIDE_INT:
1131 return true;
1133 case CONST_INT:
1134 return standard_sse_constant_p (src, TImode);
1137 else if (MEM_P (src))
1139 /* Check for load. Memory must be aligned or unaligned load is
1140 optimal. */
1141 return (REG_P (dst)
1142 && (!misaligned_operand (src, TImode)
1143 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1146 return false;
1149 /* Return 1 if INSN may be converted into vector
1150 instruction. */
1152 static bool
1153 scalar_to_vector_candidate_p (rtx_insn *insn)
1155 if (TARGET_64BIT)
1156 return timode_scalar_to_vector_candidate_p (insn);
1157 else
1158 return dimode_scalar_to_vector_candidate_p (insn);
1161 /* The DImode version of remove_non_convertible_regs. */
1163 static void
1164 dimode_remove_non_convertible_regs (bitmap candidates)
1166 bitmap_iterator bi;
1167 unsigned id;
1168 bitmap regs = BITMAP_ALLOC (NULL);
1170 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1172 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1173 rtx reg = SET_DEST (def_set);
1175 if (!REG_P (reg)
1176 || bitmap_bit_p (regs, REGNO (reg))
1177 || HARD_REGISTER_P (reg))
1178 continue;
1180 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1181 def;
1182 def = DF_REF_NEXT_REG (def))
1184 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1186 if (dump_file)
1187 fprintf (dump_file,
1188 "r%d has non convertible definition in insn %d\n",
1189 REGNO (reg), DF_REF_INSN_UID (def));
1191 bitmap_set_bit (regs, REGNO (reg));
1192 break;
1197 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1199 for (df_ref def = DF_REG_DEF_CHAIN (id);
1200 def;
1201 def = DF_REF_NEXT_REG (def))
1202 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1204 if (dump_file)
1205 fprintf (dump_file, "Removing insn %d from candidates list\n",
1206 DF_REF_INSN_UID (def));
1208 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1212 BITMAP_FREE (regs);
1215 /* For a register REGNO, scan instructions for its defs and uses.
1216 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1218 static void
1219 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1220 unsigned int regno)
1222 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1223 def;
1224 def = DF_REF_NEXT_REG (def))
1226 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1228 if (dump_file)
1229 fprintf (dump_file,
1230 "r%d has non convertible def in insn %d\n",
1231 regno, DF_REF_INSN_UID (def));
1233 bitmap_set_bit (regs, regno);
1234 break;
1238 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1239 ref;
1240 ref = DF_REF_NEXT_REG (ref))
1242 /* Debug instructions are skipped. */
1243 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1244 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1246 if (dump_file)
1247 fprintf (dump_file,
1248 "r%d has non convertible use in insn %d\n",
1249 regno, DF_REF_INSN_UID (ref));
1251 bitmap_set_bit (regs, regno);
1252 break;
1257 /* The TImode version of remove_non_convertible_regs. */
1259 static void
1260 timode_remove_non_convertible_regs (bitmap candidates)
1262 bitmap_iterator bi;
1263 unsigned id;
1264 bitmap regs = BITMAP_ALLOC (NULL);
1266 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1268 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1269 rtx dest = SET_DEST (def_set);
1270 rtx src = SET_SRC (def_set);
1272 if ((!REG_P (dest)
1273 || bitmap_bit_p (regs, REGNO (dest))
1274 || HARD_REGISTER_P (dest))
1275 && (!REG_P (src)
1276 || bitmap_bit_p (regs, REGNO (src))
1277 || HARD_REGISTER_P (src)))
1278 continue;
1280 if (REG_P (dest))
1281 timode_check_non_convertible_regs (candidates, regs,
1282 REGNO (dest));
1284 if (REG_P (src))
1285 timode_check_non_convertible_regs (candidates, regs,
1286 REGNO (src));
1289 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1291 for (df_ref def = DF_REG_DEF_CHAIN (id);
1292 def;
1293 def = DF_REF_NEXT_REG (def))
1294 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1296 if (dump_file)
1297 fprintf (dump_file, "Removing insn %d from candidates list\n",
1298 DF_REF_INSN_UID (def));
1300 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1303 for (df_ref ref = DF_REG_USE_CHAIN (id);
1304 ref;
1305 ref = DF_REF_NEXT_REG (ref))
1306 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1308 if (dump_file)
1309 fprintf (dump_file, "Removing insn %d from candidates list\n",
1310 DF_REF_INSN_UID (ref));
1312 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1316 BITMAP_FREE (regs);
1319 /* For a given bitmap of insn UIDs scans all instruction and
1320 remove insn from CANDIDATES in case it has both convertible
1321 and not convertible definitions.
1323 All insns in a bitmap are conversion candidates according to
1324 scalar_to_vector_candidate_p. Currently it implies all insns
1325 are single_set. */
1327 static void
1328 remove_non_convertible_regs (bitmap candidates)
1330 if (TARGET_64BIT)
1331 timode_remove_non_convertible_regs (candidates);
1332 else
1333 dimode_remove_non_convertible_regs (candidates);
1336 class scalar_chain
1338 public:
1339 scalar_chain ();
1340 virtual ~scalar_chain ();
1342 static unsigned max_id;
1344 /* ID of a chain. */
1345 unsigned int chain_id;
1346 /* A queue of instructions to be included into a chain. */
1347 bitmap queue;
1348 /* Instructions included into a chain. */
1349 bitmap insns;
1350 /* All registers defined by a chain. */
1351 bitmap defs;
1352 /* Registers used in both vector and sclar modes. */
1353 bitmap defs_conv;
1355 void build (bitmap candidates, unsigned insn_uid);
1356 virtual int compute_convert_gain () = 0;
1357 int convert ();
1359 protected:
1360 void add_to_queue (unsigned insn_uid);
1361 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1363 private:
1364 void add_insn (bitmap candidates, unsigned insn_uid);
1365 void analyze_register_chain (bitmap candidates, df_ref ref);
1366 virtual void mark_dual_mode_def (df_ref def) = 0;
1367 virtual void convert_insn (rtx_insn *insn) = 0;
1368 virtual void convert_registers () = 0;
1371 class dimode_scalar_chain : public scalar_chain
1373 public:
1374 int compute_convert_gain ();
1375 private:
1376 void mark_dual_mode_def (df_ref def);
1377 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1378 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1379 void convert_insn (rtx_insn *insn);
1380 void convert_op (rtx *op, rtx_insn *insn);
1381 void convert_reg (unsigned regno);
1382 void make_vector_copies (unsigned regno);
1383 void convert_registers ();
1384 int vector_const_cost (rtx exp);
1387 class timode_scalar_chain : public scalar_chain
1389 public:
1390 /* Convert from TImode to V1TImode is always faster. */
1391 int compute_convert_gain () { return 1; }
1393 private:
1394 void mark_dual_mode_def (df_ref def);
1395 void fix_debug_reg_uses (rtx reg);
1396 void convert_insn (rtx_insn *insn);
1397 /* We don't convert registers to difference size. */
1398 void convert_registers () {}
1401 unsigned scalar_chain::max_id = 0;
1403 /* Initialize new chain. */
1405 scalar_chain::scalar_chain ()
1407 chain_id = ++max_id;
1409 if (dump_file)
1410 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1412 bitmap_obstack_initialize (NULL);
1413 insns = BITMAP_ALLOC (NULL);
1414 defs = BITMAP_ALLOC (NULL);
1415 defs_conv = BITMAP_ALLOC (NULL);
1416 queue = NULL;
1419 /* Free chain's data. */
1421 scalar_chain::~scalar_chain ()
1423 BITMAP_FREE (insns);
1424 BITMAP_FREE (defs);
1425 BITMAP_FREE (defs_conv);
1426 bitmap_obstack_release (NULL);
1429 /* Add instruction into chains' queue. */
1431 void
1432 scalar_chain::add_to_queue (unsigned insn_uid)
1434 if (bitmap_bit_p (insns, insn_uid)
1435 || bitmap_bit_p (queue, insn_uid))
1436 return;
1438 if (dump_file)
1439 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1440 insn_uid, chain_id);
1441 bitmap_set_bit (queue, insn_uid);
1444 /* For DImode conversion, mark register defined by DEF as requiring
1445 conversion. */
1447 void
1448 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1450 gcc_assert (DF_REF_REG_DEF_P (def));
1452 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1453 return;
1455 if (dump_file)
1456 fprintf (dump_file,
1457 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1458 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1460 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1463 /* For TImode conversion, it is unused. */
1465 void
1466 timode_scalar_chain::mark_dual_mode_def (df_ref)
1468 gcc_unreachable ();
1471 /* Check REF's chain to add new insns into a queue
1472 and find registers requiring conversion. */
1474 void
1475 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1477 df_link *chain;
1479 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1480 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1481 add_to_queue (DF_REF_INSN_UID (ref));
1483 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1485 unsigned uid = DF_REF_INSN_UID (chain->ref);
1487 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1488 continue;
1490 if (!DF_REF_REG_MEM_P (chain->ref))
1492 if (bitmap_bit_p (insns, uid))
1493 continue;
1495 if (bitmap_bit_p (candidates, uid))
1497 add_to_queue (uid);
1498 continue;
1502 if (DF_REF_REG_DEF_P (chain->ref))
1504 if (dump_file)
1505 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1506 DF_REF_REGNO (chain->ref), uid);
1507 mark_dual_mode_def (chain->ref);
1509 else
1511 if (dump_file)
1512 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1513 DF_REF_REGNO (chain->ref), uid);
1514 mark_dual_mode_def (ref);
1519 /* Add instruction into a chain. */
1521 void
1522 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1524 if (bitmap_bit_p (insns, insn_uid))
1525 return;
1527 if (dump_file)
1528 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1530 bitmap_set_bit (insns, insn_uid);
1532 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1533 rtx def_set = single_set (insn);
1534 if (def_set && REG_P (SET_DEST (def_set))
1535 && !HARD_REGISTER_P (SET_DEST (def_set)))
1536 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1538 df_ref ref;
1539 df_ref def;
1540 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1541 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1542 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1543 def;
1544 def = DF_REF_NEXT_REG (def))
1545 analyze_register_chain (candidates, def);
1546 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1547 if (!DF_REF_REG_MEM_P (ref))
1548 analyze_register_chain (candidates, ref);
1551 /* Build new chain starting from insn INSN_UID recursively
1552 adding all dependent uses and definitions. */
1554 void
1555 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1557 queue = BITMAP_ALLOC (NULL);
1558 bitmap_set_bit (queue, insn_uid);
1560 if (dump_file)
1561 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1563 while (!bitmap_empty_p (queue))
1565 insn_uid = bitmap_first_set_bit (queue);
1566 bitmap_clear_bit (queue, insn_uid);
1567 bitmap_clear_bit (candidates, insn_uid);
1568 add_insn (candidates, insn_uid);
1571 if (dump_file)
1573 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1574 fprintf (dump_file, " insns: ");
1575 dump_bitmap (dump_file, insns);
1576 if (!bitmap_empty_p (defs_conv))
1578 bitmap_iterator bi;
1579 unsigned id;
1580 const char *comma = "";
1581 fprintf (dump_file, " defs to convert: ");
1582 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1584 fprintf (dump_file, "%sr%d", comma, id);
1585 comma = ", ";
1587 fprintf (dump_file, "\n");
1591 BITMAP_FREE (queue);
1594 /* Return a cost of building a vector costant
1595 instead of using a scalar one. */
1598 dimode_scalar_chain::vector_const_cost (rtx exp)
1600 gcc_assert (CONST_INT_P (exp));
1602 if (standard_sse_constant_p (exp, V2DImode))
1603 return COSTS_N_INSNS (1);
1604 return ix86_cost->sse_load[1];
1607 /* Compute a gain for chain conversion. */
1610 dimode_scalar_chain::compute_convert_gain ()
1612 bitmap_iterator bi;
1613 unsigned insn_uid;
1614 int gain = 0;
1615 int cost = 0;
1617 if (dump_file)
1618 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1620 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1622 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1623 rtx def_set = single_set (insn);
1624 rtx src = SET_SRC (def_set);
1625 rtx dst = SET_DEST (def_set);
1627 if (REG_P (src) && REG_P (dst))
1628 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1629 else if (REG_P (src) && MEM_P (dst))
1630 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1631 else if (MEM_P (src) && REG_P (dst))
1632 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1633 else if (GET_CODE (src) == ASHIFT
1634 || GET_CODE (src) == ASHIFTRT
1635 || GET_CODE (src) == LSHIFTRT)
1637 if (CONST_INT_P (XEXP (src, 0)))
1638 gain -= vector_const_cost (XEXP (src, 0));
1639 if (CONST_INT_P (XEXP (src, 1)))
1641 gain += ix86_cost->shift_const;
1642 if (INTVAL (XEXP (src, 1)) >= 32)
1643 gain -= COSTS_N_INSNS (1);
1645 else
1646 /* Additional gain for omitting two CMOVs. */
1647 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1649 else if (GET_CODE (src) == PLUS
1650 || GET_CODE (src) == MINUS
1651 || GET_CODE (src) == IOR
1652 || GET_CODE (src) == XOR
1653 || GET_CODE (src) == AND)
1655 gain += ix86_cost->add;
1656 /* Additional gain for andnot for targets without BMI. */
1657 if (GET_CODE (XEXP (src, 0)) == NOT
1658 && !TARGET_BMI)
1659 gain += 2 * ix86_cost->add;
1661 if (CONST_INT_P (XEXP (src, 0)))
1662 gain -= vector_const_cost (XEXP (src, 0));
1663 if (CONST_INT_P (XEXP (src, 1)))
1664 gain -= vector_const_cost (XEXP (src, 1));
1666 else if (GET_CODE (src) == NEG
1667 || GET_CODE (src) == NOT)
1668 gain += ix86_cost->add - COSTS_N_INSNS (1);
1669 else if (GET_CODE (src) == COMPARE)
1671 /* Assume comparison cost is the same. */
1673 else if (CONST_INT_P (src))
1675 if (REG_P (dst))
1676 gain += COSTS_N_INSNS (2);
1677 else if (MEM_P (dst))
1678 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1679 gain -= vector_const_cost (src);
1681 else
1682 gcc_unreachable ();
1685 if (dump_file)
1686 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1688 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1689 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1691 if (dump_file)
1692 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1694 gain -= cost;
1696 if (dump_file)
1697 fprintf (dump_file, " Total gain: %d\n", gain);
1699 return gain;
1702 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1705 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1707 if (x == reg)
1708 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1710 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1711 int i, j;
1712 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1714 if (fmt[i] == 'e')
1715 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1716 else if (fmt[i] == 'E')
1717 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1718 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1719 reg, new_reg);
1722 return x;
1725 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1727 void
1728 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1729 rtx reg, rtx new_reg)
1731 replace_with_subreg (single_set (insn), reg, new_reg);
1734 /* Insert generated conversion instruction sequence INSNS
1735 after instruction AFTER. New BB may be required in case
1736 instruction has EH region attached. */
1738 void
1739 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1741 if (!control_flow_insn_p (after))
1743 emit_insn_after (insns, after);
1744 return;
1747 basic_block bb = BLOCK_FOR_INSN (after);
1748 edge e = find_fallthru_edge (bb->succs);
1749 gcc_assert (e);
1751 basic_block new_bb = split_edge (e);
1752 emit_insn_after (insns, BB_HEAD (new_bb));
1755 /* Make vector copies for all register REGNO definitions
1756 and replace its uses in a chain. */
1758 void
1759 dimode_scalar_chain::make_vector_copies (unsigned regno)
1761 rtx reg = regno_reg_rtx[regno];
1762 rtx vreg = gen_reg_rtx (DImode);
1763 bool count_reg = false;
1764 df_ref ref;
1766 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1767 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1769 df_ref use;
1771 /* Detect the count register of a shift instruction. */
1772 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1773 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1775 rtx_insn *insn = DF_REF_INSN (use);
1776 rtx def_set = single_set (insn);
1778 gcc_assert (def_set);
1780 rtx src = SET_SRC (def_set);
1782 if ((GET_CODE (src) == ASHIFT
1783 || GET_CODE (src) == ASHIFTRT
1784 || GET_CODE (src) == LSHIFTRT)
1785 && !CONST_INT_P (XEXP (src, 1))
1786 && reg_or_subregno (XEXP (src, 1)) == regno)
1787 count_reg = true;
1790 start_sequence ();
1791 if (count_reg)
1793 rtx qreg = gen_lowpart (QImode, reg);
1794 rtx tmp = gen_reg_rtx (SImode);
1796 if (TARGET_ZERO_EXTEND_WITH_AND
1797 && optimize_function_for_speed_p (cfun))
1799 emit_move_insn (tmp, const0_rtx);
1800 emit_insn (gen_movstrictqi
1801 (gen_lowpart (QImode, tmp), qreg));
1803 else
1804 emit_insn (gen_rtx_SET
1805 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1807 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1809 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1810 emit_move_insn (slot, tmp);
1811 tmp = copy_rtx (slot);
1814 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1816 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1818 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1819 emit_move_insn (adjust_address (tmp, SImode, 0),
1820 gen_rtx_SUBREG (SImode, reg, 0));
1821 emit_move_insn (adjust_address (tmp, SImode, 4),
1822 gen_rtx_SUBREG (SImode, reg, 4));
1823 emit_move_insn (vreg, tmp);
1825 else if (TARGET_SSE4_1)
1827 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1828 CONST0_RTX (V4SImode),
1829 gen_rtx_SUBREG (SImode, reg, 0)));
1830 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1831 gen_rtx_SUBREG (V4SImode, vreg, 0),
1832 gen_rtx_SUBREG (SImode, reg, 4),
1833 GEN_INT (2)));
1835 else
1837 rtx tmp = gen_reg_rtx (DImode);
1838 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1839 CONST0_RTX (V4SImode),
1840 gen_rtx_SUBREG (SImode, reg, 0)));
1841 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1842 CONST0_RTX (V4SImode),
1843 gen_rtx_SUBREG (SImode, reg, 4)));
1844 emit_insn (gen_vec_interleave_lowv4si
1845 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1846 gen_rtx_SUBREG (V4SImode, vreg, 0),
1847 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1849 rtx_insn *seq = get_insns ();
1850 end_sequence ();
1851 rtx_insn *insn = DF_REF_INSN (ref);
1852 emit_conversion_insns (seq, insn);
1854 if (dump_file)
1855 fprintf (dump_file,
1856 " Copied r%d to a vector register r%d for insn %d\n",
1857 regno, REGNO (vreg), INSN_UID (insn));
1860 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1861 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1863 rtx_insn *insn = DF_REF_INSN (ref);
1864 if (count_reg)
1866 rtx def_set = single_set (insn);
1867 gcc_assert (def_set);
1869 rtx src = SET_SRC (def_set);
1871 if ((GET_CODE (src) == ASHIFT
1872 || GET_CODE (src) == ASHIFTRT
1873 || GET_CODE (src) == LSHIFTRT)
1874 && !CONST_INT_P (XEXP (src, 1))
1875 && reg_or_subregno (XEXP (src, 1)) == regno)
1876 XEXP (src, 1) = vreg;
1878 else
1879 replace_with_subreg_in_insn (insn, reg, vreg);
1881 if (dump_file)
1882 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1883 regno, REGNO (vreg), INSN_UID (insn));
1887 /* Convert all definitions of register REGNO
1888 and fix its uses. Scalar copies may be created
1889 in case register is used in not convertible insn. */
1891 void
1892 dimode_scalar_chain::convert_reg (unsigned regno)
1894 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1895 rtx reg = regno_reg_rtx[regno];
1896 rtx scopy = NULL_RTX;
1897 df_ref ref;
1898 bitmap conv;
1900 conv = BITMAP_ALLOC (NULL);
1901 bitmap_copy (conv, insns);
1903 if (scalar_copy)
1904 scopy = gen_reg_rtx (DImode);
1906 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1908 rtx_insn *insn = DF_REF_INSN (ref);
1909 rtx def_set = single_set (insn);
1910 rtx src = SET_SRC (def_set);
1911 rtx reg = DF_REF_REG (ref);
1913 if (!MEM_P (src))
1915 replace_with_subreg_in_insn (insn, reg, reg);
1916 bitmap_clear_bit (conv, INSN_UID (insn));
1919 if (scalar_copy)
1921 start_sequence ();
1922 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1924 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1925 emit_move_insn (tmp, reg);
1926 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1927 adjust_address (tmp, SImode, 0));
1928 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1929 adjust_address (tmp, SImode, 4));
1931 else if (TARGET_SSE4_1)
1933 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1934 emit_insn
1935 (gen_rtx_SET
1936 (gen_rtx_SUBREG (SImode, scopy, 0),
1937 gen_rtx_VEC_SELECT (SImode,
1938 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1940 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1941 emit_insn
1942 (gen_rtx_SET
1943 (gen_rtx_SUBREG (SImode, scopy, 4),
1944 gen_rtx_VEC_SELECT (SImode,
1945 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1947 else
1949 rtx vcopy = gen_reg_rtx (V2DImode);
1950 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1951 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1952 gen_rtx_SUBREG (SImode, vcopy, 0));
1953 emit_move_insn (vcopy,
1954 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1955 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1956 gen_rtx_SUBREG (SImode, vcopy, 0));
1958 rtx_insn *seq = get_insns ();
1959 end_sequence ();
1960 emit_conversion_insns (seq, insn);
1962 if (dump_file)
1963 fprintf (dump_file,
1964 " Copied r%d to a scalar register r%d for insn %d\n",
1965 regno, REGNO (scopy), INSN_UID (insn));
1969 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1970 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1972 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1974 rtx_insn *insn = DF_REF_INSN (ref);
1976 rtx def_set = single_set (insn);
1977 gcc_assert (def_set);
1979 rtx src = SET_SRC (def_set);
1980 rtx dst = SET_DEST (def_set);
1982 if ((GET_CODE (src) == ASHIFT
1983 || GET_CODE (src) == ASHIFTRT
1984 || GET_CODE (src) == LSHIFTRT)
1985 && !CONST_INT_P (XEXP (src, 1))
1986 && reg_or_subregno (XEXP (src, 1)) == regno)
1988 rtx tmp2 = gen_reg_rtx (V2DImode);
1990 start_sequence ();
1992 if (TARGET_SSE4_1)
1993 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1994 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1995 else
1997 rtx vec_cst
1998 = gen_rtx_CONST_VECTOR (V2DImode,
1999 gen_rtvec (2, GEN_INT (0xff),
2000 const0_rtx));
2001 vec_cst
2002 = validize_mem (force_const_mem (V2DImode, vec_cst));
2004 emit_insn (gen_rtx_SET
2005 (tmp2,
2006 gen_rtx_AND (V2DImode,
2007 gen_rtx_SUBREG (V2DImode, reg, 0),
2008 vec_cst)));
2010 rtx_insn *seq = get_insns ();
2011 end_sequence ();
2013 emit_insn_before (seq, insn);
2015 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
2017 else if (!MEM_P (dst) || !REG_P (src))
2018 replace_with_subreg_in_insn (insn, reg, reg);
2020 bitmap_clear_bit (conv, INSN_UID (insn));
2023 /* Skip debug insns and uninitialized uses. */
2024 else if (DF_REF_CHAIN (ref)
2025 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2027 gcc_assert (scopy);
2028 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2029 df_insn_rescan (DF_REF_INSN (ref));
2032 BITMAP_FREE (conv);
2035 /* Convert operand OP in INSN. We should handle
2036 memory operands and uninitialized registers.
2037 All other register uses are converted during
2038 registers conversion. */
2040 void
2041 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2043 *op = copy_rtx_if_shared (*op);
2045 if (GET_CODE (*op) == NOT)
2047 convert_op (&XEXP (*op, 0), insn);
2048 PUT_MODE (*op, V2DImode);
2050 else if (MEM_P (*op))
2052 rtx tmp = gen_reg_rtx (DImode);
2054 emit_insn_before (gen_move_insn (tmp, *op), insn);
2055 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2057 if (dump_file)
2058 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2059 INSN_UID (insn), REGNO (tmp));
2061 else if (REG_P (*op))
2063 /* We may have not converted register usage in case
2064 this register has no definition. Otherwise it
2065 should be converted in convert_reg. */
2066 df_ref ref;
2067 FOR_EACH_INSN_USE (ref, insn)
2068 if (DF_REF_REGNO (ref) == REGNO (*op))
2070 gcc_assert (!DF_REF_CHAIN (ref));
2071 break;
2073 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2075 else if (CONST_INT_P (*op))
2077 rtx vec_cst;
2078 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2080 /* Prefer all ones vector in case of -1. */
2081 if (constm1_operand (*op, GET_MODE (*op)))
2082 vec_cst = CONSTM1_RTX (V2DImode);
2083 else
2084 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2085 gen_rtvec (2, *op, const0_rtx));
2087 if (!standard_sse_constant_p (vec_cst, V2DImode))
2089 start_sequence ();
2090 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2091 rtx_insn *seq = get_insns ();
2092 end_sequence ();
2093 emit_insn_before (seq, insn);
2096 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2097 *op = tmp;
2099 else
2101 gcc_assert (SUBREG_P (*op));
2102 gcc_assert (GET_MODE (*op) == V2DImode);
2106 /* Convert INSN to vector mode. */
2108 void
2109 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2111 rtx def_set = single_set (insn);
2112 rtx src = SET_SRC (def_set);
2113 rtx dst = SET_DEST (def_set);
2114 rtx subreg;
2116 if (MEM_P (dst) && !REG_P (src))
2118 /* There are no scalar integer instructions and therefore
2119 temporary register usage is required. */
2120 rtx tmp = gen_reg_rtx (DImode);
2121 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2122 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2125 switch (GET_CODE (src))
2127 case ASHIFT:
2128 case ASHIFTRT:
2129 case LSHIFTRT:
2130 convert_op (&XEXP (src, 0), insn);
2131 PUT_MODE (src, V2DImode);
2132 break;
2134 case PLUS:
2135 case MINUS:
2136 case IOR:
2137 case XOR:
2138 case AND:
2139 convert_op (&XEXP (src, 0), insn);
2140 convert_op (&XEXP (src, 1), insn);
2141 PUT_MODE (src, V2DImode);
2142 break;
2144 case NEG:
2145 src = XEXP (src, 0);
2146 convert_op (&src, insn);
2147 subreg = gen_reg_rtx (V2DImode);
2148 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2149 src = gen_rtx_MINUS (V2DImode, subreg, src);
2150 break;
2152 case NOT:
2153 src = XEXP (src, 0);
2154 convert_op (&src, insn);
2155 subreg = gen_reg_rtx (V2DImode);
2156 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2157 src = gen_rtx_XOR (V2DImode, src, subreg);
2158 break;
2160 case MEM:
2161 if (!REG_P (dst))
2162 convert_op (&src, insn);
2163 break;
2165 case REG:
2166 if (!MEM_P (dst))
2167 convert_op (&src, insn);
2168 break;
2170 case SUBREG:
2171 gcc_assert (GET_MODE (src) == V2DImode);
2172 break;
2174 case COMPARE:
2175 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2177 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2178 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2180 if (REG_P (src))
2181 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2182 else
2183 subreg = copy_rtx_if_shared (src);
2184 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2185 copy_rtx_if_shared (subreg),
2186 copy_rtx_if_shared (subreg)),
2187 insn);
2188 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2189 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2190 copy_rtx_if_shared (src)),
2191 UNSPEC_PTEST);
2192 break;
2194 case CONST_INT:
2195 convert_op (&src, insn);
2196 break;
2198 default:
2199 gcc_unreachable ();
2202 SET_SRC (def_set) = src;
2203 SET_DEST (def_set) = dst;
2205 /* Drop possible dead definitions. */
2206 PATTERN (insn) = def_set;
2208 INSN_CODE (insn) = -1;
2209 recog_memoized (insn);
2210 df_insn_rescan (insn);
2213 /* Fix uses of converted REG in debug insns. */
2215 void
2216 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2218 if (!flag_var_tracking)
2219 return;
2221 df_ref ref, next;
2222 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2224 rtx_insn *insn = DF_REF_INSN (ref);
2225 /* Make sure the next ref is for a different instruction,
2226 so that we're not affected by the rescan. */
2227 next = DF_REF_NEXT_REG (ref);
2228 while (next && DF_REF_INSN (next) == insn)
2229 next = DF_REF_NEXT_REG (next);
2231 if (DEBUG_INSN_P (insn))
2233 /* It may be a debug insn with a TImode variable in
2234 register. */
2235 bool changed = false;
2236 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2238 rtx *loc = DF_REF_LOC (ref);
2239 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2241 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2242 changed = true;
2245 if (changed)
2246 df_insn_rescan (insn);
2251 /* Convert INSN from TImode to V1T1mode. */
2253 void
2254 timode_scalar_chain::convert_insn (rtx_insn *insn)
2256 rtx def_set = single_set (insn);
2257 rtx src = SET_SRC (def_set);
2258 rtx dst = SET_DEST (def_set);
2260 switch (GET_CODE (dst))
2262 case REG:
2264 rtx tmp = find_reg_equal_equiv_note (insn);
2265 if (tmp)
2266 PUT_MODE (XEXP (tmp, 0), V1TImode);
2267 PUT_MODE (dst, V1TImode);
2268 fix_debug_reg_uses (dst);
2270 break;
2271 case MEM:
2272 PUT_MODE (dst, V1TImode);
2273 break;
2275 default:
2276 gcc_unreachable ();
2279 switch (GET_CODE (src))
2281 case REG:
2282 PUT_MODE (src, V1TImode);
2283 /* Call fix_debug_reg_uses only if SRC is never defined. */
2284 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2285 fix_debug_reg_uses (src);
2286 break;
2288 case MEM:
2289 PUT_MODE (src, V1TImode);
2290 break;
2292 case CONST_WIDE_INT:
2293 if (NONDEBUG_INSN_P (insn))
2295 /* Since there are no instructions to store 128-bit constant,
2296 temporary register usage is required. */
2297 rtx tmp = gen_reg_rtx (V1TImode);
2298 start_sequence ();
2299 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2300 src = validize_mem (force_const_mem (V1TImode, src));
2301 rtx_insn *seq = get_insns ();
2302 end_sequence ();
2303 if (seq)
2304 emit_insn_before (seq, insn);
2305 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2306 dst = tmp;
2308 break;
2310 case CONST_INT:
2311 switch (standard_sse_constant_p (src, TImode))
2313 case 1:
2314 src = CONST0_RTX (GET_MODE (dst));
2315 break;
2316 case 2:
2317 src = CONSTM1_RTX (GET_MODE (dst));
2318 break;
2319 default:
2320 gcc_unreachable ();
2322 if (NONDEBUG_INSN_P (insn))
2324 rtx tmp = gen_reg_rtx (V1TImode);
2325 /* Since there are no instructions to store standard SSE
2326 constant, temporary register usage is required. */
2327 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2328 dst = tmp;
2330 break;
2332 default:
2333 gcc_unreachable ();
2336 SET_SRC (def_set) = src;
2337 SET_DEST (def_set) = dst;
2339 /* Drop possible dead definitions. */
2340 PATTERN (insn) = def_set;
2342 INSN_CODE (insn) = -1;
2343 recog_memoized (insn);
2344 df_insn_rescan (insn);
2347 void
2348 dimode_scalar_chain::convert_registers ()
2350 bitmap_iterator bi;
2351 unsigned id;
2353 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2354 convert_reg (id);
2356 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2357 make_vector_copies (id);
2360 /* Convert whole chain creating required register
2361 conversions and copies. */
2364 scalar_chain::convert ()
2366 bitmap_iterator bi;
2367 unsigned id;
2368 int converted_insns = 0;
2370 if (!dbg_cnt (stv_conversion))
2371 return 0;
2373 if (dump_file)
2374 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2376 convert_registers ();
2378 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2380 convert_insn (DF_INSN_UID_GET (id)->insn);
2381 converted_insns++;
2384 return converted_insns;
2387 /* Main STV pass function. Find and convert scalar
2388 instructions into vector mode when profitable. */
2390 static unsigned int
2391 convert_scalars_to_vector ()
2393 basic_block bb;
2394 bitmap candidates;
2395 int converted_insns = 0;
2397 bitmap_obstack_initialize (NULL);
2398 candidates = BITMAP_ALLOC (NULL);
2400 calculate_dominance_info (CDI_DOMINATORS);
2401 df_set_flags (DF_DEFER_INSN_RESCAN);
2402 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2403 df_md_add_problem ();
2404 df_analyze ();
2406 /* Find all instructions we want to convert into vector mode. */
2407 if (dump_file)
2408 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2410 FOR_EACH_BB_FN (bb, cfun)
2412 rtx_insn *insn;
2413 FOR_BB_INSNS (bb, insn)
2414 if (scalar_to_vector_candidate_p (insn))
2416 if (dump_file)
2417 fprintf (dump_file, " insn %d is marked as a candidate\n",
2418 INSN_UID (insn));
2420 bitmap_set_bit (candidates, INSN_UID (insn));
2424 remove_non_convertible_regs (candidates);
2426 if (bitmap_empty_p (candidates))
2427 if (dump_file)
2428 fprintf (dump_file, "There are no candidates for optimization.\n");
2430 while (!bitmap_empty_p (candidates))
2432 unsigned uid = bitmap_first_set_bit (candidates);
2433 scalar_chain *chain;
2435 if (TARGET_64BIT)
2436 chain = new timode_scalar_chain;
2437 else
2438 chain = new dimode_scalar_chain;
2440 /* Find instructions chain we want to convert to vector mode.
2441 Check all uses and definitions to estimate all required
2442 conversions. */
2443 chain->build (candidates, uid);
2445 if (chain->compute_convert_gain () > 0)
2446 converted_insns += chain->convert ();
2447 else
2448 if (dump_file)
2449 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2450 chain->chain_id);
2452 delete chain;
2455 if (dump_file)
2456 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2458 BITMAP_FREE (candidates);
2459 bitmap_obstack_release (NULL);
2460 df_process_deferred_rescans ();
2462 /* Conversion means we may have 128bit register spills/fills
2463 which require aligned stack. */
2464 if (converted_insns)
2466 if (crtl->stack_alignment_needed < 128)
2467 crtl->stack_alignment_needed = 128;
2468 if (crtl->stack_alignment_estimated < 128)
2469 crtl->stack_alignment_estimated = 128;
2470 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2471 if (TARGET_64BIT)
2472 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2473 parm; parm = DECL_CHAIN (parm))
2475 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2476 continue;
2477 if (DECL_RTL_SET_P (parm)
2478 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2480 rtx r = DECL_RTL (parm);
2481 if (REG_P (r))
2482 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2484 if (DECL_INCOMING_RTL (parm)
2485 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2487 rtx r = DECL_INCOMING_RTL (parm);
2488 if (REG_P (r))
2489 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2494 return 0;
2497 namespace {
2499 const pass_data pass_data_insert_vzeroupper =
2501 RTL_PASS, /* type */
2502 "vzeroupper", /* name */
2503 OPTGROUP_NONE, /* optinfo_flags */
2504 TV_MACH_DEP, /* tv_id */
2505 0, /* properties_required */
2506 0, /* properties_provided */
2507 0, /* properties_destroyed */
2508 0, /* todo_flags_start */
2509 TODO_df_finish, /* todo_flags_finish */
2512 class pass_insert_vzeroupper : public rtl_opt_pass
2514 public:
2515 pass_insert_vzeroupper(gcc::context *ctxt)
2516 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2519 /* opt_pass methods: */
2520 virtual bool gate (function *)
2522 return TARGET_AVX
2523 && TARGET_VZEROUPPER && flag_expensive_optimizations
2524 && !optimize_size;
2527 virtual unsigned int execute (function *)
2529 return rest_of_handle_insert_vzeroupper ();
2532 }; // class pass_insert_vzeroupper
2534 const pass_data pass_data_stv =
2536 RTL_PASS, /* type */
2537 "stv", /* name */
2538 OPTGROUP_NONE, /* optinfo_flags */
2539 TV_MACH_DEP, /* tv_id */
2540 0, /* properties_required */
2541 0, /* properties_provided */
2542 0, /* properties_destroyed */
2543 0, /* todo_flags_start */
2544 TODO_df_finish, /* todo_flags_finish */
2547 class pass_stv : public rtl_opt_pass
2549 public:
2550 pass_stv (gcc::context *ctxt)
2551 : rtl_opt_pass (pass_data_stv, ctxt),
2552 timode_p (false)
2555 /* opt_pass methods: */
2556 virtual bool gate (function *)
2558 return (timode_p == !!TARGET_64BIT
2559 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2562 virtual unsigned int execute (function *)
2564 return convert_scalars_to_vector ();
2567 opt_pass *clone ()
2569 return new pass_stv (m_ctxt);
2572 void set_pass_param (unsigned int n, bool param)
2574 gcc_assert (n == 0);
2575 timode_p = param;
2578 private:
2579 bool timode_p;
2580 }; // class pass_stv
2582 } // anon namespace
2584 rtl_opt_pass *
2585 make_pass_insert_vzeroupper (gcc::context *ctxt)
2587 return new pass_insert_vzeroupper (ctxt);
2590 rtl_opt_pass *
2591 make_pass_stv (gcc::context *ctxt)
2593 return new pass_stv (ctxt);
2596 /* Inserting ENDBRANCH instructions. */
2598 static unsigned int
2599 rest_of_insert_endbranch (void)
2601 timevar_push (TV_MACH_DEP);
2603 rtx cet_eb;
2604 rtx_insn *insn;
2605 basic_block bb;
2607 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2608 absent among function attributes. Later an optimization will be
2609 introduced to make analysis if an address of a static function is
2610 taken. A static function whose address is not taken will get a
2611 nocf_check attribute. This will allow to reduce the number of EB. */
2613 if (!lookup_attribute ("nocf_check",
2614 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2615 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2617 cet_eb = gen_nop_endbr ();
2619 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2620 insn = BB_HEAD (bb);
2621 emit_insn_before (cet_eb, insn);
2624 bb = 0;
2625 FOR_EACH_BB_FN (bb, cfun)
2627 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2628 insn = NEXT_INSN (insn))
2630 if (CALL_P (insn))
2632 bool need_endbr;
2633 need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2634 if (!need_endbr && !SIBLING_CALL_P (insn))
2636 rtx call = get_call_rtx_from (insn);
2637 rtx fnaddr = XEXP (call, 0);
2638 tree fndecl = NULL_TREE;
2640 /* Also generate ENDBRANCH for non-tail call which
2641 may return via indirect branch. */
2642 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2643 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2644 if (fndecl == NULL_TREE)
2645 fndecl = MEM_EXPR (fnaddr);
2646 if (fndecl
2647 && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2648 && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2649 fndecl = NULL_TREE;
2650 if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2652 tree fntype = TREE_TYPE (fndecl);
2653 if (lookup_attribute ("indirect_return",
2654 TYPE_ATTRIBUTES (fntype)))
2655 need_endbr = true;
2658 if (!need_endbr)
2659 continue;
2660 /* Generate ENDBRANCH after CALL, which can return more than
2661 twice, setjmp-like functions. */
2663 cet_eb = gen_nop_endbr ();
2664 emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2665 continue;
2668 if (JUMP_P (insn) && flag_cet_switch)
2670 rtx target = JUMP_LABEL (insn);
2671 if (target == NULL_RTX || ANY_RETURN_P (target))
2672 continue;
2674 /* Check the jump is a switch table. */
2675 rtx_insn *label = as_a<rtx_insn *> (target);
2676 rtx_insn *table = next_insn (label);
2677 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2678 continue;
2680 /* For the indirect jump find out all places it jumps and insert
2681 ENDBRANCH there. It should be done under a special flag to
2682 control ENDBRANCH generation for switch stmts. */
2683 edge_iterator ei;
2684 edge e;
2685 basic_block dest_blk;
2687 FOR_EACH_EDGE (e, ei, bb->succs)
2689 rtx_insn *insn;
2691 dest_blk = e->dest;
2692 insn = BB_HEAD (dest_blk);
2693 gcc_assert (LABEL_P (insn));
2694 cet_eb = gen_nop_endbr ();
2695 emit_insn_after (cet_eb, insn);
2697 continue;
2700 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2701 || (NOTE_P (insn)
2702 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2703 /* TODO. Check /s bit also. */
2705 cet_eb = gen_nop_endbr ();
2706 emit_insn_after (cet_eb, insn);
2707 continue;
2712 timevar_pop (TV_MACH_DEP);
2713 return 0;
2716 namespace {
2718 const pass_data pass_data_insert_endbranch =
2720 RTL_PASS, /* type. */
2721 "cet", /* name. */
2722 OPTGROUP_NONE, /* optinfo_flags. */
2723 TV_MACH_DEP, /* tv_id. */
2724 0, /* properties_required. */
2725 0, /* properties_provided. */
2726 0, /* properties_destroyed. */
2727 0, /* todo_flags_start. */
2728 0, /* todo_flags_finish. */
2731 class pass_insert_endbranch : public rtl_opt_pass
2733 public:
2734 pass_insert_endbranch (gcc::context *ctxt)
2735 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2738 /* opt_pass methods: */
2739 virtual bool gate (function *)
2741 return ((flag_cf_protection & CF_BRANCH));
2744 virtual unsigned int execute (function *)
2746 return rest_of_insert_endbranch ();
2749 }; // class pass_insert_endbranch
2751 } // anon namespace
2753 rtl_opt_pass *
2754 make_pass_insert_endbranch (gcc::context *ctxt)
2756 return new pass_insert_endbranch (ctxt);
2759 /* Return true if a red-zone is in use. We can't use red-zone when
2760 there are local indirect jumps, like "indirect_jump" or "tablejump",
2761 which jumps to another place in the function, since "call" in the
2762 indirect thunk pushes the return address onto stack, destroying
2763 red-zone.
2765 TODO: If we can reserve the first 2 WORDs, for PUSH and, another
2766 for CALL, in red-zone, we can allow local indirect jumps with
2767 indirect thunk. */
2769 bool
2770 ix86_using_red_zone (void)
2772 return (TARGET_RED_ZONE
2773 && !TARGET_64BIT_MS_ABI
2774 && (!cfun->machine->has_local_indirect_jump
2775 || cfun->machine->indirect_branch_type == indirect_branch_keep));
2778 /* Return a string that documents the current -m options. The caller is
2779 responsible for freeing the string. */
2781 static char *
2782 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2783 int flags, int flags2,
2784 const char *arch, const char *tune,
2785 enum fpmath_unit fpmath, bool add_nl_p)
2787 struct ix86_target_opts
2789 const char *option; /* option string */
2790 HOST_WIDE_INT mask; /* isa mask options */
2793 /* This table is ordered so that options like -msse4.2 that imply other
2794 ISAs come first. Target string will be displayed in the same order. */
2795 static struct ix86_target_opts isa2_opts[] =
2797 { "-mcx16", OPTION_MASK_ISA_CX16 },
2798 { "-mvaes", OPTION_MASK_ISA_VAES },
2799 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2800 { "-mpconfig", OPTION_MASK_ISA_PCONFIG },
2801 { "-mwbnoinvd", OPTION_MASK_ISA_WBNOINVD },
2802 { "-msgx", OPTION_MASK_ISA_SGX },
2803 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2804 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2805 { "-mhle", OPTION_MASK_ISA_HLE },
2806 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2807 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2808 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
2809 { "-mmovdir64b", OPTION_MASK_ISA_MOVDIR64B },
2810 { "-mwaitpkg", OPTION_MASK_ISA_WAITPKG },
2811 { "-mcldemote", OPTION_MASK_ISA_CLDEMOTE }
2813 static struct ix86_target_opts isa_opts[] =
2815 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2816 { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
2817 { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
2818 { "-mgfni", OPTION_MASK_ISA_GFNI },
2819 { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
2820 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2821 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2822 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2823 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2824 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2825 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2826 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2827 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2828 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2829 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2830 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2831 { "-mfma", OPTION_MASK_ISA_FMA },
2832 { "-mxop", OPTION_MASK_ISA_XOP },
2833 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2834 { "-mf16c", OPTION_MASK_ISA_F16C },
2835 { "-mavx", OPTION_MASK_ISA_AVX },
2836 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2837 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2838 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2839 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2840 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2841 { "-msse3", OPTION_MASK_ISA_SSE3 },
2842 { "-maes", OPTION_MASK_ISA_AES },
2843 { "-msha", OPTION_MASK_ISA_SHA },
2844 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2845 { "-msse2", OPTION_MASK_ISA_SSE2 },
2846 { "-msse", OPTION_MASK_ISA_SSE },
2847 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2848 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2849 { "-mmmx", OPTION_MASK_ISA_MMX },
2850 { "-mrtm", OPTION_MASK_ISA_RTM },
2851 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2852 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2853 { "-madx", OPTION_MASK_ISA_ADX },
2854 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2855 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2856 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2857 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2858 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2859 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2860 { "-mabm", OPTION_MASK_ISA_ABM },
2861 { "-mbmi", OPTION_MASK_ISA_BMI },
2862 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2863 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2864 { "-mtbm", OPTION_MASK_ISA_TBM },
2865 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2866 { "-msahf", OPTION_MASK_ISA_SAHF },
2867 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2868 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2869 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2870 { "-mpku", OPTION_MASK_ISA_PKU },
2871 { "-mlwp", OPTION_MASK_ISA_LWP },
2872 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2873 { "-mclwb", OPTION_MASK_ISA_CLWB },
2874 { "-mshstk", OPTION_MASK_ISA_SHSTK },
2875 { "-mmovdiri", OPTION_MASK_ISA_MOVDIRI }
2878 /* Flag options. */
2879 static struct ix86_target_opts flag_opts[] =
2881 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2882 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2883 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2884 { "-m80387", MASK_80387 },
2885 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2886 { "-malign-double", MASK_ALIGN_DOUBLE },
2887 { "-mcld", MASK_CLD },
2888 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2889 { "-mieee-fp", MASK_IEEE_FP },
2890 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2891 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2892 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2893 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2894 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2895 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2896 { "-mno-red-zone", MASK_NO_RED_ZONE },
2897 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2898 { "-mrecip", MASK_RECIP },
2899 { "-mrtd", MASK_RTD },
2900 { "-msseregparm", MASK_SSEREGPARM },
2901 { "-mstack-arg-probe", MASK_STACK_PROBE },
2902 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2903 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2904 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2905 { "-mvzeroupper", MASK_VZEROUPPER },
2906 { "-mstv", MASK_STV },
2907 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2908 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2909 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2912 /* Additional flag options. */
2913 static struct ix86_target_opts flag2_opts[] =
2915 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2918 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2919 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2921 char isa_other[40];
2922 char isa2_other[40];
2923 char flags_other[40];
2924 char flags2_other[40];
2925 unsigned num = 0;
2926 unsigned i, j;
2927 char *ret;
2928 char *ptr;
2929 size_t len;
2930 size_t line_len;
2931 size_t sep_len;
2932 const char *abi;
2934 memset (opts, '\0', sizeof (opts));
2936 /* Add -march= option. */
2937 if (arch)
2939 opts[num][0] = "-march=";
2940 opts[num++][1] = arch;
2943 /* Add -mtune= option. */
2944 if (tune)
2946 opts[num][0] = "-mtune=";
2947 opts[num++][1] = tune;
2950 /* Add -m32/-m64/-mx32. */
2951 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2953 if ((isa & OPTION_MASK_ABI_64) != 0)
2954 abi = "-m64";
2955 else
2956 abi = "-mx32";
2957 isa &= ~ (OPTION_MASK_ISA_64BIT
2958 | OPTION_MASK_ABI_64
2959 | OPTION_MASK_ABI_X32);
2961 else
2962 abi = "-m32";
2963 opts[num++][0] = abi;
2965 /* Pick out the options in isa2 options. */
2966 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2968 if ((isa2 & isa2_opts[i].mask) != 0)
2970 opts[num++][0] = isa2_opts[i].option;
2971 isa2 &= ~ isa2_opts[i].mask;
2975 if (isa2 && add_nl_p)
2977 opts[num++][0] = isa2_other;
2978 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2981 /* Pick out the options in isa options. */
2982 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2984 if ((isa & isa_opts[i].mask) != 0)
2986 opts[num++][0] = isa_opts[i].option;
2987 isa &= ~ isa_opts[i].mask;
2991 if (isa && add_nl_p)
2993 opts[num++][0] = isa_other;
2994 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2997 /* Add flag options. */
2998 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
3000 if ((flags & flag_opts[i].mask) != 0)
3002 opts[num++][0] = flag_opts[i].option;
3003 flags &= ~ flag_opts[i].mask;
3007 if (flags && add_nl_p)
3009 opts[num++][0] = flags_other;
3010 sprintf (flags_other, "(other flags: %#x)", flags);
3013 /* Add additional flag options. */
3014 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
3016 if ((flags2 & flag2_opts[i].mask) != 0)
3018 opts[num++][0] = flag2_opts[i].option;
3019 flags2 &= ~ flag2_opts[i].mask;
3023 if (flags2 && add_nl_p)
3025 opts[num++][0] = flags2_other;
3026 sprintf (flags2_other, "(other flags2: %#x)", flags2);
3029 /* Add -fpmath= option. */
3030 if (fpmath)
3032 opts[num][0] = "-mfpmath=";
3033 switch ((int) fpmath)
3035 case FPMATH_387:
3036 opts[num++][1] = "387";
3037 break;
3039 case FPMATH_SSE:
3040 opts[num++][1] = "sse";
3041 break;
3043 case FPMATH_387 | FPMATH_SSE:
3044 opts[num++][1] = "sse+387";
3045 break;
3047 default:
3048 gcc_unreachable ();
3052 /* Any options? */
3053 if (num == 0)
3054 return NULL;
3056 gcc_assert (num < ARRAY_SIZE (opts));
3058 /* Size the string. */
3059 len = 0;
3060 sep_len = (add_nl_p) ? 3 : 1;
3061 for (i = 0; i < num; i++)
3063 len += sep_len;
3064 for (j = 0; j < 2; j++)
3065 if (opts[i][j])
3066 len += strlen (opts[i][j]);
3069 /* Build the string. */
3070 ret = ptr = (char *) xmalloc (len);
3071 line_len = 0;
3073 for (i = 0; i < num; i++)
3075 size_t len2[2];
3077 for (j = 0; j < 2; j++)
3078 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3080 if (i != 0)
3082 *ptr++ = ' ';
3083 line_len++;
3085 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3087 *ptr++ = '\\';
3088 *ptr++ = '\n';
3089 line_len = 0;
3093 for (j = 0; j < 2; j++)
3094 if (opts[i][j])
3096 memcpy (ptr, opts[i][j], len2[j]);
3097 ptr += len2[j];
3098 line_len += len2[j];
3102 *ptr = '\0';
3103 gcc_assert (ret + len >= ptr);
3105 return ret;
3108 /* Return true, if profiling code should be emitted before
3109 prologue. Otherwise it returns false.
3110 Note: For x86 with "hotfix" it is sorried. */
3111 static bool
3112 ix86_profile_before_prologue (void)
3114 return flag_fentry != 0;
3117 /* Function that is callable from the debugger to print the current
3118 options. */
3119 void ATTRIBUTE_UNUSED
3120 ix86_debug_options (void)
3122 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3123 target_flags, ix86_target_flags,
3124 ix86_arch_string,ix86_tune_string,
3125 ix86_fpmath, true);
3127 if (opts)
3129 fprintf (stderr, "%s\n\n", opts);
3130 free (opts);
3132 else
3133 fputs ("<no options>\n\n", stderr);
3135 return;
3138 /* Return true if T is one of the bytes we should avoid with
3139 -mmitigate-rop. */
3141 static bool
3142 ix86_rop_should_change_byte_p (int t)
3144 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3147 static const char *stringop_alg_names[] = {
3148 #define DEF_ENUM
3149 #define DEF_ALG(alg, name) #name,
3150 #include "stringop.def"
3151 #undef DEF_ENUM
3152 #undef DEF_ALG
3155 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3156 The string is of the following form (or comma separated list of it):
3158 strategy_alg:max_size:[align|noalign]
3160 where the full size range for the strategy is either [0, max_size] or
3161 [min_size, max_size], in which min_size is the max_size + 1 of the
3162 preceding range. The last size range must have max_size == -1.
3164 Examples:
3167 -mmemcpy-strategy=libcall:-1:noalign
3169 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3173 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3175 This is to tell the compiler to use the following strategy for memset
3176 1) when the expected size is between [1, 16], use rep_8byte strategy;
3177 2) when the size is between [17, 2048], use vector_loop;
3178 3) when the size is > 2048, use libcall. */
3180 struct stringop_size_range
3182 int max;
3183 stringop_alg alg;
3184 bool noalign;
3187 static void
3188 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3190 const struct stringop_algs *default_algs;
3191 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3192 char *curr_range_str, *next_range_str;
3193 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3194 int i = 0, n = 0;
3196 if (is_memset)
3197 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3198 else
3199 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3201 curr_range_str = strategy_str;
3205 int maxs;
3206 char alg_name[128];
3207 char align[16];
3208 next_range_str = strchr (curr_range_str, ',');
3209 if (next_range_str)
3210 *next_range_str++ = '\0';
3212 if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
3213 align) != 3)
3215 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3216 return;
3219 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3221 error ("size ranges of option %qs should be increasing", opt);
3222 return;
3225 for (i = 0; i < last_alg; i++)
3226 if (!strcmp (alg_name, stringop_alg_names[i]))
3227 break;
3229 if (i == last_alg)
3231 error ("wrong strategy name %qs specified for option %qs",
3232 alg_name, opt);
3234 auto_vec <const char *> candidates;
3235 for (i = 0; i < last_alg; i++)
3236 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3237 candidates.safe_push (stringop_alg_names[i]);
3239 char *s;
3240 const char *hint
3241 = candidates_list_and_hint (alg_name, s, candidates);
3242 if (hint)
3243 inform (input_location,
3244 "valid arguments to %qs are: %s; did you mean %qs?",
3245 opt, s, hint);
3246 else
3247 inform (input_location, "valid arguments to %qs are: %s",
3248 opt, s);
3249 XDELETEVEC (s);
3250 return;
3253 if ((stringop_alg) i == rep_prefix_8_byte
3254 && !TARGET_64BIT)
3256 /* rep; movq isn't available in 32-bit code. */
3257 error ("strategy name %qs specified for option %qs "
3258 "not supported for 32-bit code", alg_name, opt);
3259 return;
3262 input_ranges[n].max = maxs;
3263 input_ranges[n].alg = (stringop_alg) i;
3264 if (!strcmp (align, "align"))
3265 input_ranges[n].noalign = false;
3266 else if (!strcmp (align, "noalign"))
3267 input_ranges[n].noalign = true;
3268 else
3270 error ("unknown alignment %qs specified for option %qs", align, opt);
3271 return;
3273 n++;
3274 curr_range_str = next_range_str;
3276 while (curr_range_str);
3278 if (input_ranges[n - 1].max != -1)
3280 error ("the max value for the last size range should be -1"
3281 " for option %qs", opt);
3282 return;
3285 if (n > MAX_STRINGOP_ALGS)
3287 error ("too many size ranges specified in option %qs", opt);
3288 return;
3291 /* Now override the default algs array. */
3292 for (i = 0; i < n; i++)
3294 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3295 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3296 = input_ranges[i].alg;
3297 *const_cast<int *>(&default_algs->size[i].noalign)
3298 = input_ranges[i].noalign;
3303 /* parse -mtune-ctrl= option. When DUMP is true,
3304 print the features that are explicitly set. */
3306 static void
3307 parse_mtune_ctrl_str (bool dump)
3309 if (!ix86_tune_ctrl_string)
3310 return;
3312 char *next_feature_string = NULL;
3313 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3314 char *orig = curr_feature_string;
3315 int i;
3318 bool clear = false;
3320 next_feature_string = strchr (curr_feature_string, ',');
3321 if (next_feature_string)
3322 *next_feature_string++ = '\0';
3323 if (*curr_feature_string == '^')
3325 curr_feature_string++;
3326 clear = true;
3328 for (i = 0; i < X86_TUNE_LAST; i++)
3330 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3332 ix86_tune_features[i] = !clear;
3333 if (dump)
3334 fprintf (stderr, "Explicitly %s feature %s\n",
3335 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3336 break;
3339 if (i == X86_TUNE_LAST)
3340 error ("unknown parameter to option -mtune-ctrl: %s",
3341 clear ? curr_feature_string - 1 : curr_feature_string);
3342 curr_feature_string = next_feature_string;
3344 while (curr_feature_string);
3345 free (orig);
3348 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3349 processor type. */
3351 static void
3352 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3354 unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
3355 int i;
3357 for (i = 0; i < X86_TUNE_LAST; ++i)
3359 if (ix86_tune_no_default)
3360 ix86_tune_features[i] = 0;
3361 else
3362 ix86_tune_features[i]
3363 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3366 if (dump)
3368 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3369 for (i = 0; i < X86_TUNE_LAST; i++)
3370 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3371 ix86_tune_features[i] ? "on" : "off");
3374 parse_mtune_ctrl_str (dump);
3378 /* Default align_* from the processor table. */
3380 static void
3381 ix86_default_align (struct gcc_options *opts)
3383 /* -falign-foo without argument: supply one. */
3384 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
3385 opts->x_str_align_loops = processor_target_table[ix86_tune].align_loop;
3386 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
3387 opts->x_str_align_jumps = processor_target_table[ix86_tune].align_jump;
3388 if (opts->x_flag_align_labels && !opts->x_str_align_labels)
3389 opts->x_str_align_labels = processor_target_table[ix86_tune].align_label;
3390 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
3391 opts->x_str_align_functions = processor_target_table[ix86_tune].align_func;
3394 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3396 static void
3397 ix86_override_options_after_change (void)
3399 ix86_default_align (&global_options);
3402 /* Override various settings based on options. If MAIN_ARGS_P, the
3403 options are from the command line, otherwise they are from
3404 attributes. Return true if there's an error related to march
3405 option. */
3407 static bool
3408 ix86_option_override_internal (bool main_args_p,
3409 struct gcc_options *opts,
3410 struct gcc_options *opts_set)
3412 int i;
3413 unsigned HOST_WIDE_INT ix86_arch_mask;
3414 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3416 const wide_int_bitmask PTA_3DNOW (HOST_WIDE_INT_1U << 0);
3417 const wide_int_bitmask PTA_3DNOW_A (HOST_WIDE_INT_1U << 1);
3418 const wide_int_bitmask PTA_64BIT (HOST_WIDE_INT_1U << 2);
3419 const wide_int_bitmask PTA_ABM (HOST_WIDE_INT_1U << 3);
3420 const wide_int_bitmask PTA_AES (HOST_WIDE_INT_1U << 4);
3421 const wide_int_bitmask PTA_AVX (HOST_WIDE_INT_1U << 5);
3422 const wide_int_bitmask PTA_BMI (HOST_WIDE_INT_1U << 6);
3423 const wide_int_bitmask PTA_CX16 (HOST_WIDE_INT_1U << 7);
3424 const wide_int_bitmask PTA_F16C (HOST_WIDE_INT_1U << 8);
3425 const wide_int_bitmask PTA_FMA (HOST_WIDE_INT_1U << 9);
3426 const wide_int_bitmask PTA_FMA4 (HOST_WIDE_INT_1U << 10);
3427 const wide_int_bitmask PTA_FSGSBASE (HOST_WIDE_INT_1U << 11);
3428 const wide_int_bitmask PTA_LWP (HOST_WIDE_INT_1U << 12);
3429 const wide_int_bitmask PTA_LZCNT (HOST_WIDE_INT_1U << 13);
3430 const wide_int_bitmask PTA_MMX (HOST_WIDE_INT_1U << 14);
3431 const wide_int_bitmask PTA_MOVBE (HOST_WIDE_INT_1U << 15);
3432 const wide_int_bitmask PTA_NO_SAHF (HOST_WIDE_INT_1U << 16);
3433 const wide_int_bitmask PTA_PCLMUL (HOST_WIDE_INT_1U << 17);
3434 const wide_int_bitmask PTA_POPCNT (HOST_WIDE_INT_1U << 18);
3435 const wide_int_bitmask PTA_PREFETCH_SSE (HOST_WIDE_INT_1U << 19);
3436 const wide_int_bitmask PTA_RDRND (HOST_WIDE_INT_1U << 20);
3437 const wide_int_bitmask PTA_SSE (HOST_WIDE_INT_1U << 21);
3438 const wide_int_bitmask PTA_SSE2 (HOST_WIDE_INT_1U << 22);
3439 const wide_int_bitmask PTA_SSE3 (HOST_WIDE_INT_1U << 23);
3440 const wide_int_bitmask PTA_SSE4_1 (HOST_WIDE_INT_1U << 24);
3441 const wide_int_bitmask PTA_SSE4_2 (HOST_WIDE_INT_1U << 25);
3442 const wide_int_bitmask PTA_SSE4A (HOST_WIDE_INT_1U << 26);
3443 const wide_int_bitmask PTA_SSSE3 (HOST_WIDE_INT_1U << 27);
3444 const wide_int_bitmask PTA_TBM (HOST_WIDE_INT_1U << 28);
3445 const wide_int_bitmask PTA_XOP (HOST_WIDE_INT_1U << 29);
3446 const wide_int_bitmask PTA_AVX2 (HOST_WIDE_INT_1U << 30);
3447 const wide_int_bitmask PTA_BMI2 (HOST_WIDE_INT_1U << 31);
3448 const wide_int_bitmask PTA_RTM (HOST_WIDE_INT_1U << 32);
3449 const wide_int_bitmask PTA_HLE (HOST_WIDE_INT_1U << 33);
3450 const wide_int_bitmask PTA_PRFCHW (HOST_WIDE_INT_1U << 34);
3451 const wide_int_bitmask PTA_RDSEED (HOST_WIDE_INT_1U << 35);
3452 const wide_int_bitmask PTA_ADX (HOST_WIDE_INT_1U << 36);
3453 const wide_int_bitmask PTA_FXSR (HOST_WIDE_INT_1U << 37);
3454 const wide_int_bitmask PTA_XSAVE (HOST_WIDE_INT_1U << 38);
3455 const wide_int_bitmask PTA_XSAVEOPT (HOST_WIDE_INT_1U << 39);
3456 const wide_int_bitmask PTA_AVX512F (HOST_WIDE_INT_1U << 40);
3457 const wide_int_bitmask PTA_AVX512ER (HOST_WIDE_INT_1U << 41);
3458 const wide_int_bitmask PTA_AVX512PF (HOST_WIDE_INT_1U << 42);
3459 const wide_int_bitmask PTA_AVX512CD (HOST_WIDE_INT_1U << 43);
3460 /* Hole after PTA_MPX was removed. */
3461 const wide_int_bitmask PTA_SHA (HOST_WIDE_INT_1U << 45);
3462 const wide_int_bitmask PTA_PREFETCHWT1 (HOST_WIDE_INT_1U << 46);
3463 const wide_int_bitmask PTA_CLFLUSHOPT (HOST_WIDE_INT_1U << 47);
3464 const wide_int_bitmask PTA_XSAVEC (HOST_WIDE_INT_1U << 48);
3465 const wide_int_bitmask PTA_XSAVES (HOST_WIDE_INT_1U << 49);
3466 const wide_int_bitmask PTA_AVX512DQ (HOST_WIDE_INT_1U << 50);
3467 const wide_int_bitmask PTA_AVX512BW (HOST_WIDE_INT_1U << 51);
3468 const wide_int_bitmask PTA_AVX512VL (HOST_WIDE_INT_1U << 52);
3469 const wide_int_bitmask PTA_AVX512IFMA (HOST_WIDE_INT_1U << 53);
3470 const wide_int_bitmask PTA_AVX512VBMI (HOST_WIDE_INT_1U << 54);
3471 const wide_int_bitmask PTA_CLWB (HOST_WIDE_INT_1U << 55);
3472 const wide_int_bitmask PTA_MWAITX (HOST_WIDE_INT_1U << 56);
3473 const wide_int_bitmask PTA_CLZERO (HOST_WIDE_INT_1U << 57);
3474 const wide_int_bitmask PTA_NO_80387 (HOST_WIDE_INT_1U << 58);
3475 const wide_int_bitmask PTA_PKU (HOST_WIDE_INT_1U << 59);
3476 const wide_int_bitmask PTA_AVX5124VNNIW (HOST_WIDE_INT_1U << 60);
3477 const wide_int_bitmask PTA_AVX5124FMAPS (HOST_WIDE_INT_1U << 61);
3478 const wide_int_bitmask PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1U << 62);
3479 const wide_int_bitmask PTA_SGX (HOST_WIDE_INT_1U << 63);
3480 const wide_int_bitmask PTA_AVX512VNNI (0, HOST_WIDE_INT_1U);
3481 const wide_int_bitmask PTA_GFNI (0, HOST_WIDE_INT_1U << 1);
3482 const wide_int_bitmask PTA_VAES (0, HOST_WIDE_INT_1U << 2);
3483 const wide_int_bitmask PTA_AVX512VBMI2 (0, HOST_WIDE_INT_1U << 3);
3484 const wide_int_bitmask PTA_VPCLMULQDQ (0, HOST_WIDE_INT_1U << 4);
3485 const wide_int_bitmask PTA_AVX512BITALG (0, HOST_WIDE_INT_1U << 5);
3486 const wide_int_bitmask PTA_RDPID (0, HOST_WIDE_INT_1U << 6);
3487 const wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7);
3488 const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8);
3489 const wide_int_bitmask PTA_WAITPKG (0, HOST_WIDE_INT_1U << 9);
3491 const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
3492 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR;
3493 const wide_int_bitmask PTA_NEHALEM = PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2
3494 | PTA_POPCNT;
3495 const wide_int_bitmask PTA_WESTMERE = PTA_NEHALEM | PTA_AES | PTA_PCLMUL;
3496 const wide_int_bitmask PTA_SANDYBRIDGE = PTA_WESTMERE | PTA_AVX | PTA_XSAVE
3497 | PTA_XSAVEOPT;
3498 const wide_int_bitmask PTA_IVYBRIDGE = PTA_SANDYBRIDGE | PTA_FSGSBASE
3499 | PTA_RDRND | PTA_F16C;
3500 const wide_int_bitmask PTA_HASWELL = PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI
3501 | PTA_BMI2 | PTA_LZCNT | PTA_FMA | PTA_MOVBE | PTA_HLE;
3502 const wide_int_bitmask PTA_BROADWELL = PTA_HASWELL | PTA_ADX | PTA_PRFCHW
3503 | PTA_RDSEED;
3504 const wide_int_bitmask PTA_SKYLAKE = PTA_BROADWELL | PTA_CLFLUSHOPT
3505 | PTA_XSAVEC | PTA_XSAVES | PTA_SGX;
3506 const wide_int_bitmask PTA_SKYLAKE_AVX512 = PTA_SKYLAKE | PTA_AVX512F
3507 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3508 | PTA_CLWB;
3509 const wide_int_bitmask PTA_CANNONLAKE = PTA_SKYLAKE | PTA_AVX512F
3510 | PTA_AVX512CD | PTA_AVX512VL | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU
3511 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA;
3512 const wide_int_bitmask PTA_ICELAKE_CLIENT = PTA_CANNONLAKE | PTA_AVX512VNNI
3513 | PTA_GFNI | PTA_VAES | PTA_AVX512VBMI2 | PTA_VPCLMULQDQ | PTA_AVX512BITALG
3514 | PTA_RDPID | PTA_CLWB;
3515 const wide_int_bitmask PTA_ICELAKE_SERVER = PTA_ICELAKE_CLIENT | PTA_PCONFIG
3516 | PTA_WBNOINVD;
3517 const wide_int_bitmask PTA_KNL = PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER
3518 | PTA_AVX512F | PTA_AVX512CD;
3519 const wide_int_bitmask PTA_BONNELL = PTA_CORE2 | PTA_MOVBE;
3520 const wide_int_bitmask PTA_SILVERMONT = PTA_WESTMERE | PTA_MOVBE | PTA_RDRND;
3521 const wide_int_bitmask PTA_GOLDMONT = PTA_SILVERMONT | PTA_SHA | PTA_XSAVE
3522 | PTA_RDSEED | PTA_XSAVEC | PTA_XSAVES | PTA_CLFLUSHOPT | PTA_XSAVEOPT
3523 | PTA_FSGSBASE;
3524 const wide_int_bitmask PTA_GOLDMONT_PLUS = PTA_GOLDMONT | PTA_RDPID
3525 | PTA_SGX;
3526 const wide_int_bitmask PTA_TREMONT = PTA_GOLDMONT_PLUS | PTA_CLWB
3527 | PTA_GFNI;
3528 const wide_int_bitmask PTA_KNM = PTA_KNL | PTA_AVX5124VNNIW
3529 | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ;
3531 static struct pta
3533 const char *const name; /* processor name or nickname. */
3534 const enum processor_type processor;
3535 const enum attr_cpu schedule;
3536 const wide_int_bitmask flags;
3538 const processor_alias_table[] =
3540 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3541 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3542 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3543 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3544 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3545 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3546 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3547 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3548 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3549 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3550 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3551 PTA_MMX | PTA_SSE | PTA_FXSR},
3552 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3553 PTA_MMX | PTA_SSE | PTA_FXSR},
3554 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3555 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3556 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3557 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3558 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3559 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3560 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3561 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3562 PTA_MMX | PTA_SSE | PTA_FXSR},
3563 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3564 PTA_MMX | PTA_SSE | PTA_FXSR},
3565 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3566 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3567 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3568 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3569 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3570 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3571 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3572 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3573 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3574 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3575 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3576 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3577 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3578 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3579 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3580 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3581 PTA_SANDYBRIDGE},
3582 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3583 PTA_SANDYBRIDGE},
3584 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3585 PTA_IVYBRIDGE},
3586 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3587 PTA_IVYBRIDGE},
3588 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3589 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3590 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3591 {"skylake", PROCESSOR_SKYLAKE, CPU_HASWELL, PTA_SKYLAKE},
3592 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3593 PTA_SKYLAKE_AVX512},
3594 {"cannonlake", PROCESSOR_CANNONLAKE, CPU_HASWELL, PTA_CANNONLAKE},
3595 {"icelake-client", PROCESSOR_ICELAKE_CLIENT, CPU_HASWELL,
3596 PTA_ICELAKE_CLIENT},
3597 {"icelake-server", PROCESSOR_ICELAKE_SERVER, CPU_HASWELL,
3598 PTA_ICELAKE_SERVER},
3599 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3600 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3601 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3602 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3603 {"goldmont", PROCESSOR_GOLDMONT, CPU_GLM, PTA_GOLDMONT},
3604 {"goldmont-plus", PROCESSOR_GOLDMONT_PLUS, CPU_GLM, PTA_GOLDMONT_PLUS},
3605 {"tremont", PROCESSOR_TREMONT, CPU_GLM, PTA_TREMONT},
3606 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3607 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3608 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3609 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3610 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3611 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3612 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3613 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3614 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3615 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3616 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3617 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3618 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3619 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3620 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3621 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3622 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3623 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3624 {"x86-64", PROCESSOR_K8, CPU_K8,
3625 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3626 {"eden-x2", PROCESSOR_K8, CPU_K8,
3627 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3628 {"nano", PROCESSOR_K8, CPU_K8,
3629 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3630 | PTA_SSSE3 | PTA_FXSR},
3631 {"nano-1000", PROCESSOR_K8, CPU_K8,
3632 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3633 | PTA_SSSE3 | PTA_FXSR},
3634 {"nano-2000", PROCESSOR_K8, CPU_K8,
3635 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3636 | PTA_SSSE3 | PTA_FXSR},
3637 {"nano-3000", PROCESSOR_K8, CPU_K8,
3638 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3639 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3640 {"nano-x2", PROCESSOR_K8, CPU_K8,
3641 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3642 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3643 {"eden-x4", PROCESSOR_K8, CPU_K8,
3644 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3645 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3646 {"nano-x4", PROCESSOR_K8, CPU_K8,
3647 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3648 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3649 {"k8", PROCESSOR_K8, CPU_K8,
3650 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3651 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3652 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3653 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3654 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3655 {"opteron", PROCESSOR_K8, CPU_K8,
3656 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3657 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3658 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3659 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3660 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3661 {"athlon64", PROCESSOR_K8, CPU_K8,
3662 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3663 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3664 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3665 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3666 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3667 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3668 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3669 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3670 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3671 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3672 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3673 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3674 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3675 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3676 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3677 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3678 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3679 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3680 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3681 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3682 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3683 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3684 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3685 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3686 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3687 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3688 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3689 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3690 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3691 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3692 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3693 | PTA_XSAVEOPT | PTA_FSGSBASE},
3694 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3695 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3696 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3697 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3698 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3699 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3700 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3701 | PTA_MOVBE | PTA_MWAITX},
3702 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3703 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3704 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3705 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3706 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3707 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3708 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3709 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3710 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3711 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3712 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3713 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3714 | PTA_FXSR | PTA_XSAVE},
3715 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3716 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3717 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3718 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3719 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3720 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3722 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3723 PTA_64BIT
3724 | PTA_HLE /* flags are only used for -march switch. */ },
3727 /* -mrecip options. */
3728 static struct
3730 const char *string; /* option name */
3731 unsigned int mask; /* mask bits to set */
3733 const recip_options[] =
3735 { "all", RECIP_MASK_ALL },
3736 { "none", RECIP_MASK_NONE },
3737 { "div", RECIP_MASK_DIV },
3738 { "sqrt", RECIP_MASK_SQRT },
3739 { "vec-div", RECIP_MASK_VEC_DIV },
3740 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3743 int const pta_size = ARRAY_SIZE (processor_alias_table);
3745 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3746 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3747 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3748 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3749 #ifdef TARGET_BI_ARCH
3750 else
3752 #if TARGET_BI_ARCH == 1
3753 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3754 is on and OPTION_MASK_ABI_X32 is off. We turn off
3755 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3756 -mx32. */
3757 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3758 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3759 #else
3760 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3761 on and OPTION_MASK_ABI_64 is off. We turn off
3762 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3763 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3764 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3765 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3766 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3767 #endif
3768 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3769 && TARGET_IAMCU_P (opts->x_target_flags))
3770 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3771 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3773 #endif
3775 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3777 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3778 OPTION_MASK_ABI_64 for TARGET_X32. */
3779 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3780 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3782 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3783 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3784 | OPTION_MASK_ABI_X32
3785 | OPTION_MASK_ABI_64);
3786 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3788 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3789 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3790 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3791 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3794 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3795 SUBTARGET_OVERRIDE_OPTIONS;
3796 #endif
3798 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3799 SUBSUBTARGET_OVERRIDE_OPTIONS;
3800 #endif
3802 /* -fPIC is the default for x86_64. */
3803 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3804 opts->x_flag_pic = 2;
3806 /* Need to check -mtune=generic first. */
3807 if (opts->x_ix86_tune_string)
3809 /* As special support for cross compilers we read -mtune=native
3810 as -mtune=generic. With native compilers we won't see the
3811 -mtune=native, as it was changed by the driver. */
3812 if (!strcmp (opts->x_ix86_tune_string, "native"))
3814 opts->x_ix86_tune_string = "generic";
3816 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3817 warning (OPT_Wdeprecated,
3818 main_args_p
3819 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3820 "or %<-mtune=generic%> instead as appropriate")
3821 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3822 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3823 " instead as appropriate"));
3825 else
3827 if (opts->x_ix86_arch_string)
3828 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3829 if (!opts->x_ix86_tune_string)
3831 opts->x_ix86_tune_string
3832 = processor_target_table[TARGET_CPU_DEFAULT].name;
3833 ix86_tune_defaulted = 1;
3836 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3837 or defaulted. We need to use a sensible tune option. */
3838 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3840 opts->x_ix86_tune_string = "generic";
3844 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3845 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3847 /* rep; movq isn't available in 32-bit code. */
3848 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3849 opts->x_ix86_stringop_alg = no_stringop;
3852 if (!opts->x_ix86_arch_string)
3853 opts->x_ix86_arch_string
3854 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3855 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3856 else
3857 ix86_arch_specified = 1;
3859 if (opts_set->x_ix86_pmode)
3861 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3862 && opts->x_ix86_pmode == PMODE_SI)
3863 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3864 && opts->x_ix86_pmode == PMODE_DI))
3865 error ("address mode %qs not supported in the %s bit mode",
3866 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3867 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3869 else
3870 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3871 ? PMODE_DI : PMODE_SI;
3873 if (!opts_set->x_ix86_abi)
3874 opts->x_ix86_abi = DEFAULT_ABI;
3876 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3877 error ("-mabi=ms not supported with X32 ABI");
3878 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3880 /* For targets using ms ABI enable ms-extensions, if not
3881 explicit turned off. For non-ms ABI we turn off this
3882 option. */
3883 if (!opts_set->x_flag_ms_extensions)
3884 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3886 if (opts_set->x_ix86_cmodel)
3888 switch (opts->x_ix86_cmodel)
3890 case CM_SMALL:
3891 case CM_SMALL_PIC:
3892 if (opts->x_flag_pic)
3893 opts->x_ix86_cmodel = CM_SMALL_PIC;
3894 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3895 error ("code model %qs not supported in the %s bit mode",
3896 "small", "32");
3897 break;
3899 case CM_MEDIUM:
3900 case CM_MEDIUM_PIC:
3901 if (opts->x_flag_pic)
3902 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3903 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3904 error ("code model %qs not supported in the %s bit mode",
3905 "medium", "32");
3906 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3907 error ("code model %qs not supported in x32 mode",
3908 "medium");
3909 break;
3911 case CM_LARGE:
3912 case CM_LARGE_PIC:
3913 if (opts->x_flag_pic)
3914 opts->x_ix86_cmodel = CM_LARGE_PIC;
3915 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3916 error ("code model %qs not supported in the %s bit mode",
3917 "large", "32");
3918 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3919 error ("code model %qs not supported in x32 mode",
3920 "large");
3921 break;
3923 case CM_32:
3924 if (opts->x_flag_pic)
3925 error ("code model %s does not support PIC mode", "32");
3926 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3927 error ("code model %qs not supported in the %s bit mode",
3928 "32", "64");
3929 break;
3931 case CM_KERNEL:
3932 if (opts->x_flag_pic)
3934 error ("code model %s does not support PIC mode", "kernel");
3935 opts->x_ix86_cmodel = CM_32;
3937 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3938 error ("code model %qs not supported in the %s bit mode",
3939 "kernel", "32");
3940 break;
3942 default:
3943 gcc_unreachable ();
3946 else
3948 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3949 use of rip-relative addressing. This eliminates fixups that
3950 would otherwise be needed if this object is to be placed in a
3951 DLL, and is essentially just as efficient as direct addressing. */
3952 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3953 && (TARGET_RDOS || TARGET_PECOFF))
3954 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3955 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3956 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3957 else
3958 opts->x_ix86_cmodel = CM_32;
3960 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3962 error ("-masm=intel not supported in this configuration");
3963 opts->x_ix86_asm_dialect = ASM_ATT;
3965 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3966 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3967 sorry ("%i-bit mode not compiled in",
3968 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3970 for (i = 0; i < pta_size; i++)
3971 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3973 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3975 error (main_args_p
3976 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3977 "switch")
3978 : G_("%<generic%> CPU can be used only for "
3979 "%<target(\"tune=\")%> attribute"));
3980 return false;
3982 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3984 error (main_args_p
3985 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3986 "switch")
3987 : G_("%<intel%> CPU can be used only for "
3988 "%<target(\"tune=\")%> attribute"));
3989 return false;
3992 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3993 && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
3995 error ("CPU you selected does not support x86-64 "
3996 "instruction set");
3997 return false;
4000 ix86_schedule = processor_alias_table[i].schedule;
4001 ix86_arch = processor_alias_table[i].processor;
4002 /* Default cpu tuning to the architecture. */
4003 ix86_tune = ix86_arch;
4005 if (((processor_alias_table[i].flags & PTA_MMX) != 0)
4006 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
4007 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
4008 if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
4009 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
4010 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
4011 if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
4012 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
4013 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
4014 if (((processor_alias_table[i].flags & PTA_SSE) != 0)
4015 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
4016 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
4017 if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
4018 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
4019 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
4020 if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
4021 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
4022 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
4023 if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
4024 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
4025 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
4026 if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
4027 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
4028 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
4029 if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
4030 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
4031 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
4032 if (((processor_alias_table[i].flags & PTA_AVX) != 0)
4033 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
4034 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
4035 if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
4036 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
4037 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
4038 if (((processor_alias_table[i].flags & PTA_FMA) != 0)
4039 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
4040 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
4041 if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
4042 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
4043 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
4044 if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
4045 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
4046 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
4047 if (((processor_alias_table[i].flags & PTA_XOP) != 0)
4048 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
4049 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
4050 if (((processor_alias_table[i].flags & PTA_LWP) != 0)
4051 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
4052 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
4053 if (((processor_alias_table[i].flags & PTA_ABM) != 0)
4054 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
4055 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
4056 if (((processor_alias_table[i].flags & PTA_BMI) != 0)
4057 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
4058 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
4059 if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
4060 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
4061 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
4062 if (((processor_alias_table[i].flags & PTA_TBM) != 0)
4063 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
4064 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
4065 if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
4066 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
4067 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
4068 if (((processor_alias_table[i].flags & PTA_CX16) != 0)
4069 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
4070 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
4071 if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
4072 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
4073 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
4074 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
4075 && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
4076 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4077 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4078 if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
4079 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
4080 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
4081 if (((processor_alias_table[i].flags & PTA_AES) != 0)
4082 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4083 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4084 if (((processor_alias_table[i].flags & PTA_SHA) != 0)
4085 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4086 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4087 if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
4088 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4089 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4090 if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
4091 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4092 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4093 if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
4094 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4095 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4096 if (((processor_alias_table[i].flags & PTA_F16C) != 0)
4097 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4098 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4099 if (((processor_alias_table[i].flags & PTA_RTM) != 0)
4100 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4101 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4102 if (((processor_alias_table[i].flags & PTA_HLE) != 0)
4103 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
4104 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
4105 if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
4106 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4107 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4108 if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
4109 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4110 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4111 if (((processor_alias_table[i].flags & PTA_ADX) != 0)
4112 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4113 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4114 if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
4115 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4116 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4117 if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
4118 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4119 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4120 if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
4121 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4122 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4123 if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
4124 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4125 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4126 if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
4127 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4128 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4129 if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
4130 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4131 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4132 if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
4133 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4134 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4135 if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
4136 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4137 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4138 if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
4139 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4140 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4141 if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
4142 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4143 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4144 if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
4145 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
4146 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
4147 if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
4148 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4149 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4150 if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
4151 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4152 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4153 if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
4154 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4155 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4156 if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
4157 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4158 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4159 if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
4160 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4161 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4162 if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
4163 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4164 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4165 if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
4166 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4167 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4168 if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
4169 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
4170 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
4171 if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
4172 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
4173 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
4174 if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
4175 && !(opts->x_ix86_isa_flags_explicit
4176 & OPTION_MASK_ISA_AVX512VBMI2))
4177 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
4178 if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
4179 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
4180 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
4181 if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
4182 && !(opts->x_ix86_isa_flags_explicit
4183 & OPTION_MASK_ISA_AVX512BITALG))
4184 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
4186 if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
4187 && !(opts->x_ix86_isa_flags2_explicit
4188 & OPTION_MASK_ISA_AVX5124VNNIW))
4189 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4190 if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
4191 && !(opts->x_ix86_isa_flags2_explicit
4192 & OPTION_MASK_ISA_AVX5124FMAPS))
4193 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4194 if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
4195 && !(opts->x_ix86_isa_flags_explicit
4196 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4197 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4198 if (((processor_alias_table[i].flags & PTA_SGX) != 0)
4199 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4200 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4201 if (((processor_alias_table[i].flags & PTA_VAES) != 0)
4202 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
4203 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
4204 if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
4205 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
4206 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
4207 if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
4208 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
4209 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
4210 if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
4211 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
4212 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
4214 if ((processor_alias_table[i].flags
4215 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
4216 x86_prefetch_sse = true;
4217 if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
4218 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
4219 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
4220 if (((processor_alias_table[i].flags & PTA_PKU) != 0)
4221 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4222 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4224 /* Don't enable x87 instructions if only
4225 general registers are allowed. */
4226 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4227 && !(opts_set->x_target_flags & MASK_80387))
4229 if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
4230 opts->x_target_flags &= ~MASK_80387;
4231 else
4232 opts->x_target_flags |= MASK_80387;
4234 break;
4237 if (i == pta_size)
4239 error (main_args_p
4240 ? G_("bad value (%qs) for %<-march=%> switch")
4241 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4242 opts->x_ix86_arch_string);
4244 auto_vec <const char *> candidates;
4245 for (i = 0; i < pta_size; i++)
4246 if (strcmp (processor_alias_table[i].name, "generic")
4247 && strcmp (processor_alias_table[i].name, "intel")
4248 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4249 || ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
4250 candidates.safe_push (processor_alias_table[i].name);
4252 #ifdef HAVE_LOCAL_CPU_DETECT
4253 /* Add also "native" as possible value. */
4254 candidates.safe_push ("native");
4255 #endif
4257 char *s;
4258 const char *hint
4259 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4260 if (hint)
4261 inform (input_location,
4262 main_args_p
4263 ? G_("valid arguments to %<-march=%> switch are: "
4264 "%s; did you mean %qs?")
4265 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4266 "%s; did you mean %qs?"), s, hint);
4267 else
4268 inform (input_location,
4269 main_args_p
4270 ? G_("valid arguments to %<-march=%> switch are: %s")
4271 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4272 "are: %s"), s);
4273 XDELETEVEC (s);
4276 ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
4277 for (i = 0; i < X86_ARCH_LAST; ++i)
4278 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4280 for (i = 0; i < pta_size; i++)
4281 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4283 ix86_schedule = processor_alias_table[i].schedule;
4284 ix86_tune = processor_alias_table[i].processor;
4285 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4287 if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
4289 if (ix86_tune_defaulted)
4291 opts->x_ix86_tune_string = "x86-64";
4292 for (i = 0; i < pta_size; i++)
4293 if (! strcmp (opts->x_ix86_tune_string,
4294 processor_alias_table[i].name))
4295 break;
4296 ix86_schedule = processor_alias_table[i].schedule;
4297 ix86_tune = processor_alias_table[i].processor;
4299 else
4300 error ("CPU you selected does not support x86-64 "
4301 "instruction set");
4304 /* Intel CPUs have always interpreted SSE prefetch instructions as
4305 NOPs; so, we can enable SSE prefetch instructions even when
4306 -mtune (rather than -march) points us to a processor that has them.
4307 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4308 higher processors. */
4309 if (TARGET_CMOV
4310 && ((processor_alias_table[i].flags
4311 & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
4312 x86_prefetch_sse = true;
4313 break;
4316 if (ix86_tune_specified && i == pta_size)
4318 error (main_args_p
4319 ? G_("bad value (%qs) for %<-mtune=%> switch")
4320 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4321 opts->x_ix86_tune_string);
4323 auto_vec <const char *> candidates;
4324 for (i = 0; i < pta_size; i++)
4325 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4326 || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
4327 candidates.safe_push (processor_alias_table[i].name);
4329 #ifdef HAVE_LOCAL_CPU_DETECT
4330 /* Add also "native" as possible value. */
4331 candidates.safe_push ("native");
4332 #endif
4334 char *s;
4335 const char *hint
4336 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4337 if (hint)
4338 inform (input_location,
4339 main_args_p
4340 ? G_("valid arguments to %<-mtune=%> switch are: "
4341 "%s; did you mean %qs?")
4342 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4343 "%s; did you mean %qs?"), s, hint);
4344 else
4345 inform (input_location,
4346 main_args_p
4347 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4348 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4349 "are: %s"), s);
4350 XDELETEVEC (s);
4353 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4355 #ifndef USE_IX86_FRAME_POINTER
4356 #define USE_IX86_FRAME_POINTER 0
4357 #endif
4359 #ifndef USE_X86_64_FRAME_POINTER
4360 #define USE_X86_64_FRAME_POINTER 0
4361 #endif
4363 /* Set the default values for switches whose default depends on TARGET_64BIT
4364 in case they weren't overwritten by command line options. */
4365 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4367 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4368 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4369 if (opts->x_flag_asynchronous_unwind_tables
4370 && !opts_set->x_flag_unwind_tables
4371 && TARGET_64BIT_MS_ABI)
4372 opts->x_flag_unwind_tables = 1;
4373 if (opts->x_flag_asynchronous_unwind_tables == 2)
4374 opts->x_flag_unwind_tables
4375 = opts->x_flag_asynchronous_unwind_tables = 1;
4376 if (opts->x_flag_pcc_struct_return == 2)
4377 opts->x_flag_pcc_struct_return = 0;
4379 else
4381 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4382 opts->x_flag_omit_frame_pointer
4383 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4384 if (opts->x_flag_asynchronous_unwind_tables == 2)
4385 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4386 if (opts->x_flag_pcc_struct_return == 2)
4388 /* Intel MCU psABI specifies that -freg-struct-return should
4389 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4390 we check -miamcu so that -freg-struct-return is always
4391 turned on if -miamcu is used. */
4392 if (TARGET_IAMCU_P (opts->x_target_flags))
4393 opts->x_flag_pcc_struct_return = 0;
4394 else
4395 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4399 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4400 /* TODO: ix86_cost should be chosen at instruction or function granuality
4401 so for cold code we use size_cost even in !optimize_size compilation. */
4402 if (opts->x_optimize_size)
4403 ix86_cost = &ix86_size_cost;
4404 else
4405 ix86_cost = ix86_tune_cost;
4407 /* Arrange to set up i386_stack_locals for all functions. */
4408 init_machine_status = ix86_init_machine_status;
4410 /* Validate -mregparm= value. */
4411 if (opts_set->x_ix86_regparm)
4413 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4414 warning (0, "-mregparm is ignored in 64-bit mode");
4415 else if (TARGET_IAMCU_P (opts->x_target_flags))
4416 warning (0, "-mregparm is ignored for Intel MCU psABI");
4417 if (opts->x_ix86_regparm > REGPARM_MAX)
4419 error ("-mregparm=%d is not between 0 and %d",
4420 opts->x_ix86_regparm, REGPARM_MAX);
4421 opts->x_ix86_regparm = 0;
4424 if (TARGET_IAMCU_P (opts->x_target_flags)
4425 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4426 opts->x_ix86_regparm = REGPARM_MAX;
4428 /* Default align_* from the processor table. */
4429 ix86_default_align (opts);
4431 /* Provide default for -mbranch-cost= value. */
4432 if (!opts_set->x_ix86_branch_cost)
4433 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4435 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4437 opts->x_target_flags
4438 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4440 /* Enable by default the SSE and MMX builtins. Do allow the user to
4441 explicitly disable any of these. In particular, disabling SSE and
4442 MMX for kernel code is extremely useful. */
4443 if (!ix86_arch_specified)
4444 opts->x_ix86_isa_flags
4445 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4446 | TARGET_SUBTARGET64_ISA_DEFAULT)
4447 & ~opts->x_ix86_isa_flags_explicit);
4449 if (TARGET_RTD_P (opts->x_target_flags))
4450 warning (0,
4451 main_args_p
4452 ? G_("%<-mrtd%> is ignored in 64bit mode")
4453 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4455 else
4457 opts->x_target_flags
4458 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4460 if (!ix86_arch_specified)
4461 opts->x_ix86_isa_flags
4462 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4464 /* i386 ABI does not specify red zone. It still makes sense to use it
4465 when programmer takes care to stack from being destroyed. */
4466 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4467 opts->x_target_flags |= MASK_NO_RED_ZONE;
4470 /* Keep nonleaf frame pointers. */
4471 if (opts->x_flag_omit_frame_pointer)
4472 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4473 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4474 opts->x_flag_omit_frame_pointer = 1;
4476 /* If we're doing fast math, we don't care about comparison order
4477 wrt NaNs. This lets us use a shorter comparison sequence. */
4478 if (opts->x_flag_finite_math_only)
4479 opts->x_target_flags &= ~MASK_IEEE_FP;
4481 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4482 since the insns won't need emulation. */
4483 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4484 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4486 /* Likewise, if the target doesn't have a 387, or we've specified
4487 software floating point, don't use 387 inline intrinsics. */
4488 if (!TARGET_80387_P (opts->x_target_flags))
4489 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4491 /* Turn on MMX builtins for -msse. */
4492 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4493 opts->x_ix86_isa_flags
4494 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4496 /* Enable SSE prefetch. */
4497 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4498 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4499 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4500 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4501 x86_prefetch_sse = true;
4503 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4504 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4505 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4506 opts->x_ix86_isa_flags
4507 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4509 /* Enable lzcnt instruction for -mabm. */
4510 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4511 opts->x_ix86_isa_flags
4512 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4514 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4515 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4516 opts->x_ix86_isa_flags
4517 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4518 & ~opts->x_ix86_isa_flags_explicit);
4520 /* Validate -mpreferred-stack-boundary= value or default it to
4521 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4522 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4523 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4525 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4526 int max = TARGET_SEH ? 4 : 12;
4528 if (opts->x_ix86_preferred_stack_boundary_arg < min
4529 || opts->x_ix86_preferred_stack_boundary_arg > max)
4531 if (min == max)
4532 error ("-mpreferred-stack-boundary is not supported "
4533 "for this target");
4534 else
4535 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4536 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4538 else
4539 ix86_preferred_stack_boundary
4540 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4543 /* Set the default value for -mstackrealign. */
4544 if (!opts_set->x_ix86_force_align_arg_pointer)
4545 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4547 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4549 /* Validate -mincoming-stack-boundary= value or default it to
4550 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4551 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4552 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4554 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4556 if (opts->x_ix86_incoming_stack_boundary_arg < min
4557 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4558 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4559 opts->x_ix86_incoming_stack_boundary_arg, min);
4560 else
4562 ix86_user_incoming_stack_boundary
4563 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4564 ix86_incoming_stack_boundary
4565 = ix86_user_incoming_stack_boundary;
4569 #ifndef NO_PROFILE_COUNTERS
4570 if (flag_nop_mcount)
4571 error ("-mnop-mcount is not compatible with this target");
4572 #endif
4573 if (flag_nop_mcount && flag_pic)
4574 error ("-mnop-mcount is not implemented for -fPIC");
4576 /* Accept -msseregparm only if at least SSE support is enabled. */
4577 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4578 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4579 error (main_args_p
4580 ? G_("%<-msseregparm%> used without SSE enabled")
4581 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4583 if (opts_set->x_ix86_fpmath)
4585 if (opts->x_ix86_fpmath & FPMATH_SSE)
4587 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4589 if (TARGET_80387_P (opts->x_target_flags))
4591 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4592 opts->x_ix86_fpmath = FPMATH_387;
4595 else if ((opts->x_ix86_fpmath & FPMATH_387)
4596 && !TARGET_80387_P (opts->x_target_flags))
4598 warning (0, "387 instruction set disabled, using SSE arithmetics");
4599 opts->x_ix86_fpmath = FPMATH_SSE;
4603 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4604 fpmath=387. The second is however default at many targets since the
4605 extra 80bit precision of temporaries is considered to be part of ABI.
4606 Overwrite the default at least for -ffast-math.
4607 TODO: -mfpmath=both seems to produce same performing code with bit
4608 smaller binaries. It is however not clear if register allocation is
4609 ready for this setting.
4610 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4611 codegen. We may switch to 387 with -ffast-math for size optimized
4612 functions. */
4613 else if (fast_math_flags_set_p (&global_options)
4614 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4615 opts->x_ix86_fpmath = FPMATH_SSE;
4616 else
4617 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4619 /* Use external vectorized library in vectorizing intrinsics. */
4620 if (opts_set->x_ix86_veclibabi_type)
4621 switch (opts->x_ix86_veclibabi_type)
4623 case ix86_veclibabi_type_svml:
4624 ix86_veclib_handler = ix86_veclibabi_svml;
4625 break;
4627 case ix86_veclibabi_type_acml:
4628 ix86_veclib_handler = ix86_veclibabi_acml;
4629 break;
4631 default:
4632 gcc_unreachable ();
4635 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4636 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4637 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4639 /* If stack probes are required, the space used for large function
4640 arguments on the stack must also be probed, so enable
4641 -maccumulate-outgoing-args so this happens in the prologue. */
4642 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4643 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4645 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4646 warning (0,
4647 main_args_p
4648 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4649 "for correctness")
4650 : G_("stack probing requires "
4651 "%<target(\"accumulate-outgoing-args\")%> for "
4652 "correctness"));
4653 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4656 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4657 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4658 if (fixed_regs[BP_REG]
4659 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4661 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4662 warning (0,
4663 main_args_p
4664 ? G_("fixed ebp register requires "
4665 "%<-maccumulate-outgoing-args%>")
4666 : G_("fixed ebp register requires "
4667 "%<target(\"accumulate-outgoing-args\")%>"));
4668 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4671 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4673 char *p;
4674 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4675 p = strchr (internal_label_prefix, 'X');
4676 internal_label_prefix_len = p - internal_label_prefix;
4677 *p = '\0';
4680 /* When scheduling description is not available, disable scheduler pass
4681 so it won't slow down the compilation and make x87 code slower. */
4682 if (!TARGET_SCHEDULE)
4683 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4685 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4686 ix86_tune_cost->simultaneous_prefetches,
4687 opts->x_param_values,
4688 opts_set->x_param_values);
4689 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4690 ix86_tune_cost->prefetch_block,
4691 opts->x_param_values,
4692 opts_set->x_param_values);
4693 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4694 ix86_tune_cost->l1_cache_size,
4695 opts->x_param_values,
4696 opts_set->x_param_values);
4697 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4698 ix86_tune_cost->l2_cache_size,
4699 opts->x_param_values,
4700 opts_set->x_param_values);
4702 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4703 if (opts->x_flag_prefetch_loop_arrays < 0
4704 && HAVE_prefetch
4705 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4706 && !opts->x_optimize_size
4707 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4708 opts->x_flag_prefetch_loop_arrays = 1;
4710 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4711 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4712 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4713 targetm.expand_builtin_va_start = NULL;
4715 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4717 ix86_gen_leave = gen_leave_rex64;
4718 if (Pmode == DImode)
4720 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4721 ix86_gen_tls_local_dynamic_base_64
4722 = gen_tls_local_dynamic_base_64_di;
4724 else
4726 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4727 ix86_gen_tls_local_dynamic_base_64
4728 = gen_tls_local_dynamic_base_64_si;
4731 else
4732 ix86_gen_leave = gen_leave;
4734 if (Pmode == DImode)
4736 ix86_gen_add3 = gen_adddi3;
4737 ix86_gen_sub3 = gen_subdi3;
4738 ix86_gen_sub3_carry = gen_subdi3_carry;
4739 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4740 ix86_gen_andsp = gen_anddi3;
4741 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4742 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4743 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4744 ix86_gen_monitor = gen_sse3_monitor_di;
4745 ix86_gen_monitorx = gen_monitorx_di;
4746 ix86_gen_clzero = gen_clzero_di;
4748 else
4750 ix86_gen_add3 = gen_addsi3;
4751 ix86_gen_sub3 = gen_subsi3;
4752 ix86_gen_sub3_carry = gen_subsi3_carry;
4753 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4754 ix86_gen_andsp = gen_andsi3;
4755 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4756 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4757 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4758 ix86_gen_monitor = gen_sse3_monitor_si;
4759 ix86_gen_monitorx = gen_monitorx_si;
4760 ix86_gen_clzero = gen_clzero_si;
4763 #ifdef USE_IX86_CLD
4764 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4765 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4766 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4767 #endif
4769 /* Set the default value for -mfentry. */
4770 if (!opts_set->x_flag_fentry)
4771 opts->x_flag_fentry = TARGET_SEH;
4772 else
4774 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4775 && opts->x_flag_fentry)
4776 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4777 "with -fpic");
4778 else if (TARGET_SEH && !opts->x_flag_fentry)
4779 sorry ("-mno-fentry isn%'t compatible with SEH");
4782 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4783 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4785 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4786 && TARGET_EMIT_VZEROUPPER)
4787 opts->x_target_flags |= MASK_VZEROUPPER;
4788 if (!(opts_set->x_target_flags & MASK_STV))
4789 opts->x_target_flags |= MASK_STV;
4790 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4791 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4792 stack realignment will be extra cost the pass doesn't take into
4793 account and the pass can't realign the stack. */
4794 if (ix86_preferred_stack_boundary < 128
4795 || ix86_incoming_stack_boundary < 128
4796 || opts->x_ix86_force_align_arg_pointer)
4797 opts->x_target_flags &= ~MASK_STV;
4798 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4799 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4800 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4801 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4802 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4803 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4805 /* Enable 128-bit AVX instruction generation
4806 for the auto-vectorizer. */
4807 if (TARGET_AVX128_OPTIMAL
4808 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4809 opts->x_prefer_vector_width_type = PVW_AVX128;
4811 /* Use 256-bit AVX instruction generation
4812 in the auto-vectorizer. */
4813 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4814 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4815 opts->x_prefer_vector_width_type = PVW_AVX256;
4817 if (opts->x_ix86_recip_name)
4819 char *p = ASTRDUP (opts->x_ix86_recip_name);
4820 char *q;
4821 unsigned int mask, i;
4822 bool invert;
4824 while ((q = strtok (p, ",")) != NULL)
4826 p = NULL;
4827 if (*q == '!')
4829 invert = true;
4830 q++;
4832 else
4833 invert = false;
4835 if (!strcmp (q, "default"))
4836 mask = RECIP_MASK_ALL;
4837 else
4839 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4840 if (!strcmp (q, recip_options[i].string))
4842 mask = recip_options[i].mask;
4843 break;
4846 if (i == ARRAY_SIZE (recip_options))
4848 error ("unknown option for -mrecip=%s", q);
4849 invert = false;
4850 mask = RECIP_MASK_NONE;
4854 opts->x_recip_mask_explicit |= mask;
4855 if (invert)
4856 opts->x_recip_mask &= ~mask;
4857 else
4858 opts->x_recip_mask |= mask;
4862 if (TARGET_RECIP_P (opts->x_target_flags))
4863 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4864 else if (opts_set->x_target_flags & MASK_RECIP)
4865 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4867 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4868 for 64-bit Bionic. Also default long double to 64-bit for Intel
4869 MCU psABI. */
4870 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4871 && !(opts_set->x_target_flags
4872 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4873 opts->x_target_flags |= (TARGET_64BIT
4874 ? MASK_LONG_DOUBLE_128
4875 : MASK_LONG_DOUBLE_64);
4877 /* Only one of them can be active. */
4878 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4879 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4881 /* Handle stack protector */
4882 if (!opts_set->x_ix86_stack_protector_guard)
4883 opts->x_ix86_stack_protector_guard
4884 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4886 #ifdef TARGET_THREAD_SSP_OFFSET
4887 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4888 #endif
4890 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4892 char *endp;
4893 const char *str = ix86_stack_protector_guard_offset_str;
4895 errno = 0;
4896 int64_t offset;
4898 #if defined(INT64_T_IS_LONG)
4899 offset = strtol (str, &endp, 0);
4900 #else
4901 offset = strtoll (str, &endp, 0);
4902 #endif
4904 if (!*str || *endp || errno)
4905 error ("%qs is not a valid number "
4906 "in -mstack-protector-guard-offset=", str);
4908 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4909 HOST_WIDE_INT_C (0x7fffffff)))
4910 error ("%qs is not a valid offset "
4911 "in -mstack-protector-guard-offset=", str);
4913 ix86_stack_protector_guard_offset = offset;
4916 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4918 /* The kernel uses a different segment register for performance
4919 reasons; a system call would not have to trash the userspace
4920 segment register, which would be expensive. */
4921 if (ix86_cmodel == CM_KERNEL)
4922 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4924 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4926 const char *str = ix86_stack_protector_guard_reg_str;
4927 addr_space_t seg = ADDR_SPACE_GENERIC;
4929 /* Discard optional register prefix. */
4930 if (str[0] == '%')
4931 str++;
4933 if (strlen (str) == 2 && str[1] == 's')
4935 if (str[0] == 'f')
4936 seg = ADDR_SPACE_SEG_FS;
4937 else if (str[0] == 'g')
4938 seg = ADDR_SPACE_SEG_GS;
4941 if (seg == ADDR_SPACE_GENERIC)
4942 error ("%qs is not a valid base register "
4943 "in -mstack-protector-guard-reg=",
4944 ix86_stack_protector_guard_reg_str);
4946 ix86_stack_protector_guard_reg = seg;
4949 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4950 if (opts->x_ix86_tune_memcpy_strategy)
4952 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4953 ix86_parse_stringop_strategy_string (str, false);
4954 free (str);
4957 if (opts->x_ix86_tune_memset_strategy)
4959 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4960 ix86_parse_stringop_strategy_string (str, true);
4961 free (str);
4964 /* Save the initial options in case the user does function specific
4965 options. */
4966 if (main_args_p)
4967 target_option_default_node = target_option_current_node
4968 = build_target_option_node (opts);
4970 if (opts->x_flag_cf_protection != CF_NONE)
4971 opts->x_flag_cf_protection =
4972 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4974 if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
4975 maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
4976 opts->x_param_values,
4977 opts_set->x_param_values);
4979 return true;
4982 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4984 static void
4985 ix86_option_override (void)
4987 ix86_option_override_internal (true, &global_options, &global_options_set);
4990 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4991 static char *
4992 ix86_offload_options (void)
4994 if (TARGET_LP64)
4995 return xstrdup ("-foffload-abi=lp64");
4996 return xstrdup ("-foffload-abi=ilp32");
4999 /* Update register usage after having seen the compiler flags. */
5001 static void
5002 ix86_conditional_register_usage (void)
5004 int i, c_mask;
5006 /* If there are no caller-saved registers, preserve all registers.
5007 except fixed_regs and registers used for function return value
5008 since aggregate_value_p checks call_used_regs[regno] on return
5009 value. */
5010 if (cfun && cfun->machine->no_caller_saved_registers)
5011 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5012 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
5013 call_used_regs[i] = 0;
5015 /* For 32-bit targets, squash the REX registers. */
5016 if (! TARGET_64BIT)
5018 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
5019 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5020 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
5021 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5022 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5023 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5026 /* See the definition of CALL_USED_REGISTERS in i386.h. */
5027 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
5029 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
5031 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5033 /* Set/reset conditionally defined registers from
5034 CALL_USED_REGISTERS initializer. */
5035 if (call_used_regs[i] > 1)
5036 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
5038 /* Calculate registers of CLOBBERED_REGS register set
5039 as call used registers from GENERAL_REGS register set. */
5040 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
5041 && call_used_regs[i])
5042 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
5045 /* If MMX is disabled, squash the registers. */
5046 if (! TARGET_MMX)
5047 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5048 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
5049 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5051 /* If SSE is disabled, squash the registers. */
5052 if (! TARGET_SSE)
5053 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5054 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
5055 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5057 /* If the FPU is disabled, squash the registers. */
5058 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
5059 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
5060 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
5061 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5063 /* If AVX512F is disabled, squash the registers. */
5064 if (! TARGET_AVX512F)
5066 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
5067 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5069 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
5070 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
5074 /* Canonicalize a comparison from one we don't have to one we do have. */
5076 static void
5077 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
5078 bool op0_preserve_value)
5080 /* The order of operands in x87 ficom compare is forced by combine in
5081 simplify_comparison () function. Float operator is treated as RTX_OBJ
5082 with a precedence over other operators and is always put in the first
5083 place. Swap condition and operands to match ficom instruction. */
5084 if (!op0_preserve_value
5085 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5087 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5089 /* We are called only for compares that are split to SAHF instruction.
5090 Ensure that we have setcc/jcc insn for the swapped condition. */
5091 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5093 std::swap (*op0, *op1);
5094 *code = (int) scode;
5099 /* Save the current options */
5101 static void
5102 ix86_function_specific_save (struct cl_target_option *ptr,
5103 struct gcc_options *opts)
5105 ptr->arch = ix86_arch;
5106 ptr->schedule = ix86_schedule;
5107 ptr->prefetch_sse = x86_prefetch_sse;
5108 ptr->tune = ix86_tune;
5109 ptr->branch_cost = ix86_branch_cost;
5110 ptr->tune_defaulted = ix86_tune_defaulted;
5111 ptr->arch_specified = ix86_arch_specified;
5112 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5113 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5114 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5115 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5116 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5117 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5118 ptr->x_ix86_abi = opts->x_ix86_abi;
5119 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5120 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5121 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5122 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5123 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5124 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5125 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5126 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5127 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5128 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5129 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5130 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5131 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5132 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5133 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5134 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5135 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5136 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5137 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5138 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5140 /* The fields are char but the variables are not; make sure the
5141 values fit in the fields. */
5142 gcc_assert (ptr->arch == ix86_arch);
5143 gcc_assert (ptr->schedule == ix86_schedule);
5144 gcc_assert (ptr->tune == ix86_tune);
5145 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5148 /* Restore the current options */
5150 static void
5151 ix86_function_specific_restore (struct gcc_options *opts,
5152 struct cl_target_option *ptr)
5154 enum processor_type old_tune = ix86_tune;
5155 enum processor_type old_arch = ix86_arch;
5156 unsigned HOST_WIDE_INT ix86_arch_mask;
5157 int i;
5159 /* We don't change -fPIC. */
5160 opts->x_flag_pic = flag_pic;
5162 ix86_arch = (enum processor_type) ptr->arch;
5163 ix86_schedule = (enum attr_cpu) ptr->schedule;
5164 ix86_tune = (enum processor_type) ptr->tune;
5165 x86_prefetch_sse = ptr->prefetch_sse;
5166 opts->x_ix86_branch_cost = ptr->branch_cost;
5167 ix86_tune_defaulted = ptr->tune_defaulted;
5168 ix86_arch_specified = ptr->arch_specified;
5169 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5170 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5171 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5172 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5173 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5174 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5175 opts->x_ix86_abi = ptr->x_ix86_abi;
5176 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5177 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5178 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5179 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5180 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5181 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5182 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5183 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5184 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5185 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5186 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5187 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5188 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5189 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5190 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5191 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5192 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5193 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5194 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5195 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5196 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5197 /* TODO: ix86_cost should be chosen at instruction or function granuality
5198 so for cold code we use size_cost even in !optimize_size compilation. */
5199 if (opts->x_optimize_size)
5200 ix86_cost = &ix86_size_cost;
5201 else
5202 ix86_cost = ix86_tune_cost;
5204 /* Recreate the arch feature tests if the arch changed */
5205 if (old_arch != ix86_arch)
5207 ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
5208 for (i = 0; i < X86_ARCH_LAST; ++i)
5209 ix86_arch_features[i]
5210 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5213 /* Recreate the tune optimization tests */
5214 if (old_tune != ix86_tune)
5215 set_ix86_tune_features (ix86_tune, false);
5218 /* Adjust target options after streaming them in. This is mainly about
5219 reconciling them with global options. */
5221 static void
5222 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5224 /* flag_pic is a global option, but ix86_cmodel is target saved option
5225 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5226 for PIC, or error out. */
5227 if (flag_pic)
5228 switch (ptr->x_ix86_cmodel)
5230 case CM_SMALL:
5231 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5232 break;
5234 case CM_MEDIUM:
5235 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5236 break;
5238 case CM_LARGE:
5239 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5240 break;
5242 case CM_KERNEL:
5243 error ("code model %s does not support PIC mode", "kernel");
5244 break;
5246 default:
5247 break;
5249 else
5250 switch (ptr->x_ix86_cmodel)
5252 case CM_SMALL_PIC:
5253 ptr->x_ix86_cmodel = CM_SMALL;
5254 break;
5256 case CM_MEDIUM_PIC:
5257 ptr->x_ix86_cmodel = CM_MEDIUM;
5258 break;
5260 case CM_LARGE_PIC:
5261 ptr->x_ix86_cmodel = CM_LARGE;
5262 break;
5264 default:
5265 break;
5269 /* Print the current options */
5271 static void
5272 ix86_function_specific_print (FILE *file, int indent,
5273 struct cl_target_option *ptr)
5275 char *target_string
5276 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5277 ptr->x_target_flags, ptr->x_ix86_target_flags,
5278 NULL, NULL, ptr->x_ix86_fpmath, false);
5280 gcc_assert (ptr->arch < PROCESSOR_max);
5281 fprintf (file, "%*sarch = %d (%s)\n",
5282 indent, "",
5283 ptr->arch, processor_target_table[ptr->arch].name);
5285 gcc_assert (ptr->tune < PROCESSOR_max);
5286 fprintf (file, "%*stune = %d (%s)\n",
5287 indent, "",
5288 ptr->tune, processor_target_table[ptr->tune].name);
5290 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5292 if (target_string)
5294 fprintf (file, "%*s%s\n", indent, "", target_string);
5295 free (target_string);
5300 /* Inner function to process the attribute((target(...))), take an argument and
5301 set the current options from the argument. If we have a list, recursively go
5302 over the list. */
5304 static bool
5305 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5306 struct gcc_options *opts,
5307 struct gcc_options *opts_set,
5308 struct gcc_options *enum_opts_set)
5310 char *next_optstr;
5311 bool ret = true;
5313 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5314 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5315 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5316 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5317 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5319 enum ix86_opt_type
5321 ix86_opt_unknown,
5322 ix86_opt_yes,
5323 ix86_opt_no,
5324 ix86_opt_str,
5325 ix86_opt_enum,
5326 ix86_opt_isa
5329 static const struct
5331 const char *string;
5332 size_t len;
5333 enum ix86_opt_type type;
5334 int opt;
5335 int mask;
5336 } attrs[] = {
5337 /* isa options */
5338 IX86_ATTR_ISA ("pconfig", OPT_mpconfig),
5339 IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd),
5340 IX86_ATTR_ISA ("sgx", OPT_msgx),
5341 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5342 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5343 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5344 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5345 IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
5346 IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
5348 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5349 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5350 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5351 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5352 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5353 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5354 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5355 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5356 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5357 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5358 IX86_ATTR_ISA ("fma", OPT_mfma),
5359 IX86_ATTR_ISA ("xop", OPT_mxop),
5360 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5361 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5362 IX86_ATTR_ISA ("avx", OPT_mavx),
5363 IX86_ATTR_ISA ("sse4", OPT_msse4),
5364 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5365 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5366 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5367 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5368 IX86_ATTR_ISA ("sse3", OPT_msse3),
5369 IX86_ATTR_ISA ("aes", OPT_maes),
5370 IX86_ATTR_ISA ("sha", OPT_msha),
5371 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5372 IX86_ATTR_ISA ("sse2", OPT_msse2),
5373 IX86_ATTR_ISA ("sse", OPT_msse),
5374 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5375 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5376 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5377 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5378 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5379 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5380 IX86_ATTR_ISA ("adx", OPT_madx),
5381 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5382 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5383 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5384 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5385 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5386 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5387 IX86_ATTR_ISA ("abm", OPT_mabm),
5388 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5389 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5390 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5391 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5392 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5393 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5394 IX86_ATTR_ISA ("sahf", OPT_msahf),
5395 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5396 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5397 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5398 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5399 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5400 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5401 IX86_ATTR_ISA ("pku", OPT_mpku),
5402 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5403 IX86_ATTR_ISA ("hle", OPT_mhle),
5404 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5405 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5406 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5407 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5408 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5409 IX86_ATTR_ISA ("vaes", OPT_mvaes),
5410 IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
5411 IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
5412 IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
5413 IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
5414 IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
5416 /* enum options */
5417 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5419 /* string options */
5420 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5421 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5423 /* flag options */
5424 IX86_ATTR_YES ("cld",
5425 OPT_mcld,
5426 MASK_CLD),
5428 IX86_ATTR_NO ("fancy-math-387",
5429 OPT_mfancy_math_387,
5430 MASK_NO_FANCY_MATH_387),
5432 IX86_ATTR_YES ("ieee-fp",
5433 OPT_mieee_fp,
5434 MASK_IEEE_FP),
5436 IX86_ATTR_YES ("inline-all-stringops",
5437 OPT_minline_all_stringops,
5438 MASK_INLINE_ALL_STRINGOPS),
5440 IX86_ATTR_YES ("inline-stringops-dynamically",
5441 OPT_minline_stringops_dynamically,
5442 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5444 IX86_ATTR_NO ("align-stringops",
5445 OPT_mno_align_stringops,
5446 MASK_NO_ALIGN_STRINGOPS),
5448 IX86_ATTR_YES ("recip",
5449 OPT_mrecip,
5450 MASK_RECIP),
5454 /* If this is a list, recurse to get the options. */
5455 if (TREE_CODE (args) == TREE_LIST)
5457 bool ret = true;
5459 for (; args; args = TREE_CHAIN (args))
5460 if (TREE_VALUE (args)
5461 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5462 p_strings, opts, opts_set,
5463 enum_opts_set))
5464 ret = false;
5466 return ret;
5469 else if (TREE_CODE (args) != STRING_CST)
5471 error ("attribute %<target%> argument not a string");
5472 return false;
5475 /* Handle multiple arguments separated by commas. */
5476 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5478 while (next_optstr && *next_optstr != '\0')
5480 char *p = next_optstr;
5481 char *orig_p = p;
5482 char *comma = strchr (next_optstr, ',');
5483 const char *opt_string;
5484 size_t len, opt_len;
5485 int opt;
5486 bool opt_set_p;
5487 char ch;
5488 unsigned i;
5489 enum ix86_opt_type type = ix86_opt_unknown;
5490 int mask = 0;
5492 if (comma)
5494 *comma = '\0';
5495 len = comma - next_optstr;
5496 next_optstr = comma + 1;
5498 else
5500 len = strlen (p);
5501 next_optstr = NULL;
5504 /* Recognize no-xxx. */
5505 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5507 opt_set_p = false;
5508 p += 3;
5509 len -= 3;
5511 else
5512 opt_set_p = true;
5514 /* Find the option. */
5515 ch = *p;
5516 opt = N_OPTS;
5517 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5519 type = attrs[i].type;
5520 opt_len = attrs[i].len;
5521 if (ch == attrs[i].string[0]
5522 && ((type != ix86_opt_str && type != ix86_opt_enum)
5523 ? len == opt_len
5524 : len > opt_len)
5525 && memcmp (p, attrs[i].string, opt_len) == 0)
5527 opt = attrs[i].opt;
5528 mask = attrs[i].mask;
5529 opt_string = attrs[i].string;
5530 break;
5534 /* Process the option. */
5535 if (opt == N_OPTS)
5537 error ("attribute(target(\"%s\")) is unknown", orig_p);
5538 ret = false;
5541 else if (type == ix86_opt_isa)
5543 struct cl_decoded_option decoded;
5545 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5546 ix86_handle_option (opts, opts_set,
5547 &decoded, input_location);
5550 else if (type == ix86_opt_yes || type == ix86_opt_no)
5552 if (type == ix86_opt_no)
5553 opt_set_p = !opt_set_p;
5555 if (opt_set_p)
5556 opts->x_target_flags |= mask;
5557 else
5558 opts->x_target_flags &= ~mask;
5561 else if (type == ix86_opt_str)
5563 if (p_strings[opt])
5565 error ("option(\"%s\") was already specified", opt_string);
5566 ret = false;
5568 else
5569 p_strings[opt] = xstrdup (p + opt_len);
5572 else if (type == ix86_opt_enum)
5574 bool arg_ok;
5575 int value;
5577 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5578 if (arg_ok)
5579 set_option (opts, enum_opts_set, opt, value,
5580 p + opt_len, DK_UNSPECIFIED, input_location,
5581 global_dc);
5582 else
5584 error ("attribute(target(\"%s\")) is unknown", orig_p);
5585 ret = false;
5589 else
5590 gcc_unreachable ();
5593 return ret;
5596 /* Release allocated strings. */
5597 static void
5598 release_options_strings (char **option_strings)
5600 /* Free up memory allocated to hold the strings */
5601 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5602 free (option_strings[i]);
5605 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5607 tree
5608 ix86_valid_target_attribute_tree (tree args,
5609 struct gcc_options *opts,
5610 struct gcc_options *opts_set)
5612 const char *orig_arch_string = opts->x_ix86_arch_string;
5613 const char *orig_tune_string = opts->x_ix86_tune_string;
5614 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5615 int orig_tune_defaulted = ix86_tune_defaulted;
5616 int orig_arch_specified = ix86_arch_specified;
5617 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5618 tree t = NULL_TREE;
5619 struct cl_target_option *def
5620 = TREE_TARGET_OPTION (target_option_default_node);
5621 struct gcc_options enum_opts_set;
5623 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5625 /* Process each of the options on the chain. */
5626 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5627 opts_set, &enum_opts_set))
5628 return error_mark_node;
5630 /* If the changed options are different from the default, rerun
5631 ix86_option_override_internal, and then save the options away.
5632 The string options are attribute options, and will be undone
5633 when we copy the save structure. */
5634 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5635 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5636 || opts->x_target_flags != def->x_target_flags
5637 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5638 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5639 || enum_opts_set.x_ix86_fpmath)
5641 /* If we are using the default tune= or arch=, undo the string assigned,
5642 and use the default. */
5643 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5645 opts->x_ix86_arch_string
5646 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5648 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5649 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5650 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5651 | OPTION_MASK_ABI_64
5652 | OPTION_MASK_ABI_X32
5653 | OPTION_MASK_CODE16);
5654 opts->x_ix86_isa_flags2 = 0;
5656 else if (!orig_arch_specified)
5657 opts->x_ix86_arch_string = NULL;
5659 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5660 opts->x_ix86_tune_string
5661 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5662 else if (orig_tune_defaulted)
5663 opts->x_ix86_tune_string = NULL;
5665 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5666 if (enum_opts_set.x_ix86_fpmath)
5667 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5669 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5670 bool r = ix86_option_override_internal (false, opts, opts_set);
5671 if (!r)
5673 release_options_strings (option_strings);
5674 return error_mark_node;
5677 /* Add any builtin functions with the new isa if any. */
5678 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5680 /* Save the current options unless we are validating options for
5681 #pragma. */
5682 t = build_target_option_node (opts);
5684 opts->x_ix86_arch_string = orig_arch_string;
5685 opts->x_ix86_tune_string = orig_tune_string;
5686 opts_set->x_ix86_fpmath = orig_fpmath_set;
5688 release_options_strings (option_strings);
5691 return t;
5694 /* Hook to validate attribute((target("string"))). */
5696 static bool
5697 ix86_valid_target_attribute_p (tree fndecl,
5698 tree ARG_UNUSED (name),
5699 tree args,
5700 int ARG_UNUSED (flags))
5702 struct gcc_options func_options;
5703 tree new_target, new_optimize;
5704 bool ret = true;
5706 /* attribute((target("default"))) does nothing, beyond
5707 affecting multi-versioning. */
5708 if (TREE_VALUE (args)
5709 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5710 && TREE_CHAIN (args) == NULL_TREE
5711 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5712 return true;
5714 tree old_optimize = build_optimization_node (&global_options);
5716 /* Get the optimization options of the current function. */
5717 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5719 if (!func_optimize)
5720 func_optimize = old_optimize;
5722 /* Init func_options. */
5723 memset (&func_options, 0, sizeof (func_options));
5724 init_options_struct (&func_options, NULL);
5725 lang_hooks.init_options_struct (&func_options);
5727 cl_optimization_restore (&func_options,
5728 TREE_OPTIMIZATION (func_optimize));
5730 /* Initialize func_options to the default before its target options can
5731 be set. */
5732 cl_target_option_restore (&func_options,
5733 TREE_TARGET_OPTION (target_option_default_node));
5735 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5736 &global_options_set);
5738 new_optimize = build_optimization_node (&func_options);
5740 if (new_target == error_mark_node)
5741 ret = false;
5743 else if (fndecl && new_target)
5745 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5747 if (old_optimize != new_optimize)
5748 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5751 finalize_options_struct (&func_options);
5753 return ret;
5757 /* Hook to determine if one function can safely inline another. */
5759 static bool
5760 ix86_can_inline_p (tree caller, tree callee)
5762 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5763 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5765 /* Changes of those flags can be tolerated for always inlines. Lets hope
5766 user knows what he is doing. */
5767 const unsigned HOST_WIDE_INT always_inline_safe_mask
5768 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
5769 | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
5770 | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
5771 | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
5772 | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
5773 | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
5774 | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
5777 if (!callee_tree)
5778 callee_tree = target_option_default_node;
5779 if (!caller_tree)
5780 caller_tree = target_option_default_node;
5781 if (callee_tree == caller_tree)
5782 return true;
5784 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5785 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5786 bool ret = false;
5787 bool always_inline =
5788 (DECL_DISREGARD_INLINE_LIMITS (callee)
5789 && lookup_attribute ("always_inline",
5790 DECL_ATTRIBUTES (callee)));
5792 cgraph_node *callee_node = cgraph_node::get (callee);
5793 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5794 function can inline a SSE2 function but a SSE2 function can't inline
5795 a SSE4 function. */
5796 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5797 != callee_opts->x_ix86_isa_flags)
5798 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5799 != callee_opts->x_ix86_isa_flags2))
5800 ret = false;
5802 /* See if we have the same non-isa options. */
5803 else if ((!always_inline
5804 && caller_opts->x_target_flags != callee_opts->x_target_flags)
5805 || (caller_opts->x_target_flags & ~always_inline_safe_mask)
5806 != (callee_opts->x_target_flags & ~always_inline_safe_mask))
5807 ret = false;
5809 /* See if arch, tune, etc. are the same. */
5810 else if (caller_opts->arch != callee_opts->arch)
5811 ret = false;
5813 else if (!always_inline && caller_opts->tune != callee_opts->tune)
5814 ret = false;
5816 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5817 /* If the calle doesn't use FP expressions differences in
5818 ix86_fpmath can be ignored. We are called from FEs
5819 for multi-versioning call optimization, so beware of
5820 ipa_fn_summaries not available. */
5821 && (! ipa_fn_summaries
5822 || ipa_fn_summaries->get (callee_node) == NULL
5823 || ipa_fn_summaries->get (callee_node)->fp_expressions))
5824 ret = false;
5826 else if (!always_inline
5827 && caller_opts->branch_cost != callee_opts->branch_cost)
5828 ret = false;
5830 else
5831 ret = true;
5833 return ret;
5837 /* Remember the last target of ix86_set_current_function. */
5838 static GTY(()) tree ix86_previous_fndecl;
5840 /* Set targets globals to the default (or current #pragma GCC target
5841 if active). Invalidate ix86_previous_fndecl cache. */
5843 void
5844 ix86_reset_previous_fndecl (void)
5846 tree new_tree = target_option_current_node;
5847 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5848 if (TREE_TARGET_GLOBALS (new_tree))
5849 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5850 else if (new_tree == target_option_default_node)
5851 restore_target_globals (&default_target_globals);
5852 else
5853 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5854 ix86_previous_fndecl = NULL_TREE;
5857 /* Set the func_type field from the function FNDECL. */
5859 static void
5860 ix86_set_func_type (tree fndecl)
5862 if (cfun->machine->func_type == TYPE_UNKNOWN)
5864 if (lookup_attribute ("interrupt",
5865 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5867 if (ix86_function_naked (fndecl))
5868 error_at (DECL_SOURCE_LOCATION (fndecl),
5869 "interrupt and naked attributes are not compatible");
5871 int nargs = 0;
5872 for (tree arg = DECL_ARGUMENTS (fndecl);
5873 arg;
5874 arg = TREE_CHAIN (arg))
5875 nargs++;
5876 cfun->machine->no_caller_saved_registers = true;
5877 cfun->machine->func_type
5878 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5880 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5882 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5883 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5884 sorry ("Only DWARF debug format is supported for interrupt "
5885 "service routine.");
5887 else
5889 cfun->machine->func_type = TYPE_NORMAL;
5890 if (lookup_attribute ("no_caller_saved_registers",
5891 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5892 cfun->machine->no_caller_saved_registers = true;
5897 /* Set the indirect_branch_type field from the function FNDECL. */
5899 static void
5900 ix86_set_indirect_branch_type (tree fndecl)
5902 if (cfun->machine->indirect_branch_type == indirect_branch_unset)
5904 tree attr = lookup_attribute ("indirect_branch",
5905 DECL_ATTRIBUTES (fndecl));
5906 if (attr != NULL)
5908 tree args = TREE_VALUE (attr);
5909 if (args == NULL)
5910 gcc_unreachable ();
5911 tree cst = TREE_VALUE (args);
5912 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5913 cfun->machine->indirect_branch_type = indirect_branch_keep;
5914 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5915 cfun->machine->indirect_branch_type = indirect_branch_thunk;
5916 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5917 cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
5918 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5919 cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
5920 else
5921 gcc_unreachable ();
5923 else
5924 cfun->machine->indirect_branch_type = ix86_indirect_branch;
5926 /* -mcmodel=large is not compatible with -mindirect-branch=thunk
5927 nor -mindirect-branch=thunk-extern. */
5928 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5929 && ((cfun->machine->indirect_branch_type
5930 == indirect_branch_thunk_extern)
5931 || (cfun->machine->indirect_branch_type
5932 == indirect_branch_thunk)))
5933 error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
5934 "compatible",
5935 ((cfun->machine->indirect_branch_type
5936 == indirect_branch_thunk_extern)
5937 ? "thunk-extern" : "thunk"));
5940 if (cfun->machine->function_return_type == indirect_branch_unset)
5942 tree attr = lookup_attribute ("function_return",
5943 DECL_ATTRIBUTES (fndecl));
5944 if (attr != NULL)
5946 tree args = TREE_VALUE (attr);
5947 if (args == NULL)
5948 gcc_unreachable ();
5949 tree cst = TREE_VALUE (args);
5950 if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
5951 cfun->machine->function_return_type = indirect_branch_keep;
5952 else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
5953 cfun->machine->function_return_type = indirect_branch_thunk;
5954 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
5955 cfun->machine->function_return_type = indirect_branch_thunk_inline;
5956 else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
5957 cfun->machine->function_return_type = indirect_branch_thunk_extern;
5958 else
5959 gcc_unreachable ();
5961 else
5962 cfun->machine->function_return_type = ix86_function_return;
5964 /* -mcmodel=large is not compatible with -mfunction-return=thunk
5965 nor -mfunction-return=thunk-extern. */
5966 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
5967 && ((cfun->machine->function_return_type
5968 == indirect_branch_thunk_extern)
5969 || (cfun->machine->function_return_type
5970 == indirect_branch_thunk)))
5971 error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
5972 "compatible",
5973 ((cfun->machine->function_return_type
5974 == indirect_branch_thunk_extern)
5975 ? "thunk-extern" : "thunk"));
5979 /* Establish appropriate back-end context for processing the function
5980 FNDECL. The argument might be NULL to indicate processing at top
5981 level, outside of any function scope. */
5982 static void
5983 ix86_set_current_function (tree fndecl)
5985 /* Only change the context if the function changes. This hook is called
5986 several times in the course of compiling a function, and we don't want to
5987 slow things down too much or call target_reinit when it isn't safe. */
5988 if (fndecl == ix86_previous_fndecl)
5990 /* There may be 2 function bodies for the same function FNDECL,
5991 one is extern inline and one isn't. Call ix86_set_func_type
5992 to set the func_type field. */
5993 if (fndecl != NULL_TREE)
5995 ix86_set_func_type (fndecl);
5996 ix86_set_indirect_branch_type (fndecl);
5998 return;
6001 tree old_tree;
6002 if (ix86_previous_fndecl == NULL_TREE)
6003 old_tree = target_option_current_node;
6004 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
6005 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
6006 else
6007 old_tree = target_option_default_node;
6009 if (fndecl == NULL_TREE)
6011 if (old_tree != target_option_current_node)
6012 ix86_reset_previous_fndecl ();
6013 return;
6016 ix86_set_func_type (fndecl);
6017 ix86_set_indirect_branch_type (fndecl);
6019 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
6020 if (new_tree == NULL_TREE)
6021 new_tree = target_option_default_node;
6023 if (old_tree != new_tree)
6025 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6026 if (TREE_TARGET_GLOBALS (new_tree))
6027 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
6028 else if (new_tree == target_option_default_node)
6029 restore_target_globals (&default_target_globals);
6030 else
6031 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
6033 ix86_previous_fndecl = fndecl;
6035 static bool prev_no_caller_saved_registers;
6037 /* 64-bit MS and SYSV ABI have different set of call used registers.
6038 Avoid expensive re-initialization of init_regs each time we switch
6039 function context. */
6040 if (TARGET_64BIT
6041 && (call_used_regs[SI_REG]
6042 == (cfun->machine->call_abi == MS_ABI)))
6043 reinit_regs ();
6044 /* Need to re-initialize init_regs if caller-saved registers are
6045 changed. */
6046 else if (prev_no_caller_saved_registers
6047 != cfun->machine->no_caller_saved_registers)
6048 reinit_regs ();
6050 if (cfun->machine->func_type != TYPE_NORMAL
6051 || cfun->machine->no_caller_saved_registers)
6053 /* Don't allow SSE, MMX nor x87 instructions since they
6054 may change processor state. */
6055 const char *isa;
6056 if (TARGET_SSE)
6057 isa = "SSE";
6058 else if (TARGET_MMX)
6059 isa = "MMX/3Dnow";
6060 else if (TARGET_80387)
6061 isa = "80387";
6062 else
6063 isa = NULL;
6064 if (isa != NULL)
6066 if (cfun->machine->func_type != TYPE_NORMAL)
6067 sorry ("%s instructions aren't allowed in %s service routine",
6068 isa, (cfun->machine->func_type == TYPE_EXCEPTION
6069 ? "exception" : "interrupt"));
6070 else
6071 sorry ("%s instructions aren't allowed in function with "
6072 "no_caller_saved_registers attribute", isa);
6073 /* Don't issue the same error twice. */
6074 cfun->machine->func_type = TYPE_NORMAL;
6075 cfun->machine->no_caller_saved_registers = false;
6079 prev_no_caller_saved_registers
6080 = cfun->machine->no_caller_saved_registers;
6084 /* Return true if this goes in large data/bss. */
6086 static bool
6087 ix86_in_large_data_p (tree exp)
6089 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
6090 return false;
6092 if (exp == NULL_TREE)
6093 return false;
6095 /* Functions are never large data. */
6096 if (TREE_CODE (exp) == FUNCTION_DECL)
6097 return false;
6099 /* Automatic variables are never large data. */
6100 if (VAR_P (exp) && !is_global_var (exp))
6101 return false;
6103 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
6105 const char *section = DECL_SECTION_NAME (exp);
6106 if (strcmp (section, ".ldata") == 0
6107 || strcmp (section, ".lbss") == 0)
6108 return true;
6109 return false;
6111 else
6113 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
6115 /* If this is an incomplete type with size 0, then we can't put it
6116 in data because it might be too big when completed. Also,
6117 int_size_in_bytes returns -1 if size can vary or is larger than
6118 an integer in which case also it is safer to assume that it goes in
6119 large data. */
6120 if (size <= 0 || size > ix86_section_threshold)
6121 return true;
6124 return false;
6127 /* i386-specific section flag to mark large sections. */
6128 #define SECTION_LARGE SECTION_MACH_DEP
6130 /* Switch to the appropriate section for output of DECL.
6131 DECL is either a `VAR_DECL' node or a constant of some sort.
6132 RELOC indicates whether forming the initial value of DECL requires
6133 link-time relocations. */
6135 ATTRIBUTE_UNUSED static section *
6136 x86_64_elf_select_section (tree decl, int reloc,
6137 unsigned HOST_WIDE_INT align)
6139 if (ix86_in_large_data_p (decl))
6141 const char *sname = NULL;
6142 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
6143 switch (categorize_decl_for_section (decl, reloc))
6145 case SECCAT_DATA:
6146 sname = ".ldata";
6147 break;
6148 case SECCAT_DATA_REL:
6149 sname = ".ldata.rel";
6150 break;
6151 case SECCAT_DATA_REL_LOCAL:
6152 sname = ".ldata.rel.local";
6153 break;
6154 case SECCAT_DATA_REL_RO:
6155 sname = ".ldata.rel.ro";
6156 break;
6157 case SECCAT_DATA_REL_RO_LOCAL:
6158 sname = ".ldata.rel.ro.local";
6159 break;
6160 case SECCAT_BSS:
6161 sname = ".lbss";
6162 flags |= SECTION_BSS;
6163 break;
6164 case SECCAT_RODATA:
6165 case SECCAT_RODATA_MERGE_STR:
6166 case SECCAT_RODATA_MERGE_STR_INIT:
6167 case SECCAT_RODATA_MERGE_CONST:
6168 sname = ".lrodata";
6169 flags &= ~SECTION_WRITE;
6170 break;
6171 case SECCAT_SRODATA:
6172 case SECCAT_SDATA:
6173 case SECCAT_SBSS:
6174 gcc_unreachable ();
6175 case SECCAT_TEXT:
6176 case SECCAT_TDATA:
6177 case SECCAT_TBSS:
6178 /* We don't split these for medium model. Place them into
6179 default sections and hope for best. */
6180 break;
6182 if (sname)
6184 /* We might get called with string constants, but get_named_section
6185 doesn't like them as they are not DECLs. Also, we need to set
6186 flags in that case. */
6187 if (!DECL_P (decl))
6188 return get_section (sname, flags, NULL);
6189 return get_named_section (decl, sname, reloc);
6192 return default_elf_select_section (decl, reloc, align);
6195 /* Select a set of attributes for section NAME based on the properties
6196 of DECL and whether or not RELOC indicates that DECL's initializer
6197 might contain runtime relocations. */
6199 static unsigned int ATTRIBUTE_UNUSED
6200 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6202 unsigned int flags = default_section_type_flags (decl, name, reloc);
6204 if (ix86_in_large_data_p (decl))
6205 flags |= SECTION_LARGE;
6207 if (decl == NULL_TREE
6208 && (strcmp (name, ".ldata.rel.ro") == 0
6209 || strcmp (name, ".ldata.rel.ro.local") == 0))
6210 flags |= SECTION_RELRO;
6212 if (strcmp (name, ".lbss") == 0
6213 || strncmp (name, ".lbss.", 5) == 0
6214 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6215 flags |= SECTION_BSS;
6217 return flags;
6220 /* Build up a unique section name, expressed as a
6221 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6222 RELOC indicates whether the initial value of EXP requires
6223 link-time relocations. */
6225 static void ATTRIBUTE_UNUSED
6226 x86_64_elf_unique_section (tree decl, int reloc)
6228 if (ix86_in_large_data_p (decl))
6230 const char *prefix = NULL;
6231 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6232 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6234 switch (categorize_decl_for_section (decl, reloc))
6236 case SECCAT_DATA:
6237 case SECCAT_DATA_REL:
6238 case SECCAT_DATA_REL_LOCAL:
6239 case SECCAT_DATA_REL_RO:
6240 case SECCAT_DATA_REL_RO_LOCAL:
6241 prefix = one_only ? ".ld" : ".ldata";
6242 break;
6243 case SECCAT_BSS:
6244 prefix = one_only ? ".lb" : ".lbss";
6245 break;
6246 case SECCAT_RODATA:
6247 case SECCAT_RODATA_MERGE_STR:
6248 case SECCAT_RODATA_MERGE_STR_INIT:
6249 case SECCAT_RODATA_MERGE_CONST:
6250 prefix = one_only ? ".lr" : ".lrodata";
6251 break;
6252 case SECCAT_SRODATA:
6253 case SECCAT_SDATA:
6254 case SECCAT_SBSS:
6255 gcc_unreachable ();
6256 case SECCAT_TEXT:
6257 case SECCAT_TDATA:
6258 case SECCAT_TBSS:
6259 /* We don't split these for medium model. Place them into
6260 default sections and hope for best. */
6261 break;
6263 if (prefix)
6265 const char *name, *linkonce;
6266 char *string;
6268 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6269 name = targetm.strip_name_encoding (name);
6271 /* If we're using one_only, then there needs to be a .gnu.linkonce
6272 prefix to the section name. */
6273 linkonce = one_only ? ".gnu.linkonce" : "";
6275 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6277 set_decl_section_name (decl, string);
6278 return;
6281 default_unique_section (decl, reloc);
6284 #ifdef COMMON_ASM_OP
6286 #ifndef LARGECOMM_SECTION_ASM_OP
6287 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6288 #endif
6290 /* This says how to output assembler code to declare an
6291 uninitialized external linkage data object.
6293 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6294 large objects. */
6295 void
6296 x86_elf_aligned_decl_common (FILE *file, tree decl,
6297 const char *name, unsigned HOST_WIDE_INT size,
6298 int align)
6300 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6301 && size > (unsigned int)ix86_section_threshold)
6303 switch_to_section (get_named_section (decl, ".lbss", 0));
6304 fputs (LARGECOMM_SECTION_ASM_OP, file);
6306 else
6307 fputs (COMMON_ASM_OP, file);
6308 assemble_name (file, name);
6309 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6310 size, align / BITS_PER_UNIT);
6312 #endif
6314 /* Utility function for targets to use in implementing
6315 ASM_OUTPUT_ALIGNED_BSS. */
6317 void
6318 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6319 unsigned HOST_WIDE_INT size, int align)
6321 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6322 && size > (unsigned int)ix86_section_threshold)
6323 switch_to_section (get_named_section (decl, ".lbss", 0));
6324 else
6325 switch_to_section (bss_section);
6326 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6327 #ifdef ASM_DECLARE_OBJECT_NAME
6328 last_assemble_variable_decl = decl;
6329 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6330 #else
6331 /* Standard thing is just output label for the object. */
6332 ASM_OUTPUT_LABEL (file, name);
6333 #endif /* ASM_DECLARE_OBJECT_NAME */
6334 ASM_OUTPUT_SKIP (file, size ? size : 1);
6337 /* Decide whether we must probe the stack before any space allocation
6338 on this target. It's essentially TARGET_STACK_PROBE except when
6339 -fstack-check causes the stack to be already probed differently. */
6341 bool
6342 ix86_target_stack_probe (void)
6344 /* Do not probe the stack twice if static stack checking is enabled. */
6345 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6346 return false;
6348 return TARGET_STACK_PROBE;
6351 /* Decide whether we can make a sibling call to a function. DECL is the
6352 declaration of the function being targeted by the call and EXP is the
6353 CALL_EXPR representing the call. */
6355 static bool
6356 ix86_function_ok_for_sibcall (tree decl, tree exp)
6358 tree type, decl_or_type;
6359 rtx a, b;
6360 bool bind_global = decl && !targetm.binds_local_p (decl);
6362 if (ix86_function_naked (current_function_decl))
6363 return false;
6365 /* Sibling call isn't OK if there are no caller-saved registers
6366 since all registers must be preserved before return. */
6367 if (cfun->machine->no_caller_saved_registers)
6368 return false;
6370 /* If we are generating position-independent code, we cannot sibcall
6371 optimize direct calls to global functions, as the PLT requires
6372 %ebx be live. (Darwin does not have a PLT.) */
6373 if (!TARGET_MACHO
6374 && !TARGET_64BIT
6375 && flag_pic
6376 && flag_plt
6377 && bind_global)
6378 return false;
6380 /* If we need to align the outgoing stack, then sibcalling would
6381 unalign the stack, which may break the called function. */
6382 if (ix86_minimum_incoming_stack_boundary (true)
6383 < PREFERRED_STACK_BOUNDARY)
6384 return false;
6386 if (decl)
6388 decl_or_type = decl;
6389 type = TREE_TYPE (decl);
6391 else
6393 /* We're looking at the CALL_EXPR, we need the type of the function. */
6394 type = CALL_EXPR_FN (exp); /* pointer expression */
6395 type = TREE_TYPE (type); /* pointer type */
6396 type = TREE_TYPE (type); /* function type */
6397 decl_or_type = type;
6400 /* Check that the return value locations are the same. Like
6401 if we are returning floats on the 80387 register stack, we cannot
6402 make a sibcall from a function that doesn't return a float to a
6403 function that does or, conversely, from a function that does return
6404 a float to a function that doesn't; the necessary stack adjustment
6405 would not be executed. This is also the place we notice
6406 differences in the return value ABI. Note that it is ok for one
6407 of the functions to have void return type as long as the return
6408 value of the other is passed in a register. */
6409 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6410 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6411 cfun->decl, false);
6412 if (STACK_REG_P (a) || STACK_REG_P (b))
6414 if (!rtx_equal_p (a, b))
6415 return false;
6417 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6419 else if (!rtx_equal_p (a, b))
6420 return false;
6422 if (TARGET_64BIT)
6424 /* The SYSV ABI has more call-clobbered registers;
6425 disallow sibcalls from MS to SYSV. */
6426 if (cfun->machine->call_abi == MS_ABI
6427 && ix86_function_type_abi (type) == SYSV_ABI)
6428 return false;
6430 else
6432 /* If this call is indirect, we'll need to be able to use a
6433 call-clobbered register for the address of the target function.
6434 Make sure that all such registers are not used for passing
6435 parameters. Note that DLLIMPORT functions and call to global
6436 function via GOT slot are indirect. */
6437 if (!decl
6438 || (bind_global && flag_pic && !flag_plt)
6439 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
6440 || flag_force_indirect_call)
6442 /* Check if regparm >= 3 since arg_reg_available is set to
6443 false if regparm == 0. If regparm is 1 or 2, there is
6444 always a call-clobbered register available.
6446 ??? The symbol indirect call doesn't need a call-clobbered
6447 register. But we don't know if this is a symbol indirect
6448 call or not here. */
6449 if (ix86_function_regparm (type, decl) >= 3
6450 && !cfun->machine->arg_reg_available)
6451 return false;
6455 /* Otherwise okay. That also includes certain types of indirect calls. */
6456 return true;
6459 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6460 and "sseregparm" calling convention attributes;
6461 arguments as in struct attribute_spec.handler. */
6463 static tree
6464 ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
6465 bool *no_add_attrs)
6467 if (TREE_CODE (*node) != FUNCTION_TYPE
6468 && TREE_CODE (*node) != METHOD_TYPE
6469 && TREE_CODE (*node) != FIELD_DECL
6470 && TREE_CODE (*node) != TYPE_DECL)
6472 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6473 name);
6474 *no_add_attrs = true;
6475 return NULL_TREE;
6478 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6479 if (is_attribute_p ("regparm", name))
6481 tree cst;
6483 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6485 error ("fastcall and regparm attributes are not compatible");
6488 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6490 error ("regparam and thiscall attributes are not compatible");
6493 cst = TREE_VALUE (args);
6494 if (TREE_CODE (cst) != INTEGER_CST)
6496 warning (OPT_Wattributes,
6497 "%qE attribute requires an integer constant argument",
6498 name);
6499 *no_add_attrs = true;
6501 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6503 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6504 name, REGPARM_MAX);
6505 *no_add_attrs = true;
6508 return NULL_TREE;
6511 if (TARGET_64BIT)
6513 /* Do not warn when emulating the MS ABI. */
6514 if ((TREE_CODE (*node) != FUNCTION_TYPE
6515 && TREE_CODE (*node) != METHOD_TYPE)
6516 || ix86_function_type_abi (*node) != MS_ABI)
6517 warning (OPT_Wattributes, "%qE attribute ignored",
6518 name);
6519 *no_add_attrs = true;
6520 return NULL_TREE;
6523 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6524 if (is_attribute_p ("fastcall", name))
6526 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6528 error ("fastcall and cdecl attributes are not compatible");
6530 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6532 error ("fastcall and stdcall attributes are not compatible");
6534 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6536 error ("fastcall and regparm attributes are not compatible");
6538 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6540 error ("fastcall and thiscall attributes are not compatible");
6544 /* Can combine stdcall with fastcall (redundant), regparm and
6545 sseregparm. */
6546 else if (is_attribute_p ("stdcall", name))
6548 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6550 error ("stdcall and cdecl attributes are not compatible");
6552 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6554 error ("stdcall and fastcall attributes are not compatible");
6556 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6558 error ("stdcall and thiscall attributes are not compatible");
6562 /* Can combine cdecl with regparm and sseregparm. */
6563 else if (is_attribute_p ("cdecl", name))
6565 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6567 error ("stdcall and cdecl attributes are not compatible");
6569 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6571 error ("fastcall and cdecl attributes are not compatible");
6573 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6575 error ("cdecl and thiscall attributes are not compatible");
6578 else if (is_attribute_p ("thiscall", name))
6580 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6581 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6582 name);
6583 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6585 error ("stdcall and thiscall attributes are not compatible");
6587 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6589 error ("fastcall and thiscall attributes are not compatible");
6591 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6593 error ("cdecl and thiscall attributes are not compatible");
6597 /* Can combine sseregparm with all attributes. */
6599 return NULL_TREE;
6602 /* The transactional memory builtins are implicitly regparm or fastcall
6603 depending on the ABI. Override the generic do-nothing attribute that
6604 these builtins were declared with, and replace it with one of the two
6605 attributes that we expect elsewhere. */
6607 static tree
6608 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6609 int flags, bool *no_add_attrs)
6611 tree alt;
6613 /* In no case do we want to add the placeholder attribute. */
6614 *no_add_attrs = true;
6616 /* The 64-bit ABI is unchanged for transactional memory. */
6617 if (TARGET_64BIT)
6618 return NULL_TREE;
6620 /* ??? Is there a better way to validate 32-bit windows? We have
6621 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6622 if (CHECK_STACK_LIMIT > 0)
6623 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6624 else
6626 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6627 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6629 decl_attributes (node, alt, flags);
6631 return NULL_TREE;
6634 /* This function determines from TYPE the calling-convention. */
6636 unsigned int
6637 ix86_get_callcvt (const_tree type)
6639 unsigned int ret = 0;
6640 bool is_stdarg;
6641 tree attrs;
6643 if (TARGET_64BIT)
6644 return IX86_CALLCVT_CDECL;
6646 attrs = TYPE_ATTRIBUTES (type);
6647 if (attrs != NULL_TREE)
6649 if (lookup_attribute ("cdecl", attrs))
6650 ret |= IX86_CALLCVT_CDECL;
6651 else if (lookup_attribute ("stdcall", attrs))
6652 ret |= IX86_CALLCVT_STDCALL;
6653 else if (lookup_attribute ("fastcall", attrs))
6654 ret |= IX86_CALLCVT_FASTCALL;
6655 else if (lookup_attribute ("thiscall", attrs))
6656 ret |= IX86_CALLCVT_THISCALL;
6658 /* Regparam isn't allowed for thiscall and fastcall. */
6659 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6661 if (lookup_attribute ("regparm", attrs))
6662 ret |= IX86_CALLCVT_REGPARM;
6663 if (lookup_attribute ("sseregparm", attrs))
6664 ret |= IX86_CALLCVT_SSEREGPARM;
6667 if (IX86_BASE_CALLCVT(ret) != 0)
6668 return ret;
6671 is_stdarg = stdarg_p (type);
6672 if (TARGET_RTD && !is_stdarg)
6673 return IX86_CALLCVT_STDCALL | ret;
6675 if (ret != 0
6676 || is_stdarg
6677 || TREE_CODE (type) != METHOD_TYPE
6678 || ix86_function_type_abi (type) != MS_ABI)
6679 return IX86_CALLCVT_CDECL | ret;
6681 return IX86_CALLCVT_THISCALL;
6684 /* Return 0 if the attributes for two types are incompatible, 1 if they
6685 are compatible, and 2 if they are nearly compatible (which causes a
6686 warning to be generated). */
6688 static int
6689 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6691 unsigned int ccvt1, ccvt2;
6693 if (TREE_CODE (type1) != FUNCTION_TYPE
6694 && TREE_CODE (type1) != METHOD_TYPE)
6695 return 1;
6697 ccvt1 = ix86_get_callcvt (type1);
6698 ccvt2 = ix86_get_callcvt (type2);
6699 if (ccvt1 != ccvt2)
6700 return 0;
6701 if (ix86_function_regparm (type1, NULL)
6702 != ix86_function_regparm (type2, NULL))
6703 return 0;
6705 return 1;
6708 /* Return the regparm value for a function with the indicated TYPE and DECL.
6709 DECL may be NULL when calling function indirectly
6710 or considering a libcall. */
6712 static int
6713 ix86_function_regparm (const_tree type, const_tree decl)
6715 tree attr;
6716 int regparm;
6717 unsigned int ccvt;
6719 if (TARGET_64BIT)
6720 return (ix86_function_type_abi (type) == SYSV_ABI
6721 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6722 ccvt = ix86_get_callcvt (type);
6723 regparm = ix86_regparm;
6725 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6727 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6728 if (attr)
6730 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6731 return regparm;
6734 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6735 return 2;
6736 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6737 return 1;
6739 /* Use register calling convention for local functions when possible. */
6740 if (decl
6741 && TREE_CODE (decl) == FUNCTION_DECL)
6743 cgraph_node *target = cgraph_node::get (decl);
6744 if (target)
6745 target = target->function_symbol ();
6747 /* Caller and callee must agree on the calling convention, so
6748 checking here just optimize means that with
6749 __attribute__((optimize (...))) caller could use regparm convention
6750 and callee not, or vice versa. Instead look at whether the callee
6751 is optimized or not. */
6752 if (target && opt_for_fn (target->decl, optimize)
6753 && !(profile_flag && !flag_fentry))
6755 cgraph_local_info *i = &target->local;
6756 if (i && i->local && i->can_change_signature)
6758 int local_regparm, globals = 0, regno;
6760 /* Make sure no regparm register is taken by a
6761 fixed register variable. */
6762 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6763 local_regparm++)
6764 if (fixed_regs[local_regparm])
6765 break;
6767 /* We don't want to use regparm(3) for nested functions as
6768 these use a static chain pointer in the third argument. */
6769 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6770 local_regparm = 2;
6772 /* Save a register for the split stack. */
6773 if (flag_split_stack)
6775 if (local_regparm == 3)
6776 local_regparm = 2;
6777 else if (local_regparm == 2
6778 && DECL_STATIC_CHAIN (target->decl))
6779 local_regparm = 1;
6782 /* Each fixed register usage increases register pressure,
6783 so less registers should be used for argument passing.
6784 This functionality can be overriden by an explicit
6785 regparm value. */
6786 for (regno = AX_REG; regno <= DI_REG; regno++)
6787 if (fixed_regs[regno])
6788 globals++;
6790 local_regparm
6791 = globals < local_regparm ? local_regparm - globals : 0;
6793 if (local_regparm > regparm)
6794 regparm = local_regparm;
6799 return regparm;
6802 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6803 DFmode (2) arguments in SSE registers for a function with the
6804 indicated TYPE and DECL. DECL may be NULL when calling function
6805 indirectly or considering a libcall. Return -1 if any FP parameter
6806 should be rejected by error. This is used in siutation we imply SSE
6807 calling convetion but the function is called from another function with
6808 SSE disabled. Otherwise return 0. */
6810 static int
6811 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6813 gcc_assert (!TARGET_64BIT);
6815 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6816 by the sseregparm attribute. */
6817 if (TARGET_SSEREGPARM
6818 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6820 if (!TARGET_SSE)
6822 if (warn)
6824 if (decl)
6825 error ("calling %qD with attribute sseregparm without "
6826 "SSE/SSE2 enabled", decl);
6827 else
6828 error ("calling %qT with attribute sseregparm without "
6829 "SSE/SSE2 enabled", type);
6831 return 0;
6834 return 2;
6837 if (!decl)
6838 return 0;
6840 cgraph_node *target = cgraph_node::get (decl);
6841 if (target)
6842 target = target->function_symbol ();
6844 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6845 (and DFmode for SSE2) arguments in SSE registers. */
6846 if (target
6847 /* TARGET_SSE_MATH */
6848 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6849 && opt_for_fn (target->decl, optimize)
6850 && !(profile_flag && !flag_fentry))
6852 cgraph_local_info *i = &target->local;
6853 if (i && i->local && i->can_change_signature)
6855 /* Refuse to produce wrong code when local function with SSE enabled
6856 is called from SSE disabled function.
6857 FIXME: We need a way to detect these cases cross-ltrans partition
6858 and avoid using SSE calling conventions on local functions called
6859 from function with SSE disabled. For now at least delay the
6860 warning until we know we are going to produce wrong code.
6861 See PR66047 */
6862 if (!TARGET_SSE && warn)
6863 return -1;
6864 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6865 ->x_ix86_isa_flags) ? 2 : 1;
6869 return 0;
6872 /* Return true if EAX is live at the start of the function. Used by
6873 ix86_expand_prologue to determine if we need special help before
6874 calling allocate_stack_worker. */
6876 static bool
6877 ix86_eax_live_at_start_p (void)
6879 /* Cheat. Don't bother working forward from ix86_function_regparm
6880 to the function type to whether an actual argument is located in
6881 eax. Instead just look at cfg info, which is still close enough
6882 to correct at this point. This gives false positives for broken
6883 functions that might use uninitialized data that happens to be
6884 allocated in eax, but who cares? */
6885 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6888 static bool
6889 ix86_keep_aggregate_return_pointer (tree fntype)
6891 tree attr;
6893 if (!TARGET_64BIT)
6895 attr = lookup_attribute ("callee_pop_aggregate_return",
6896 TYPE_ATTRIBUTES (fntype));
6897 if (attr)
6898 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6900 /* For 32-bit MS-ABI the default is to keep aggregate
6901 return pointer. */
6902 if (ix86_function_type_abi (fntype) == MS_ABI)
6903 return true;
6905 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6908 /* Value is the number of bytes of arguments automatically
6909 popped when returning from a subroutine call.
6910 FUNDECL is the declaration node of the function (as a tree),
6911 FUNTYPE is the data type of the function (as a tree),
6912 or for a library call it is an identifier node for the subroutine name.
6913 SIZE is the number of bytes of arguments passed on the stack.
6915 On the 80386, the RTD insn may be used to pop them if the number
6916 of args is fixed, but if the number is variable then the caller
6917 must pop them all. RTD can't be used for library calls now
6918 because the library is compiled with the Unix compiler.
6919 Use of RTD is a selectable option, since it is incompatible with
6920 standard Unix calling sequences. If the option is not selected,
6921 the caller must always pop the args.
6923 The attribute stdcall is equivalent to RTD on a per module basis. */
6925 static poly_int64
6926 ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
6928 unsigned int ccvt;
6930 /* None of the 64-bit ABIs pop arguments. */
6931 if (TARGET_64BIT)
6932 return 0;
6934 ccvt = ix86_get_callcvt (funtype);
6936 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6937 | IX86_CALLCVT_THISCALL)) != 0
6938 && ! stdarg_p (funtype))
6939 return size;
6941 /* Lose any fake structure return argument if it is passed on the stack. */
6942 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6943 && !ix86_keep_aggregate_return_pointer (funtype))
6945 int nregs = ix86_function_regparm (funtype, fundecl);
6946 if (nregs == 0)
6947 return GET_MODE_SIZE (Pmode);
6950 return 0;
6953 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6955 static bool
6956 ix86_legitimate_combined_insn (rtx_insn *insn)
6958 int i;
6960 /* Check operand constraints in case hard registers were propagated
6961 into insn pattern. This check prevents combine pass from
6962 generating insn patterns with invalid hard register operands.
6963 These invalid insns can eventually confuse reload to error out
6964 with a spill failure. See also PRs 46829 and 46843. */
6966 gcc_assert (INSN_CODE (insn) >= 0);
6968 extract_insn (insn);
6969 preprocess_constraints (insn);
6971 int n_operands = recog_data.n_operands;
6972 int n_alternatives = recog_data.n_alternatives;
6973 for (i = 0; i < n_operands; i++)
6975 rtx op = recog_data.operand[i];
6976 machine_mode mode = GET_MODE (op);
6977 const operand_alternative *op_alt;
6978 int offset = 0;
6979 bool win;
6980 int j;
6982 /* A unary operator may be accepted by the predicate, but it
6983 is irrelevant for matching constraints. */
6984 if (UNARY_P (op))
6985 op = XEXP (op, 0);
6987 if (SUBREG_P (op))
6989 if (REG_P (SUBREG_REG (op))
6990 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6991 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6992 GET_MODE (SUBREG_REG (op)),
6993 SUBREG_BYTE (op),
6994 GET_MODE (op));
6995 op = SUBREG_REG (op);
6998 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6999 continue;
7001 op_alt = recog_op_alt;
7003 /* Operand has no constraints, anything is OK. */
7004 win = !n_alternatives;
7006 alternative_mask preferred = get_preferred_alternatives (insn);
7007 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
7009 if (!TEST_BIT (preferred, j))
7010 continue;
7011 if (op_alt[i].anything_ok
7012 || (op_alt[i].matches != -1
7013 && operands_match_p
7014 (recog_data.operand[i],
7015 recog_data.operand[op_alt[i].matches]))
7016 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
7018 win = true;
7019 break;
7023 if (!win)
7024 return false;
7027 return true;
7030 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
7032 static unsigned HOST_WIDE_INT
7033 ix86_asan_shadow_offset (void)
7035 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
7036 : HOST_WIDE_INT_C (0x7fff8000))
7037 : (HOST_WIDE_INT_1 << 29);
7040 /* Argument support functions. */
7042 /* Return true when register may be used to pass function parameters. */
7043 bool
7044 ix86_function_arg_regno_p (int regno)
7046 int i;
7047 enum calling_abi call_abi;
7048 const int *parm_regs;
7050 if (!TARGET_64BIT)
7052 if (TARGET_MACHO)
7053 return (regno < REGPARM_MAX
7054 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
7055 else
7056 return (regno < REGPARM_MAX
7057 || (TARGET_MMX && MMX_REGNO_P (regno)
7058 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
7059 || (TARGET_SSE && SSE_REGNO_P (regno)
7060 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
7063 if (TARGET_SSE && SSE_REGNO_P (regno)
7064 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
7065 return true;
7067 /* TODO: The function should depend on current function ABI but
7068 builtins.c would need updating then. Therefore we use the
7069 default ABI. */
7070 call_abi = ix86_cfun_abi ();
7072 /* RAX is used as hidden argument to va_arg functions. */
7073 if (call_abi == SYSV_ABI && regno == AX_REG)
7074 return true;
7076 if (call_abi == MS_ABI)
7077 parm_regs = x86_64_ms_abi_int_parameter_registers;
7078 else
7079 parm_regs = x86_64_int_parameter_registers;
7081 for (i = 0; i < (call_abi == MS_ABI
7082 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
7083 if (regno == parm_regs[i])
7084 return true;
7085 return false;
7088 /* Return if we do not know how to pass TYPE solely in registers. */
7090 static bool
7091 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
7093 if (must_pass_in_stack_var_size_or_pad (mode, type))
7094 return true;
7096 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
7097 The layout_type routine is crafty and tries to trick us into passing
7098 currently unsupported vector types on the stack by using TImode. */
7099 return (!TARGET_64BIT && mode == TImode
7100 && type && TREE_CODE (type) != VECTOR_TYPE);
7103 /* It returns the size, in bytes, of the area reserved for arguments passed
7104 in registers for the function represented by fndecl dependent to the used
7105 abi format. */
7107 ix86_reg_parm_stack_space (const_tree fndecl)
7109 enum calling_abi call_abi = SYSV_ABI;
7110 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
7111 call_abi = ix86_function_abi (fndecl);
7112 else
7113 call_abi = ix86_function_type_abi (fndecl);
7114 if (TARGET_64BIT && call_abi == MS_ABI)
7115 return 32;
7116 return 0;
7119 /* We add this as a workaround in order to use libc_has_function
7120 hook in i386.md. */
7121 bool
7122 ix86_libc_has_function (enum function_class fn_class)
7124 return targetm.libc_has_function (fn_class);
7127 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
7128 specifying the call abi used. */
7129 enum calling_abi
7130 ix86_function_type_abi (const_tree fntype)
7132 enum calling_abi abi = ix86_abi;
7134 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
7135 return abi;
7137 if (abi == SYSV_ABI
7138 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
7140 static int warned;
7141 if (TARGET_X32 && !warned)
7143 error ("X32 does not support ms_abi attribute");
7144 warned = 1;
7147 abi = MS_ABI;
7149 else if (abi == MS_ABI
7150 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
7151 abi = SYSV_ABI;
7153 return abi;
7156 static enum calling_abi
7157 ix86_function_abi (const_tree fndecl)
7159 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
7162 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
7163 specifying the call abi used. */
7164 enum calling_abi
7165 ix86_cfun_abi (void)
7167 return cfun ? cfun->machine->call_abi : ix86_abi;
7170 static bool
7171 ix86_function_ms_hook_prologue (const_tree fn)
7173 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
7175 if (decl_function_context (fn) != NULL_TREE)
7176 error_at (DECL_SOURCE_LOCATION (fn),
7177 "ms_hook_prologue is not compatible with nested function");
7178 else
7179 return true;
7181 return false;
7184 static bool
7185 ix86_function_naked (const_tree fn)
7187 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
7188 return true;
7190 return false;
7193 /* Write the extra assembler code needed to declare a function properly. */
7195 void
7196 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7197 tree decl)
7199 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7201 if (is_ms_hook)
7203 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7204 unsigned int filler_cc = 0xcccccccc;
7206 for (i = 0; i < filler_count; i += 4)
7207 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7210 #ifdef SUBTARGET_ASM_UNWIND_INIT
7211 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7212 #endif
7214 ASM_OUTPUT_LABEL (asm_out_file, fname);
7216 /* Output magic byte marker, if hot-patch attribute is set. */
7217 if (is_ms_hook)
7219 if (TARGET_64BIT)
7221 /* leaq [%rsp + 0], %rsp */
7222 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7223 asm_out_file);
7225 else
7227 /* movl.s %edi, %edi
7228 push %ebp
7229 movl.s %esp, %ebp */
7230 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7235 /* Implementation of call abi switching target hook. Specific to FNDECL
7236 the specific call register sets are set. See also
7237 ix86_conditional_register_usage for more details. */
7238 void
7239 ix86_call_abi_override (const_tree fndecl)
7241 cfun->machine->call_abi = ix86_function_abi (fndecl);
7244 /* Return 1 if pseudo register should be created and used to hold
7245 GOT address for PIC code. */
7246 bool
7247 ix86_use_pseudo_pic_reg (void)
7249 if ((TARGET_64BIT
7250 && (ix86_cmodel == CM_SMALL_PIC
7251 || TARGET_PECOFF))
7252 || !flag_pic)
7253 return false;
7254 return true;
7257 /* Initialize large model PIC register. */
7259 static void
7260 ix86_init_large_pic_reg (unsigned int tmp_regno)
7262 rtx_code_label *label;
7263 rtx tmp_reg;
7265 gcc_assert (Pmode == DImode);
7266 label = gen_label_rtx ();
7267 emit_label (label);
7268 LABEL_PRESERVE_P (label) = 1;
7269 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7270 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7271 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7272 label));
7273 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7274 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7275 pic_offset_table_rtx, tmp_reg));
7276 const char *name = LABEL_NAME (label);
7277 PUT_CODE (label, NOTE);
7278 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7279 NOTE_DELETED_LABEL_NAME (label) = name;
7282 /* Create and initialize PIC register if required. */
7283 static void
7284 ix86_init_pic_reg (void)
7286 edge entry_edge;
7287 rtx_insn *seq;
7289 if (!ix86_use_pseudo_pic_reg ())
7290 return;
7292 start_sequence ();
7294 if (TARGET_64BIT)
7296 if (ix86_cmodel == CM_LARGE_PIC)
7297 ix86_init_large_pic_reg (R11_REG);
7298 else
7299 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7301 else
7303 /* If there is future mcount call in the function it is more profitable
7304 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7305 rtx reg = crtl->profile
7306 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7307 : pic_offset_table_rtx;
7308 rtx_insn *insn = emit_insn (gen_set_got (reg));
7309 RTX_FRAME_RELATED_P (insn) = 1;
7310 if (crtl->profile)
7311 emit_move_insn (pic_offset_table_rtx, reg);
7312 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7315 seq = get_insns ();
7316 end_sequence ();
7318 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7319 insert_insn_on_edge (seq, entry_edge);
7320 commit_one_edge_insertion (entry_edge);
7323 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7324 for a call to a function whose data type is FNTYPE.
7325 For a library call, FNTYPE is 0. */
7327 void
7328 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7329 tree fntype, /* tree ptr for function decl */
7330 rtx libname, /* SYMBOL_REF of library name or 0 */
7331 tree fndecl,
7332 int caller)
7334 struct cgraph_local_info *i = NULL;
7335 struct cgraph_node *target = NULL;
7337 memset (cum, 0, sizeof (*cum));
7339 if (fndecl)
7341 target = cgraph_node::get (fndecl);
7342 if (target)
7344 target = target->function_symbol ();
7345 i = cgraph_node::local_info (target->decl);
7346 cum->call_abi = ix86_function_abi (target->decl);
7348 else
7349 cum->call_abi = ix86_function_abi (fndecl);
7351 else
7352 cum->call_abi = ix86_function_type_abi (fntype);
7354 cum->caller = caller;
7356 /* Set up the number of registers to use for passing arguments. */
7357 cum->nregs = ix86_regparm;
7358 if (TARGET_64BIT)
7360 cum->nregs = (cum->call_abi == SYSV_ABI
7361 ? X86_64_REGPARM_MAX
7362 : X86_64_MS_REGPARM_MAX);
7364 if (TARGET_SSE)
7366 cum->sse_nregs = SSE_REGPARM_MAX;
7367 if (TARGET_64BIT)
7369 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7370 ? X86_64_SSE_REGPARM_MAX
7371 : X86_64_MS_SSE_REGPARM_MAX);
7374 if (TARGET_MMX)
7375 cum->mmx_nregs = MMX_REGPARM_MAX;
7376 cum->warn_avx512f = true;
7377 cum->warn_avx = true;
7378 cum->warn_sse = true;
7379 cum->warn_mmx = true;
7381 /* Because type might mismatch in between caller and callee, we need to
7382 use actual type of function for local calls.
7383 FIXME: cgraph_analyze can be told to actually record if function uses
7384 va_start so for local functions maybe_vaarg can be made aggressive
7385 helping K&R code.
7386 FIXME: once typesytem is fixed, we won't need this code anymore. */
7387 if (i && i->local && i->can_change_signature)
7388 fntype = TREE_TYPE (target->decl);
7389 cum->stdarg = stdarg_p (fntype);
7390 cum->maybe_vaarg = (fntype
7391 ? (!prototype_p (fntype) || stdarg_p (fntype))
7392 : !libname);
7394 cum->decl = fndecl;
7396 cum->warn_empty = !warn_abi || cum->stdarg;
7397 if (!cum->warn_empty && fntype)
7399 function_args_iterator iter;
7400 tree argtype;
7401 bool seen_empty_type = false;
7402 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7404 if (argtype == error_mark_node || VOID_TYPE_P (argtype))
7405 break;
7406 if (TYPE_EMPTY_P (argtype))
7407 seen_empty_type = true;
7408 else if (seen_empty_type)
7410 cum->warn_empty = true;
7411 break;
7416 if (!TARGET_64BIT)
7418 /* If there are variable arguments, then we won't pass anything
7419 in registers in 32-bit mode. */
7420 if (stdarg_p (fntype))
7422 cum->nregs = 0;
7423 /* Since in 32-bit, variable arguments are always passed on
7424 stack, there is scratch register available for indirect
7425 sibcall. */
7426 cfun->machine->arg_reg_available = true;
7427 cum->sse_nregs = 0;
7428 cum->mmx_nregs = 0;
7429 cum->warn_avx512f = false;
7430 cum->warn_avx = false;
7431 cum->warn_sse = false;
7432 cum->warn_mmx = false;
7433 return;
7436 /* Use ecx and edx registers if function has fastcall attribute,
7437 else look for regparm information. */
7438 if (fntype)
7440 unsigned int ccvt = ix86_get_callcvt (fntype);
7441 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7443 cum->nregs = 1;
7444 cum->fastcall = 1; /* Same first register as in fastcall. */
7446 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7448 cum->nregs = 2;
7449 cum->fastcall = 1;
7451 else
7452 cum->nregs = ix86_function_regparm (fntype, fndecl);
7455 /* Set up the number of SSE registers used for passing SFmode
7456 and DFmode arguments. Warn for mismatching ABI. */
7457 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7460 cfun->machine->arg_reg_available = (cum->nregs > 0);
7463 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7464 But in the case of vector types, it is some vector mode.
7466 When we have only some of our vector isa extensions enabled, then there
7467 are some modes for which vector_mode_supported_p is false. For these
7468 modes, the generic vector support in gcc will choose some non-vector mode
7469 in order to implement the type. By computing the natural mode, we'll
7470 select the proper ABI location for the operand and not depend on whatever
7471 the middle-end decides to do with these vector types.
7473 The midde-end can't deal with the vector types > 16 bytes. In this
7474 case, we return the original mode and warn ABI change if CUM isn't
7475 NULL.
7477 If INT_RETURN is true, warn ABI change if the vector mode isn't
7478 available for function return value. */
7480 static machine_mode
7481 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7482 bool in_return)
7484 machine_mode mode = TYPE_MODE (type);
7486 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7488 HOST_WIDE_INT size = int_size_in_bytes (type);
7489 if ((size == 8 || size == 16 || size == 32 || size == 64)
7490 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7491 && TYPE_VECTOR_SUBPARTS (type) > 1)
7493 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7495 /* There are no XFmode vector modes. */
7496 if (innermode == XFmode)
7497 return mode;
7499 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7500 mode = MIN_MODE_VECTOR_FLOAT;
7501 else
7502 mode = MIN_MODE_VECTOR_INT;
7504 /* Get the mode which has this inner mode and number of units. */
7505 FOR_EACH_MODE_FROM (mode, mode)
7506 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7507 && GET_MODE_INNER (mode) == innermode)
7509 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7511 static bool warnedavx512f;
7512 static bool warnedavx512f_ret;
7514 if (cum && cum->warn_avx512f && !warnedavx512f)
7516 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7517 "without AVX512F enabled changes the ABI"))
7518 warnedavx512f = true;
7520 else if (in_return && !warnedavx512f_ret)
7522 if (warning (OPT_Wpsabi, "AVX512F vector return "
7523 "without AVX512F enabled changes the ABI"))
7524 warnedavx512f_ret = true;
7527 return TYPE_MODE (type);
7529 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7531 static bool warnedavx;
7532 static bool warnedavx_ret;
7534 if (cum && cum->warn_avx && !warnedavx)
7536 if (warning (OPT_Wpsabi, "AVX vector argument "
7537 "without AVX enabled changes the ABI"))
7538 warnedavx = true;
7540 else if (in_return && !warnedavx_ret)
7542 if (warning (OPT_Wpsabi, "AVX vector return "
7543 "without AVX enabled changes the ABI"))
7544 warnedavx_ret = true;
7547 return TYPE_MODE (type);
7549 else if (((size == 8 && TARGET_64BIT) || size == 16)
7550 && !TARGET_SSE
7551 && !TARGET_IAMCU)
7553 static bool warnedsse;
7554 static bool warnedsse_ret;
7556 if (cum && cum->warn_sse && !warnedsse)
7558 if (warning (OPT_Wpsabi, "SSE vector argument "
7559 "without SSE enabled changes the ABI"))
7560 warnedsse = true;
7562 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7564 if (warning (OPT_Wpsabi, "SSE vector return "
7565 "without SSE enabled changes the ABI"))
7566 warnedsse_ret = true;
7569 else if ((size == 8 && !TARGET_64BIT)
7570 && (!cfun
7571 || cfun->machine->func_type == TYPE_NORMAL)
7572 && !TARGET_MMX
7573 && !TARGET_IAMCU)
7575 static bool warnedmmx;
7576 static bool warnedmmx_ret;
7578 if (cum && cum->warn_mmx && !warnedmmx)
7580 if (warning (OPT_Wpsabi, "MMX vector argument "
7581 "without MMX enabled changes the ABI"))
7582 warnedmmx = true;
7584 else if (in_return && !warnedmmx_ret)
7586 if (warning (OPT_Wpsabi, "MMX vector return "
7587 "without MMX enabled changes the ABI"))
7588 warnedmmx_ret = true;
7591 return mode;
7594 gcc_unreachable ();
7598 return mode;
7601 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7602 this may not agree with the mode that the type system has chosen for the
7603 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7604 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7606 static rtx
7607 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7608 unsigned int regno)
7610 rtx tmp;
7612 if (orig_mode != BLKmode)
7613 tmp = gen_rtx_REG (orig_mode, regno);
7614 else
7616 tmp = gen_rtx_REG (mode, regno);
7617 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7618 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7621 return tmp;
7624 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7625 of this code is to classify each 8bytes of incoming argument by the register
7626 class and assign registers accordingly. */
7628 /* Return the union class of CLASS1 and CLASS2.
7629 See the x86-64 PS ABI for details. */
7631 static enum x86_64_reg_class
7632 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7634 /* Rule #1: If both classes are equal, this is the resulting class. */
7635 if (class1 == class2)
7636 return class1;
7638 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7639 the other class. */
7640 if (class1 == X86_64_NO_CLASS)
7641 return class2;
7642 if (class2 == X86_64_NO_CLASS)
7643 return class1;
7645 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7646 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7647 return X86_64_MEMORY_CLASS;
7649 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7650 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7651 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7652 return X86_64_INTEGERSI_CLASS;
7653 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7654 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7655 return X86_64_INTEGER_CLASS;
7657 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7658 MEMORY is used. */
7659 if (class1 == X86_64_X87_CLASS
7660 || class1 == X86_64_X87UP_CLASS
7661 || class1 == X86_64_COMPLEX_X87_CLASS
7662 || class2 == X86_64_X87_CLASS
7663 || class2 == X86_64_X87UP_CLASS
7664 || class2 == X86_64_COMPLEX_X87_CLASS)
7665 return X86_64_MEMORY_CLASS;
7667 /* Rule #6: Otherwise class SSE is used. */
7668 return X86_64_SSE_CLASS;
7671 /* Classify the argument of type TYPE and mode MODE.
7672 CLASSES will be filled by the register class used to pass each word
7673 of the operand. The number of words is returned. In case the parameter
7674 should be passed in memory, 0 is returned. As a special case for zero
7675 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7677 BIT_OFFSET is used internally for handling records and specifies offset
7678 of the offset in bits modulo 512 to avoid overflow cases.
7680 See the x86-64 PS ABI for details.
7683 static int
7684 classify_argument (machine_mode mode, const_tree type,
7685 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7687 HOST_WIDE_INT bytes =
7688 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7689 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7691 /* Variable sized entities are always passed/returned in memory. */
7692 if (bytes < 0)
7693 return 0;
7695 if (mode != VOIDmode
7696 && targetm.calls.must_pass_in_stack (mode, type))
7697 return 0;
7699 if (type && AGGREGATE_TYPE_P (type))
7701 int i;
7702 tree field;
7703 enum x86_64_reg_class subclasses[MAX_CLASSES];
7705 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7706 if (bytes > 64)
7707 return 0;
7709 for (i = 0; i < words; i++)
7710 classes[i] = X86_64_NO_CLASS;
7712 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7713 signalize memory class, so handle it as special case. */
7714 if (!words)
7716 classes[0] = X86_64_NO_CLASS;
7717 return 1;
7720 /* Classify each field of record and merge classes. */
7721 switch (TREE_CODE (type))
7723 case RECORD_TYPE:
7724 /* And now merge the fields of structure. */
7725 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7727 if (TREE_CODE (field) == FIELD_DECL)
7729 int num;
7731 if (TREE_TYPE (field) == error_mark_node)
7732 continue;
7734 /* Bitfields are always classified as integer. Handle them
7735 early, since later code would consider them to be
7736 misaligned integers. */
7737 if (DECL_BIT_FIELD (field))
7739 for (i = (int_bit_position (field)
7740 + (bit_offset % 64)) / 8 / 8;
7741 i < ((int_bit_position (field) + (bit_offset % 64))
7742 + tree_to_shwi (DECL_SIZE (field))
7743 + 63) / 8 / 8; i++)
7744 classes[i] =
7745 merge_classes (X86_64_INTEGER_CLASS,
7746 classes[i]);
7748 else
7750 int pos;
7752 type = TREE_TYPE (field);
7754 /* Flexible array member is ignored. */
7755 if (TYPE_MODE (type) == BLKmode
7756 && TREE_CODE (type) == ARRAY_TYPE
7757 && TYPE_SIZE (type) == NULL_TREE
7758 && TYPE_DOMAIN (type) != NULL_TREE
7759 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7760 == NULL_TREE))
7762 static bool warned;
7764 if (!warned && warn_psabi)
7766 warned = true;
7767 inform (input_location,
7768 "the ABI of passing struct with"
7769 " a flexible array member has"
7770 " changed in GCC 4.4");
7772 continue;
7774 num = classify_argument (TYPE_MODE (type), type,
7775 subclasses,
7776 (int_bit_position (field)
7777 + bit_offset) % 512);
7778 if (!num)
7779 return 0;
7780 pos = (int_bit_position (field)
7781 + (bit_offset % 64)) / 8 / 8;
7782 for (i = 0; i < num && (i + pos) < words; i++)
7783 classes[i + pos] =
7784 merge_classes (subclasses[i], classes[i + pos]);
7788 break;
7790 case ARRAY_TYPE:
7791 /* Arrays are handled as small records. */
7793 int num;
7794 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7795 TREE_TYPE (type), subclasses, bit_offset);
7796 if (!num)
7797 return 0;
7799 /* The partial classes are now full classes. */
7800 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7801 subclasses[0] = X86_64_SSE_CLASS;
7802 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7803 && !((bit_offset % 64) == 0 && bytes == 4))
7804 subclasses[0] = X86_64_INTEGER_CLASS;
7806 for (i = 0; i < words; i++)
7807 classes[i] = subclasses[i % num];
7809 break;
7811 case UNION_TYPE:
7812 case QUAL_UNION_TYPE:
7813 /* Unions are similar to RECORD_TYPE but offset is always 0.
7815 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7817 if (TREE_CODE (field) == FIELD_DECL)
7819 int num;
7821 if (TREE_TYPE (field) == error_mark_node)
7822 continue;
7824 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7825 TREE_TYPE (field), subclasses,
7826 bit_offset);
7827 if (!num)
7828 return 0;
7829 for (i = 0; i < num && i < words; i++)
7830 classes[i] = merge_classes (subclasses[i], classes[i]);
7833 break;
7835 default:
7836 gcc_unreachable ();
7839 if (words > 2)
7841 /* When size > 16 bytes, if the first one isn't
7842 X86_64_SSE_CLASS or any other ones aren't
7843 X86_64_SSEUP_CLASS, everything should be passed in
7844 memory. */
7845 if (classes[0] != X86_64_SSE_CLASS)
7846 return 0;
7848 for (i = 1; i < words; i++)
7849 if (classes[i] != X86_64_SSEUP_CLASS)
7850 return 0;
7853 /* Final merger cleanup. */
7854 for (i = 0; i < words; i++)
7856 /* If one class is MEMORY, everything should be passed in
7857 memory. */
7858 if (classes[i] == X86_64_MEMORY_CLASS)
7859 return 0;
7861 /* The X86_64_SSEUP_CLASS should be always preceded by
7862 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7863 if (classes[i] == X86_64_SSEUP_CLASS
7864 && classes[i - 1] != X86_64_SSE_CLASS
7865 && classes[i - 1] != X86_64_SSEUP_CLASS)
7867 /* The first one should never be X86_64_SSEUP_CLASS. */
7868 gcc_assert (i != 0);
7869 classes[i] = X86_64_SSE_CLASS;
7872 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7873 everything should be passed in memory. */
7874 if (classes[i] == X86_64_X87UP_CLASS
7875 && (classes[i - 1] != X86_64_X87_CLASS))
7877 static bool warned;
7879 /* The first one should never be X86_64_X87UP_CLASS. */
7880 gcc_assert (i != 0);
7881 if (!warned && warn_psabi)
7883 warned = true;
7884 inform (input_location,
7885 "the ABI of passing union with long double"
7886 " has changed in GCC 4.4");
7888 return 0;
7891 return words;
7894 /* Compute alignment needed. We align all types to natural boundaries with
7895 exception of XFmode that is aligned to 64bits. */
7896 if (mode != VOIDmode && mode != BLKmode)
7898 int mode_alignment = GET_MODE_BITSIZE (mode);
7900 if (mode == XFmode)
7901 mode_alignment = 128;
7902 else if (mode == XCmode)
7903 mode_alignment = 256;
7904 if (COMPLEX_MODE_P (mode))
7905 mode_alignment /= 2;
7906 /* Misaligned fields are always returned in memory. */
7907 if (bit_offset % mode_alignment)
7908 return 0;
7911 /* for V1xx modes, just use the base mode */
7912 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7913 && GET_MODE_UNIT_SIZE (mode) == bytes)
7914 mode = GET_MODE_INNER (mode);
7916 /* Classification of atomic types. */
7917 switch (mode)
7919 case E_SDmode:
7920 case E_DDmode:
7921 classes[0] = X86_64_SSE_CLASS;
7922 return 1;
7923 case E_TDmode:
7924 classes[0] = X86_64_SSE_CLASS;
7925 classes[1] = X86_64_SSEUP_CLASS;
7926 return 2;
7927 case E_DImode:
7928 case E_SImode:
7929 case E_HImode:
7930 case E_QImode:
7931 case E_CSImode:
7932 case E_CHImode:
7933 case E_CQImode:
7935 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7937 /* Analyze last 128 bits only. */
7938 size = (size - 1) & 0x7f;
7940 if (size < 32)
7942 classes[0] = X86_64_INTEGERSI_CLASS;
7943 return 1;
7945 else if (size < 64)
7947 classes[0] = X86_64_INTEGER_CLASS;
7948 return 1;
7950 else if (size < 64+32)
7952 classes[0] = X86_64_INTEGER_CLASS;
7953 classes[1] = X86_64_INTEGERSI_CLASS;
7954 return 2;
7956 else if (size < 64+64)
7958 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7959 return 2;
7961 else
7962 gcc_unreachable ();
7964 case E_CDImode:
7965 case E_TImode:
7966 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7967 return 2;
7968 case E_COImode:
7969 case E_OImode:
7970 /* OImode shouldn't be used directly. */
7971 gcc_unreachable ();
7972 case E_CTImode:
7973 return 0;
7974 case E_SFmode:
7975 if (!(bit_offset % 64))
7976 classes[0] = X86_64_SSESF_CLASS;
7977 else
7978 classes[0] = X86_64_SSE_CLASS;
7979 return 1;
7980 case E_DFmode:
7981 classes[0] = X86_64_SSEDF_CLASS;
7982 return 1;
7983 case E_XFmode:
7984 classes[0] = X86_64_X87_CLASS;
7985 classes[1] = X86_64_X87UP_CLASS;
7986 return 2;
7987 case E_TFmode:
7988 classes[0] = X86_64_SSE_CLASS;
7989 classes[1] = X86_64_SSEUP_CLASS;
7990 return 2;
7991 case E_SCmode:
7992 classes[0] = X86_64_SSE_CLASS;
7993 if (!(bit_offset % 64))
7994 return 1;
7995 else
7997 static bool warned;
7999 if (!warned && warn_psabi)
8001 warned = true;
8002 inform (input_location,
8003 "the ABI of passing structure with complex float"
8004 " member has changed in GCC 4.4");
8006 classes[1] = X86_64_SSESF_CLASS;
8007 return 2;
8009 case E_DCmode:
8010 classes[0] = X86_64_SSEDF_CLASS;
8011 classes[1] = X86_64_SSEDF_CLASS;
8012 return 2;
8013 case E_XCmode:
8014 classes[0] = X86_64_COMPLEX_X87_CLASS;
8015 return 1;
8016 case E_TCmode:
8017 /* This modes is larger than 16 bytes. */
8018 return 0;
8019 case E_V8SFmode:
8020 case E_V8SImode:
8021 case E_V32QImode:
8022 case E_V16HImode:
8023 case E_V4DFmode:
8024 case E_V4DImode:
8025 classes[0] = X86_64_SSE_CLASS;
8026 classes[1] = X86_64_SSEUP_CLASS;
8027 classes[2] = X86_64_SSEUP_CLASS;
8028 classes[3] = X86_64_SSEUP_CLASS;
8029 return 4;
8030 case E_V8DFmode:
8031 case E_V16SFmode:
8032 case E_V8DImode:
8033 case E_V16SImode:
8034 case E_V32HImode:
8035 case E_V64QImode:
8036 classes[0] = X86_64_SSE_CLASS;
8037 classes[1] = X86_64_SSEUP_CLASS;
8038 classes[2] = X86_64_SSEUP_CLASS;
8039 classes[3] = X86_64_SSEUP_CLASS;
8040 classes[4] = X86_64_SSEUP_CLASS;
8041 classes[5] = X86_64_SSEUP_CLASS;
8042 classes[6] = X86_64_SSEUP_CLASS;
8043 classes[7] = X86_64_SSEUP_CLASS;
8044 return 8;
8045 case E_V4SFmode:
8046 case E_V4SImode:
8047 case E_V16QImode:
8048 case E_V8HImode:
8049 case E_V2DFmode:
8050 case E_V2DImode:
8051 classes[0] = X86_64_SSE_CLASS;
8052 classes[1] = X86_64_SSEUP_CLASS;
8053 return 2;
8054 case E_V1TImode:
8055 case E_V1DImode:
8056 case E_V2SFmode:
8057 case E_V2SImode:
8058 case E_V4HImode:
8059 case E_V8QImode:
8060 classes[0] = X86_64_SSE_CLASS;
8061 return 1;
8062 case E_BLKmode:
8063 case E_VOIDmode:
8064 return 0;
8065 default:
8066 gcc_assert (VECTOR_MODE_P (mode));
8068 if (bytes > 16)
8069 return 0;
8071 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
8073 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
8074 classes[0] = X86_64_INTEGERSI_CLASS;
8075 else
8076 classes[0] = X86_64_INTEGER_CLASS;
8077 classes[1] = X86_64_INTEGER_CLASS;
8078 return 1 + (bytes > 8);
8082 /* Examine the argument and return set number of register required in each
8083 class. Return true iff parameter should be passed in memory. */
8085 static bool
8086 examine_argument (machine_mode mode, const_tree type, int in_return,
8087 int *int_nregs, int *sse_nregs)
8089 enum x86_64_reg_class regclass[MAX_CLASSES];
8090 int n = classify_argument (mode, type, regclass, 0);
8092 *int_nregs = 0;
8093 *sse_nregs = 0;
8095 if (!n)
8096 return true;
8097 for (n--; n >= 0; n--)
8098 switch (regclass[n])
8100 case X86_64_INTEGER_CLASS:
8101 case X86_64_INTEGERSI_CLASS:
8102 (*int_nregs)++;
8103 break;
8104 case X86_64_SSE_CLASS:
8105 case X86_64_SSESF_CLASS:
8106 case X86_64_SSEDF_CLASS:
8107 (*sse_nregs)++;
8108 break;
8109 case X86_64_NO_CLASS:
8110 case X86_64_SSEUP_CLASS:
8111 break;
8112 case X86_64_X87_CLASS:
8113 case X86_64_X87UP_CLASS:
8114 case X86_64_COMPLEX_X87_CLASS:
8115 if (!in_return)
8116 return true;
8117 break;
8118 case X86_64_MEMORY_CLASS:
8119 gcc_unreachable ();
8122 return false;
8125 /* Construct container for the argument used by GCC interface. See
8126 FUNCTION_ARG for the detailed description. */
8128 static rtx
8129 construct_container (machine_mode mode, machine_mode orig_mode,
8130 const_tree type, int in_return, int nintregs, int nsseregs,
8131 const int *intreg, int sse_regno)
8133 /* The following variables hold the static issued_error state. */
8134 static bool issued_sse_arg_error;
8135 static bool issued_sse_ret_error;
8136 static bool issued_x87_ret_error;
8138 machine_mode tmpmode;
8139 int bytes =
8140 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
8141 enum x86_64_reg_class regclass[MAX_CLASSES];
8142 int n;
8143 int i;
8144 int nexps = 0;
8145 int needed_sseregs, needed_intregs;
8146 rtx exp[MAX_CLASSES];
8147 rtx ret;
8149 n = classify_argument (mode, type, regclass, 0);
8150 if (!n)
8151 return NULL;
8152 if (examine_argument (mode, type, in_return, &needed_intregs,
8153 &needed_sseregs))
8154 return NULL;
8155 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
8156 return NULL;
8158 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
8159 some less clueful developer tries to use floating-point anyway. */
8160 if (needed_sseregs && !TARGET_SSE)
8162 if (in_return)
8164 if (!issued_sse_ret_error)
8166 error ("SSE register return with SSE disabled");
8167 issued_sse_ret_error = true;
8170 else if (!issued_sse_arg_error)
8172 error ("SSE register argument with SSE disabled");
8173 issued_sse_arg_error = true;
8175 return NULL;
8178 /* Likewise, error if the ABI requires us to return values in the
8179 x87 registers and the user specified -mno-80387. */
8180 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
8181 for (i = 0; i < n; i++)
8182 if (regclass[i] == X86_64_X87_CLASS
8183 || regclass[i] == X86_64_X87UP_CLASS
8184 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
8186 if (!issued_x87_ret_error)
8188 error ("x87 register return with x87 disabled");
8189 issued_x87_ret_error = true;
8191 return NULL;
8194 /* First construct simple cases. Avoid SCmode, since we want to use
8195 single register to pass this type. */
8196 if (n == 1 && mode != SCmode)
8197 switch (regclass[0])
8199 case X86_64_INTEGER_CLASS:
8200 case X86_64_INTEGERSI_CLASS:
8201 return gen_rtx_REG (mode, intreg[0]);
8202 case X86_64_SSE_CLASS:
8203 case X86_64_SSESF_CLASS:
8204 case X86_64_SSEDF_CLASS:
8205 if (mode != BLKmode)
8206 return gen_reg_or_parallel (mode, orig_mode,
8207 SSE_REGNO (sse_regno));
8208 break;
8209 case X86_64_X87_CLASS:
8210 case X86_64_COMPLEX_X87_CLASS:
8211 return gen_rtx_REG (mode, FIRST_STACK_REG);
8212 case X86_64_NO_CLASS:
8213 /* Zero sized array, struct or class. */
8214 return NULL;
8215 default:
8216 gcc_unreachable ();
8218 if (n == 2
8219 && regclass[0] == X86_64_SSE_CLASS
8220 && regclass[1] == X86_64_SSEUP_CLASS
8221 && mode != BLKmode)
8222 return gen_reg_or_parallel (mode, orig_mode,
8223 SSE_REGNO (sse_regno));
8224 if (n == 4
8225 && regclass[0] == X86_64_SSE_CLASS
8226 && regclass[1] == X86_64_SSEUP_CLASS
8227 && regclass[2] == X86_64_SSEUP_CLASS
8228 && regclass[3] == X86_64_SSEUP_CLASS
8229 && mode != BLKmode)
8230 return gen_reg_or_parallel (mode, orig_mode,
8231 SSE_REGNO (sse_regno));
8232 if (n == 8
8233 && regclass[0] == X86_64_SSE_CLASS
8234 && regclass[1] == X86_64_SSEUP_CLASS
8235 && regclass[2] == X86_64_SSEUP_CLASS
8236 && regclass[3] == X86_64_SSEUP_CLASS
8237 && regclass[4] == X86_64_SSEUP_CLASS
8238 && regclass[5] == X86_64_SSEUP_CLASS
8239 && regclass[6] == X86_64_SSEUP_CLASS
8240 && regclass[7] == X86_64_SSEUP_CLASS
8241 && mode != BLKmode)
8242 return gen_reg_or_parallel (mode, orig_mode,
8243 SSE_REGNO (sse_regno));
8244 if (n == 2
8245 && regclass[0] == X86_64_X87_CLASS
8246 && regclass[1] == X86_64_X87UP_CLASS)
8247 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8249 if (n == 2
8250 && regclass[0] == X86_64_INTEGER_CLASS
8251 && regclass[1] == X86_64_INTEGER_CLASS
8252 && (mode == CDImode || mode == TImode)
8253 && intreg[0] + 1 == intreg[1])
8254 return gen_rtx_REG (mode, intreg[0]);
8256 /* Otherwise figure out the entries of the PARALLEL. */
8257 for (i = 0; i < n; i++)
8259 int pos;
8261 switch (regclass[i])
8263 case X86_64_NO_CLASS:
8264 break;
8265 case X86_64_INTEGER_CLASS:
8266 case X86_64_INTEGERSI_CLASS:
8267 /* Merge TImodes on aligned occasions here too. */
8268 if (i * 8 + 8 > bytes)
8270 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8271 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8272 /* We've requested 24 bytes we
8273 don't have mode for. Use DImode. */
8274 tmpmode = DImode;
8276 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8277 tmpmode = SImode;
8278 else
8279 tmpmode = DImode;
8280 exp [nexps++]
8281 = gen_rtx_EXPR_LIST (VOIDmode,
8282 gen_rtx_REG (tmpmode, *intreg),
8283 GEN_INT (i*8));
8284 intreg++;
8285 break;
8286 case X86_64_SSESF_CLASS:
8287 exp [nexps++]
8288 = gen_rtx_EXPR_LIST (VOIDmode,
8289 gen_rtx_REG (SFmode,
8290 SSE_REGNO (sse_regno)),
8291 GEN_INT (i*8));
8292 sse_regno++;
8293 break;
8294 case X86_64_SSEDF_CLASS:
8295 exp [nexps++]
8296 = gen_rtx_EXPR_LIST (VOIDmode,
8297 gen_rtx_REG (DFmode,
8298 SSE_REGNO (sse_regno)),
8299 GEN_INT (i*8));
8300 sse_regno++;
8301 break;
8302 case X86_64_SSE_CLASS:
8303 pos = i;
8304 switch (n)
8306 case 1:
8307 tmpmode = DImode;
8308 break;
8309 case 2:
8310 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8312 tmpmode = TImode;
8313 i++;
8315 else
8316 tmpmode = DImode;
8317 break;
8318 case 4:
8319 gcc_assert (i == 0
8320 && regclass[1] == X86_64_SSEUP_CLASS
8321 && regclass[2] == X86_64_SSEUP_CLASS
8322 && regclass[3] == X86_64_SSEUP_CLASS);
8323 tmpmode = OImode;
8324 i += 3;
8325 break;
8326 case 8:
8327 gcc_assert (i == 0
8328 && regclass[1] == X86_64_SSEUP_CLASS
8329 && regclass[2] == X86_64_SSEUP_CLASS
8330 && regclass[3] == X86_64_SSEUP_CLASS
8331 && regclass[4] == X86_64_SSEUP_CLASS
8332 && regclass[5] == X86_64_SSEUP_CLASS
8333 && regclass[6] == X86_64_SSEUP_CLASS
8334 && regclass[7] == X86_64_SSEUP_CLASS);
8335 tmpmode = XImode;
8336 i += 7;
8337 break;
8338 default:
8339 gcc_unreachable ();
8341 exp [nexps++]
8342 = gen_rtx_EXPR_LIST (VOIDmode,
8343 gen_rtx_REG (tmpmode,
8344 SSE_REGNO (sse_regno)),
8345 GEN_INT (pos*8));
8346 sse_regno++;
8347 break;
8348 default:
8349 gcc_unreachable ();
8353 /* Empty aligned struct, union or class. */
8354 if (nexps == 0)
8355 return NULL;
8357 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8358 for (i = 0; i < nexps; i++)
8359 XVECEXP (ret, 0, i) = exp [i];
8360 return ret;
8363 /* Update the data in CUM to advance over an argument of mode MODE
8364 and data type TYPE. (TYPE is null for libcalls where that information
8365 may not be available.)
8367 Return a number of integer regsiters advanced over. */
8369 static int
8370 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8371 const_tree type, HOST_WIDE_INT bytes,
8372 HOST_WIDE_INT words)
8374 int res = 0;
8375 bool error_p = false;
8377 if (TARGET_IAMCU)
8379 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8380 bytes in registers. */
8381 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8382 goto pass_in_reg;
8383 return res;
8386 switch (mode)
8388 default:
8389 break;
8391 case E_BLKmode:
8392 if (bytes < 0)
8393 break;
8394 /* FALLTHRU */
8396 case E_DImode:
8397 case E_SImode:
8398 case E_HImode:
8399 case E_QImode:
8400 pass_in_reg:
8401 cum->words += words;
8402 cum->nregs -= words;
8403 cum->regno += words;
8404 if (cum->nregs >= 0)
8405 res = words;
8406 if (cum->nregs <= 0)
8408 cum->nregs = 0;
8409 cfun->machine->arg_reg_available = false;
8410 cum->regno = 0;
8412 break;
8414 case E_OImode:
8415 /* OImode shouldn't be used directly. */
8416 gcc_unreachable ();
8418 case E_DFmode:
8419 if (cum->float_in_sse == -1)
8420 error_p = true;
8421 if (cum->float_in_sse < 2)
8422 break;
8423 /* FALLTHRU */
8424 case E_SFmode:
8425 if (cum->float_in_sse == -1)
8426 error_p = true;
8427 if (cum->float_in_sse < 1)
8428 break;
8429 /* FALLTHRU */
8431 case E_V8SFmode:
8432 case E_V8SImode:
8433 case E_V64QImode:
8434 case E_V32HImode:
8435 case E_V16SImode:
8436 case E_V8DImode:
8437 case E_V16SFmode:
8438 case E_V8DFmode:
8439 case E_V32QImode:
8440 case E_V16HImode:
8441 case E_V4DFmode:
8442 case E_V4DImode:
8443 case E_TImode:
8444 case E_V16QImode:
8445 case E_V8HImode:
8446 case E_V4SImode:
8447 case E_V2DImode:
8448 case E_V4SFmode:
8449 case E_V2DFmode:
8450 if (!type || !AGGREGATE_TYPE_P (type))
8452 cum->sse_words += words;
8453 cum->sse_nregs -= 1;
8454 cum->sse_regno += 1;
8455 if (cum->sse_nregs <= 0)
8457 cum->sse_nregs = 0;
8458 cum->sse_regno = 0;
8461 break;
8463 case E_V8QImode:
8464 case E_V4HImode:
8465 case E_V2SImode:
8466 case E_V2SFmode:
8467 case E_V1TImode:
8468 case E_V1DImode:
8469 if (!type || !AGGREGATE_TYPE_P (type))
8471 cum->mmx_words += words;
8472 cum->mmx_nregs -= 1;
8473 cum->mmx_regno += 1;
8474 if (cum->mmx_nregs <= 0)
8476 cum->mmx_nregs = 0;
8477 cum->mmx_regno = 0;
8480 break;
8482 if (error_p)
8484 cum->float_in_sse = 0;
8485 error ("calling %qD with SSE calling convention without "
8486 "SSE/SSE2 enabled", cum->decl);
8487 sorry ("this is a GCC bug that can be worked around by adding "
8488 "attribute used to function called");
8491 return res;
8494 static int
8495 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8496 const_tree type, HOST_WIDE_INT words, bool named)
8498 int int_nregs, sse_nregs;
8500 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8501 if (!named && (VALID_AVX512F_REG_MODE (mode)
8502 || VALID_AVX256_REG_MODE (mode)))
8503 return 0;
8505 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8506 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8508 cum->nregs -= int_nregs;
8509 cum->sse_nregs -= sse_nregs;
8510 cum->regno += int_nregs;
8511 cum->sse_regno += sse_nregs;
8512 return int_nregs;
8514 else
8516 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8517 cum->words = ROUND_UP (cum->words, align);
8518 cum->words += words;
8519 return 0;
8523 static int
8524 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8525 HOST_WIDE_INT words)
8527 /* Otherwise, this should be passed indirect. */
8528 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8530 cum->words += words;
8531 if (cum->nregs > 0)
8533 cum->nregs -= 1;
8534 cum->regno += 1;
8535 return 1;
8537 return 0;
8540 /* Update the data in CUM to advance over an argument of mode MODE and
8541 data type TYPE. (TYPE is null for libcalls where that information
8542 may not be available.) */
8544 static void
8545 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8546 const_tree type, bool named)
8548 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8549 HOST_WIDE_INT bytes, words;
8550 int nregs;
8552 /* The argument of interrupt handler is a special case and is
8553 handled in ix86_function_arg. */
8554 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8555 return;
8557 if (mode == BLKmode)
8558 bytes = int_size_in_bytes (type);
8559 else
8560 bytes = GET_MODE_SIZE (mode);
8561 words = CEIL (bytes, UNITS_PER_WORD);
8563 if (type)
8564 mode = type_natural_mode (type, NULL, false);
8566 if (TARGET_64BIT)
8568 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8570 if (call_abi == MS_ABI)
8571 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8572 else
8573 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8575 else
8576 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8578 /* For pointers passed in memory we expect bounds passed in Bounds
8579 Table. */
8580 if (!nregs)
8582 /* Track if there are outgoing arguments on stack. */
8583 if (cum->caller)
8584 cfun->machine->outgoing_args_on_stack = true;
8588 /* Define where to put the arguments to a function.
8589 Value is zero to push the argument on the stack,
8590 or a hard register in which to store the argument.
8592 MODE is the argument's machine mode.
8593 TYPE is the data type of the argument (as a tree).
8594 This is null for libcalls where that information may
8595 not be available.
8596 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8597 the preceding args and about the function being called.
8598 NAMED is nonzero if this argument is a named parameter
8599 (otherwise it is an extra parameter matching an ellipsis). */
8601 static rtx
8602 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8603 machine_mode orig_mode, const_tree type,
8604 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8606 bool error_p = false;
8608 /* Avoid the AL settings for the Unix64 ABI. */
8609 if (mode == VOIDmode)
8610 return constm1_rtx;
8612 if (TARGET_IAMCU)
8614 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8615 bytes in registers. */
8616 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8617 goto pass_in_reg;
8618 return NULL_RTX;
8621 switch (mode)
8623 default:
8624 break;
8626 case E_BLKmode:
8627 if (bytes < 0)
8628 break;
8629 /* FALLTHRU */
8630 case E_DImode:
8631 case E_SImode:
8632 case E_HImode:
8633 case E_QImode:
8634 pass_in_reg:
8635 if (words <= cum->nregs)
8637 int regno = cum->regno;
8639 /* Fastcall allocates the first two DWORD (SImode) or
8640 smaller arguments to ECX and EDX if it isn't an
8641 aggregate type . */
8642 if (cum->fastcall)
8644 if (mode == BLKmode
8645 || mode == DImode
8646 || (type && AGGREGATE_TYPE_P (type)))
8647 break;
8649 /* ECX not EAX is the first allocated register. */
8650 if (regno == AX_REG)
8651 regno = CX_REG;
8653 return gen_rtx_REG (mode, regno);
8655 break;
8657 case E_DFmode:
8658 if (cum->float_in_sse == -1)
8659 error_p = true;
8660 if (cum->float_in_sse < 2)
8661 break;
8662 /* FALLTHRU */
8663 case E_SFmode:
8664 if (cum->float_in_sse == -1)
8665 error_p = true;
8666 if (cum->float_in_sse < 1)
8667 break;
8668 /* FALLTHRU */
8669 case E_TImode:
8670 /* In 32bit, we pass TImode in xmm registers. */
8671 case E_V16QImode:
8672 case E_V8HImode:
8673 case E_V4SImode:
8674 case E_V2DImode:
8675 case E_V4SFmode:
8676 case E_V2DFmode:
8677 if (!type || !AGGREGATE_TYPE_P (type))
8679 if (cum->sse_nregs)
8680 return gen_reg_or_parallel (mode, orig_mode,
8681 cum->sse_regno + FIRST_SSE_REG);
8683 break;
8685 case E_OImode:
8686 case E_XImode:
8687 /* OImode and XImode shouldn't be used directly. */
8688 gcc_unreachable ();
8690 case E_V64QImode:
8691 case E_V32HImode:
8692 case E_V16SImode:
8693 case E_V8DImode:
8694 case E_V16SFmode:
8695 case E_V8DFmode:
8696 case E_V8SFmode:
8697 case E_V8SImode:
8698 case E_V32QImode:
8699 case E_V16HImode:
8700 case E_V4DFmode:
8701 case E_V4DImode:
8702 if (!type || !AGGREGATE_TYPE_P (type))
8704 if (cum->sse_nregs)
8705 return gen_reg_or_parallel (mode, orig_mode,
8706 cum->sse_regno + FIRST_SSE_REG);
8708 break;
8710 case E_V8QImode:
8711 case E_V4HImode:
8712 case E_V2SImode:
8713 case E_V2SFmode:
8714 case E_V1TImode:
8715 case E_V1DImode:
8716 if (!type || !AGGREGATE_TYPE_P (type))
8718 if (cum->mmx_nregs)
8719 return gen_reg_or_parallel (mode, orig_mode,
8720 cum->mmx_regno + FIRST_MMX_REG);
8722 break;
8724 if (error_p)
8726 cum->float_in_sse = 0;
8727 error ("calling %qD with SSE calling convention without "
8728 "SSE/SSE2 enabled", cum->decl);
8729 sorry ("this is a GCC bug that can be worked around by adding "
8730 "attribute used to function called");
8733 return NULL_RTX;
8736 static rtx
8737 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8738 machine_mode orig_mode, const_tree type, bool named)
8740 /* Handle a hidden AL argument containing number of registers
8741 for varargs x86-64 functions. */
8742 if (mode == VOIDmode)
8743 return GEN_INT (cum->maybe_vaarg
8744 ? (cum->sse_nregs < 0
8745 ? X86_64_SSE_REGPARM_MAX
8746 : cum->sse_regno)
8747 : -1);
8749 switch (mode)
8751 default:
8752 break;
8754 case E_V8SFmode:
8755 case E_V8SImode:
8756 case E_V32QImode:
8757 case E_V16HImode:
8758 case E_V4DFmode:
8759 case E_V4DImode:
8760 case E_V16SFmode:
8761 case E_V16SImode:
8762 case E_V64QImode:
8763 case E_V32HImode:
8764 case E_V8DFmode:
8765 case E_V8DImode:
8766 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8767 if (!named)
8768 return NULL;
8769 break;
8772 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8773 cum->sse_nregs,
8774 &x86_64_int_parameter_registers [cum->regno],
8775 cum->sse_regno);
8778 static rtx
8779 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8780 machine_mode orig_mode, bool named,
8781 HOST_WIDE_INT bytes)
8783 unsigned int regno;
8785 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8786 We use value of -2 to specify that current function call is MSABI. */
8787 if (mode == VOIDmode)
8788 return GEN_INT (-2);
8790 /* If we've run out of registers, it goes on the stack. */
8791 if (cum->nregs == 0)
8792 return NULL_RTX;
8794 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8796 /* Only floating point modes are passed in anything but integer regs. */
8797 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8799 if (named)
8800 regno = cum->regno + FIRST_SSE_REG;
8801 else
8803 rtx t1, t2;
8805 /* Unnamed floating parameters are passed in both the
8806 SSE and integer registers. */
8807 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8808 t2 = gen_rtx_REG (mode, regno);
8809 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8810 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8811 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8814 /* Handle aggregated types passed in register. */
8815 if (orig_mode == BLKmode)
8817 if (bytes > 0 && bytes <= 8)
8818 mode = (bytes > 4 ? DImode : SImode);
8819 if (mode == BLKmode)
8820 mode = DImode;
8823 return gen_reg_or_parallel (mode, orig_mode, regno);
8826 /* Return where to put the arguments to a function.
8827 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8829 MODE is the argument's machine mode. TYPE is the data type of the
8830 argument. It is null for libcalls where that information may not be
8831 available. CUM gives information about the preceding args and about
8832 the function being called. NAMED is nonzero if this argument is a
8833 named parameter (otherwise it is an extra parameter matching an
8834 ellipsis). */
8836 static rtx
8837 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8838 const_tree type, bool named)
8840 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8841 machine_mode mode = omode;
8842 HOST_WIDE_INT bytes, words;
8843 rtx arg;
8845 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8847 gcc_assert (type != NULL_TREE);
8848 if (POINTER_TYPE_P (type))
8850 /* This is the pointer argument. */
8851 gcc_assert (TYPE_MODE (type) == Pmode);
8852 /* It is at -WORD(AP) in the current frame in interrupt and
8853 exception handlers. */
8854 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8856 else
8858 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8859 && TREE_CODE (type) == INTEGER_TYPE
8860 && TYPE_MODE (type) == word_mode);
8861 /* The error code is the word-mode integer argument at
8862 -2 * WORD(AP) in the current frame of the exception
8863 handler. */
8864 arg = gen_rtx_MEM (word_mode,
8865 plus_constant (Pmode,
8866 arg_pointer_rtx,
8867 -2 * UNITS_PER_WORD));
8869 return arg;
8872 if (mode == BLKmode)
8873 bytes = int_size_in_bytes (type);
8874 else
8875 bytes = GET_MODE_SIZE (mode);
8876 words = CEIL (bytes, UNITS_PER_WORD);
8878 /* To simplify the code below, represent vector types with a vector mode
8879 even if MMX/SSE are not active. */
8880 if (type && TREE_CODE (type) == VECTOR_TYPE)
8881 mode = type_natural_mode (type, cum, false);
8883 if (TARGET_64BIT)
8885 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8887 if (call_abi == MS_ABI)
8888 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8889 else
8890 arg = function_arg_64 (cum, mode, omode, type, named);
8892 else
8893 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8895 /* Track if there are outgoing arguments on stack. */
8896 if (arg == NULL_RTX && cum->caller)
8897 cfun->machine->outgoing_args_on_stack = true;
8899 return arg;
8902 /* A C expression that indicates when an argument must be passed by
8903 reference. If nonzero for an argument, a copy of that argument is
8904 made in memory and a pointer to the argument is passed instead of
8905 the argument itself. The pointer is passed in whatever way is
8906 appropriate for passing a pointer to that type. */
8908 static bool
8909 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8910 const_tree type, bool)
8912 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8914 if (TARGET_64BIT)
8916 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8918 /* See Windows x64 Software Convention. */
8919 if (call_abi == MS_ABI)
8921 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8923 if (type)
8925 /* Arrays are passed by reference. */
8926 if (TREE_CODE (type) == ARRAY_TYPE)
8927 return true;
8929 if (RECORD_OR_UNION_TYPE_P (type))
8931 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8932 are passed by reference. */
8933 msize = int_size_in_bytes (type);
8937 /* __m128 is passed by reference. */
8938 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8940 else if (type && int_size_in_bytes (type) == -1)
8941 return true;
8944 return false;
8947 /* Return true when TYPE should be 128bit aligned for 32bit argument
8948 passing ABI. XXX: This function is obsolete and is only used for
8949 checking psABI compatibility with previous versions of GCC. */
8951 static bool
8952 ix86_compat_aligned_value_p (const_tree type)
8954 machine_mode mode = TYPE_MODE (type);
8955 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8956 || mode == TDmode
8957 || mode == TFmode
8958 || mode == TCmode)
8959 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8960 return true;
8961 if (TYPE_ALIGN (type) < 128)
8962 return false;
8964 if (AGGREGATE_TYPE_P (type))
8966 /* Walk the aggregates recursively. */
8967 switch (TREE_CODE (type))
8969 case RECORD_TYPE:
8970 case UNION_TYPE:
8971 case QUAL_UNION_TYPE:
8973 tree field;
8975 /* Walk all the structure fields. */
8976 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8978 if (TREE_CODE (field) == FIELD_DECL
8979 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8980 return true;
8982 break;
8985 case ARRAY_TYPE:
8986 /* Just for use if some languages passes arrays by value. */
8987 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8988 return true;
8989 break;
8991 default:
8992 gcc_unreachable ();
8995 return false;
8998 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8999 XXX: This function is obsolete and is only used for checking psABI
9000 compatibility with previous versions of GCC. */
9002 static unsigned int
9003 ix86_compat_function_arg_boundary (machine_mode mode,
9004 const_tree type, unsigned int align)
9006 /* In 32bit, only _Decimal128 and __float128 are aligned to their
9007 natural boundaries. */
9008 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
9010 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
9011 make an exception for SSE modes since these require 128bit
9012 alignment.
9014 The handling here differs from field_alignment. ICC aligns MMX
9015 arguments to 4 byte boundaries, while structure fields are aligned
9016 to 8 byte boundaries. */
9017 if (!type)
9019 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
9020 align = PARM_BOUNDARY;
9022 else
9024 if (!ix86_compat_aligned_value_p (type))
9025 align = PARM_BOUNDARY;
9028 if (align > BIGGEST_ALIGNMENT)
9029 align = BIGGEST_ALIGNMENT;
9030 return align;
9033 /* Return true when TYPE should be 128bit aligned for 32bit argument
9034 passing ABI. */
9036 static bool
9037 ix86_contains_aligned_value_p (const_tree type)
9039 machine_mode mode = TYPE_MODE (type);
9041 if (mode == XFmode || mode == XCmode)
9042 return false;
9044 if (TYPE_ALIGN (type) < 128)
9045 return false;
9047 if (AGGREGATE_TYPE_P (type))
9049 /* Walk the aggregates recursively. */
9050 switch (TREE_CODE (type))
9052 case RECORD_TYPE:
9053 case UNION_TYPE:
9054 case QUAL_UNION_TYPE:
9056 tree field;
9058 /* Walk all the structure fields. */
9059 for (field = TYPE_FIELDS (type);
9060 field;
9061 field = DECL_CHAIN (field))
9063 if (TREE_CODE (field) == FIELD_DECL
9064 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
9065 return true;
9067 break;
9070 case ARRAY_TYPE:
9071 /* Just for use if some languages passes arrays by value. */
9072 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
9073 return true;
9074 break;
9076 default:
9077 gcc_unreachable ();
9080 else
9081 return TYPE_ALIGN (type) >= 128;
9083 return false;
9086 /* Gives the alignment boundary, in bits, of an argument with the
9087 specified mode and type. */
9089 static unsigned int
9090 ix86_function_arg_boundary (machine_mode mode, const_tree type)
9092 unsigned int align;
9093 if (type)
9095 /* Since the main variant type is used for call, we convert it to
9096 the main variant type. */
9097 type = TYPE_MAIN_VARIANT (type);
9098 align = TYPE_ALIGN (type);
9099 if (TYPE_EMPTY_P (type))
9100 return PARM_BOUNDARY;
9102 else
9103 align = GET_MODE_ALIGNMENT (mode);
9104 if (align < PARM_BOUNDARY)
9105 align = PARM_BOUNDARY;
9106 else
9108 static bool warned;
9109 unsigned int saved_align = align;
9111 if (!TARGET_64BIT)
9113 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
9114 if (!type)
9116 if (mode == XFmode || mode == XCmode)
9117 align = PARM_BOUNDARY;
9119 else if (!ix86_contains_aligned_value_p (type))
9120 align = PARM_BOUNDARY;
9122 if (align < 128)
9123 align = PARM_BOUNDARY;
9126 if (warn_psabi
9127 && !warned
9128 && align != ix86_compat_function_arg_boundary (mode, type,
9129 saved_align))
9131 warned = true;
9132 inform (input_location,
9133 "The ABI for passing parameters with %d-byte"
9134 " alignment has changed in GCC 4.6",
9135 align / BITS_PER_UNIT);
9139 return align;
9142 /* Return true if N is a possible register number of function value. */
9144 static bool
9145 ix86_function_value_regno_p (const unsigned int regno)
9147 switch (regno)
9149 case AX_REG:
9150 return true;
9151 case DX_REG:
9152 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9153 case DI_REG:
9154 case SI_REG:
9155 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9157 /* Complex values are returned in %st(0)/%st(1) pair. */
9158 case ST0_REG:
9159 case ST1_REG:
9160 /* TODO: The function should depend on current function ABI but
9161 builtins.c would need updating then. Therefore we use the
9162 default ABI. */
9163 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9164 return false;
9165 return TARGET_FLOAT_RETURNS_IN_80387;
9167 /* Complex values are returned in %xmm0/%xmm1 pair. */
9168 case XMM0_REG:
9169 case XMM1_REG:
9170 return TARGET_SSE;
9172 case MM0_REG:
9173 if (TARGET_MACHO || TARGET_64BIT)
9174 return false;
9175 return TARGET_MMX;
9178 return false;
9181 /* Define how to find the value returned by a function.
9182 VALTYPE is the data type of the value (as a tree).
9183 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9184 otherwise, FUNC is 0. */
9186 static rtx
9187 function_value_32 (machine_mode orig_mode, machine_mode mode,
9188 const_tree fntype, const_tree fn)
9190 unsigned int regno;
9192 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9193 we normally prevent this case when mmx is not available. However
9194 some ABIs may require the result to be returned like DImode. */
9195 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9196 regno = FIRST_MMX_REG;
9198 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9199 we prevent this case when sse is not available. However some ABIs
9200 may require the result to be returned like integer TImode. */
9201 else if (mode == TImode
9202 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9203 regno = FIRST_SSE_REG;
9205 /* 32-byte vector modes in %ymm0. */
9206 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9207 regno = FIRST_SSE_REG;
9209 /* 64-byte vector modes in %zmm0. */
9210 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9211 regno = FIRST_SSE_REG;
9213 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9214 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9215 regno = FIRST_FLOAT_REG;
9216 else
9217 /* Most things go in %eax. */
9218 regno = AX_REG;
9220 /* Override FP return register with %xmm0 for local functions when
9221 SSE math is enabled or for functions with sseregparm attribute. */
9222 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9224 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9225 if (sse_level == -1)
9227 error ("calling %qD with SSE calling convention without "
9228 "SSE/SSE2 enabled", fn);
9229 sorry ("this is a GCC bug that can be worked around by adding "
9230 "attribute used to function called");
9232 else if ((sse_level >= 1 && mode == SFmode)
9233 || (sse_level == 2 && mode == DFmode))
9234 regno = FIRST_SSE_REG;
9237 /* OImode shouldn't be used directly. */
9238 gcc_assert (mode != OImode);
9240 return gen_rtx_REG (orig_mode, regno);
9243 static rtx
9244 function_value_64 (machine_mode orig_mode, machine_mode mode,
9245 const_tree valtype)
9247 rtx ret;
9249 /* Handle libcalls, which don't provide a type node. */
9250 if (valtype == NULL)
9252 unsigned int regno;
9254 switch (mode)
9256 case E_SFmode:
9257 case E_SCmode:
9258 case E_DFmode:
9259 case E_DCmode:
9260 case E_TFmode:
9261 case E_SDmode:
9262 case E_DDmode:
9263 case E_TDmode:
9264 regno = FIRST_SSE_REG;
9265 break;
9266 case E_XFmode:
9267 case E_XCmode:
9268 regno = FIRST_FLOAT_REG;
9269 break;
9270 case E_TCmode:
9271 return NULL;
9272 default:
9273 regno = AX_REG;
9276 return gen_rtx_REG (mode, regno);
9278 else if (POINTER_TYPE_P (valtype))
9280 /* Pointers are always returned in word_mode. */
9281 mode = word_mode;
9284 ret = construct_container (mode, orig_mode, valtype, 1,
9285 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9286 x86_64_int_return_registers, 0);
9288 /* For zero sized structures, construct_container returns NULL, but we
9289 need to keep rest of compiler happy by returning meaningful value. */
9290 if (!ret)
9291 ret = gen_rtx_REG (orig_mode, AX_REG);
9293 return ret;
9296 static rtx
9297 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9298 const_tree valtype)
9300 unsigned int regno = AX_REG;
9302 if (TARGET_SSE)
9304 switch (GET_MODE_SIZE (mode))
9306 case 16:
9307 if (valtype != NULL_TREE
9308 && !VECTOR_INTEGER_TYPE_P (valtype)
9309 && !VECTOR_INTEGER_TYPE_P (valtype)
9310 && !INTEGRAL_TYPE_P (valtype)
9311 && !VECTOR_FLOAT_TYPE_P (valtype))
9312 break;
9313 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9314 && !COMPLEX_MODE_P (mode))
9315 regno = FIRST_SSE_REG;
9316 break;
9317 case 8:
9318 case 4:
9319 if (mode == SFmode || mode == DFmode)
9320 regno = FIRST_SSE_REG;
9321 break;
9322 default:
9323 break;
9326 return gen_rtx_REG (orig_mode, regno);
9329 static rtx
9330 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9331 machine_mode orig_mode, machine_mode mode)
9333 const_tree fn, fntype;
9335 fn = NULL_TREE;
9336 if (fntype_or_decl && DECL_P (fntype_or_decl))
9337 fn = fntype_or_decl;
9338 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9340 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9341 return function_value_ms_64 (orig_mode, mode, valtype);
9342 else if (TARGET_64BIT)
9343 return function_value_64 (orig_mode, mode, valtype);
9344 else
9345 return function_value_32 (orig_mode, mode, fntype, fn);
9348 static rtx
9349 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9351 machine_mode mode, orig_mode;
9353 orig_mode = TYPE_MODE (valtype);
9354 mode = type_natural_mode (valtype, NULL, true);
9355 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9358 /* Pointer function arguments and return values are promoted to
9359 word_mode for normal functions. */
9361 static machine_mode
9362 ix86_promote_function_mode (const_tree type, machine_mode mode,
9363 int *punsignedp, const_tree fntype,
9364 int for_return)
9366 if (cfun->machine->func_type == TYPE_NORMAL
9367 && type != NULL_TREE
9368 && POINTER_TYPE_P (type))
9370 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9371 return word_mode;
9373 return default_promote_function_mode (type, mode, punsignedp, fntype,
9374 for_return);
9377 /* Return true if a structure, union or array with MODE containing FIELD
9378 should be accessed using BLKmode. */
9380 static bool
9381 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9383 /* Union with XFmode must be in BLKmode. */
9384 return (mode == XFmode
9385 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9386 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9390 ix86_libcall_value (machine_mode mode)
9392 return ix86_function_value_1 (NULL, NULL, mode, mode);
9395 /* Return true iff type is returned in memory. */
9397 static bool
9398 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9400 #ifdef SUBTARGET_RETURN_IN_MEMORY
9401 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9402 #else
9403 const machine_mode mode = type_natural_mode (type, NULL, true);
9404 HOST_WIDE_INT size;
9406 if (TARGET_64BIT)
9408 if (ix86_function_type_abi (fntype) == MS_ABI)
9410 size = int_size_in_bytes (type);
9412 /* __m128 is returned in xmm0. */
9413 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9414 || INTEGRAL_TYPE_P (type)
9415 || VECTOR_FLOAT_TYPE_P (type))
9416 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9417 && !COMPLEX_MODE_P (mode)
9418 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9419 return false;
9421 /* Otherwise, the size must be exactly in [1248]. */
9422 return size != 1 && size != 2 && size != 4 && size != 8;
9424 else
9426 int needed_intregs, needed_sseregs;
9428 return examine_argument (mode, type, 1,
9429 &needed_intregs, &needed_sseregs);
9432 else
9434 size = int_size_in_bytes (type);
9436 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9437 bytes in registers. */
9438 if (TARGET_IAMCU)
9439 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9441 if (mode == BLKmode)
9442 return true;
9444 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9445 return false;
9447 if (VECTOR_MODE_P (mode) || mode == TImode)
9449 /* User-created vectors small enough to fit in EAX. */
9450 if (size < 8)
9451 return false;
9453 /* Unless ABI prescibes otherwise,
9454 MMX/3dNow values are returned in MM0 if available. */
9456 if (size == 8)
9457 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9459 /* SSE values are returned in XMM0 if available. */
9460 if (size == 16)
9461 return !TARGET_SSE;
9463 /* AVX values are returned in YMM0 if available. */
9464 if (size == 32)
9465 return !TARGET_AVX;
9467 /* AVX512F values are returned in ZMM0 if available. */
9468 if (size == 64)
9469 return !TARGET_AVX512F;
9472 if (mode == XFmode)
9473 return false;
9475 if (size > 12)
9476 return true;
9478 /* OImode shouldn't be used directly. */
9479 gcc_assert (mode != OImode);
9481 return false;
9483 #endif
9487 /* Create the va_list data type. */
9489 static tree
9490 ix86_build_builtin_va_list_64 (void)
9492 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9494 record = lang_hooks.types.make_type (RECORD_TYPE);
9495 type_decl = build_decl (BUILTINS_LOCATION,
9496 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9498 f_gpr = build_decl (BUILTINS_LOCATION,
9499 FIELD_DECL, get_identifier ("gp_offset"),
9500 unsigned_type_node);
9501 f_fpr = build_decl (BUILTINS_LOCATION,
9502 FIELD_DECL, get_identifier ("fp_offset"),
9503 unsigned_type_node);
9504 f_ovf = build_decl (BUILTINS_LOCATION,
9505 FIELD_DECL, get_identifier ("overflow_arg_area"),
9506 ptr_type_node);
9507 f_sav = build_decl (BUILTINS_LOCATION,
9508 FIELD_DECL, get_identifier ("reg_save_area"),
9509 ptr_type_node);
9511 va_list_gpr_counter_field = f_gpr;
9512 va_list_fpr_counter_field = f_fpr;
9514 DECL_FIELD_CONTEXT (f_gpr) = record;
9515 DECL_FIELD_CONTEXT (f_fpr) = record;
9516 DECL_FIELD_CONTEXT (f_ovf) = record;
9517 DECL_FIELD_CONTEXT (f_sav) = record;
9519 TYPE_STUB_DECL (record) = type_decl;
9520 TYPE_NAME (record) = type_decl;
9521 TYPE_FIELDS (record) = f_gpr;
9522 DECL_CHAIN (f_gpr) = f_fpr;
9523 DECL_CHAIN (f_fpr) = f_ovf;
9524 DECL_CHAIN (f_ovf) = f_sav;
9526 layout_type (record);
9528 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9529 NULL_TREE, TYPE_ATTRIBUTES (record));
9531 /* The correct type is an array type of one element. */
9532 return build_array_type (record, build_index_type (size_zero_node));
9535 /* Setup the builtin va_list data type and for 64-bit the additional
9536 calling convention specific va_list data types. */
9538 static tree
9539 ix86_build_builtin_va_list (void)
9541 if (TARGET_64BIT)
9543 /* Initialize ABI specific va_list builtin types.
9545 In lto1, we can encounter two va_list types:
9546 - one as a result of the type-merge across TUs, and
9547 - the one constructed here.
9548 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9549 a type identity check in canonical_va_list_type based on
9550 TYPE_MAIN_VARIANT (which we used to have) will not work.
9551 Instead, we tag each va_list_type_node with its unique attribute, and
9552 look for the attribute in the type identity check in
9553 canonical_va_list_type.
9555 Tagging sysv_va_list_type_node directly with the attribute is
9556 problematic since it's a array of one record, which will degrade into a
9557 pointer to record when used as parameter (see build_va_arg comments for
9558 an example), dropping the attribute in the process. So we tag the
9559 record instead. */
9561 /* For SYSV_ABI we use an array of one record. */
9562 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9564 /* For MS_ABI we use plain pointer to argument area. */
9565 tree char_ptr_type = build_pointer_type (char_type_node);
9566 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9567 TYPE_ATTRIBUTES (char_ptr_type));
9568 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9570 return ((ix86_abi == MS_ABI)
9571 ? ms_va_list_type_node
9572 : sysv_va_list_type_node);
9574 else
9576 /* For i386 we use plain pointer to argument area. */
9577 return build_pointer_type (char_type_node);
9581 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9583 static void
9584 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9586 rtx save_area, mem;
9587 alias_set_type set;
9588 int i, max;
9590 /* GPR size of varargs save area. */
9591 if (cfun->va_list_gpr_size)
9592 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9593 else
9594 ix86_varargs_gpr_size = 0;
9596 /* FPR size of varargs save area. We don't need it if we don't pass
9597 anything in SSE registers. */
9598 if (TARGET_SSE && cfun->va_list_fpr_size)
9599 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9600 else
9601 ix86_varargs_fpr_size = 0;
9603 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9604 return;
9606 save_area = frame_pointer_rtx;
9607 set = get_varargs_alias_set ();
9609 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9610 if (max > X86_64_REGPARM_MAX)
9611 max = X86_64_REGPARM_MAX;
9613 for (i = cum->regno; i < max; i++)
9615 mem = gen_rtx_MEM (word_mode,
9616 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9617 MEM_NOTRAP_P (mem) = 1;
9618 set_mem_alias_set (mem, set);
9619 emit_move_insn (mem,
9620 gen_rtx_REG (word_mode,
9621 x86_64_int_parameter_registers[i]));
9624 if (ix86_varargs_fpr_size)
9626 machine_mode smode;
9627 rtx_code_label *label;
9628 rtx test;
9630 /* Now emit code to save SSE registers. The AX parameter contains number
9631 of SSE parameter registers used to call this function, though all we
9632 actually check here is the zero/non-zero status. */
9634 label = gen_label_rtx ();
9635 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9636 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9637 label));
9639 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9640 we used movdqa (i.e. TImode) instead? Perhaps even better would
9641 be if we could determine the real mode of the data, via a hook
9642 into pass_stdarg. Ignore all that for now. */
9643 smode = V4SFmode;
9644 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9645 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9647 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9648 if (max > X86_64_SSE_REGPARM_MAX)
9649 max = X86_64_SSE_REGPARM_MAX;
9651 for (i = cum->sse_regno; i < max; ++i)
9653 mem = plus_constant (Pmode, save_area,
9654 i * 16 + ix86_varargs_gpr_size);
9655 mem = gen_rtx_MEM (smode, mem);
9656 MEM_NOTRAP_P (mem) = 1;
9657 set_mem_alias_set (mem, set);
9658 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9660 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9663 emit_label (label);
9667 static void
9668 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9670 alias_set_type set = get_varargs_alias_set ();
9671 int i;
9673 /* Reset to zero, as there might be a sysv vaarg used
9674 before. */
9675 ix86_varargs_gpr_size = 0;
9676 ix86_varargs_fpr_size = 0;
9678 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9680 rtx reg, mem;
9682 mem = gen_rtx_MEM (Pmode,
9683 plus_constant (Pmode, virtual_incoming_args_rtx,
9684 i * UNITS_PER_WORD));
9685 MEM_NOTRAP_P (mem) = 1;
9686 set_mem_alias_set (mem, set);
9688 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9689 emit_move_insn (mem, reg);
9693 static void
9694 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9695 tree type, int *, int no_rtl)
9697 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9698 CUMULATIVE_ARGS next_cum;
9699 tree fntype;
9701 /* This argument doesn't appear to be used anymore. Which is good,
9702 because the old code here didn't suppress rtl generation. */
9703 gcc_assert (!no_rtl);
9705 if (!TARGET_64BIT)
9706 return;
9708 fntype = TREE_TYPE (current_function_decl);
9710 /* For varargs, we do not want to skip the dummy va_dcl argument.
9711 For stdargs, we do want to skip the last named argument. */
9712 next_cum = *cum;
9713 if (stdarg_p (fntype))
9714 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9715 true);
9717 if (cum->call_abi == MS_ABI)
9718 setup_incoming_varargs_ms_64 (&next_cum);
9719 else
9720 setup_incoming_varargs_64 (&next_cum);
9723 static void
9724 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9725 machine_mode mode,
9726 tree type,
9727 int *pretend_size ATTRIBUTE_UNUSED,
9728 int no_rtl)
9730 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9731 CUMULATIVE_ARGS next_cum;
9732 tree fntype;
9733 int max;
9735 gcc_assert (!no_rtl);
9737 /* Do nothing if we use plain pointer to argument area. */
9738 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9739 return;
9741 fntype = TREE_TYPE (current_function_decl);
9743 /* For varargs, we do not want to skip the dummy va_dcl argument.
9744 For stdargs, we do want to skip the last named argument. */
9745 next_cum = *cum;
9746 if (stdarg_p (fntype))
9747 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9748 true);
9750 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9751 if (max > X86_64_REGPARM_MAX)
9752 max = X86_64_REGPARM_MAX;
9756 /* Checks if TYPE is of kind va_list char *. */
9758 static bool
9759 is_va_list_char_pointer (tree type)
9761 tree canonic;
9763 /* For 32-bit it is always true. */
9764 if (!TARGET_64BIT)
9765 return true;
9766 canonic = ix86_canonical_va_list_type (type);
9767 return (canonic == ms_va_list_type_node
9768 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9771 /* Implement va_start. */
9773 static void
9774 ix86_va_start (tree valist, rtx nextarg)
9776 HOST_WIDE_INT words, n_gpr, n_fpr;
9777 tree f_gpr, f_fpr, f_ovf, f_sav;
9778 tree gpr, fpr, ovf, sav, t;
9779 tree type;
9780 rtx ovf_rtx;
9782 if (flag_split_stack
9783 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9785 unsigned int scratch_regno;
9787 /* When we are splitting the stack, we can't refer to the stack
9788 arguments using internal_arg_pointer, because they may be on
9789 the old stack. The split stack prologue will arrange to
9790 leave a pointer to the old stack arguments in a scratch
9791 register, which we here copy to a pseudo-register. The split
9792 stack prologue can't set the pseudo-register directly because
9793 it (the prologue) runs before any registers have been saved. */
9795 scratch_regno = split_stack_prologue_scratch_regno ();
9796 if (scratch_regno != INVALID_REGNUM)
9798 rtx reg;
9799 rtx_insn *seq;
9801 reg = gen_reg_rtx (Pmode);
9802 cfun->machine->split_stack_varargs_pointer = reg;
9804 start_sequence ();
9805 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9806 seq = get_insns ();
9807 end_sequence ();
9809 push_topmost_sequence ();
9810 emit_insn_after (seq, entry_of_function ());
9811 pop_topmost_sequence ();
9815 /* Only 64bit target needs something special. */
9816 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9818 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9819 std_expand_builtin_va_start (valist, nextarg);
9820 else
9822 rtx va_r, next;
9824 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9825 next = expand_binop (ptr_mode, add_optab,
9826 cfun->machine->split_stack_varargs_pointer,
9827 crtl->args.arg_offset_rtx,
9828 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9829 convert_move (va_r, next, 0);
9831 return;
9834 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9835 f_fpr = DECL_CHAIN (f_gpr);
9836 f_ovf = DECL_CHAIN (f_fpr);
9837 f_sav = DECL_CHAIN (f_ovf);
9839 valist = build_simple_mem_ref (valist);
9840 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9841 /* The following should be folded into the MEM_REF offset. */
9842 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9843 f_gpr, NULL_TREE);
9844 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9845 f_fpr, NULL_TREE);
9846 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9847 f_ovf, NULL_TREE);
9848 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9849 f_sav, NULL_TREE);
9851 /* Count number of gp and fp argument registers used. */
9852 words = crtl->args.info.words;
9853 n_gpr = crtl->args.info.regno;
9854 n_fpr = crtl->args.info.sse_regno;
9856 if (cfun->va_list_gpr_size)
9858 type = TREE_TYPE (gpr);
9859 t = build2 (MODIFY_EXPR, type,
9860 gpr, build_int_cst (type, n_gpr * 8));
9861 TREE_SIDE_EFFECTS (t) = 1;
9862 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9865 if (TARGET_SSE && cfun->va_list_fpr_size)
9867 type = TREE_TYPE (fpr);
9868 t = build2 (MODIFY_EXPR, type, fpr,
9869 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9870 TREE_SIDE_EFFECTS (t) = 1;
9871 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9874 /* Find the overflow area. */
9875 type = TREE_TYPE (ovf);
9876 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9877 ovf_rtx = crtl->args.internal_arg_pointer;
9878 else
9879 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9880 t = make_tree (type, ovf_rtx);
9881 if (words != 0)
9882 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9884 t = build2 (MODIFY_EXPR, type, ovf, t);
9885 TREE_SIDE_EFFECTS (t) = 1;
9886 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9888 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9890 /* Find the register save area.
9891 Prologue of the function save it right above stack frame. */
9892 type = TREE_TYPE (sav);
9893 t = make_tree (type, frame_pointer_rtx);
9894 if (!ix86_varargs_gpr_size)
9895 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9897 t = build2 (MODIFY_EXPR, type, sav, t);
9898 TREE_SIDE_EFFECTS (t) = 1;
9899 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9903 /* Implement va_arg. */
9905 static tree
9906 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9907 gimple_seq *post_p)
9909 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9910 tree f_gpr, f_fpr, f_ovf, f_sav;
9911 tree gpr, fpr, ovf, sav, t;
9912 int size, rsize;
9913 tree lab_false, lab_over = NULL_TREE;
9914 tree addr, t2;
9915 rtx container;
9916 int indirect_p = 0;
9917 tree ptrtype;
9918 machine_mode nat_mode;
9919 unsigned int arg_boundary;
9921 /* Only 64bit target needs something special. */
9922 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9923 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9925 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9926 f_fpr = DECL_CHAIN (f_gpr);
9927 f_ovf = DECL_CHAIN (f_fpr);
9928 f_sav = DECL_CHAIN (f_ovf);
9930 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9931 valist, f_gpr, NULL_TREE);
9933 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9934 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9935 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9937 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9938 if (indirect_p)
9939 type = build_pointer_type (type);
9940 size = arg_int_size_in_bytes (type);
9941 rsize = CEIL (size, UNITS_PER_WORD);
9943 nat_mode = type_natural_mode (type, NULL, false);
9944 switch (nat_mode)
9946 case E_V8SFmode:
9947 case E_V8SImode:
9948 case E_V32QImode:
9949 case E_V16HImode:
9950 case E_V4DFmode:
9951 case E_V4DImode:
9952 case E_V16SFmode:
9953 case E_V16SImode:
9954 case E_V64QImode:
9955 case E_V32HImode:
9956 case E_V8DFmode:
9957 case E_V8DImode:
9958 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9959 if (!TARGET_64BIT_MS_ABI)
9961 container = NULL;
9962 break;
9964 /* FALLTHRU */
9966 default:
9967 container = construct_container (nat_mode, TYPE_MODE (type),
9968 type, 0, X86_64_REGPARM_MAX,
9969 X86_64_SSE_REGPARM_MAX, intreg,
9971 break;
9974 /* Pull the value out of the saved registers. */
9976 addr = create_tmp_var (ptr_type_node, "addr");
9978 if (container)
9980 int needed_intregs, needed_sseregs;
9981 bool need_temp;
9982 tree int_addr, sse_addr;
9984 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9985 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9987 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9989 need_temp = (!REG_P (container)
9990 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9991 || TYPE_ALIGN (type) > 128));
9993 /* In case we are passing structure, verify that it is consecutive block
9994 on the register save area. If not we need to do moves. */
9995 if (!need_temp && !REG_P (container))
9997 /* Verify that all registers are strictly consecutive */
9998 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
10000 int i;
10002 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10004 rtx slot = XVECEXP (container, 0, i);
10005 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
10006 || INTVAL (XEXP (slot, 1)) != i * 16)
10007 need_temp = true;
10010 else
10012 int i;
10014 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
10016 rtx slot = XVECEXP (container, 0, i);
10017 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
10018 || INTVAL (XEXP (slot, 1)) != i * 8)
10019 need_temp = true;
10023 if (!need_temp)
10025 int_addr = addr;
10026 sse_addr = addr;
10028 else
10030 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10031 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10034 /* First ensure that we fit completely in registers. */
10035 if (needed_intregs)
10037 t = build_int_cst (TREE_TYPE (gpr),
10038 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10039 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10040 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10041 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10042 gimplify_and_add (t, pre_p);
10044 if (needed_sseregs)
10046 t = build_int_cst (TREE_TYPE (fpr),
10047 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10048 + X86_64_REGPARM_MAX * 8);
10049 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10050 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10051 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10052 gimplify_and_add (t, pre_p);
10055 /* Compute index to start of area used for integer regs. */
10056 if (needed_intregs)
10058 /* int_addr = gpr + sav; */
10059 t = fold_build_pointer_plus (sav, gpr);
10060 gimplify_assign (int_addr, t, pre_p);
10062 if (needed_sseregs)
10064 /* sse_addr = fpr + sav; */
10065 t = fold_build_pointer_plus (sav, fpr);
10066 gimplify_assign (sse_addr, t, pre_p);
10068 if (need_temp)
10070 int i, prev_size = 0;
10071 tree temp = create_tmp_var (type, "va_arg_tmp");
10073 /* addr = &temp; */
10074 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10075 gimplify_assign (addr, t, pre_p);
10077 for (i = 0; i < XVECLEN (container, 0); i++)
10079 rtx slot = XVECEXP (container, 0, i);
10080 rtx reg = XEXP (slot, 0);
10081 machine_mode mode = GET_MODE (reg);
10082 tree piece_type;
10083 tree addr_type;
10084 tree daddr_type;
10085 tree src_addr, src;
10086 int src_offset;
10087 tree dest_addr, dest;
10088 int cur_size = GET_MODE_SIZE (mode);
10090 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10091 prev_size = INTVAL (XEXP (slot, 1));
10092 if (prev_size + cur_size > size)
10094 cur_size = size - prev_size;
10095 unsigned int nbits = cur_size * BITS_PER_UNIT;
10096 if (!int_mode_for_size (nbits, 1).exists (&mode))
10097 mode = QImode;
10099 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10100 if (mode == GET_MODE (reg))
10101 addr_type = build_pointer_type (piece_type);
10102 else
10103 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10104 true);
10105 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10106 true);
10108 if (SSE_REGNO_P (REGNO (reg)))
10110 src_addr = sse_addr;
10111 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10113 else
10115 src_addr = int_addr;
10116 src_offset = REGNO (reg) * 8;
10118 src_addr = fold_convert (addr_type, src_addr);
10119 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10121 dest_addr = fold_convert (daddr_type, addr);
10122 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10123 if (cur_size == GET_MODE_SIZE (mode))
10125 src = build_va_arg_indirect_ref (src_addr);
10126 dest = build_va_arg_indirect_ref (dest_addr);
10128 gimplify_assign (dest, src, pre_p);
10130 else
10132 tree copy
10133 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10134 3, dest_addr, src_addr,
10135 size_int (cur_size));
10136 gimplify_and_add (copy, pre_p);
10138 prev_size += cur_size;
10142 if (needed_intregs)
10144 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10145 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10146 gimplify_assign (gpr, t, pre_p);
10149 if (needed_sseregs)
10151 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10152 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10153 gimplify_assign (unshare_expr (fpr), t, pre_p);
10156 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10158 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10161 /* ... otherwise out of the overflow area. */
10163 /* When we align parameter on stack for caller, if the parameter
10164 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10165 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10166 here with caller. */
10167 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10168 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10169 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10171 /* Care for on-stack alignment if needed. */
10172 if (arg_boundary <= 64 || size == 0)
10173 t = ovf;
10174 else
10176 HOST_WIDE_INT align = arg_boundary / 8;
10177 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10178 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10179 build_int_cst (TREE_TYPE (t), -align));
10182 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10183 gimplify_assign (addr, t, pre_p);
10185 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10186 gimplify_assign (unshare_expr (ovf), t, pre_p);
10188 if (container)
10189 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10191 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10192 addr = fold_convert (ptrtype, addr);
10194 if (indirect_p)
10195 addr = build_va_arg_indirect_ref (addr);
10196 return build_va_arg_indirect_ref (addr);
10199 /* Return true if OPNUM's MEM should be matched
10200 in movabs* patterns. */
10202 bool
10203 ix86_check_movabs (rtx insn, int opnum)
10205 rtx set, mem;
10207 set = PATTERN (insn);
10208 if (GET_CODE (set) == PARALLEL)
10209 set = XVECEXP (set, 0, 0);
10210 gcc_assert (GET_CODE (set) == SET);
10211 mem = XEXP (set, opnum);
10212 while (SUBREG_P (mem))
10213 mem = SUBREG_REG (mem);
10214 gcc_assert (MEM_P (mem));
10215 return volatile_ok || !MEM_VOLATILE_P (mem);
10218 /* Return false if INSN contains a MEM with a non-default address space. */
10219 bool
10220 ix86_check_no_addr_space (rtx insn)
10222 subrtx_var_iterator::array_type array;
10223 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10225 rtx x = *iter;
10226 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10227 return false;
10229 return true;
10232 /* Initialize the table of extra 80387 mathematical constants. */
10234 static void
10235 init_ext_80387_constants (void)
10237 static const char * cst[5] =
10239 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10240 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10241 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10242 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10243 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10245 int i;
10247 for (i = 0; i < 5; i++)
10249 real_from_string (&ext_80387_constants_table[i], cst[i]);
10250 /* Ensure each constant is rounded to XFmode precision. */
10251 real_convert (&ext_80387_constants_table[i],
10252 XFmode, &ext_80387_constants_table[i]);
10255 ext_80387_constants_init = 1;
10258 /* Return non-zero if the constant is something that
10259 can be loaded with a special instruction. */
10262 standard_80387_constant_p (rtx x)
10264 machine_mode mode = GET_MODE (x);
10266 const REAL_VALUE_TYPE *r;
10268 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10269 return -1;
10271 if (x == CONST0_RTX (mode))
10272 return 1;
10273 if (x == CONST1_RTX (mode))
10274 return 2;
10276 r = CONST_DOUBLE_REAL_VALUE (x);
10278 /* For XFmode constants, try to find a special 80387 instruction when
10279 optimizing for size or on those CPUs that benefit from them. */
10280 if (mode == XFmode
10281 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10283 int i;
10285 if (! ext_80387_constants_init)
10286 init_ext_80387_constants ();
10288 for (i = 0; i < 5; i++)
10289 if (real_identical (r, &ext_80387_constants_table[i]))
10290 return i + 3;
10293 /* Load of the constant -0.0 or -1.0 will be split as
10294 fldz;fchs or fld1;fchs sequence. */
10295 if (real_isnegzero (r))
10296 return 8;
10297 if (real_identical (r, &dconstm1))
10298 return 9;
10300 return 0;
10303 /* Return the opcode of the special instruction to be used to load
10304 the constant X. */
10306 const char *
10307 standard_80387_constant_opcode (rtx x)
10309 switch (standard_80387_constant_p (x))
10311 case 1:
10312 return "fldz";
10313 case 2:
10314 return "fld1";
10315 case 3:
10316 return "fldlg2";
10317 case 4:
10318 return "fldln2";
10319 case 5:
10320 return "fldl2e";
10321 case 6:
10322 return "fldl2t";
10323 case 7:
10324 return "fldpi";
10325 case 8:
10326 case 9:
10327 return "#";
10328 default:
10329 gcc_unreachable ();
10333 /* Return the CONST_DOUBLE representing the 80387 constant that is
10334 loaded by the specified special instruction. The argument IDX
10335 matches the return value from standard_80387_constant_p. */
10338 standard_80387_constant_rtx (int idx)
10340 int i;
10342 if (! ext_80387_constants_init)
10343 init_ext_80387_constants ();
10345 switch (idx)
10347 case 3:
10348 case 4:
10349 case 5:
10350 case 6:
10351 case 7:
10352 i = idx - 3;
10353 break;
10355 default:
10356 gcc_unreachable ();
10359 return const_double_from_real_value (ext_80387_constants_table[i],
10360 XFmode);
10363 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10364 in supported SSE/AVX vector mode. */
10367 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10369 machine_mode mode;
10371 if (!TARGET_SSE)
10372 return 0;
10374 mode = GET_MODE (x);
10376 if (x == const0_rtx || const0_operand (x, mode))
10377 return 1;
10379 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10381 /* VOIDmode integer constant, get mode from the predicate. */
10382 if (mode == VOIDmode)
10383 mode = pred_mode;
10385 switch (GET_MODE_SIZE (mode))
10387 case 64:
10388 if (TARGET_AVX512F)
10389 return 2;
10390 break;
10391 case 32:
10392 if (TARGET_AVX2)
10393 return 2;
10394 break;
10395 case 16:
10396 if (TARGET_SSE2)
10397 return 2;
10398 break;
10399 case 0:
10400 /* VOIDmode */
10401 gcc_unreachable ();
10402 default:
10403 break;
10407 return 0;
10410 /* Return the opcode of the special instruction to be used to load
10411 the constant operands[1] into operands[0]. */
10413 const char *
10414 standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
10416 machine_mode mode;
10417 rtx x = operands[1];
10419 gcc_assert (TARGET_SSE);
10421 mode = GET_MODE (x);
10423 if (x == const0_rtx || const0_operand (x, mode))
10425 switch (get_attr_mode (insn))
10427 case MODE_TI:
10428 if (!EXT_REX_SSE_REG_P (operands[0]))
10429 return "%vpxor\t%0, %d0";
10430 /* FALLTHRU */
10431 case MODE_XI:
10432 case MODE_OI:
10433 if (EXT_REX_SSE_REG_P (operands[0]))
10434 return (TARGET_AVX512VL
10435 ? "vpxord\t%x0, %x0, %x0"
10436 : "vpxord\t%g0, %g0, %g0");
10437 return "vpxor\t%x0, %x0, %x0";
10439 case MODE_V2DF:
10440 if (!EXT_REX_SSE_REG_P (operands[0]))
10441 return "%vxorpd\t%0, %d0";
10442 /* FALLTHRU */
10443 case MODE_V8DF:
10444 case MODE_V4DF:
10445 if (!EXT_REX_SSE_REG_P (operands[0]))
10446 return "vxorpd\t%x0, %x0, %x0";
10447 else if (TARGET_AVX512DQ)
10448 return (TARGET_AVX512VL
10449 ? "vxorpd\t%x0, %x0, %x0"
10450 : "vxorpd\t%g0, %g0, %g0");
10451 else
10452 return (TARGET_AVX512VL
10453 ? "vpxorq\t%x0, %x0, %x0"
10454 : "vpxorq\t%g0, %g0, %g0");
10456 case MODE_V4SF:
10457 if (!EXT_REX_SSE_REG_P (operands[0]))
10458 return "%vxorps\t%0, %d0";
10459 /* FALLTHRU */
10460 case MODE_V16SF:
10461 case MODE_V8SF:
10462 if (!EXT_REX_SSE_REG_P (operands[0]))
10463 return "vxorps\t%x0, %x0, %x0";
10464 else if (TARGET_AVX512DQ)
10465 return (TARGET_AVX512VL
10466 ? "vxorps\t%x0, %x0, %x0"
10467 : "vxorps\t%g0, %g0, %g0");
10468 else
10469 return (TARGET_AVX512VL
10470 ? "vpxord\t%x0, %x0, %x0"
10471 : "vpxord\t%g0, %g0, %g0");
10473 default:
10474 gcc_unreachable ();
10477 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10479 enum attr_mode insn_mode = get_attr_mode (insn);
10481 switch (insn_mode)
10483 case MODE_XI:
10484 case MODE_V8DF:
10485 case MODE_V16SF:
10486 gcc_assert (TARGET_AVX512F);
10487 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10489 case MODE_OI:
10490 case MODE_V4DF:
10491 case MODE_V8SF:
10492 gcc_assert (TARGET_AVX2);
10493 /* FALLTHRU */
10494 case MODE_TI:
10495 case MODE_V2DF:
10496 case MODE_V4SF:
10497 gcc_assert (TARGET_SSE2);
10498 if (!EXT_REX_SSE_REG_P (operands[0]))
10499 return (TARGET_AVX
10500 ? "vpcmpeqd\t%0, %0, %0"
10501 : "pcmpeqd\t%0, %0");
10502 else if (TARGET_AVX512VL)
10503 return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
10504 else
10505 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10507 default:
10508 gcc_unreachable ();
10512 gcc_unreachable ();
10515 /* Returns true if INSN can be transformed from a memory load
10516 to a supported FP constant load. */
10518 bool
10519 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10521 rtx src = find_constant_src (insn);
10523 gcc_assert (REG_P (dst));
10525 if (src == NULL
10526 || (SSE_REGNO_P (REGNO (dst))
10527 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10528 || (STACK_REGNO_P (REGNO (dst))
10529 && standard_80387_constant_p (src) < 1))
10530 return false;
10532 return true;
10535 /* Returns true if OP contains a symbol reference */
10537 bool
10538 symbolic_reference_mentioned_p (rtx op)
10540 const char *fmt;
10541 int i;
10543 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10544 return true;
10546 fmt = GET_RTX_FORMAT (GET_CODE (op));
10547 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10549 if (fmt[i] == 'E')
10551 int j;
10553 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10554 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10555 return true;
10558 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10559 return true;
10562 return false;
10565 /* Return true if it is appropriate to emit `ret' instructions in the
10566 body of a function. Do this only if the epilogue is simple, needing a
10567 couple of insns. Prior to reloading, we can't tell how many registers
10568 must be saved, so return false then. Return false if there is no frame
10569 marker to de-allocate. */
10571 bool
10572 ix86_can_use_return_insn_p (void)
10574 if (ix86_function_naked (current_function_decl))
10575 return false;
10577 /* Don't use `ret' instruction in interrupt handler. */
10578 if (! reload_completed
10579 || frame_pointer_needed
10580 || cfun->machine->func_type != TYPE_NORMAL)
10581 return 0;
10583 /* Don't allow more than 32k pop, since that's all we can do
10584 with one instruction. */
10585 if (crtl->args.pops_args && crtl->args.size >= 32768)
10586 return 0;
10588 struct ix86_frame &frame = cfun->machine->frame;
10589 return (frame.stack_pointer_offset == UNITS_PER_WORD
10590 && (frame.nregs + frame.nsseregs) == 0);
10593 /* Value should be nonzero if functions must have frame pointers.
10594 Zero means the frame pointer need not be set up (and parms may
10595 be accessed via the stack pointer) in functions that seem suitable. */
10597 static bool
10598 ix86_frame_pointer_required (void)
10600 /* If we accessed previous frames, then the generated code expects
10601 to be able to access the saved ebp value in our frame. */
10602 if (cfun->machine->accesses_prev_frame)
10603 return true;
10605 /* Several x86 os'es need a frame pointer for other reasons,
10606 usually pertaining to setjmp. */
10607 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10608 return true;
10610 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10611 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10612 return true;
10614 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10615 allocation is 4GB. */
10616 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10617 return true;
10619 /* SSE saves require frame-pointer when stack is misaligned. */
10620 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10621 return true;
10623 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10624 turns off the frame pointer by default. Turn it back on now if
10625 we've not got a leaf function. */
10626 if (TARGET_OMIT_LEAF_FRAME_POINTER
10627 && (!crtl->is_leaf
10628 || ix86_current_function_calls_tls_descriptor))
10629 return true;
10631 if (crtl->profile && !flag_fentry)
10632 return true;
10634 return false;
10637 /* Record that the current function accesses previous call frames. */
10639 void
10640 ix86_setup_frame_addresses (void)
10642 cfun->machine->accesses_prev_frame = 1;
10645 #ifndef USE_HIDDEN_LINKONCE
10646 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10647 # define USE_HIDDEN_LINKONCE 1
10648 # else
10649 # define USE_HIDDEN_LINKONCE 0
10650 # endif
10651 #endif
10653 /* Label count for call and return thunks. It is used to make unique
10654 labels in call and return thunks. */
10655 static int indirectlabelno;
10657 /* True if call thunk function is needed. */
10658 static bool indirect_thunk_needed = false;
10659 /* True if call thunk function with the BND prefix is needed. */
10660 static bool indirect_thunk_bnd_needed = false;
10662 /* Bit masks of integer registers, which contain branch target, used
10663 by call thunk functions. */
10664 static int indirect_thunks_used;
10665 /* Bit masks of integer registers, which contain branch target, used
10666 by call thunk functions with the BND prefix. */
10667 static int indirect_thunks_bnd_used;
10669 /* True if return thunk function is needed. */
10670 static bool indirect_return_needed = false;
10671 /* True if return thunk function with the BND prefix is needed. */
10672 static bool indirect_return_bnd_needed = false;
10674 /* True if return thunk function via CX is needed. */
10675 static bool indirect_return_via_cx;
10676 /* True if return thunk function via CX with the BND prefix is
10677 needed. */
10678 static bool indirect_return_via_cx_bnd;
10680 #ifndef INDIRECT_LABEL
10681 # define INDIRECT_LABEL "LIND"
10682 #endif
10684 /* Indicate what prefix is needed for an indirect branch. */
10685 enum indirect_thunk_prefix
10687 indirect_thunk_prefix_none,
10688 indirect_thunk_prefix_bnd,
10689 indirect_thunk_prefix_nt
10692 /* Return the prefix needed for an indirect branch INSN. */
10694 enum indirect_thunk_prefix
10695 indirect_thunk_need_prefix (rtx_insn *insn)
10697 enum indirect_thunk_prefix need_prefix;
10698 if ((cfun->machine->indirect_branch_type
10699 == indirect_branch_thunk_extern)
10700 && ix86_notrack_prefixed_insn_p (insn))
10702 /* NOTRACK prefix is only used with external thunk so that it
10703 can be properly updated to support CET at run-time. */
10704 need_prefix = indirect_thunk_prefix_nt;
10706 else
10707 need_prefix = indirect_thunk_prefix_none;
10708 return need_prefix;
10711 /* Fills in the label name that should be used for the indirect thunk. */
10713 static void
10714 indirect_thunk_name (char name[32], unsigned int regno,
10715 enum indirect_thunk_prefix need_prefix,
10716 bool ret_p)
10718 if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
10719 gcc_unreachable ();
10721 if (USE_HIDDEN_LINKONCE)
10723 const char *prefix;
10725 if (need_prefix == indirect_thunk_prefix_bnd)
10726 prefix = "_bnd";
10727 else if (need_prefix == indirect_thunk_prefix_nt
10728 && regno != INVALID_REGNUM)
10730 /* NOTRACK prefix is only used with external thunk via
10731 register so that NOTRACK prefix can be added to indirect
10732 branch via register to support CET at run-time. */
10733 prefix = "_nt";
10735 else
10736 prefix = "";
10738 const char *ret = ret_p ? "return" : "indirect";
10740 if (regno != INVALID_REGNUM)
10742 const char *reg_prefix;
10743 if (LEGACY_INT_REGNO_P (regno))
10744 reg_prefix = TARGET_64BIT ? "r" : "e";
10745 else
10746 reg_prefix = "";
10747 sprintf (name, "__x86_%s_thunk%s_%s%s",
10748 ret, prefix, reg_prefix, reg_names[regno]);
10750 else
10751 sprintf (name, "__x86_%s_thunk%s", ret, prefix);
10753 else
10755 if (regno != INVALID_REGNUM)
10757 if (need_prefix == indirect_thunk_prefix_bnd)
10758 ASM_GENERATE_INTERNAL_LABEL (name, "LITBR", regno);
10759 else
10760 ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
10762 else
10764 if (ret_p)
10766 if (need_prefix == indirect_thunk_prefix_bnd)
10767 ASM_GENERATE_INTERNAL_LABEL (name, "LRTB", 0);
10768 else
10769 ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
10771 else
10773 if (need_prefix == indirect_thunk_prefix_bnd)
10774 ASM_GENERATE_INTERNAL_LABEL (name, "LITB", 0);
10775 else
10776 ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
10782 /* Output a call and return thunk for indirect branch. If BND_P is
10783 true, the BND prefix is needed. If REGNO != -1, the function
10784 address is in REGNO and the call and return thunk looks like:
10786 call L2
10788 pause
10789 lfence
10790 jmp L1
10792 mov %REG, (%sp)
10795 Otherwise, the function address is on the top of stack and the
10796 call and return thunk looks like:
10798 call L2
10800 pause
10801 lfence
10802 jmp L1
10804 lea WORD_SIZE(%sp), %sp
10808 static void
10809 output_indirect_thunk (enum indirect_thunk_prefix need_prefix,
10810 unsigned int regno)
10812 char indirectlabel1[32];
10813 char indirectlabel2[32];
10815 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
10816 indirectlabelno++);
10817 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
10818 indirectlabelno++);
10820 /* Call */
10821 if (need_prefix == indirect_thunk_prefix_bnd)
10822 fputs ("\tbnd call\t", asm_out_file);
10823 else
10824 fputs ("\tcall\t", asm_out_file);
10825 assemble_name_raw (asm_out_file, indirectlabel2);
10826 fputc ('\n', asm_out_file);
10828 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
10830 /* AMD and Intel CPUs prefer each a different instruction as loop filler.
10831 Usage of both pause + lfence is compromise solution. */
10832 fprintf (asm_out_file, "\tpause\n\tlfence\n");
10834 /* Jump. */
10835 fputs ("\tjmp\t", asm_out_file);
10836 assemble_name_raw (asm_out_file, indirectlabel1);
10837 fputc ('\n', asm_out_file);
10839 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
10841 if (regno != INVALID_REGNUM)
10843 /* MOV. */
10844 rtx xops[2];
10845 xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
10846 xops[1] = gen_rtx_REG (word_mode, regno);
10847 output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
10849 else
10851 /* LEA. */
10852 rtx xops[2];
10853 xops[0] = stack_pointer_rtx;
10854 xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10855 output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
10858 if (need_prefix == indirect_thunk_prefix_bnd)
10859 fputs ("\tbnd ret\n", asm_out_file);
10860 else
10861 fputs ("\tret\n", asm_out_file);
10864 /* Output a funtion with a call and return thunk for indirect branch.
10865 If BND_P is true, the BND prefix is needed. If REGNO != UNVALID_REGNUM,
10866 the function address is in REGNO. Otherwise, the function address is
10867 on the top of stack. Thunk is used for function return if RET_P is
10868 true. */
10870 static void
10871 output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
10872 unsigned int regno, bool ret_p)
10874 char name[32];
10875 tree decl;
10877 /* Create __x86_indirect_thunk/__x86_indirect_thunk_bnd. */
10878 indirect_thunk_name (name, regno, need_prefix, ret_p);
10879 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10880 get_identifier (name),
10881 build_function_type_list (void_type_node, NULL_TREE));
10882 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10883 NULL_TREE, void_type_node);
10884 TREE_PUBLIC (decl) = 1;
10885 TREE_STATIC (decl) = 1;
10886 DECL_IGNORED_P (decl) = 1;
10888 #if TARGET_MACHO
10889 if (TARGET_MACHO)
10891 switch_to_section (darwin_sections[picbase_thunk_section]);
10892 fputs ("\t.weak_definition\t", asm_out_file);
10893 assemble_name (asm_out_file, name);
10894 fputs ("\n\t.private_extern\t", asm_out_file);
10895 assemble_name (asm_out_file, name);
10896 putc ('\n', asm_out_file);
10897 ASM_OUTPUT_LABEL (asm_out_file, name);
10898 DECL_WEAK (decl) = 1;
10900 else
10901 #endif
10902 if (USE_HIDDEN_LINKONCE)
10904 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10906 targetm.asm_out.unique_section (decl, 0);
10907 switch_to_section (get_named_section (decl, NULL, 0));
10909 targetm.asm_out.globalize_label (asm_out_file, name);
10910 fputs ("\t.hidden\t", asm_out_file);
10911 assemble_name (asm_out_file, name);
10912 putc ('\n', asm_out_file);
10913 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10915 else
10917 switch_to_section (text_section);
10918 ASM_OUTPUT_LABEL (asm_out_file, name);
10921 DECL_INITIAL (decl) = make_node (BLOCK);
10922 current_function_decl = decl;
10923 allocate_struct_function (decl, false);
10924 init_function_start (decl);
10925 /* We're about to hide the function body from callees of final_* by
10926 emitting it directly; tell them we're a thunk, if they care. */
10927 cfun->is_thunk = true;
10928 first_function_block_is_cold = false;
10929 /* Make sure unwind info is emitted for the thunk if needed. */
10930 final_start_function (emit_barrier (), asm_out_file, 1);
10932 output_indirect_thunk (need_prefix, regno);
10934 final_end_function ();
10935 init_insn_lengths ();
10936 free_after_compilation (cfun);
10937 set_cfun (NULL);
10938 current_function_decl = NULL;
10941 static int pic_labels_used;
10943 /* Fills in the label name that should be used for a pc thunk for
10944 the given register. */
10946 static void
10947 get_pc_thunk_name (char name[32], unsigned int regno)
10949 gcc_assert (!TARGET_64BIT);
10951 if (USE_HIDDEN_LINKONCE)
10952 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10953 else
10954 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10958 /* This function generates code for -fpic that loads %ebx with
10959 the return address of the caller and then returns. */
10961 static void
10962 ix86_code_end (void)
10964 rtx xops[2];
10965 unsigned int regno;
10967 if (indirect_return_needed)
10968 output_indirect_thunk_function (indirect_thunk_prefix_none,
10969 INVALID_REGNUM, true);
10970 if (indirect_return_bnd_needed)
10971 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
10972 INVALID_REGNUM, true);
10974 if (indirect_return_via_cx)
10975 output_indirect_thunk_function (indirect_thunk_prefix_none,
10976 CX_REG, true);
10977 if (indirect_return_via_cx_bnd)
10978 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
10979 CX_REG, true);
10981 if (indirect_thunk_needed)
10982 output_indirect_thunk_function (indirect_thunk_prefix_none,
10983 INVALID_REGNUM, false);
10984 if (indirect_thunk_bnd_needed)
10985 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
10986 INVALID_REGNUM, false);
10988 for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
10990 unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
10991 if ((indirect_thunks_used & (1 << i)))
10992 output_indirect_thunk_function (indirect_thunk_prefix_none,
10993 regno, false);
10995 if ((indirect_thunks_bnd_used & (1 << i)))
10996 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
10997 regno, false);
11000 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
11002 char name[32];
11003 tree decl;
11005 if ((indirect_thunks_used & (1 << regno)))
11006 output_indirect_thunk_function (indirect_thunk_prefix_none,
11007 regno, false);
11009 if ((indirect_thunks_bnd_used & (1 << regno)))
11010 output_indirect_thunk_function (indirect_thunk_prefix_bnd,
11011 regno, false);
11013 if (!(pic_labels_used & (1 << regno)))
11014 continue;
11016 get_pc_thunk_name (name, regno);
11018 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
11019 get_identifier (name),
11020 build_function_type_list (void_type_node, NULL_TREE));
11021 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
11022 NULL_TREE, void_type_node);
11023 TREE_PUBLIC (decl) = 1;
11024 TREE_STATIC (decl) = 1;
11025 DECL_IGNORED_P (decl) = 1;
11027 #if TARGET_MACHO
11028 if (TARGET_MACHO)
11030 switch_to_section (darwin_sections[picbase_thunk_section]);
11031 fputs ("\t.weak_definition\t", asm_out_file);
11032 assemble_name (asm_out_file, name);
11033 fputs ("\n\t.private_extern\t", asm_out_file);
11034 assemble_name (asm_out_file, name);
11035 putc ('\n', asm_out_file);
11036 ASM_OUTPUT_LABEL (asm_out_file, name);
11037 DECL_WEAK (decl) = 1;
11039 else
11040 #endif
11041 if (USE_HIDDEN_LINKONCE)
11043 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
11045 targetm.asm_out.unique_section (decl, 0);
11046 switch_to_section (get_named_section (decl, NULL, 0));
11048 targetm.asm_out.globalize_label (asm_out_file, name);
11049 fputs ("\t.hidden\t", asm_out_file);
11050 assemble_name (asm_out_file, name);
11051 putc ('\n', asm_out_file);
11052 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
11054 else
11056 switch_to_section (text_section);
11057 ASM_OUTPUT_LABEL (asm_out_file, name);
11060 DECL_INITIAL (decl) = make_node (BLOCK);
11061 current_function_decl = decl;
11062 allocate_struct_function (decl, false);
11063 init_function_start (decl);
11064 /* We're about to hide the function body from callees of final_* by
11065 emitting it directly; tell them we're a thunk, if they care. */
11066 cfun->is_thunk = true;
11067 first_function_block_is_cold = false;
11068 /* Make sure unwind info is emitted for the thunk if needed. */
11069 final_start_function (emit_barrier (), asm_out_file, 1);
11071 /* Pad stack IP move with 4 instructions (two NOPs count
11072 as one instruction). */
11073 if (TARGET_PAD_SHORT_FUNCTION)
11075 int i = 8;
11077 while (i--)
11078 fputs ("\tnop\n", asm_out_file);
11081 xops[0] = gen_rtx_REG (Pmode, regno);
11082 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
11083 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
11084 output_asm_insn ("%!ret", NULL);
11085 final_end_function ();
11086 init_insn_lengths ();
11087 free_after_compilation (cfun);
11088 set_cfun (NULL);
11089 current_function_decl = NULL;
11092 if (flag_split_stack)
11093 file_end_indicate_split_stack ();
11096 /* Emit code for the SET_GOT patterns. */
11098 const char *
11099 output_set_got (rtx dest, rtx label)
11101 rtx xops[3];
11103 xops[0] = dest;
11105 if (TARGET_VXWORKS_RTP && flag_pic)
11107 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
11108 xops[2] = gen_rtx_MEM (Pmode,
11109 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
11110 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
11112 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
11113 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
11114 an unadorned address. */
11115 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
11116 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
11117 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
11118 return "";
11121 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
11123 if (flag_pic)
11125 char name[32];
11126 get_pc_thunk_name (name, REGNO (dest));
11127 pic_labels_used |= 1 << REGNO (dest);
11129 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
11130 xops[2] = gen_rtx_MEM (QImode, xops[2]);
11131 output_asm_insn ("%!call\t%X2", xops);
11133 #if TARGET_MACHO
11134 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
11135 This is what will be referenced by the Mach-O PIC subsystem. */
11136 if (machopic_should_output_picbase_label () || !label)
11137 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
11139 /* When we are restoring the pic base at the site of a nonlocal label,
11140 and we decided to emit the pic base above, we will still output a
11141 local label used for calculating the correction offset (even though
11142 the offset will be 0 in that case). */
11143 if (label)
11144 targetm.asm_out.internal_label (asm_out_file, "L",
11145 CODE_LABEL_NUMBER (label));
11146 #endif
11148 else
11150 if (TARGET_MACHO)
11151 /* We don't need a pic base, we're not producing pic. */
11152 gcc_unreachable ();
11154 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
11155 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
11156 targetm.asm_out.internal_label (asm_out_file, "L",
11157 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
11160 if (!TARGET_MACHO)
11161 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
11163 return "";
11166 /* Generate an "push" pattern for input ARG. */
11168 static rtx
11169 gen_push (rtx arg)
11171 struct machine_function *m = cfun->machine;
11173 if (m->fs.cfa_reg == stack_pointer_rtx)
11174 m->fs.cfa_offset += UNITS_PER_WORD;
11175 m->fs.sp_offset += UNITS_PER_WORD;
11177 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11178 arg = gen_rtx_REG (word_mode, REGNO (arg));
11180 return gen_rtx_SET (gen_rtx_MEM (word_mode,
11181 gen_rtx_PRE_DEC (Pmode,
11182 stack_pointer_rtx)),
11183 arg);
11186 /* Generate an "pop" pattern for input ARG. */
11188 static rtx
11189 gen_pop (rtx arg)
11191 if (REG_P (arg) && GET_MODE (arg) != word_mode)
11192 arg = gen_rtx_REG (word_mode, REGNO (arg));
11194 return gen_rtx_SET (arg,
11195 gen_rtx_MEM (word_mode,
11196 gen_rtx_POST_INC (Pmode,
11197 stack_pointer_rtx)));
11200 /* Return >= 0 if there is an unused call-clobbered register available
11201 for the entire function. */
11203 static unsigned int
11204 ix86_select_alt_pic_regnum (void)
11206 if (ix86_use_pseudo_pic_reg ())
11207 return INVALID_REGNUM;
11209 if (crtl->is_leaf
11210 && !crtl->profile
11211 && !ix86_current_function_calls_tls_descriptor)
11213 int i, drap;
11214 /* Can't use the same register for both PIC and DRAP. */
11215 if (crtl->drap_reg)
11216 drap = REGNO (crtl->drap_reg);
11217 else
11218 drap = -1;
11219 for (i = 2; i >= 0; --i)
11220 if (i != drap && !df_regs_ever_live_p (i))
11221 return i;
11224 return INVALID_REGNUM;
11227 /* Return true if REGNO is used by the epilogue. */
11229 bool
11230 ix86_epilogue_uses (int regno)
11232 /* If there are no caller-saved registers, we preserve all registers,
11233 except for MMX and x87 registers which aren't supported when saving
11234 and restoring registers. Don't explicitly save SP register since
11235 it is always preserved. */
11236 return (epilogue_completed
11237 && cfun->machine->no_caller_saved_registers
11238 && !fixed_regs[regno]
11239 && !STACK_REGNO_P (regno)
11240 && !MMX_REGNO_P (regno));
11243 /* Return nonzero if register REGNO can be used as a scratch register
11244 in peephole2. */
11246 static bool
11247 ix86_hard_regno_scratch_ok (unsigned int regno)
11249 /* If there are no caller-saved registers, we can't use any register
11250 as a scratch register after epilogue and use REGNO as scratch
11251 register only if it has been used before to avoid saving and
11252 restoring it. */
11253 return (!cfun->machine->no_caller_saved_registers
11254 || (!epilogue_completed
11255 && df_regs_ever_live_p (regno)));
11258 /* Return true if register class CL should be an additional allocno
11259 class. */
11261 static bool
11262 ix86_additional_allocno_class_p (reg_class_t cl)
11264 return cl == MOD4_SSE_REGS;
11267 /* Return TRUE if we need to save REGNO. */
11269 static bool
11270 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
11272 /* If there are no caller-saved registers, we preserve all registers,
11273 except for MMX and x87 registers which aren't supported when saving
11274 and restoring registers. Don't explicitly save SP register since
11275 it is always preserved. */
11276 if (cfun->machine->no_caller_saved_registers)
11278 /* Don't preserve registers used for function return value. */
11279 rtx reg = crtl->return_rtx;
11280 if (reg)
11282 unsigned int i = REGNO (reg);
11283 unsigned int nregs = REG_NREGS (reg);
11284 while (nregs-- > 0)
11285 if ((i + nregs) == regno)
11286 return false;
11288 reg = crtl->return_bnd;
11289 if (reg)
11291 i = REGNO (reg);
11292 nregs = REG_NREGS (reg);
11293 while (nregs-- > 0)
11294 if ((i + nregs) == regno)
11295 return false;
11299 return (df_regs_ever_live_p (regno)
11300 && !fixed_regs[regno]
11301 && !STACK_REGNO_P (regno)
11302 && !MMX_REGNO_P (regno)
11303 && (regno != HARD_FRAME_POINTER_REGNUM
11304 || !frame_pointer_needed));
11307 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
11308 && pic_offset_table_rtx)
11310 if (ix86_use_pseudo_pic_reg ())
11312 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
11313 _mcount in prologue. */
11314 if (!TARGET_64BIT && flag_pic && crtl->profile)
11315 return true;
11317 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
11318 || crtl->profile
11319 || crtl->calls_eh_return
11320 || crtl->uses_const_pool
11321 || cfun->has_nonlocal_label)
11322 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
11325 if (crtl->calls_eh_return && maybe_eh_return)
11327 unsigned i;
11328 for (i = 0; ; i++)
11330 unsigned test = EH_RETURN_DATA_REGNO (i);
11331 if (test == INVALID_REGNUM)
11332 break;
11333 if (test == regno)
11334 return true;
11338 if (ignore_outlined && cfun->machine->call_ms2sysv)
11340 unsigned count = cfun->machine->call_ms2sysv_extra_regs
11341 + xlogue_layout::MIN_REGS;
11342 if (xlogue_layout::is_stub_managed_reg (regno, count))
11343 return false;
11346 if (crtl->drap_reg
11347 && regno == REGNO (crtl->drap_reg)
11348 && !cfun->machine->no_drap_save_restore)
11349 return true;
11351 return (df_regs_ever_live_p (regno)
11352 && !call_used_regs[regno]
11353 && !fixed_regs[regno]
11354 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
11357 /* Return number of saved general prupose registers. */
11359 static int
11360 ix86_nsaved_regs (void)
11362 int nregs = 0;
11363 int regno;
11365 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11366 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11367 nregs ++;
11368 return nregs;
11371 /* Return number of saved SSE registers. */
11373 static int
11374 ix86_nsaved_sseregs (void)
11376 int nregs = 0;
11377 int regno;
11379 if (!TARGET_64BIT_MS_ABI)
11380 return 0;
11381 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11382 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11383 nregs ++;
11384 return nregs;
11387 /* Given FROM and TO register numbers, say whether this elimination is
11388 allowed. If stack alignment is needed, we can only replace argument
11389 pointer with hard frame pointer, or replace frame pointer with stack
11390 pointer. Otherwise, frame pointer elimination is automatically
11391 handled and all other eliminations are valid. */
11393 static bool
11394 ix86_can_eliminate (const int from, const int to)
11396 if (stack_realign_fp)
11397 return ((from == ARG_POINTER_REGNUM
11398 && to == HARD_FRAME_POINTER_REGNUM)
11399 || (from == FRAME_POINTER_REGNUM
11400 && to == STACK_POINTER_REGNUM));
11401 else
11402 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11405 /* Return the offset between two registers, one to be eliminated, and the other
11406 its replacement, at the start of a routine. */
11408 HOST_WIDE_INT
11409 ix86_initial_elimination_offset (int from, int to)
11411 struct ix86_frame &frame = cfun->machine->frame;
11413 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11414 return frame.hard_frame_pointer_offset;
11415 else if (from == FRAME_POINTER_REGNUM
11416 && to == HARD_FRAME_POINTER_REGNUM)
11417 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11418 else
11420 gcc_assert (to == STACK_POINTER_REGNUM);
11422 if (from == ARG_POINTER_REGNUM)
11423 return frame.stack_pointer_offset;
11425 gcc_assert (from == FRAME_POINTER_REGNUM);
11426 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11430 /* In a dynamically-aligned function, we can't know the offset from
11431 stack pointer to frame pointer, so we must ensure that setjmp
11432 eliminates fp against the hard fp (%ebp) rather than trying to
11433 index from %esp up to the top of the frame across a gap that is
11434 of unknown (at compile-time) size. */
11435 static rtx
11436 ix86_builtin_setjmp_frame_value (void)
11438 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11441 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11442 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11444 static bool warned_once = false;
11445 if (!warned_once)
11447 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11448 feature);
11449 warned_once = true;
11453 /* Return the probing interval for -fstack-clash-protection. */
11455 static HOST_WIDE_INT
11456 get_probe_interval (void)
11458 if (flag_stack_clash_protection)
11459 return (HOST_WIDE_INT_1U
11460 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
11461 else
11462 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
11465 /* When using -fsplit-stack, the allocation routines set a field in
11466 the TCB to the bottom of the stack plus this much space, measured
11467 in bytes. */
11469 #define SPLIT_STACK_AVAILABLE 256
11471 /* Fill structure ix86_frame about frame of currently computed function. */
11473 static void
11474 ix86_compute_frame_layout (void)
11476 struct ix86_frame *frame = &cfun->machine->frame;
11477 struct machine_function *m = cfun->machine;
11478 unsigned HOST_WIDE_INT stack_alignment_needed;
11479 HOST_WIDE_INT offset;
11480 unsigned HOST_WIDE_INT preferred_alignment;
11481 HOST_WIDE_INT size = get_frame_size ();
11482 HOST_WIDE_INT to_allocate;
11484 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11485 * ms_abi functions that call a sysv function. We now need to prune away
11486 * cases where it should be disabled. */
11487 if (TARGET_64BIT && m->call_ms2sysv)
11489 gcc_assert (TARGET_64BIT_MS_ABI);
11490 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11491 gcc_assert (!TARGET_SEH);
11492 gcc_assert (TARGET_SSE);
11493 gcc_assert (!ix86_using_red_zone ());
11495 if (crtl->calls_eh_return)
11497 gcc_assert (!reload_completed);
11498 m->call_ms2sysv = false;
11499 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11502 else if (ix86_static_chain_on_stack)
11504 gcc_assert (!reload_completed);
11505 m->call_ms2sysv = false;
11506 warn_once_call_ms2sysv_xlogues ("static call chains");
11509 /* Finally, compute which registers the stub will manage. */
11510 else
11512 unsigned count = xlogue_layout::count_stub_managed_regs ();
11513 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11514 m->call_ms2sysv_pad_in = 0;
11518 frame->nregs = ix86_nsaved_regs ();
11519 frame->nsseregs = ix86_nsaved_sseregs ();
11521 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11522 except for function prologues, leaf functions and when the defult
11523 incoming stack boundary is overriden at command line or via
11524 force_align_arg_pointer attribute. */
11525 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11526 && (!crtl->is_leaf || cfun->calls_alloca != 0
11527 || ix86_current_function_calls_tls_descriptor
11528 || ix86_incoming_stack_boundary < 128))
11530 crtl->preferred_stack_boundary = 128;
11531 crtl->stack_alignment_needed = 128;
11534 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11535 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11537 gcc_assert (!size || stack_alignment_needed);
11538 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11539 gcc_assert (preferred_alignment <= stack_alignment_needed);
11541 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11542 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11543 if (TARGET_64BIT && m->call_ms2sysv)
11545 gcc_assert (stack_alignment_needed >= 16);
11546 gcc_assert (!frame->nsseregs);
11549 /* For SEH we have to limit the amount of code movement into the prologue.
11550 At present we do this via a BLOCKAGE, at which point there's very little
11551 scheduling that can be done, which means that there's very little point
11552 in doing anything except PUSHs. */
11553 if (TARGET_SEH)
11554 m->use_fast_prologue_epilogue = false;
11555 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11557 int count = frame->nregs;
11558 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11560 /* The fast prologue uses move instead of push to save registers. This
11561 is significantly longer, but also executes faster as modern hardware
11562 can execute the moves in parallel, but can't do that for push/pop.
11564 Be careful about choosing what prologue to emit: When function takes
11565 many instructions to execute we may use slow version as well as in
11566 case function is known to be outside hot spot (this is known with
11567 feedback only). Weight the size of function by number of registers
11568 to save as it is cheap to use one or two push instructions but very
11569 slow to use many of them. */
11570 if (count)
11571 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11572 if (node->frequency < NODE_FREQUENCY_NORMAL
11573 || (flag_branch_probabilities
11574 && node->frequency < NODE_FREQUENCY_HOT))
11575 m->use_fast_prologue_epilogue = false;
11576 else
11577 m->use_fast_prologue_epilogue
11578 = !expensive_function_p (count);
11581 frame->save_regs_using_mov
11582 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11583 /* If static stack checking is enabled and done with probes,
11584 the registers need to be saved before allocating the frame. */
11585 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11587 /* Skip return address and error code in exception handler. */
11588 offset = INCOMING_FRAME_SP_OFFSET;
11590 /* Skip pushed static chain. */
11591 if (ix86_static_chain_on_stack)
11592 offset += UNITS_PER_WORD;
11594 /* Skip saved base pointer. */
11595 if (frame_pointer_needed)
11596 offset += UNITS_PER_WORD;
11597 frame->hfp_save_offset = offset;
11599 /* The traditional frame pointer location is at the top of the frame. */
11600 frame->hard_frame_pointer_offset = offset;
11602 /* Register save area */
11603 offset += frame->nregs * UNITS_PER_WORD;
11604 frame->reg_save_offset = offset;
11606 /* On SEH target, registers are pushed just before the frame pointer
11607 location. */
11608 if (TARGET_SEH)
11609 frame->hard_frame_pointer_offset = offset;
11611 /* Calculate the size of the va-arg area (not including padding, if any). */
11612 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11614 /* Also adjust stack_realign_offset for the largest alignment of
11615 stack slot actually used. */
11616 if (stack_realign_fp
11617 || (cfun->machine->max_used_stack_alignment != 0
11618 && (offset % cfun->machine->max_used_stack_alignment) != 0))
11620 /* We may need a 16-byte aligned stack for the remainder of the
11621 register save area, but the stack frame for the local function
11622 may require a greater alignment if using AVX/2/512. In order
11623 to avoid wasting space, we first calculate the space needed for
11624 the rest of the register saves, add that to the stack pointer,
11625 and then realign the stack to the boundary of the start of the
11626 frame for the local function. */
11627 HOST_WIDE_INT space_needed = 0;
11628 HOST_WIDE_INT sse_reg_space_needed = 0;
11630 if (TARGET_64BIT)
11632 if (m->call_ms2sysv)
11634 m->call_ms2sysv_pad_in = 0;
11635 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11638 else if (frame->nsseregs)
11639 /* The only ABI that has saved SSE registers (Win64) also has a
11640 16-byte aligned default stack. However, many programs violate
11641 the ABI, and Wine64 forces stack realignment to compensate. */
11642 space_needed = frame->nsseregs * 16;
11644 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11646 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11647 rounding to be pedantic. */
11648 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11650 else
11651 space_needed = frame->va_arg_size;
11653 /* Record the allocation size required prior to the realignment AND. */
11654 frame->stack_realign_allocate = space_needed;
11656 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11657 before this point are not directly comparable with values below
11658 this point. Use sp_valid_at to determine if the stack pointer is
11659 valid for a given offset, fp_valid_at for the frame pointer, or
11660 choose_baseaddr to have a base register chosen for you.
11662 Note that the result of (frame->stack_realign_offset
11663 & (stack_alignment_needed - 1)) may not equal zero. */
11664 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11665 frame->stack_realign_offset = offset - space_needed;
11666 frame->sse_reg_save_offset = frame->stack_realign_offset
11667 + sse_reg_space_needed;
11669 else
11671 frame->stack_realign_offset = offset;
11673 if (TARGET_64BIT && m->call_ms2sysv)
11675 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11676 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11679 /* Align and set SSE register save area. */
11680 else if (frame->nsseregs)
11682 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11683 required and the DRAP re-alignment boundary is at least 16 bytes,
11684 then we want the SSE register save area properly aligned. */
11685 if (ix86_incoming_stack_boundary >= 128
11686 || (stack_realign_drap && stack_alignment_needed >= 16))
11687 offset = ROUND_UP (offset, 16);
11688 offset += frame->nsseregs * 16;
11690 frame->sse_reg_save_offset = offset;
11691 offset += frame->va_arg_size;
11694 /* Align start of frame for local function. When a function call
11695 is removed, it may become a leaf function. But if argument may
11696 be passed on stack, we need to align the stack when there is no
11697 tail call. */
11698 if (m->call_ms2sysv
11699 || frame->va_arg_size != 0
11700 || size != 0
11701 || !crtl->is_leaf
11702 || (!crtl->tail_call_emit
11703 && cfun->machine->outgoing_args_on_stack)
11704 || cfun->calls_alloca
11705 || ix86_current_function_calls_tls_descriptor)
11706 offset = ROUND_UP (offset, stack_alignment_needed);
11708 /* Frame pointer points here. */
11709 frame->frame_pointer_offset = offset;
11711 offset += size;
11713 /* Add outgoing arguments area. Can be skipped if we eliminated
11714 all the function calls as dead code.
11715 Skipping is however impossible when function calls alloca. Alloca
11716 expander assumes that last crtl->outgoing_args_size
11717 of stack frame are unused. */
11718 if (ACCUMULATE_OUTGOING_ARGS
11719 && (!crtl->is_leaf || cfun->calls_alloca
11720 || ix86_current_function_calls_tls_descriptor))
11722 offset += crtl->outgoing_args_size;
11723 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11725 else
11726 frame->outgoing_arguments_size = 0;
11728 /* Align stack boundary. Only needed if we're calling another function
11729 or using alloca. */
11730 if (!crtl->is_leaf || cfun->calls_alloca
11731 || ix86_current_function_calls_tls_descriptor)
11732 offset = ROUND_UP (offset, preferred_alignment);
11734 /* We've reached end of stack frame. */
11735 frame->stack_pointer_offset = offset;
11737 /* Size prologue needs to allocate. */
11738 to_allocate = offset - frame->sse_reg_save_offset;
11740 if ((!to_allocate && frame->nregs <= 1)
11741 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
11742 /* If stack clash probing needs a loop, then it needs a
11743 scratch register. But the returned register is only guaranteed
11744 to be safe to use after register saves are complete. So if
11745 stack clash protections are enabled and the allocated frame is
11746 larger than the probe interval, then use pushes to save
11747 callee saved registers. */
11748 || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
11749 frame->save_regs_using_mov = false;
11751 if (ix86_using_red_zone ()
11752 && crtl->sp_is_unchanging
11753 && crtl->is_leaf
11754 && !ix86_pc_thunk_call_expanded
11755 && !ix86_current_function_calls_tls_descriptor)
11757 frame->red_zone_size = to_allocate;
11758 if (frame->save_regs_using_mov)
11759 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11760 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11761 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11763 else
11764 frame->red_zone_size = 0;
11765 frame->stack_pointer_offset -= frame->red_zone_size;
11767 /* The SEH frame pointer location is near the bottom of the frame.
11768 This is enforced by the fact that the difference between the
11769 stack pointer and the frame pointer is limited to 240 bytes in
11770 the unwind data structure. */
11771 if (TARGET_SEH)
11773 HOST_WIDE_INT diff;
11775 /* If we can leave the frame pointer where it is, do so. Also, returns
11776 the establisher frame for __builtin_frame_address (0). */
11777 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11778 if (diff <= SEH_MAX_FRAME_SIZE
11779 && (diff > 240 || (diff & 15) != 0)
11780 && !crtl->accesses_prior_frames)
11782 /* Ideally we'd determine what portion of the local stack frame
11783 (within the constraint of the lowest 240) is most heavily used.
11784 But without that complication, simply bias the frame pointer
11785 by 128 bytes so as to maximize the amount of the local stack
11786 frame that is addressable with 8-bit offsets. */
11787 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11792 /* This is semi-inlined memory_address_length, but simplified
11793 since we know that we're always dealing with reg+offset, and
11794 to avoid having to create and discard all that rtl. */
11796 static inline int
11797 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11799 int len = 4;
11801 if (offset == 0)
11803 /* EBP and R13 cannot be encoded without an offset. */
11804 len = (regno == BP_REG || regno == R13_REG);
11806 else if (IN_RANGE (offset, -128, 127))
11807 len = 1;
11809 /* ESP and R12 must be encoded with a SIB byte. */
11810 if (regno == SP_REG || regno == R12_REG)
11811 len++;
11813 return len;
11816 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11817 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11819 static bool
11820 sp_valid_at (HOST_WIDE_INT cfa_offset)
11822 const struct machine_frame_state &fs = cfun->machine->fs;
11823 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11825 /* Validate that the cfa_offset isn't in a "no-man's land". */
11826 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11827 return false;
11829 return fs.sp_valid;
11832 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11833 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11835 static inline bool
11836 fp_valid_at (HOST_WIDE_INT cfa_offset)
11838 const struct machine_frame_state &fs = cfun->machine->fs;
11839 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11841 /* Validate that the cfa_offset isn't in a "no-man's land". */
11842 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11843 return false;
11845 return fs.fp_valid;
11848 /* Choose a base register based upon alignment requested, speed and/or
11849 size. */
11851 static void
11852 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11853 HOST_WIDE_INT &base_offset,
11854 unsigned int align_reqested, unsigned int *align)
11856 const struct machine_function *m = cfun->machine;
11857 unsigned int hfp_align;
11858 unsigned int drap_align;
11859 unsigned int sp_align;
11860 bool hfp_ok = fp_valid_at (cfa_offset);
11861 bool drap_ok = m->fs.drap_valid;
11862 bool sp_ok = sp_valid_at (cfa_offset);
11864 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11866 /* Filter out any registers that don't meet the requested alignment
11867 criteria. */
11868 if (align_reqested)
11870 if (m->fs.realigned)
11871 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11872 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11873 notes (which we would need to use a realigned stack pointer),
11874 so disable on SEH targets. */
11875 else if (m->fs.sp_realigned)
11876 sp_align = crtl->stack_alignment_needed;
11878 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11879 drap_ok = drap_ok && drap_align >= align_reqested;
11880 sp_ok = sp_ok && sp_align >= align_reqested;
11883 if (m->use_fast_prologue_epilogue)
11885 /* Choose the base register most likely to allow the most scheduling
11886 opportunities. Generally FP is valid throughout the function,
11887 while DRAP must be reloaded within the epilogue. But choose either
11888 over the SP due to increased encoding size. */
11890 if (hfp_ok)
11892 base_reg = hard_frame_pointer_rtx;
11893 base_offset = m->fs.fp_offset - cfa_offset;
11895 else if (drap_ok)
11897 base_reg = crtl->drap_reg;
11898 base_offset = 0 - cfa_offset;
11900 else if (sp_ok)
11902 base_reg = stack_pointer_rtx;
11903 base_offset = m->fs.sp_offset - cfa_offset;
11906 else
11908 HOST_WIDE_INT toffset;
11909 int len = 16, tlen;
11911 /* Choose the base register with the smallest address encoding.
11912 With a tie, choose FP > DRAP > SP. */
11913 if (sp_ok)
11915 base_reg = stack_pointer_rtx;
11916 base_offset = m->fs.sp_offset - cfa_offset;
11917 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11919 if (drap_ok)
11921 toffset = 0 - cfa_offset;
11922 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11923 if (tlen <= len)
11925 base_reg = crtl->drap_reg;
11926 base_offset = toffset;
11927 len = tlen;
11930 if (hfp_ok)
11932 toffset = m->fs.fp_offset - cfa_offset;
11933 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11934 if (tlen <= len)
11936 base_reg = hard_frame_pointer_rtx;
11937 base_offset = toffset;
11938 len = tlen;
11943 /* Set the align return value. */
11944 if (align)
11946 if (base_reg == stack_pointer_rtx)
11947 *align = sp_align;
11948 else if (base_reg == crtl->drap_reg)
11949 *align = drap_align;
11950 else if (base_reg == hard_frame_pointer_rtx)
11951 *align = hfp_align;
11955 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11956 the alignment of address. If ALIGN is non-null, it should point to
11957 an alignment value (in bits) that is preferred or zero and will
11958 recieve the alignment of the base register that was selected,
11959 irrespective of rather or not CFA_OFFSET is a multiple of that
11960 alignment value. If it is possible for the base register offset to be
11961 non-immediate then SCRATCH_REGNO should specify a scratch register to
11962 use.
11964 The valid base registers are taken from CFUN->MACHINE->FS. */
11966 static rtx
11967 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11968 unsigned int scratch_regno = INVALID_REGNUM)
11970 rtx base_reg = NULL;
11971 HOST_WIDE_INT base_offset = 0;
11973 /* If a specific alignment is requested, try to get a base register
11974 with that alignment first. */
11975 if (align && *align)
11976 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11978 if (!base_reg)
11979 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11981 gcc_assert (base_reg != NULL);
11983 rtx base_offset_rtx = GEN_INT (base_offset);
11985 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11987 gcc_assert (scratch_regno != INVALID_REGNUM);
11989 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11990 emit_move_insn (scratch_reg, base_offset_rtx);
11992 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11995 return plus_constant (Pmode, base_reg, base_offset);
11998 /* Emit code to save registers in the prologue. */
12000 static void
12001 ix86_emit_save_regs (void)
12003 unsigned int regno;
12004 rtx_insn *insn;
12006 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
12007 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12009 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
12010 RTX_FRAME_RELATED_P (insn) = 1;
12014 /* Emit a single register save at CFA - CFA_OFFSET. */
12016 static void
12017 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
12018 HOST_WIDE_INT cfa_offset)
12020 struct machine_function *m = cfun->machine;
12021 rtx reg = gen_rtx_REG (mode, regno);
12022 rtx mem, addr, base, insn;
12023 unsigned int align = GET_MODE_ALIGNMENT (mode);
12025 addr = choose_baseaddr (cfa_offset, &align);
12026 mem = gen_frame_mem (mode, addr);
12028 /* The location aligment depends upon the base register. */
12029 align = MIN (GET_MODE_ALIGNMENT (mode), align);
12030 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
12031 set_mem_align (mem, align);
12033 insn = emit_insn (gen_rtx_SET (mem, reg));
12034 RTX_FRAME_RELATED_P (insn) = 1;
12036 base = addr;
12037 if (GET_CODE (base) == PLUS)
12038 base = XEXP (base, 0);
12039 gcc_checking_assert (REG_P (base));
12041 /* When saving registers into a re-aligned local stack frame, avoid
12042 any tricky guessing by dwarf2out. */
12043 if (m->fs.realigned)
12045 gcc_checking_assert (stack_realign_drap);
12047 if (regno == REGNO (crtl->drap_reg))
12049 /* A bit of a hack. We force the DRAP register to be saved in
12050 the re-aligned stack frame, which provides us with a copy
12051 of the CFA that will last past the prologue. Install it. */
12052 gcc_checking_assert (cfun->machine->fs.fp_valid);
12053 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12054 cfun->machine->fs.fp_offset - cfa_offset);
12055 mem = gen_rtx_MEM (mode, addr);
12056 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
12058 else
12060 /* The frame pointer is a stable reference within the
12061 aligned frame. Use it. */
12062 gcc_checking_assert (cfun->machine->fs.fp_valid);
12063 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
12064 cfun->machine->fs.fp_offset - cfa_offset);
12065 mem = gen_rtx_MEM (mode, addr);
12066 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12070 else if (base == stack_pointer_rtx && m->fs.sp_realigned
12071 && cfa_offset >= m->fs.sp_realigned_offset)
12073 gcc_checking_assert (stack_realign_fp);
12074 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
12077 /* The memory may not be relative to the current CFA register,
12078 which means that we may need to generate a new pattern for
12079 use by the unwind info. */
12080 else if (base != m->fs.cfa_reg)
12082 addr = plus_constant (Pmode, m->fs.cfa_reg,
12083 m->fs.cfa_offset - cfa_offset);
12084 mem = gen_rtx_MEM (mode, addr);
12085 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
12089 /* Emit code to save registers using MOV insns.
12090 First register is stored at CFA - CFA_OFFSET. */
12091 static void
12092 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
12094 unsigned int regno;
12096 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12097 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12099 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
12100 cfa_offset -= UNITS_PER_WORD;
12104 /* Emit code to save SSE registers using MOV insns.
12105 First register is stored at CFA - CFA_OFFSET. */
12106 static void
12107 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
12109 unsigned int regno;
12111 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
12112 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
12114 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
12115 cfa_offset -= GET_MODE_SIZE (V4SFmode);
12119 static GTY(()) rtx queued_cfa_restores;
12121 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
12122 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
12123 Don't add the note if the previously saved value will be left untouched
12124 within stack red-zone till return, as unwinders can find the same value
12125 in the register and on the stack. */
12127 static void
12128 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
12130 if (!crtl->shrink_wrapped
12131 && cfa_offset <= cfun->machine->fs.red_zone_offset)
12132 return;
12134 if (insn)
12136 add_reg_note (insn, REG_CFA_RESTORE, reg);
12137 RTX_FRAME_RELATED_P (insn) = 1;
12139 else
12140 queued_cfa_restores
12141 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
12144 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
12146 static void
12147 ix86_add_queued_cfa_restore_notes (rtx insn)
12149 rtx last;
12150 if (!queued_cfa_restores)
12151 return;
12152 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
12154 XEXP (last, 1) = REG_NOTES (insn);
12155 REG_NOTES (insn) = queued_cfa_restores;
12156 queued_cfa_restores = NULL_RTX;
12157 RTX_FRAME_RELATED_P (insn) = 1;
12160 /* Expand prologue or epilogue stack adjustment.
12161 The pattern exist to put a dependency on all ebp-based memory accesses.
12162 STYLE should be negative if instructions should be marked as frame related,
12163 zero if %r11 register is live and cannot be freely used and positive
12164 otherwise. */
12166 static rtx
12167 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
12168 int style, bool set_cfa)
12170 struct machine_function *m = cfun->machine;
12171 rtx insn;
12172 bool add_frame_related_expr = false;
12174 if (Pmode == SImode)
12175 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
12176 else if (x86_64_immediate_operand (offset, DImode))
12177 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
12178 else
12180 rtx tmp;
12181 /* r11 is used by indirect sibcall return as well, set before the
12182 epilogue and used after the epilogue. */
12183 if (style)
12184 tmp = gen_rtx_REG (DImode, R11_REG);
12185 else
12187 gcc_assert (src != hard_frame_pointer_rtx
12188 && dest != hard_frame_pointer_rtx);
12189 tmp = hard_frame_pointer_rtx;
12191 insn = emit_insn (gen_rtx_SET (tmp, offset));
12192 if (style < 0)
12193 add_frame_related_expr = true;
12195 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
12198 insn = emit_insn (insn);
12199 if (style >= 0)
12200 ix86_add_queued_cfa_restore_notes (insn);
12202 if (set_cfa)
12204 rtx r;
12206 gcc_assert (m->fs.cfa_reg == src);
12207 m->fs.cfa_offset += INTVAL (offset);
12208 m->fs.cfa_reg = dest;
12210 r = gen_rtx_PLUS (Pmode, src, offset);
12211 r = gen_rtx_SET (dest, r);
12212 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
12213 RTX_FRAME_RELATED_P (insn) = 1;
12215 else if (style < 0)
12217 RTX_FRAME_RELATED_P (insn) = 1;
12218 if (add_frame_related_expr)
12220 rtx r = gen_rtx_PLUS (Pmode, src, offset);
12221 r = gen_rtx_SET (dest, r);
12222 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
12226 if (dest == stack_pointer_rtx)
12228 HOST_WIDE_INT ooffset = m->fs.sp_offset;
12229 bool valid = m->fs.sp_valid;
12230 bool realigned = m->fs.sp_realigned;
12232 if (src == hard_frame_pointer_rtx)
12234 valid = m->fs.fp_valid;
12235 realigned = false;
12236 ooffset = m->fs.fp_offset;
12238 else if (src == crtl->drap_reg)
12240 valid = m->fs.drap_valid;
12241 realigned = false;
12242 ooffset = 0;
12244 else
12246 /* Else there are two possibilities: SP itself, which we set
12247 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
12248 taken care of this by hand along the eh_return path. */
12249 gcc_checking_assert (src == stack_pointer_rtx
12250 || offset == const0_rtx);
12253 m->fs.sp_offset = ooffset - INTVAL (offset);
12254 m->fs.sp_valid = valid;
12255 m->fs.sp_realigned = realigned;
12257 return insn;
12260 /* Find an available register to be used as dynamic realign argument
12261 pointer regsiter. Such a register will be written in prologue and
12262 used in begin of body, so it must not be
12263 1. parameter passing register.
12264 2. GOT pointer.
12265 We reuse static-chain register if it is available. Otherwise, we
12266 use DI for i386 and R13 for x86-64. We chose R13 since it has
12267 shorter encoding.
12269 Return: the regno of chosen register. */
12271 static unsigned int
12272 find_drap_reg (void)
12274 tree decl = cfun->decl;
12276 /* Always use callee-saved register if there are no caller-saved
12277 registers. */
12278 if (TARGET_64BIT)
12280 /* Use R13 for nested function or function need static chain.
12281 Since function with tail call may use any caller-saved
12282 registers in epilogue, DRAP must not use caller-saved
12283 register in such case. */
12284 if (DECL_STATIC_CHAIN (decl)
12285 || cfun->machine->no_caller_saved_registers
12286 || crtl->tail_call_emit)
12287 return R13_REG;
12289 return R10_REG;
12291 else
12293 /* Use DI for nested function or function need static chain.
12294 Since function with tail call may use any caller-saved
12295 registers in epilogue, DRAP must not use caller-saved
12296 register in such case. */
12297 if (DECL_STATIC_CHAIN (decl)
12298 || cfun->machine->no_caller_saved_registers
12299 || crtl->tail_call_emit)
12300 return DI_REG;
12302 /* Reuse static chain register if it isn't used for parameter
12303 passing. */
12304 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
12306 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
12307 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
12308 return CX_REG;
12310 return DI_REG;
12314 /* Handle a "force_align_arg_pointer" attribute. */
12316 static tree
12317 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
12318 tree, int, bool *no_add_attrs)
12320 if (TREE_CODE (*node) != FUNCTION_TYPE
12321 && TREE_CODE (*node) != METHOD_TYPE
12322 && TREE_CODE (*node) != FIELD_DECL
12323 && TREE_CODE (*node) != TYPE_DECL)
12325 warning (OPT_Wattributes, "%qE attribute only applies to functions",
12326 name);
12327 *no_add_attrs = true;
12330 return NULL_TREE;
12333 /* Return minimum incoming stack alignment. */
12335 static unsigned int
12336 ix86_minimum_incoming_stack_boundary (bool sibcall)
12338 unsigned int incoming_stack_boundary;
12340 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
12341 if (cfun->machine->func_type != TYPE_NORMAL)
12342 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
12343 /* Prefer the one specified at command line. */
12344 else if (ix86_user_incoming_stack_boundary)
12345 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
12346 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
12347 if -mstackrealign is used, it isn't used for sibcall check and
12348 estimated stack alignment is 128bit. */
12349 else if (!sibcall
12350 && ix86_force_align_arg_pointer
12351 && crtl->stack_alignment_estimated == 128)
12352 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12353 else
12354 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
12356 /* Incoming stack alignment can be changed on individual functions
12357 via force_align_arg_pointer attribute. We use the smallest
12358 incoming stack boundary. */
12359 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
12360 && lookup_attribute (ix86_force_align_arg_pointer_string,
12361 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
12362 incoming_stack_boundary = MIN_STACK_BOUNDARY;
12364 /* The incoming stack frame has to be aligned at least at
12365 parm_stack_boundary. */
12366 if (incoming_stack_boundary < crtl->parm_stack_boundary)
12367 incoming_stack_boundary = crtl->parm_stack_boundary;
12369 /* Stack at entrance of main is aligned by runtime. We use the
12370 smallest incoming stack boundary. */
12371 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
12372 && DECL_NAME (current_function_decl)
12373 && MAIN_NAME_P (DECL_NAME (current_function_decl))
12374 && DECL_FILE_SCOPE_P (current_function_decl))
12375 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
12377 return incoming_stack_boundary;
12380 /* Update incoming stack boundary and estimated stack alignment. */
12382 static void
12383 ix86_update_stack_boundary (void)
12385 ix86_incoming_stack_boundary
12386 = ix86_minimum_incoming_stack_boundary (false);
12388 /* x86_64 vararg needs 16byte stack alignment for register save
12389 area. */
12390 if (TARGET_64BIT
12391 && cfun->stdarg
12392 && crtl->stack_alignment_estimated < 128)
12393 crtl->stack_alignment_estimated = 128;
12395 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
12396 if (ix86_tls_descriptor_calls_expanded_in_cfun
12397 && crtl->preferred_stack_boundary < 128)
12398 crtl->preferred_stack_boundary = 128;
12401 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
12402 needed or an rtx for DRAP otherwise. */
12404 static rtx
12405 ix86_get_drap_rtx (void)
12407 /* We must use DRAP if there are outgoing arguments on stack and
12408 ACCUMULATE_OUTGOING_ARGS is false. */
12409 if (ix86_force_drap
12410 || (cfun->machine->outgoing_args_on_stack
12411 && !ACCUMULATE_OUTGOING_ARGS))
12412 crtl->need_drap = true;
12414 if (stack_realign_drap)
12416 /* Assign DRAP to vDRAP and returns vDRAP */
12417 unsigned int regno = find_drap_reg ();
12418 rtx drap_vreg;
12419 rtx arg_ptr;
12420 rtx_insn *seq, *insn;
12422 arg_ptr = gen_rtx_REG (Pmode, regno);
12423 crtl->drap_reg = arg_ptr;
12425 start_sequence ();
12426 drap_vreg = copy_to_reg (arg_ptr);
12427 seq = get_insns ();
12428 end_sequence ();
12430 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12431 if (!optimize)
12433 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12434 RTX_FRAME_RELATED_P (insn) = 1;
12436 return drap_vreg;
12438 else
12439 return NULL;
12442 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12444 static rtx
12445 ix86_internal_arg_pointer (void)
12447 return virtual_incoming_args_rtx;
12450 struct scratch_reg {
12451 rtx reg;
12452 bool saved;
12455 /* Return a short-lived scratch register for use on function entry.
12456 In 32-bit mode, it is valid only after the registers are saved
12457 in the prologue. This register must be released by means of
12458 release_scratch_register_on_entry once it is dead. */
12460 static void
12461 get_scratch_register_on_entry (struct scratch_reg *sr)
12463 int regno;
12465 sr->saved = false;
12467 if (TARGET_64BIT)
12469 /* We always use R11 in 64-bit mode. */
12470 regno = R11_REG;
12472 else
12474 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12475 bool fastcall_p
12476 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12477 bool thiscall_p
12478 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12479 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12480 int regparm = ix86_function_regparm (fntype, decl);
12481 int drap_regno
12482 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12484 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12485 for the static chain register. */
12486 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12487 && drap_regno != AX_REG)
12488 regno = AX_REG;
12489 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12490 for the static chain register. */
12491 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12492 regno = AX_REG;
12493 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12494 regno = DX_REG;
12495 /* ecx is the static chain register. */
12496 else if (regparm < 3 && !fastcall_p && !thiscall_p
12497 && !static_chain_p
12498 && drap_regno != CX_REG)
12499 regno = CX_REG;
12500 else if (ix86_save_reg (BX_REG, true, false))
12501 regno = BX_REG;
12502 /* esi is the static chain register. */
12503 else if (!(regparm == 3 && static_chain_p)
12504 && ix86_save_reg (SI_REG, true, false))
12505 regno = SI_REG;
12506 else if (ix86_save_reg (DI_REG, true, false))
12507 regno = DI_REG;
12508 else
12510 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12511 sr->saved = true;
12515 sr->reg = gen_rtx_REG (Pmode, regno);
12516 if (sr->saved)
12518 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12519 RTX_FRAME_RELATED_P (insn) = 1;
12523 /* Release a scratch register obtained from the preceding function.
12525 If RELEASE_VIA_POP is true, we just pop the register off the stack
12526 to release it. This is what non-Linux systems use with -fstack-check.
12528 Otherwise we use OFFSET to locate the saved register and the
12529 allocated stack space becomes part of the local frame and is
12530 deallocated by the epilogue. */
12532 static void
12533 release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
12534 bool release_via_pop)
12536 if (sr->saved)
12538 if (release_via_pop)
12540 struct machine_function *m = cfun->machine;
12541 rtx x, insn = emit_insn (gen_pop (sr->reg));
12543 /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
12544 RTX_FRAME_RELATED_P (insn) = 1;
12545 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12546 x = gen_rtx_SET (stack_pointer_rtx, x);
12547 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12548 m->fs.sp_offset -= UNITS_PER_WORD;
12550 else
12552 rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
12553 x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
12554 emit_insn (x);
12559 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12561 This differs from the next routine in that it tries hard to prevent
12562 attacks that jump the stack guard. Thus it is never allowed to allocate
12563 more than PROBE_INTERVAL bytes of stack space without a suitable
12564 probe.
12566 INT_REGISTERS_SAVED is true if integer registers have already been
12567 pushed on the stack. */
12569 static void
12570 ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
12571 const bool int_registers_saved)
12573 struct machine_function *m = cfun->machine;
12575 /* If this function does not statically allocate stack space, then
12576 no probes are needed. */
12577 if (!size)
12579 /* However, the allocation of space via pushes for register
12580 saves could be viewed as allocating space, but without the
12581 need to probe. */
12582 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12583 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12584 else
12585 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12586 return;
12589 /* If we are a noreturn function, then we have to consider the
12590 possibility that we're called via a jump rather than a call.
12592 Thus we don't have the implicit probe generated by saving the
12593 return address into the stack at the call. Thus, the stack
12594 pointer could be anywhere in the guard page. The safe thing
12595 to do is emit a probe now.
12597 The probe can be avoided if we have already emitted any callee
12598 register saves into the stack or have a frame pointer (which will
12599 have been saved as well). Those saves will function as implicit
12600 probes.
12602 ?!? This should be revamped to work like aarch64 and s390 where
12603 we track the offset from the most recent probe. Normally that
12604 offset would be zero. For a noreturn function we would reset
12605 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12606 we just probe when we cross PROBE_INTERVAL. */
12607 if (TREE_THIS_VOLATILE (cfun->decl)
12608 && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
12610 /* We can safely use any register here since we're just going to push
12611 its value and immediately pop it back. But we do try and avoid
12612 argument passing registers so as not to introduce dependencies in
12613 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12614 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12615 rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
12616 rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
12617 m->fs.sp_offset -= UNITS_PER_WORD;
12618 if (m->fs.cfa_reg == stack_pointer_rtx)
12620 m->fs.cfa_offset -= UNITS_PER_WORD;
12621 rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
12622 x = gen_rtx_SET (stack_pointer_rtx, x);
12623 add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
12624 RTX_FRAME_RELATED_P (insn_push) = 1;
12625 x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
12626 x = gen_rtx_SET (stack_pointer_rtx, x);
12627 add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
12628 RTX_FRAME_RELATED_P (insn_pop) = 1;
12630 emit_insn (gen_blockage ());
12633 /* If we allocate less than the size of the guard statically,
12634 then no probing is necessary, but we do need to allocate
12635 the stack. */
12636 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12638 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12639 GEN_INT (-size), -1,
12640 m->fs.cfa_reg == stack_pointer_rtx);
12641 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12642 return;
12645 /* We're allocating a large enough stack frame that we need to
12646 emit probes. Either emit them inline or in a loop depending
12647 on the size. */
12648 HOST_WIDE_INT probe_interval = get_probe_interval ();
12649 if (size <= 4 * probe_interval)
12651 HOST_WIDE_INT i;
12652 for (i = probe_interval; i <= size; i += probe_interval)
12654 /* Allocate PROBE_INTERVAL bytes. */
12655 rtx insn
12656 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12657 GEN_INT (-probe_interval), -1,
12658 m->fs.cfa_reg == stack_pointer_rtx);
12659 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12661 /* And probe at *sp. */
12662 emit_stack_probe (stack_pointer_rtx);
12663 emit_insn (gen_blockage ());
12666 /* We need to allocate space for the residual, but we do not need
12667 to probe the residual. */
12668 HOST_WIDE_INT residual = (i - probe_interval - size);
12669 if (residual)
12670 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12671 GEN_INT (residual), -1,
12672 m->fs.cfa_reg == stack_pointer_rtx);
12673 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12675 else
12677 /* We expect the GP registers to be saved when probes are used
12678 as the probing sequences might need a scratch register and
12679 the routine to allocate one assumes the integer registers
12680 have already been saved. */
12681 gcc_assert (int_registers_saved);
12683 struct scratch_reg sr;
12684 get_scratch_register_on_entry (&sr);
12686 /* If we needed to save a register, then account for any space
12687 that was pushed (we are not going to pop the register when
12688 we do the restore). */
12689 if (sr.saved)
12690 size -= UNITS_PER_WORD;
12692 /* Step 1: round SIZE down to a multiple of the interval. */
12693 HOST_WIDE_INT rounded_size = size & -probe_interval;
12695 /* Step 2: compute final value of the loop counter. Use lea if
12696 possible. */
12697 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12698 rtx insn;
12699 if (address_no_seg_operand (addr, Pmode))
12700 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12701 else
12703 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12704 insn = emit_insn (gen_rtx_SET (sr.reg,
12705 gen_rtx_PLUS (Pmode, sr.reg,
12706 stack_pointer_rtx)));
12708 if (m->fs.cfa_reg == stack_pointer_rtx)
12710 add_reg_note (insn, REG_CFA_DEF_CFA,
12711 plus_constant (Pmode, sr.reg,
12712 m->fs.cfa_offset + rounded_size));
12713 RTX_FRAME_RELATED_P (insn) = 1;
12716 /* Step 3: the loop. */
12717 rtx size_rtx = GEN_INT (rounded_size);
12718 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12719 size_rtx));
12720 if (m->fs.cfa_reg == stack_pointer_rtx)
12722 m->fs.cfa_offset += rounded_size;
12723 add_reg_note (insn, REG_CFA_DEF_CFA,
12724 plus_constant (Pmode, stack_pointer_rtx,
12725 m->fs.cfa_offset));
12726 RTX_FRAME_RELATED_P (insn) = 1;
12728 m->fs.sp_offset += rounded_size;
12729 emit_insn (gen_blockage ());
12731 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12732 is equal to ROUNDED_SIZE. */
12734 if (size != rounded_size)
12735 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12736 GEN_INT (rounded_size - size), -1,
12737 m->fs.cfa_reg == stack_pointer_rtx);
12738 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12740 /* This does not deallocate the space reserved for the scratch
12741 register. That will be deallocated in the epilogue. */
12742 release_scratch_register_on_entry (&sr, size, false);
12745 /* Make sure nothing is scheduled before we are done. */
12746 emit_insn (gen_blockage ());
12749 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12751 INT_REGISTERS_SAVED is true if integer registers have already been
12752 pushed on the stack. */
12754 static void
12755 ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
12756 const bool int_registers_saved)
12758 /* We skip the probe for the first interval + a small dope of 4 words and
12759 probe that many bytes past the specified size to maintain a protection
12760 area at the botton of the stack. */
12761 const int dope = 4 * UNITS_PER_WORD;
12762 rtx size_rtx = GEN_INT (size), last;
12764 /* See if we have a constant small number of probes to generate. If so,
12765 that's the easy case. The run-time loop is made up of 9 insns in the
12766 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12767 for n # of intervals. */
12768 if (size <= 4 * get_probe_interval ())
12770 HOST_WIDE_INT i, adjust;
12771 bool first_probe = true;
12773 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12774 values of N from 1 until it exceeds SIZE. If only one probe is
12775 needed, this will not generate any code. Then adjust and probe
12776 to PROBE_INTERVAL + SIZE. */
12777 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12779 if (first_probe)
12781 adjust = 2 * get_probe_interval () + dope;
12782 first_probe = false;
12784 else
12785 adjust = get_probe_interval ();
12787 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12788 plus_constant (Pmode, stack_pointer_rtx,
12789 -adjust)));
12790 emit_stack_probe (stack_pointer_rtx);
12793 if (first_probe)
12794 adjust = size + get_probe_interval () + dope;
12795 else
12796 adjust = size + get_probe_interval () - i;
12798 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12799 plus_constant (Pmode, stack_pointer_rtx,
12800 -adjust)));
12801 emit_stack_probe (stack_pointer_rtx);
12803 /* Adjust back to account for the additional first interval. */
12804 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12805 plus_constant (Pmode, stack_pointer_rtx,
12806 (get_probe_interval ()
12807 + dope))));
12810 /* Otherwise, do the same as above, but in a loop. Note that we must be
12811 extra careful with variables wrapping around because we might be at
12812 the very top (or the very bottom) of the address space and we have
12813 to be able to handle this case properly; in particular, we use an
12814 equality test for the loop condition. */
12815 else
12817 /* We expect the GP registers to be saved when probes are used
12818 as the probing sequences might need a scratch register and
12819 the routine to allocate one assumes the integer registers
12820 have already been saved. */
12821 gcc_assert (int_registers_saved);
12823 HOST_WIDE_INT rounded_size;
12824 struct scratch_reg sr;
12826 get_scratch_register_on_entry (&sr);
12828 /* If we needed to save a register, then account for any space
12829 that was pushed (we are not going to pop the register when
12830 we do the restore). */
12831 if (sr.saved)
12832 size -= UNITS_PER_WORD;
12834 /* Step 1: round SIZE to the previous multiple of the interval. */
12836 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12839 /* Step 2: compute initial and final value of the loop counter. */
12841 /* SP = SP_0 + PROBE_INTERVAL. */
12842 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12843 plus_constant (Pmode, stack_pointer_rtx,
12844 - (get_probe_interval () + dope))));
12846 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12847 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12848 emit_insn (gen_rtx_SET (sr.reg,
12849 plus_constant (Pmode, stack_pointer_rtx,
12850 -rounded_size)));
12851 else
12853 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12854 emit_insn (gen_rtx_SET (sr.reg,
12855 gen_rtx_PLUS (Pmode, sr.reg,
12856 stack_pointer_rtx)));
12860 /* Step 3: the loop
12864 SP = SP + PROBE_INTERVAL
12865 probe at SP
12867 while (SP != LAST_ADDR)
12869 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12870 values of N from 1 until it is equal to ROUNDED_SIZE. */
12872 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12875 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12876 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12878 if (size != rounded_size)
12880 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12881 plus_constant (Pmode, stack_pointer_rtx,
12882 rounded_size - size)));
12883 emit_stack_probe (stack_pointer_rtx);
12886 /* Adjust back to account for the additional first interval. */
12887 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12888 plus_constant (Pmode, stack_pointer_rtx,
12889 (get_probe_interval ()
12890 + dope))));
12892 /* This does not deallocate the space reserved for the scratch
12893 register. That will be deallocated in the epilogue. */
12894 release_scratch_register_on_entry (&sr, size, false);
12897 /* Even if the stack pointer isn't the CFA register, we need to correctly
12898 describe the adjustments made to it, in particular differentiate the
12899 frame-related ones from the frame-unrelated ones. */
12900 if (size > 0)
12902 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12903 XVECEXP (expr, 0, 0)
12904 = gen_rtx_SET (stack_pointer_rtx,
12905 plus_constant (Pmode, stack_pointer_rtx, -size));
12906 XVECEXP (expr, 0, 1)
12907 = gen_rtx_SET (stack_pointer_rtx,
12908 plus_constant (Pmode, stack_pointer_rtx,
12909 get_probe_interval () + dope + size));
12910 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12911 RTX_FRAME_RELATED_P (last) = 1;
12913 cfun->machine->fs.sp_offset += size;
12916 /* Make sure nothing is scheduled before we are done. */
12917 emit_insn (gen_blockage ());
12920 /* Adjust the stack pointer up to REG while probing it. */
12922 const char *
12923 output_adjust_stack_and_probe (rtx reg)
12925 static int labelno = 0;
12926 char loop_lab[32];
12927 rtx xops[2];
12929 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12931 /* Loop. */
12932 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12934 /* SP = SP + PROBE_INTERVAL. */
12935 xops[0] = stack_pointer_rtx;
12936 xops[1] = GEN_INT (get_probe_interval ());
12937 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12939 /* Probe at SP. */
12940 xops[1] = const0_rtx;
12941 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12943 /* Test if SP == LAST_ADDR. */
12944 xops[0] = stack_pointer_rtx;
12945 xops[1] = reg;
12946 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12948 /* Branch. */
12949 fputs ("\tjne\t", asm_out_file);
12950 assemble_name_raw (asm_out_file, loop_lab);
12951 fputc ('\n', asm_out_file);
12953 return "";
12956 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12957 inclusive. These are offsets from the current stack pointer.
12959 INT_REGISTERS_SAVED is true if integer registers have already been
12960 pushed on the stack. */
12962 static void
12963 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
12964 const bool int_registers_saved)
12966 /* See if we have a constant small number of probes to generate. If so,
12967 that's the easy case. The run-time loop is made up of 6 insns in the
12968 generic case while the compile-time loop is made up of n insns for n #
12969 of intervals. */
12970 if (size <= 6 * get_probe_interval ())
12972 HOST_WIDE_INT i;
12974 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12975 it exceeds SIZE. If only one probe is needed, this will not
12976 generate any code. Then probe at FIRST + SIZE. */
12977 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12978 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12979 -(first + i)));
12981 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12982 -(first + size)));
12985 /* Otherwise, do the same as above, but in a loop. Note that we must be
12986 extra careful with variables wrapping around because we might be at
12987 the very top (or the very bottom) of the address space and we have
12988 to be able to handle this case properly; in particular, we use an
12989 equality test for the loop condition. */
12990 else
12992 /* We expect the GP registers to be saved when probes are used
12993 as the probing sequences might need a scratch register and
12994 the routine to allocate one assumes the integer registers
12995 have already been saved. */
12996 gcc_assert (int_registers_saved);
12998 HOST_WIDE_INT rounded_size, last;
12999 struct scratch_reg sr;
13001 get_scratch_register_on_entry (&sr);
13004 /* Step 1: round SIZE to the previous multiple of the interval. */
13006 rounded_size = ROUND_DOWN (size, get_probe_interval ());
13009 /* Step 2: compute initial and final value of the loop counter. */
13011 /* TEST_OFFSET = FIRST. */
13012 emit_move_insn (sr.reg, GEN_INT (-first));
13014 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
13015 last = first + rounded_size;
13018 /* Step 3: the loop
13022 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
13023 probe at TEST_ADDR
13025 while (TEST_ADDR != LAST_ADDR)
13027 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
13028 until it is equal to ROUNDED_SIZE. */
13030 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
13033 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
13034 that SIZE is equal to ROUNDED_SIZE. */
13036 if (size != rounded_size)
13037 emit_stack_probe (plus_constant (Pmode,
13038 gen_rtx_PLUS (Pmode,
13039 stack_pointer_rtx,
13040 sr.reg),
13041 rounded_size - size));
13043 release_scratch_register_on_entry (&sr, size, true);
13046 /* Make sure nothing is scheduled before we are done. */
13047 emit_insn (gen_blockage ());
13050 /* Probe a range of stack addresses from REG to END, inclusive. These are
13051 offsets from the current stack pointer. */
13053 const char *
13054 output_probe_stack_range (rtx reg, rtx end)
13056 static int labelno = 0;
13057 char loop_lab[32];
13058 rtx xops[3];
13060 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
13062 /* Loop. */
13063 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
13065 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
13066 xops[0] = reg;
13067 xops[1] = GEN_INT (get_probe_interval ());
13068 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
13070 /* Probe at TEST_ADDR. */
13071 xops[0] = stack_pointer_rtx;
13072 xops[1] = reg;
13073 xops[2] = const0_rtx;
13074 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
13076 /* Test if TEST_ADDR == LAST_ADDR. */
13077 xops[0] = reg;
13078 xops[1] = end;
13079 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
13081 /* Branch. */
13082 fputs ("\tjne\t", asm_out_file);
13083 assemble_name_raw (asm_out_file, loop_lab);
13084 fputc ('\n', asm_out_file);
13086 return "";
13089 /* Return true if stack frame is required. Update STACK_ALIGNMENT
13090 to the largest alignment, in bits, of stack slot used if stack
13091 frame is required and CHECK_STACK_SLOT is true. */
13093 static bool
13094 ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
13095 bool check_stack_slot)
13097 HARD_REG_SET set_up_by_prologue, prologue_used;
13098 basic_block bb;
13100 CLEAR_HARD_REG_SET (prologue_used);
13101 CLEAR_HARD_REG_SET (set_up_by_prologue);
13102 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
13103 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
13104 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
13105 HARD_FRAME_POINTER_REGNUM);
13107 /* The preferred stack alignment is the minimum stack alignment. */
13108 if (stack_alignment > crtl->preferred_stack_boundary)
13109 stack_alignment = crtl->preferred_stack_boundary;
13111 bool require_stack_frame = false;
13113 FOR_EACH_BB_FN (bb, cfun)
13115 rtx_insn *insn;
13116 FOR_BB_INSNS (bb, insn)
13117 if (NONDEBUG_INSN_P (insn)
13118 && requires_stack_frame_p (insn, prologue_used,
13119 set_up_by_prologue))
13121 require_stack_frame = true;
13123 if (check_stack_slot)
13125 /* Find the maximum stack alignment. */
13126 subrtx_iterator::array_type array;
13127 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
13128 if (MEM_P (*iter)
13129 && (reg_mentioned_p (stack_pointer_rtx,
13130 *iter)
13131 || reg_mentioned_p (frame_pointer_rtx,
13132 *iter)))
13134 unsigned int alignment = MEM_ALIGN (*iter);
13135 if (alignment > stack_alignment)
13136 stack_alignment = alignment;
13142 return require_stack_frame;
13145 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
13146 will guide prologue/epilogue to be generated in correct form. */
13148 static void
13149 ix86_finalize_stack_frame_flags (void)
13151 /* Check if stack realign is really needed after reload, and
13152 stores result in cfun */
13153 unsigned int incoming_stack_boundary
13154 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
13155 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
13156 unsigned int stack_alignment
13157 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
13158 ? crtl->max_used_stack_slot_alignment
13159 : crtl->stack_alignment_needed);
13160 unsigned int stack_realign
13161 = (incoming_stack_boundary < stack_alignment);
13162 bool recompute_frame_layout_p = false;
13164 if (crtl->stack_realign_finalized)
13166 /* After stack_realign_needed is finalized, we can't no longer
13167 change it. */
13168 gcc_assert (crtl->stack_realign_needed == stack_realign);
13169 return;
13172 /* If the only reason for frame_pointer_needed is that we conservatively
13173 assumed stack realignment might be needed or -fno-omit-frame-pointer
13174 is used, but in the end nothing that needed the stack alignment had
13175 been spilled nor stack access, clear frame_pointer_needed and say we
13176 don't need stack realignment. */
13177 if ((stack_realign || (!flag_omit_frame_pointer && optimize))
13178 && frame_pointer_needed
13179 && crtl->is_leaf
13180 && crtl->sp_is_unchanging
13181 && !ix86_current_function_calls_tls_descriptor
13182 && !crtl->accesses_prior_frames
13183 && !cfun->calls_alloca
13184 && !crtl->calls_eh_return
13185 /* See ira_setup_eliminable_regset for the rationale. */
13186 && !(STACK_CHECK_MOVING_SP
13187 && flag_stack_check
13188 && flag_exceptions
13189 && cfun->can_throw_non_call_exceptions)
13190 && !ix86_frame_pointer_required ()
13191 && get_frame_size () == 0
13192 && ix86_nsaved_sseregs () == 0
13193 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
13195 if (ix86_find_max_used_stack_alignment (stack_alignment,
13196 stack_realign))
13198 /* Stack frame is required. If stack alignment needed is less
13199 than incoming stack boundary, don't realign stack. */
13200 stack_realign = incoming_stack_boundary < stack_alignment;
13201 if (!stack_realign)
13203 crtl->max_used_stack_slot_alignment
13204 = incoming_stack_boundary;
13205 crtl->stack_alignment_needed
13206 = incoming_stack_boundary;
13207 /* Also update preferred_stack_boundary for leaf
13208 functions. */
13209 crtl->preferred_stack_boundary
13210 = incoming_stack_boundary;
13213 else
13215 /* If drap has been set, but it actually isn't live at the
13216 start of the function, there is no reason to set it up. */
13217 if (crtl->drap_reg)
13219 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13220 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
13221 REGNO (crtl->drap_reg)))
13223 crtl->drap_reg = NULL_RTX;
13224 crtl->need_drap = false;
13227 else
13228 cfun->machine->no_drap_save_restore = true;
13230 frame_pointer_needed = false;
13231 stack_realign = false;
13232 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
13233 crtl->stack_alignment_needed = incoming_stack_boundary;
13234 crtl->stack_alignment_estimated = incoming_stack_boundary;
13235 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
13236 crtl->preferred_stack_boundary = incoming_stack_boundary;
13237 df_finish_pass (true);
13238 df_scan_alloc (NULL);
13239 df_scan_blocks ();
13240 df_compute_regs_ever_live (true);
13241 df_analyze ();
13243 if (flag_var_tracking)
13245 /* Since frame pointer is no longer available, replace it with
13246 stack pointer - UNITS_PER_WORD in debug insns. */
13247 df_ref ref, next;
13248 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
13249 ref; ref = next)
13251 next = DF_REF_NEXT_REG (ref);
13252 if (!DF_REF_INSN_INFO (ref))
13253 continue;
13255 /* Make sure the next ref is for a different instruction,
13256 so that we're not affected by the rescan. */
13257 rtx_insn *insn = DF_REF_INSN (ref);
13258 while (next && DF_REF_INSN (next) == insn)
13259 next = DF_REF_NEXT_REG (next);
13261 if (DEBUG_INSN_P (insn))
13263 bool changed = false;
13264 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
13266 rtx *loc = DF_REF_LOC (ref);
13267 if (*loc == hard_frame_pointer_rtx)
13269 *loc = plus_constant (Pmode,
13270 stack_pointer_rtx,
13271 -UNITS_PER_WORD);
13272 changed = true;
13275 if (changed)
13276 df_insn_rescan (insn);
13281 recompute_frame_layout_p = true;
13284 else if (crtl->max_used_stack_slot_alignment >= 128)
13286 /* We don't need to realign stack. max_used_stack_alignment is
13287 used to decide how stack frame should be aligned. This is
13288 independent of any psABIs nor 32-bit vs 64-bit. It is always
13289 safe to compute max_used_stack_alignment. We compute it only
13290 if 128-bit aligned load/store may be generated on misaligned
13291 stack slot which will lead to segfault. */
13292 if (ix86_find_max_used_stack_alignment (stack_alignment, true))
13293 cfun->machine->max_used_stack_alignment
13294 = stack_alignment / BITS_PER_UNIT;
13297 if (crtl->stack_realign_needed != stack_realign)
13298 recompute_frame_layout_p = true;
13299 crtl->stack_realign_needed = stack_realign;
13300 crtl->stack_realign_finalized = true;
13301 if (recompute_frame_layout_p)
13302 ix86_compute_frame_layout ();
13305 /* Delete SET_GOT right after entry block if it is allocated to reg. */
13307 static void
13308 ix86_elim_entry_set_got (rtx reg)
13310 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
13311 rtx_insn *c_insn = BB_HEAD (bb);
13312 if (!NONDEBUG_INSN_P (c_insn))
13313 c_insn = next_nonnote_nondebug_insn (c_insn);
13314 if (c_insn && NONJUMP_INSN_P (c_insn))
13316 rtx pat = PATTERN (c_insn);
13317 if (GET_CODE (pat) == PARALLEL)
13319 rtx vec = XVECEXP (pat, 0, 0);
13320 if (GET_CODE (vec) == SET
13321 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
13322 && REGNO (XEXP (vec, 0)) == REGNO (reg))
13323 delete_insn (c_insn);
13328 static rtx
13329 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
13331 rtx addr, mem;
13333 if (offset)
13334 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
13335 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
13336 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
13339 static inline rtx
13340 gen_frame_load (rtx reg, rtx frame_reg, int offset)
13342 return gen_frame_set (reg, frame_reg, offset, false);
13345 static inline rtx
13346 gen_frame_store (rtx reg, rtx frame_reg, int offset)
13348 return gen_frame_set (reg, frame_reg, offset, true);
13351 static void
13352 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
13354 struct machine_function *m = cfun->machine;
13355 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13356 + m->call_ms2sysv_extra_regs;
13357 rtvec v = rtvec_alloc (ncregs + 1);
13358 unsigned int align, i, vi = 0;
13359 rtx_insn *insn;
13360 rtx sym, addr;
13361 rtx rax = gen_rtx_REG (word_mode, AX_REG);
13362 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13364 /* AL should only be live with sysv_abi. */
13365 gcc_assert (!ix86_eax_live_at_start_p ());
13366 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
13368 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
13369 we've actually realigned the stack or not. */
13370 align = GET_MODE_ALIGNMENT (V4SFmode);
13371 addr = choose_baseaddr (frame.stack_realign_offset
13372 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
13373 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13375 emit_insn (gen_rtx_SET (rax, addr));
13377 /* Get the stub symbol. */
13378 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
13379 : XLOGUE_STUB_SAVE);
13380 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13382 for (i = 0; i < ncregs; ++i)
13384 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13385 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
13386 r.regno);
13387 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
13390 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
13392 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
13393 RTX_FRAME_RELATED_P (insn) = true;
13396 /* Expand the prologue into a bunch of separate insns. */
13398 void
13399 ix86_expand_prologue (void)
13401 struct machine_function *m = cfun->machine;
13402 rtx insn, t;
13403 HOST_WIDE_INT allocate;
13404 bool int_registers_saved;
13405 bool sse_registers_saved;
13406 bool save_stub_call_needed;
13407 rtx static_chain = NULL_RTX;
13409 if (ix86_function_naked (current_function_decl))
13410 return;
13412 ix86_finalize_stack_frame_flags ();
13414 /* DRAP should not coexist with stack_realign_fp */
13415 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
13417 memset (&m->fs, 0, sizeof (m->fs));
13419 /* Initialize CFA state for before the prologue. */
13420 m->fs.cfa_reg = stack_pointer_rtx;
13421 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
13423 /* Track SP offset to the CFA. We continue tracking this after we've
13424 swapped the CFA register away from SP. In the case of re-alignment
13425 this is fudged; we're interested to offsets within the local frame. */
13426 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13427 m->fs.sp_valid = true;
13428 m->fs.sp_realigned = false;
13430 const struct ix86_frame &frame = cfun->machine->frame;
13432 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
13434 /* We should have already generated an error for any use of
13435 ms_hook on a nested function. */
13436 gcc_checking_assert (!ix86_static_chain_on_stack);
13438 /* Check if profiling is active and we shall use profiling before
13439 prologue variant. If so sorry. */
13440 if (crtl->profile && flag_fentry != 0)
13441 sorry ("ms_hook_prologue attribute isn%'t compatible "
13442 "with -mfentry for 32-bit");
13444 /* In ix86_asm_output_function_label we emitted:
13445 8b ff movl.s %edi,%edi
13446 55 push %ebp
13447 8b ec movl.s %esp,%ebp
13449 This matches the hookable function prologue in Win32 API
13450 functions in Microsoft Windows XP Service Pack 2 and newer.
13451 Wine uses this to enable Windows apps to hook the Win32 API
13452 functions provided by Wine.
13454 What that means is that we've already set up the frame pointer. */
13456 if (frame_pointer_needed
13457 && !(crtl->drap_reg && crtl->stack_realign_needed))
13459 rtx push, mov;
13461 /* We've decided to use the frame pointer already set up.
13462 Describe this to the unwinder by pretending that both
13463 push and mov insns happen right here.
13465 Putting the unwind info here at the end of the ms_hook
13466 is done so that we can make absolutely certain we get
13467 the required byte sequence at the start of the function,
13468 rather than relying on an assembler that can produce
13469 the exact encoding required.
13471 However it does mean (in the unpatched case) that we have
13472 a 1 insn window where the asynchronous unwind info is
13473 incorrect. However, if we placed the unwind info at
13474 its correct location we would have incorrect unwind info
13475 in the patched case. Which is probably all moot since
13476 I don't expect Wine generates dwarf2 unwind info for the
13477 system libraries that use this feature. */
13479 insn = emit_insn (gen_blockage ());
13481 push = gen_push (hard_frame_pointer_rtx);
13482 mov = gen_rtx_SET (hard_frame_pointer_rtx,
13483 stack_pointer_rtx);
13484 RTX_FRAME_RELATED_P (push) = 1;
13485 RTX_FRAME_RELATED_P (mov) = 1;
13487 RTX_FRAME_RELATED_P (insn) = 1;
13488 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13489 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
13491 /* Note that gen_push incremented m->fs.cfa_offset, even
13492 though we didn't emit the push insn here. */
13493 m->fs.cfa_reg = hard_frame_pointer_rtx;
13494 m->fs.fp_offset = m->fs.cfa_offset;
13495 m->fs.fp_valid = true;
13497 else
13499 /* The frame pointer is not needed so pop %ebp again.
13500 This leaves us with a pristine state. */
13501 emit_insn (gen_pop (hard_frame_pointer_rtx));
13505 /* The first insn of a function that accepts its static chain on the
13506 stack is to push the register that would be filled in by a direct
13507 call. This insn will be skipped by the trampoline. */
13508 else if (ix86_static_chain_on_stack)
13510 static_chain = ix86_static_chain (cfun->decl, false);
13511 insn = emit_insn (gen_push (static_chain));
13512 emit_insn (gen_blockage ());
13514 /* We don't want to interpret this push insn as a register save,
13515 only as a stack adjustment. The real copy of the register as
13516 a save will be done later, if needed. */
13517 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13518 t = gen_rtx_SET (stack_pointer_rtx, t);
13519 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13520 RTX_FRAME_RELATED_P (insn) = 1;
13523 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13524 of DRAP is needed and stack realignment is really needed after reload */
13525 if (stack_realign_drap)
13527 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13529 /* Can't use DRAP in interrupt function. */
13530 if (cfun->machine->func_type != TYPE_NORMAL)
13531 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13532 "in interrupt service routine. This may be worked "
13533 "around by avoiding functions with aggregate return.");
13535 /* Only need to push parameter pointer reg if it is caller saved. */
13536 if (!call_used_regs[REGNO (crtl->drap_reg)])
13538 /* Push arg pointer reg */
13539 insn = emit_insn (gen_push (crtl->drap_reg));
13540 RTX_FRAME_RELATED_P (insn) = 1;
13543 /* Grab the argument pointer. */
13544 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13545 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13546 RTX_FRAME_RELATED_P (insn) = 1;
13547 m->fs.cfa_reg = crtl->drap_reg;
13548 m->fs.cfa_offset = 0;
13550 /* Align the stack. */
13551 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13552 stack_pointer_rtx,
13553 GEN_INT (-align_bytes)));
13554 RTX_FRAME_RELATED_P (insn) = 1;
13556 /* Replicate the return address on the stack so that return
13557 address can be reached via (argp - 1) slot. This is needed
13558 to implement macro RETURN_ADDR_RTX and intrinsic function
13559 expand_builtin_return_addr etc. */
13560 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13561 t = gen_frame_mem (word_mode, t);
13562 insn = emit_insn (gen_push (t));
13563 RTX_FRAME_RELATED_P (insn) = 1;
13565 /* For the purposes of frame and register save area addressing,
13566 we've started over with a new frame. */
13567 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13568 m->fs.realigned = true;
13570 if (static_chain)
13572 /* Replicate static chain on the stack so that static chain
13573 can be reached via (argp - 2) slot. This is needed for
13574 nested function with stack realignment. */
13575 insn = emit_insn (gen_push (static_chain));
13576 RTX_FRAME_RELATED_P (insn) = 1;
13580 int_registers_saved = (frame.nregs == 0);
13581 sse_registers_saved = (frame.nsseregs == 0);
13582 save_stub_call_needed = (m->call_ms2sysv);
13583 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13585 if (frame_pointer_needed && !m->fs.fp_valid)
13587 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13588 slower on all targets. Also sdb didn't like it. */
13589 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13590 RTX_FRAME_RELATED_P (insn) = 1;
13592 /* Push registers now, before setting the frame pointer
13593 on SEH target. */
13594 if (!int_registers_saved
13595 && TARGET_SEH
13596 && !frame.save_regs_using_mov)
13598 ix86_emit_save_regs ();
13599 int_registers_saved = true;
13600 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13603 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13605 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13606 RTX_FRAME_RELATED_P (insn) = 1;
13608 if (m->fs.cfa_reg == stack_pointer_rtx)
13609 m->fs.cfa_reg = hard_frame_pointer_rtx;
13610 m->fs.fp_offset = m->fs.sp_offset;
13611 m->fs.fp_valid = true;
13615 if (!int_registers_saved)
13617 /* If saving registers via PUSH, do so now. */
13618 if (!frame.save_regs_using_mov)
13620 ix86_emit_save_regs ();
13621 int_registers_saved = true;
13622 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13625 /* When using red zone we may start register saving before allocating
13626 the stack frame saving one cycle of the prologue. However, avoid
13627 doing this if we have to probe the stack; at least on x86_64 the
13628 stack probe can turn into a call that clobbers a red zone location. */
13629 else if (ix86_using_red_zone ()
13630 && (! TARGET_STACK_PROBE
13631 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13633 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13634 int_registers_saved = true;
13638 if (stack_realign_fp)
13640 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13641 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13643 /* Record last valid frame pointer offset. */
13644 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13646 /* The computation of the size of the re-aligned stack frame means
13647 that we must allocate the size of the register save area before
13648 performing the actual alignment. Otherwise we cannot guarantee
13649 that there's enough storage above the realignment point. */
13650 allocate = frame.reg_save_offset - m->fs.sp_offset
13651 + frame.stack_realign_allocate;
13652 if (allocate)
13653 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13654 GEN_INT (-allocate), -1, false);
13656 /* Align the stack. */
13657 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13658 stack_pointer_rtx,
13659 GEN_INT (-align_bytes)));
13660 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13661 m->fs.sp_realigned_offset = m->fs.sp_offset
13662 - frame.stack_realign_allocate;
13663 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13664 Beyond this point, stack access should be done via choose_baseaddr or
13665 by using sp_valid_at and fp_valid_at to determine the correct base
13666 register. Henceforth, any CFA offset should be thought of as logical
13667 and not physical. */
13668 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13669 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13670 m->fs.sp_realigned = true;
13672 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13673 is needed to describe where a register is saved using a realigned
13674 stack pointer, so we need to invalidate the stack pointer for that
13675 target. */
13676 if (TARGET_SEH)
13677 m->fs.sp_valid = false;
13679 /* If SP offset is non-immediate after allocation of the stack frame,
13680 then emit SSE saves or stub call prior to allocating the rest of the
13681 stack frame. This is less efficient for the out-of-line stub because
13682 we can't combine allocations across the call barrier, but it's better
13683 than using a scratch register. */
13684 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13685 - m->fs.sp_realigned_offset),
13686 Pmode))
13688 if (!sse_registers_saved)
13690 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13691 sse_registers_saved = true;
13693 else if (save_stub_call_needed)
13695 ix86_emit_outlined_ms2sysv_save (frame);
13696 save_stub_call_needed = false;
13701 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13703 if (flag_stack_usage_info)
13705 /* We start to count from ARG_POINTER. */
13706 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13708 /* If it was realigned, take into account the fake frame. */
13709 if (stack_realign_drap)
13711 if (ix86_static_chain_on_stack)
13712 stack_size += UNITS_PER_WORD;
13714 if (!call_used_regs[REGNO (crtl->drap_reg)])
13715 stack_size += UNITS_PER_WORD;
13717 /* This over-estimates by 1 minimal-stack-alignment-unit but
13718 mitigates that by counting in the new return address slot. */
13719 current_function_dynamic_stack_size
13720 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13723 current_function_static_stack_size = stack_size;
13726 /* On SEH target with very large frame size, allocate an area to save
13727 SSE registers (as the very large allocation won't be described). */
13728 if (TARGET_SEH
13729 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13730 && !sse_registers_saved)
13732 HOST_WIDE_INT sse_size =
13733 frame.sse_reg_save_offset - frame.reg_save_offset;
13735 gcc_assert (int_registers_saved);
13737 /* No need to do stack checking as the area will be immediately
13738 written. */
13739 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13740 GEN_INT (-sse_size), -1,
13741 m->fs.cfa_reg == stack_pointer_rtx);
13742 allocate -= sse_size;
13743 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13744 sse_registers_saved = true;
13747 /* The stack has already been decremented by the instruction calling us
13748 so probe if the size is non-negative to preserve the protection area. */
13749 if (allocate >= 0
13750 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13751 || flag_stack_clash_protection))
13753 if (flag_stack_clash_protection)
13755 ix86_adjust_stack_and_probe_stack_clash (allocate,
13756 int_registers_saved);
13757 allocate = 0;
13759 else if (STACK_CHECK_MOVING_SP)
13761 if (!(crtl->is_leaf && !cfun->calls_alloca
13762 && allocate <= get_probe_interval ()))
13764 ix86_adjust_stack_and_probe (allocate, int_registers_saved);
13765 allocate = 0;
13768 else
13770 HOST_WIDE_INT size = allocate;
13772 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13773 size = 0x80000000 - get_stack_check_protect () - 1;
13775 if (TARGET_STACK_PROBE)
13777 if (crtl->is_leaf && !cfun->calls_alloca)
13779 if (size > get_probe_interval ())
13780 ix86_emit_probe_stack_range (0, size, int_registers_saved);
13782 else
13783 ix86_emit_probe_stack_range (0,
13784 size + get_stack_check_protect (),
13785 int_registers_saved);
13787 else
13789 if (crtl->is_leaf && !cfun->calls_alloca)
13791 if (size > get_probe_interval ()
13792 && size > get_stack_check_protect ())
13793 ix86_emit_probe_stack_range (get_stack_check_protect (),
13794 (size
13795 - get_stack_check_protect ()),
13796 int_registers_saved);
13798 else
13799 ix86_emit_probe_stack_range (get_stack_check_protect (), size,
13800 int_registers_saved);
13805 if (allocate == 0)
13807 else if (!ix86_target_stack_probe ()
13808 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13810 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13811 GEN_INT (-allocate), -1,
13812 m->fs.cfa_reg == stack_pointer_rtx);
13814 else
13816 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13817 rtx r10 = NULL;
13818 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13819 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13820 bool eax_live = ix86_eax_live_at_start_p ();
13821 bool r10_live = false;
13823 if (TARGET_64BIT)
13824 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13826 if (eax_live)
13828 insn = emit_insn (gen_push (eax));
13829 allocate -= UNITS_PER_WORD;
13830 /* Note that SEH directives need to continue tracking the stack
13831 pointer even after the frame pointer has been set up. */
13832 if (sp_is_cfa_reg || TARGET_SEH)
13834 if (sp_is_cfa_reg)
13835 m->fs.cfa_offset += UNITS_PER_WORD;
13836 RTX_FRAME_RELATED_P (insn) = 1;
13837 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13838 gen_rtx_SET (stack_pointer_rtx,
13839 plus_constant (Pmode, stack_pointer_rtx,
13840 -UNITS_PER_WORD)));
13844 if (r10_live)
13846 r10 = gen_rtx_REG (Pmode, R10_REG);
13847 insn = emit_insn (gen_push (r10));
13848 allocate -= UNITS_PER_WORD;
13849 if (sp_is_cfa_reg || TARGET_SEH)
13851 if (sp_is_cfa_reg)
13852 m->fs.cfa_offset += UNITS_PER_WORD;
13853 RTX_FRAME_RELATED_P (insn) = 1;
13854 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13855 gen_rtx_SET (stack_pointer_rtx,
13856 plus_constant (Pmode, stack_pointer_rtx,
13857 -UNITS_PER_WORD)));
13861 emit_move_insn (eax, GEN_INT (allocate));
13862 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13864 /* Use the fact that AX still contains ALLOCATE. */
13865 adjust_stack_insn = (Pmode == DImode
13866 ? gen_pro_epilogue_adjust_stack_di_sub
13867 : gen_pro_epilogue_adjust_stack_si_sub);
13869 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13870 stack_pointer_rtx, eax));
13872 if (sp_is_cfa_reg || TARGET_SEH)
13874 if (sp_is_cfa_reg)
13875 m->fs.cfa_offset += allocate;
13876 RTX_FRAME_RELATED_P (insn) = 1;
13877 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13878 gen_rtx_SET (stack_pointer_rtx,
13879 plus_constant (Pmode, stack_pointer_rtx,
13880 -allocate)));
13882 m->fs.sp_offset += allocate;
13884 /* Use stack_pointer_rtx for relative addressing so that code
13885 works for realigned stack, too. */
13886 if (r10_live && eax_live)
13888 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13889 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13890 gen_frame_mem (word_mode, t));
13891 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13892 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13893 gen_frame_mem (word_mode, t));
13895 else if (eax_live || r10_live)
13897 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13898 emit_move_insn (gen_rtx_REG (word_mode,
13899 (eax_live ? AX_REG : R10_REG)),
13900 gen_frame_mem (word_mode, t));
13903 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13905 /* If we havn't already set up the frame pointer, do so now. */
13906 if (frame_pointer_needed && !m->fs.fp_valid)
13908 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13909 GEN_INT (frame.stack_pointer_offset
13910 - frame.hard_frame_pointer_offset));
13911 insn = emit_insn (insn);
13912 RTX_FRAME_RELATED_P (insn) = 1;
13913 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13915 if (m->fs.cfa_reg == stack_pointer_rtx)
13916 m->fs.cfa_reg = hard_frame_pointer_rtx;
13917 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13918 m->fs.fp_valid = true;
13921 if (!int_registers_saved)
13922 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13923 if (!sse_registers_saved)
13924 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13925 else if (save_stub_call_needed)
13926 ix86_emit_outlined_ms2sysv_save (frame);
13928 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13929 in PROLOGUE. */
13930 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13932 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13933 insn = emit_insn (gen_set_got (pic));
13934 RTX_FRAME_RELATED_P (insn) = 1;
13935 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13936 emit_insn (gen_prologue_use (pic));
13937 /* Deleting already emmitted SET_GOT if exist and allocated to
13938 REAL_PIC_OFFSET_TABLE_REGNUM. */
13939 ix86_elim_entry_set_got (pic);
13942 if (crtl->drap_reg && !crtl->stack_realign_needed)
13944 /* vDRAP is setup but after reload it turns out stack realign
13945 isn't necessary, here we will emit prologue to setup DRAP
13946 without stack realign adjustment */
13947 t = choose_baseaddr (0, NULL);
13948 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13951 /* Prevent instructions from being scheduled into register save push
13952 sequence when access to the redzone area is done through frame pointer.
13953 The offset between the frame pointer and the stack pointer is calculated
13954 relative to the value of the stack pointer at the end of the function
13955 prologue, and moving instructions that access redzone area via frame
13956 pointer inside push sequence violates this assumption. */
13957 if (frame_pointer_needed && frame.red_zone_size)
13958 emit_insn (gen_memory_blockage ());
13960 /* SEH requires that the prologue end within 256 bytes of the start of
13961 the function. Prevent instruction schedules that would extend that.
13962 Further, prevent alloca modifications to the stack pointer from being
13963 combined with prologue modifications. */
13964 if (TARGET_SEH)
13965 emit_insn (gen_prologue_use (stack_pointer_rtx));
13968 /* Emit code to restore REG using a POP insn. */
13970 static void
13971 ix86_emit_restore_reg_using_pop (rtx reg)
13973 struct machine_function *m = cfun->machine;
13974 rtx_insn *insn = emit_insn (gen_pop (reg));
13976 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13977 m->fs.sp_offset -= UNITS_PER_WORD;
13979 if (m->fs.cfa_reg == crtl->drap_reg
13980 && REGNO (reg) == REGNO (crtl->drap_reg))
13982 /* Previously we'd represented the CFA as an expression
13983 like *(%ebp - 8). We've just popped that value from
13984 the stack, which means we need to reset the CFA to
13985 the drap register. This will remain until we restore
13986 the stack pointer. */
13987 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13988 RTX_FRAME_RELATED_P (insn) = 1;
13990 /* This means that the DRAP register is valid for addressing too. */
13991 m->fs.drap_valid = true;
13992 return;
13995 if (m->fs.cfa_reg == stack_pointer_rtx)
13997 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13998 x = gen_rtx_SET (stack_pointer_rtx, x);
13999 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14000 RTX_FRAME_RELATED_P (insn) = 1;
14002 m->fs.cfa_offset -= UNITS_PER_WORD;
14005 /* When the frame pointer is the CFA, and we pop it, we are
14006 swapping back to the stack pointer as the CFA. This happens
14007 for stack frames that don't allocate other data, so we assume
14008 the stack pointer is now pointing at the return address, i.e.
14009 the function entry state, which makes the offset be 1 word. */
14010 if (reg == hard_frame_pointer_rtx)
14012 m->fs.fp_valid = false;
14013 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14015 m->fs.cfa_reg = stack_pointer_rtx;
14016 m->fs.cfa_offset -= UNITS_PER_WORD;
14018 add_reg_note (insn, REG_CFA_DEF_CFA,
14019 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14020 GEN_INT (m->fs.cfa_offset)));
14021 RTX_FRAME_RELATED_P (insn) = 1;
14026 /* Emit code to restore saved registers using POP insns. */
14028 static void
14029 ix86_emit_restore_regs_using_pop (void)
14031 unsigned int regno;
14033 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14034 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
14035 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
14038 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
14039 omits the emit and only attaches the notes. */
14041 static void
14042 ix86_emit_leave (rtx_insn *insn)
14044 struct machine_function *m = cfun->machine;
14045 if (!insn)
14046 insn = emit_insn (ix86_gen_leave ());
14048 ix86_add_queued_cfa_restore_notes (insn);
14050 gcc_assert (m->fs.fp_valid);
14051 m->fs.sp_valid = true;
14052 m->fs.sp_realigned = false;
14053 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
14054 m->fs.fp_valid = false;
14056 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
14058 m->fs.cfa_reg = stack_pointer_rtx;
14059 m->fs.cfa_offset = m->fs.sp_offset;
14061 add_reg_note (insn, REG_CFA_DEF_CFA,
14062 plus_constant (Pmode, stack_pointer_rtx,
14063 m->fs.sp_offset));
14064 RTX_FRAME_RELATED_P (insn) = 1;
14066 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
14067 m->fs.fp_offset);
14070 /* Emit code to restore saved registers using MOV insns.
14071 First register is restored from CFA - CFA_OFFSET. */
14072 static void
14073 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
14074 bool maybe_eh_return)
14076 struct machine_function *m = cfun->machine;
14077 unsigned int regno;
14079 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14080 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14082 rtx reg = gen_rtx_REG (word_mode, regno);
14083 rtx mem;
14084 rtx_insn *insn;
14086 mem = choose_baseaddr (cfa_offset, NULL);
14087 mem = gen_frame_mem (word_mode, mem);
14088 insn = emit_move_insn (reg, mem);
14090 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
14092 /* Previously we'd represented the CFA as an expression
14093 like *(%ebp - 8). We've just popped that value from
14094 the stack, which means we need to reset the CFA to
14095 the drap register. This will remain until we restore
14096 the stack pointer. */
14097 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
14098 RTX_FRAME_RELATED_P (insn) = 1;
14100 /* This means that the DRAP register is valid for addressing. */
14101 m->fs.drap_valid = true;
14103 else
14104 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14106 cfa_offset -= UNITS_PER_WORD;
14110 /* Emit code to restore saved registers using MOV insns.
14111 First register is restored from CFA - CFA_OFFSET. */
14112 static void
14113 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
14114 bool maybe_eh_return)
14116 unsigned int regno;
14118 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
14119 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
14121 rtx reg = gen_rtx_REG (V4SFmode, regno);
14122 rtx mem;
14123 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
14125 mem = choose_baseaddr (cfa_offset, &align);
14126 mem = gen_rtx_MEM (V4SFmode, mem);
14128 /* The location aligment depends upon the base register. */
14129 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
14130 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
14131 set_mem_align (mem, align);
14132 emit_insn (gen_rtx_SET (reg, mem));
14134 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
14136 cfa_offset -= GET_MODE_SIZE (V4SFmode);
14140 static void
14141 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
14142 bool use_call, int style)
14144 struct machine_function *m = cfun->machine;
14145 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
14146 + m->call_ms2sysv_extra_regs;
14147 rtvec v;
14148 unsigned int elems_needed, align, i, vi = 0;
14149 rtx_insn *insn;
14150 rtx sym, tmp;
14151 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
14152 rtx r10 = NULL_RTX;
14153 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
14154 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
14155 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
14156 rtx rsi_frame_load = NULL_RTX;
14157 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
14158 enum xlogue_stub stub;
14160 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
14162 /* If using a realigned stack, we should never start with padding. */
14163 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
14165 /* Setup RSI as the stub's base pointer. */
14166 align = GET_MODE_ALIGNMENT (V4SFmode);
14167 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
14168 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
14170 emit_insn (gen_rtx_SET (rsi, tmp));
14172 /* Get a symbol for the stub. */
14173 if (frame_pointer_needed)
14174 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
14175 : XLOGUE_STUB_RESTORE_HFP_TAIL;
14176 else
14177 stub = use_call ? XLOGUE_STUB_RESTORE
14178 : XLOGUE_STUB_RESTORE_TAIL;
14179 sym = xlogue.get_stub_rtx (stub);
14181 elems_needed = ncregs;
14182 if (use_call)
14183 elems_needed += 1;
14184 else
14185 elems_needed += frame_pointer_needed ? 5 : 3;
14186 v = rtvec_alloc (elems_needed);
14188 /* We call the epilogue stub when we need to pop incoming args or we are
14189 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
14190 epilogue stub and it is the tail-call. */
14191 if (use_call)
14192 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14193 else
14195 RTVEC_ELT (v, vi++) = ret_rtx;
14196 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
14197 if (frame_pointer_needed)
14199 rtx rbp = gen_rtx_REG (DImode, BP_REG);
14200 gcc_assert (m->fs.fp_valid);
14201 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
14203 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
14204 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
14205 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
14206 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
14207 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
14209 else
14211 /* If no hard frame pointer, we set R10 to the SP restore value. */
14212 gcc_assert (!m->fs.fp_valid);
14213 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14214 gcc_assert (m->fs.sp_valid);
14216 r10 = gen_rtx_REG (DImode, R10_REG);
14217 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
14218 emit_insn (gen_rtx_SET (r10, tmp));
14220 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
14224 /* Generate frame load insns and restore notes. */
14225 for (i = 0; i < ncregs; ++i)
14227 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
14228 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
14229 rtx reg, frame_load;
14231 reg = gen_rtx_REG (mode, r.regno);
14232 frame_load = gen_frame_load (reg, rsi, r.offset);
14234 /* Save RSI frame load insn & note to add last. */
14235 if (r.regno == SI_REG)
14237 gcc_assert (!rsi_frame_load);
14238 rsi_frame_load = frame_load;
14239 rsi_restore_offset = r.offset;
14241 else
14243 RTVEC_ELT (v, vi++) = frame_load;
14244 ix86_add_cfa_restore_note (NULL, reg, r.offset);
14248 /* Add RSI frame load & restore note at the end. */
14249 gcc_assert (rsi_frame_load);
14250 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
14251 RTVEC_ELT (v, vi++) = rsi_frame_load;
14252 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
14253 rsi_restore_offset);
14255 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
14256 if (!use_call && !frame_pointer_needed)
14258 gcc_assert (m->fs.sp_valid);
14259 gcc_assert (!m->fs.sp_realigned);
14261 /* At this point, R10 should point to frame.stack_realign_offset. */
14262 if (m->fs.cfa_reg == stack_pointer_rtx)
14263 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
14264 m->fs.sp_offset = frame.stack_realign_offset;
14267 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
14268 tmp = gen_rtx_PARALLEL (VOIDmode, v);
14269 if (use_call)
14270 insn = emit_insn (tmp);
14271 else
14273 insn = emit_jump_insn (tmp);
14274 JUMP_LABEL (insn) = ret_rtx;
14276 if (frame_pointer_needed)
14277 ix86_emit_leave (insn);
14278 else
14280 /* Need CFA adjust note. */
14281 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
14282 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
14286 RTX_FRAME_RELATED_P (insn) = true;
14287 ix86_add_queued_cfa_restore_notes (insn);
14289 /* If we're not doing a tail-call, we need to adjust the stack. */
14290 if (use_call && m->fs.sp_valid)
14292 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
14293 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14294 GEN_INT (dealloc), style,
14295 m->fs.cfa_reg == stack_pointer_rtx);
14299 /* Restore function stack, frame, and registers. */
14301 void
14302 ix86_expand_epilogue (int style)
14304 struct machine_function *m = cfun->machine;
14305 struct machine_frame_state frame_state_save = m->fs;
14306 bool restore_regs_via_mov;
14307 bool using_drap;
14308 bool restore_stub_is_tail = false;
14310 if (ix86_function_naked (current_function_decl))
14312 /* The program should not reach this point. */
14313 emit_insn (gen_ud2 ());
14314 return;
14317 ix86_finalize_stack_frame_flags ();
14318 const struct ix86_frame &frame = cfun->machine->frame;
14320 m->fs.sp_realigned = stack_realign_fp;
14321 m->fs.sp_valid = stack_realign_fp
14322 || !frame_pointer_needed
14323 || crtl->sp_is_unchanging;
14324 gcc_assert (!m->fs.sp_valid
14325 || m->fs.sp_offset == frame.stack_pointer_offset);
14327 /* The FP must be valid if the frame pointer is present. */
14328 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
14329 gcc_assert (!m->fs.fp_valid
14330 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
14332 /* We must have *some* valid pointer to the stack frame. */
14333 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
14335 /* The DRAP is never valid at this point. */
14336 gcc_assert (!m->fs.drap_valid);
14338 /* See the comment about red zone and frame
14339 pointer usage in ix86_expand_prologue. */
14340 if (frame_pointer_needed && frame.red_zone_size)
14341 emit_insn (gen_memory_blockage ());
14343 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
14344 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
14346 /* Determine the CFA offset of the end of the red-zone. */
14347 m->fs.red_zone_offset = 0;
14348 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
14350 /* The red-zone begins below return address and error code in
14351 exception handler. */
14352 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
14354 /* When the register save area is in the aligned portion of
14355 the stack, determine the maximum runtime displacement that
14356 matches up with the aligned frame. */
14357 if (stack_realign_drap)
14358 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
14359 + UNITS_PER_WORD);
14362 HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
14364 /* Special care must be taken for the normal return case of a function
14365 using eh_return: the eax and edx registers are marked as saved, but
14366 not restored along this path. Adjust the save location to match. */
14367 if (crtl->calls_eh_return && style != 2)
14368 reg_save_offset -= 2 * UNITS_PER_WORD;
14370 /* EH_RETURN requires the use of moves to function properly. */
14371 if (crtl->calls_eh_return)
14372 restore_regs_via_mov = true;
14373 /* SEH requires the use of pops to identify the epilogue. */
14374 else if (TARGET_SEH)
14375 restore_regs_via_mov = false;
14376 /* If we're only restoring one register and sp cannot be used then
14377 using a move instruction to restore the register since it's
14378 less work than reloading sp and popping the register. */
14379 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
14380 restore_regs_via_mov = true;
14381 else if (TARGET_EPILOGUE_USING_MOVE
14382 && cfun->machine->use_fast_prologue_epilogue
14383 && (frame.nregs > 1
14384 || m->fs.sp_offset != reg_save_offset))
14385 restore_regs_via_mov = true;
14386 else if (frame_pointer_needed
14387 && !frame.nregs
14388 && m->fs.sp_offset != reg_save_offset)
14389 restore_regs_via_mov = true;
14390 else if (frame_pointer_needed
14391 && TARGET_USE_LEAVE
14392 && cfun->machine->use_fast_prologue_epilogue
14393 && frame.nregs == 1)
14394 restore_regs_via_mov = true;
14395 else
14396 restore_regs_via_mov = false;
14398 if (restore_regs_via_mov || frame.nsseregs)
14400 /* Ensure that the entire register save area is addressable via
14401 the stack pointer, if we will restore SSE regs via sp. */
14402 if (TARGET_64BIT
14403 && m->fs.sp_offset > 0x7fffffff
14404 && sp_valid_at (frame.stack_realign_offset + 1)
14405 && (frame.nsseregs + frame.nregs) != 0)
14407 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14408 GEN_INT (m->fs.sp_offset
14409 - frame.sse_reg_save_offset),
14410 style,
14411 m->fs.cfa_reg == stack_pointer_rtx);
14415 /* If there are any SSE registers to restore, then we have to do it
14416 via moves, since there's obviously no pop for SSE regs. */
14417 if (frame.nsseregs)
14418 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
14419 style == 2);
14421 if (m->call_ms2sysv)
14423 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
14425 /* We cannot use a tail-call for the stub if:
14426 1. We have to pop incoming args,
14427 2. We have additional int regs to restore, or
14428 3. A sibling call will be the tail-call, or
14429 4. We are emitting an eh_return_internal epilogue.
14431 TODO: Item 4 has not yet tested!
14433 If any of the above are true, we will call the stub rather than
14434 jump to it. */
14435 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
14436 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
14439 /* If using out-of-line stub that is a tail-call, then...*/
14440 if (m->call_ms2sysv && restore_stub_is_tail)
14442 /* TODO: parinoid tests. (remove eventually) */
14443 gcc_assert (m->fs.sp_valid);
14444 gcc_assert (!m->fs.sp_realigned);
14445 gcc_assert (!m->fs.fp_valid);
14446 gcc_assert (!m->fs.realigned);
14447 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
14448 gcc_assert (!crtl->drap_reg);
14449 gcc_assert (!frame.nregs);
14451 else if (restore_regs_via_mov)
14453 rtx t;
14455 if (frame.nregs)
14456 ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
14458 /* eh_return epilogues need %ecx added to the stack pointer. */
14459 if (style == 2)
14461 rtx sa = EH_RETURN_STACKADJ_RTX;
14462 rtx_insn *insn;
14464 /* %ecx can't be used for both DRAP register and eh_return. */
14465 if (crtl->drap_reg)
14466 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
14468 /* regparm nested functions don't work with eh_return. */
14469 gcc_assert (!ix86_static_chain_on_stack);
14471 if (frame_pointer_needed)
14473 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
14474 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
14475 emit_insn (gen_rtx_SET (sa, t));
14477 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
14478 insn = emit_move_insn (hard_frame_pointer_rtx, t);
14480 /* Note that we use SA as a temporary CFA, as the return
14481 address is at the proper place relative to it. We
14482 pretend this happens at the FP restore insn because
14483 prior to this insn the FP would be stored at the wrong
14484 offset relative to SA, and after this insn we have no
14485 other reasonable register to use for the CFA. We don't
14486 bother resetting the CFA to the SP for the duration of
14487 the return insn, unless the control flow instrumentation
14488 is done. In this case the SP is used later and we have
14489 to reset CFA to SP. */
14490 add_reg_note (insn, REG_CFA_DEF_CFA,
14491 plus_constant (Pmode, sa, UNITS_PER_WORD));
14492 ix86_add_queued_cfa_restore_notes (insn);
14493 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
14494 RTX_FRAME_RELATED_P (insn) = 1;
14496 m->fs.cfa_reg = sa;
14497 m->fs.cfa_offset = UNITS_PER_WORD;
14498 m->fs.fp_valid = false;
14500 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14501 const0_rtx, style,
14502 flag_cf_protection);
14504 else
14506 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14507 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14508 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14509 ix86_add_queued_cfa_restore_notes (insn);
14511 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14512 if (m->fs.cfa_offset != UNITS_PER_WORD)
14514 m->fs.cfa_offset = UNITS_PER_WORD;
14515 add_reg_note (insn, REG_CFA_DEF_CFA,
14516 plus_constant (Pmode, stack_pointer_rtx,
14517 UNITS_PER_WORD));
14518 RTX_FRAME_RELATED_P (insn) = 1;
14521 m->fs.sp_offset = UNITS_PER_WORD;
14522 m->fs.sp_valid = true;
14523 m->fs.sp_realigned = false;
14526 else
14528 /* SEH requires that the function end with (1) a stack adjustment
14529 if necessary, (2) a sequence of pops, and (3) a return or
14530 jump instruction. Prevent insns from the function body from
14531 being scheduled into this sequence. */
14532 if (TARGET_SEH)
14534 /* Prevent a catch region from being adjacent to the standard
14535 epilogue sequence. Unfortunately neither crtl->uses_eh_lsda
14536 nor several other flags that would be interesting to test are
14537 set up yet. */
14538 if (flag_non_call_exceptions)
14539 emit_insn (gen_nops (const1_rtx));
14540 else
14541 emit_insn (gen_blockage ());
14544 /* First step is to deallocate the stack frame so that we can
14545 pop the registers. If the stack pointer was realigned, it needs
14546 to be restored now. Also do it on SEH target for very large
14547 frame as the emitted instructions aren't allowed by the ABI
14548 in epilogues. */
14549 if (!m->fs.sp_valid || m->fs.sp_realigned
14550 || (TARGET_SEH
14551 && (m->fs.sp_offset - reg_save_offset
14552 >= SEH_MAX_FRAME_SIZE)))
14554 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14555 GEN_INT (m->fs.fp_offset
14556 - reg_save_offset),
14557 style, false);
14559 else if (m->fs.sp_offset != reg_save_offset)
14561 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14562 GEN_INT (m->fs.sp_offset
14563 - reg_save_offset),
14564 style,
14565 m->fs.cfa_reg == stack_pointer_rtx);
14568 ix86_emit_restore_regs_using_pop ();
14571 /* If we used a stack pointer and haven't already got rid of it,
14572 then do so now. */
14573 if (m->fs.fp_valid)
14575 /* If the stack pointer is valid and pointing at the frame
14576 pointer store address, then we only need a pop. */
14577 if (sp_valid_at (frame.hfp_save_offset)
14578 && m->fs.sp_offset == frame.hfp_save_offset)
14579 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14580 /* Leave results in shorter dependency chains on CPUs that are
14581 able to grok it fast. */
14582 else if (TARGET_USE_LEAVE
14583 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14584 || !cfun->machine->use_fast_prologue_epilogue)
14585 ix86_emit_leave (NULL);
14586 else
14588 pro_epilogue_adjust_stack (stack_pointer_rtx,
14589 hard_frame_pointer_rtx,
14590 const0_rtx, style, !using_drap);
14591 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14595 if (using_drap)
14597 int param_ptr_offset = UNITS_PER_WORD;
14598 rtx_insn *insn;
14600 gcc_assert (stack_realign_drap);
14602 if (ix86_static_chain_on_stack)
14603 param_ptr_offset += UNITS_PER_WORD;
14604 if (!call_used_regs[REGNO (crtl->drap_reg)])
14605 param_ptr_offset += UNITS_PER_WORD;
14607 insn = emit_insn (gen_rtx_SET
14608 (stack_pointer_rtx,
14609 gen_rtx_PLUS (Pmode,
14610 crtl->drap_reg,
14611 GEN_INT (-param_ptr_offset))));
14612 m->fs.cfa_reg = stack_pointer_rtx;
14613 m->fs.cfa_offset = param_ptr_offset;
14614 m->fs.sp_offset = param_ptr_offset;
14615 m->fs.realigned = false;
14617 add_reg_note (insn, REG_CFA_DEF_CFA,
14618 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14619 GEN_INT (param_ptr_offset)));
14620 RTX_FRAME_RELATED_P (insn) = 1;
14622 if (!call_used_regs[REGNO (crtl->drap_reg)])
14623 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14626 /* At this point the stack pointer must be valid, and we must have
14627 restored all of the registers. We may not have deallocated the
14628 entire stack frame. We've delayed this until now because it may
14629 be possible to merge the local stack deallocation with the
14630 deallocation forced by ix86_static_chain_on_stack. */
14631 gcc_assert (m->fs.sp_valid);
14632 gcc_assert (!m->fs.sp_realigned);
14633 gcc_assert (!m->fs.fp_valid);
14634 gcc_assert (!m->fs.realigned);
14635 if (m->fs.sp_offset != UNITS_PER_WORD)
14637 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14638 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14639 style, true);
14641 else
14642 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14644 /* Sibcall epilogues don't want a return instruction. */
14645 if (style == 0)
14647 m->fs = frame_state_save;
14648 return;
14651 if (cfun->machine->func_type != TYPE_NORMAL)
14652 emit_jump_insn (gen_interrupt_return ());
14653 else if (crtl->args.pops_args && crtl->args.size)
14655 rtx popc = GEN_INT (crtl->args.pops_args);
14657 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14658 address, do explicit add, and jump indirectly to the caller. */
14660 if (crtl->args.pops_args >= 65536)
14662 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14663 rtx_insn *insn;
14665 /* There is no "pascal" calling convention in any 64bit ABI. */
14666 gcc_assert (!TARGET_64BIT);
14668 insn = emit_insn (gen_pop (ecx));
14669 m->fs.cfa_offset -= UNITS_PER_WORD;
14670 m->fs.sp_offset -= UNITS_PER_WORD;
14672 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14673 x = gen_rtx_SET (stack_pointer_rtx, x);
14674 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14675 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14676 RTX_FRAME_RELATED_P (insn) = 1;
14678 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14679 popc, -1, true);
14680 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14682 else
14683 emit_jump_insn (gen_simple_return_pop_internal (popc));
14685 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14687 /* In case of return from EH a simple return cannot be used
14688 as a return address will be compared with a shadow stack
14689 return address. Use indirect jump instead. */
14690 if (style == 2 && flag_cf_protection)
14692 /* Register used in indirect jump must be in word_mode. But
14693 Pmode may not be the same as word_mode for x32. */
14694 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14695 rtx_insn *insn;
14697 insn = emit_insn (gen_pop (ecx));
14698 m->fs.cfa_offset -= UNITS_PER_WORD;
14699 m->fs.sp_offset -= UNITS_PER_WORD;
14701 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14702 x = gen_rtx_SET (stack_pointer_rtx, x);
14703 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14704 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14705 RTX_FRAME_RELATED_P (insn) = 1;
14707 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14709 else
14710 emit_jump_insn (gen_simple_return_internal ());
14713 /* Restore the state back to the state from the prologue,
14714 so that it's correct for the next epilogue. */
14715 m->fs = frame_state_save;
14718 /* Reset from the function's potential modifications. */
14720 static void
14721 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14723 if (pic_offset_table_rtx
14724 && !ix86_use_pseudo_pic_reg ())
14725 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14727 if (TARGET_MACHO)
14729 rtx_insn *insn = get_last_insn ();
14730 rtx_insn *deleted_debug_label = NULL;
14732 /* Mach-O doesn't support labels at the end of objects, so if
14733 it looks like we might want one, take special action.
14734 First, collect any sequence of deleted debug labels. */
14735 while (insn
14736 && NOTE_P (insn)
14737 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14739 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14740 notes only, instead set their CODE_LABEL_NUMBER to -1,
14741 otherwise there would be code generation differences
14742 in between -g and -g0. */
14743 if (NOTE_P (insn) && NOTE_KIND (insn)
14744 == NOTE_INSN_DELETED_DEBUG_LABEL)
14745 deleted_debug_label = insn;
14746 insn = PREV_INSN (insn);
14749 /* If we have:
14750 label:
14751 barrier
14752 then this needs to be detected, so skip past the barrier. */
14754 if (insn && BARRIER_P (insn))
14755 insn = PREV_INSN (insn);
14757 /* Up to now we've only seen notes or barriers. */
14758 if (insn)
14760 if (LABEL_P (insn)
14761 || (NOTE_P (insn)
14762 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14763 /* Trailing label. */
14764 fputs ("\tnop\n", file);
14765 else if (cfun && ! cfun->is_thunk)
14767 /* See if we have a completely empty function body, skipping
14768 the special case of the picbase thunk emitted as asm. */
14769 while (insn && ! INSN_P (insn))
14770 insn = PREV_INSN (insn);
14771 /* If we don't find any insns, we've got an empty function body;
14772 I.e. completely empty - without a return or branch. This is
14773 taken as the case where a function body has been removed
14774 because it contains an inline __builtin_unreachable(). GCC
14775 declares that reaching __builtin_unreachable() means UB so
14776 we're not obliged to do anything special; however, we want
14777 non-zero-sized function bodies. To meet this, and help the
14778 user out, let's trap the case. */
14779 if (insn == NULL)
14780 fputs ("\tud2\n", file);
14783 else if (deleted_debug_label)
14784 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14785 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14786 CODE_LABEL_NUMBER (insn) = -1;
14790 /* Return a scratch register to use in the split stack prologue. The
14791 split stack prologue is used for -fsplit-stack. It is the first
14792 instructions in the function, even before the regular prologue.
14793 The scratch register can be any caller-saved register which is not
14794 used for parameters or for the static chain. */
14796 static unsigned int
14797 split_stack_prologue_scratch_regno (void)
14799 if (TARGET_64BIT)
14800 return R11_REG;
14801 else
14803 bool is_fastcall, is_thiscall;
14804 int regparm;
14806 is_fastcall = (lookup_attribute ("fastcall",
14807 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14808 != NULL);
14809 is_thiscall = (lookup_attribute ("thiscall",
14810 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14811 != NULL);
14812 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14814 if (is_fastcall)
14816 if (DECL_STATIC_CHAIN (cfun->decl))
14818 sorry ("-fsplit-stack does not support fastcall with "
14819 "nested function");
14820 return INVALID_REGNUM;
14822 return AX_REG;
14824 else if (is_thiscall)
14826 if (!DECL_STATIC_CHAIN (cfun->decl))
14827 return DX_REG;
14828 return AX_REG;
14830 else if (regparm < 3)
14832 if (!DECL_STATIC_CHAIN (cfun->decl))
14833 return CX_REG;
14834 else
14836 if (regparm >= 2)
14838 sorry ("-fsplit-stack does not support 2 register "
14839 "parameters for a nested function");
14840 return INVALID_REGNUM;
14842 return DX_REG;
14845 else
14847 /* FIXME: We could make this work by pushing a register
14848 around the addition and comparison. */
14849 sorry ("-fsplit-stack does not support 3 register parameters");
14850 return INVALID_REGNUM;
14855 /* A SYMBOL_REF for the function which allocates new stackspace for
14856 -fsplit-stack. */
14858 static GTY(()) rtx split_stack_fn;
14860 /* A SYMBOL_REF for the more stack function when using the large
14861 model. */
14863 static GTY(()) rtx split_stack_fn_large;
14865 /* Return location of the stack guard value in the TLS block. */
14868 ix86_split_stack_guard (void)
14870 int offset;
14871 addr_space_t as = DEFAULT_TLS_SEG_REG;
14872 rtx r;
14874 gcc_assert (flag_split_stack);
14876 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14877 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14878 #else
14879 gcc_unreachable ();
14880 #endif
14882 r = GEN_INT (offset);
14883 r = gen_const_mem (Pmode, r);
14884 set_mem_addr_space (r, as);
14886 return r;
14889 /* Handle -fsplit-stack. These are the first instructions in the
14890 function, even before the regular prologue. */
14892 void
14893 ix86_expand_split_stack_prologue (void)
14895 HOST_WIDE_INT allocate;
14896 unsigned HOST_WIDE_INT args_size;
14897 rtx_code_label *label;
14898 rtx limit, current, allocate_rtx, call_fusage;
14899 rtx_insn *call_insn;
14900 rtx scratch_reg = NULL_RTX;
14901 rtx_code_label *varargs_label = NULL;
14902 rtx fn;
14904 gcc_assert (flag_split_stack && reload_completed);
14906 ix86_finalize_stack_frame_flags ();
14907 struct ix86_frame &frame = cfun->machine->frame;
14908 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14910 /* This is the label we will branch to if we have enough stack
14911 space. We expect the basic block reordering pass to reverse this
14912 branch if optimizing, so that we branch in the unlikely case. */
14913 label = gen_label_rtx ();
14915 /* We need to compare the stack pointer minus the frame size with
14916 the stack boundary in the TCB. The stack boundary always gives
14917 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14918 can compare directly. Otherwise we need to do an addition. */
14920 limit = ix86_split_stack_guard ();
14922 if (allocate < SPLIT_STACK_AVAILABLE)
14923 current = stack_pointer_rtx;
14924 else
14926 unsigned int scratch_regno;
14927 rtx offset;
14929 /* We need a scratch register to hold the stack pointer minus
14930 the required frame size. Since this is the very start of the
14931 function, the scratch register can be any caller-saved
14932 register which is not used for parameters. */
14933 offset = GEN_INT (- allocate);
14934 scratch_regno = split_stack_prologue_scratch_regno ();
14935 if (scratch_regno == INVALID_REGNUM)
14936 return;
14937 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14938 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14940 /* We don't use ix86_gen_add3 in this case because it will
14941 want to split to lea, but when not optimizing the insn
14942 will not be split after this point. */
14943 emit_insn (gen_rtx_SET (scratch_reg,
14944 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14945 offset)));
14947 else
14949 emit_move_insn (scratch_reg, offset);
14950 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14951 stack_pointer_rtx));
14953 current = scratch_reg;
14956 ix86_expand_branch (GEU, current, limit, label);
14957 rtx_insn *jump_insn = get_last_insn ();
14958 JUMP_LABEL (jump_insn) = label;
14960 /* Mark the jump as very likely to be taken. */
14961 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14963 if (split_stack_fn == NULL_RTX)
14965 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14966 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14968 fn = split_stack_fn;
14970 /* Get more stack space. We pass in the desired stack space and the
14971 size of the arguments to copy to the new stack. In 32-bit mode
14972 we push the parameters; __morestack will return on a new stack
14973 anyhow. In 64-bit mode we pass the parameters in r10 and
14974 r11. */
14975 allocate_rtx = GEN_INT (allocate);
14976 args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
14977 call_fusage = NULL_RTX;
14978 rtx pop = NULL_RTX;
14979 if (TARGET_64BIT)
14981 rtx reg10, reg11;
14983 reg10 = gen_rtx_REG (Pmode, R10_REG);
14984 reg11 = gen_rtx_REG (Pmode, R11_REG);
14986 /* If this function uses a static chain, it will be in %r10.
14987 Preserve it across the call to __morestack. */
14988 if (DECL_STATIC_CHAIN (cfun->decl))
14990 rtx rax;
14992 rax = gen_rtx_REG (word_mode, AX_REG);
14993 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14994 use_reg (&call_fusage, rax);
14997 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14998 && !TARGET_PECOFF)
15000 HOST_WIDE_INT argval;
15002 gcc_assert (Pmode == DImode);
15003 /* When using the large model we need to load the address
15004 into a register, and we've run out of registers. So we
15005 switch to a different calling convention, and we call a
15006 different function: __morestack_large. We pass the
15007 argument size in the upper 32 bits of r10 and pass the
15008 frame size in the lower 32 bits. */
15009 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
15010 gcc_assert ((args_size & 0xffffffff) == args_size);
15012 if (split_stack_fn_large == NULL_RTX)
15014 split_stack_fn_large =
15015 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
15016 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
15018 if (ix86_cmodel == CM_LARGE_PIC)
15020 rtx_code_label *label;
15021 rtx x;
15023 label = gen_label_rtx ();
15024 emit_label (label);
15025 LABEL_PRESERVE_P (label) = 1;
15026 emit_insn (gen_set_rip_rex64 (reg10, label));
15027 emit_insn (gen_set_got_offset_rex64 (reg11, label));
15028 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
15029 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
15030 UNSPEC_GOT);
15031 x = gen_rtx_CONST (Pmode, x);
15032 emit_move_insn (reg11, x);
15033 x = gen_rtx_PLUS (Pmode, reg10, reg11);
15034 x = gen_const_mem (Pmode, x);
15035 emit_move_insn (reg11, x);
15037 else
15038 emit_move_insn (reg11, split_stack_fn_large);
15040 fn = reg11;
15042 argval = ((args_size << 16) << 16) + allocate;
15043 emit_move_insn (reg10, GEN_INT (argval));
15045 else
15047 emit_move_insn (reg10, allocate_rtx);
15048 emit_move_insn (reg11, GEN_INT (args_size));
15049 use_reg (&call_fusage, reg11);
15052 use_reg (&call_fusage, reg10);
15054 else
15056 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
15057 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
15058 insn = emit_insn (gen_push (allocate_rtx));
15059 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
15060 pop = GEN_INT (2 * UNITS_PER_WORD);
15062 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
15063 GEN_INT (UNITS_PER_WORD), constm1_rtx,
15064 pop, false);
15065 add_function_usage_to (call_insn, call_fusage);
15066 if (!TARGET_64BIT)
15067 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
15068 /* Indicate that this function can't jump to non-local gotos. */
15069 make_reg_eh_region_note_nothrow_nononlocal (call_insn);
15071 /* In order to make call/return prediction work right, we now need
15072 to execute a return instruction. See
15073 libgcc/config/i386/morestack.S for the details on how this works.
15075 For flow purposes gcc must not see this as a return
15076 instruction--we need control flow to continue at the subsequent
15077 label. Therefore, we use an unspec. */
15078 gcc_assert (crtl->args.pops_args < 65536);
15079 rtx_insn *ret_insn
15080 = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
15082 if ((flag_cf_protection & CF_BRANCH))
15084 /* Insert ENDBR since __morestack will jump back here via indirect
15085 call. */
15086 rtx cet_eb = gen_nop_endbr ();
15087 emit_insn_after (cet_eb, ret_insn);
15090 /* If we are in 64-bit mode and this function uses a static chain,
15091 we saved %r10 in %rax before calling _morestack. */
15092 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
15093 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
15094 gen_rtx_REG (word_mode, AX_REG));
15096 /* If this function calls va_start, we need to store a pointer to
15097 the arguments on the old stack, because they may not have been
15098 all copied to the new stack. At this point the old stack can be
15099 found at the frame pointer value used by __morestack, because
15100 __morestack has set that up before calling back to us. Here we
15101 store that pointer in a scratch register, and in
15102 ix86_expand_prologue we store the scratch register in a stack
15103 slot. */
15104 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15106 unsigned int scratch_regno;
15107 rtx frame_reg;
15108 int words;
15110 scratch_regno = split_stack_prologue_scratch_regno ();
15111 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
15112 frame_reg = gen_rtx_REG (Pmode, BP_REG);
15114 /* 64-bit:
15115 fp -> old fp value
15116 return address within this function
15117 return address of caller of this function
15118 stack arguments
15119 So we add three words to get to the stack arguments.
15121 32-bit:
15122 fp -> old fp value
15123 return address within this function
15124 first argument to __morestack
15125 second argument to __morestack
15126 return address of caller of this function
15127 stack arguments
15128 So we add five words to get to the stack arguments.
15130 words = TARGET_64BIT ? 3 : 5;
15131 emit_insn (gen_rtx_SET (scratch_reg,
15132 gen_rtx_PLUS (Pmode, frame_reg,
15133 GEN_INT (words * UNITS_PER_WORD))));
15135 varargs_label = gen_label_rtx ();
15136 emit_jump_insn (gen_jump (varargs_label));
15137 JUMP_LABEL (get_last_insn ()) = varargs_label;
15139 emit_barrier ();
15142 emit_label (label);
15143 LABEL_NUSES (label) = 1;
15145 /* If this function calls va_start, we now have to set the scratch
15146 register for the case where we do not call __morestack. In this
15147 case we need to set it based on the stack pointer. */
15148 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15150 emit_insn (gen_rtx_SET (scratch_reg,
15151 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
15152 GEN_INT (UNITS_PER_WORD))));
15154 emit_label (varargs_label);
15155 LABEL_NUSES (varargs_label) = 1;
15159 /* We may have to tell the dataflow pass that the split stack prologue
15160 is initializing a scratch register. */
15162 static void
15163 ix86_live_on_entry (bitmap regs)
15165 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
15167 gcc_assert (flag_split_stack);
15168 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
15172 /* Extract the parts of an RTL expression that is a valid memory address
15173 for an instruction. Return 0 if the structure of the address is
15174 grossly off. Return -1 if the address contains ASHIFT, so it is not
15175 strictly valid, but still used for computing length of lea instruction. */
15178 ix86_decompose_address (rtx addr, struct ix86_address *out)
15180 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
15181 rtx base_reg, index_reg;
15182 HOST_WIDE_INT scale = 1;
15183 rtx scale_rtx = NULL_RTX;
15184 rtx tmp;
15185 int retval = 1;
15186 addr_space_t seg = ADDR_SPACE_GENERIC;
15188 /* Allow zero-extended SImode addresses,
15189 they will be emitted with addr32 prefix. */
15190 if (TARGET_64BIT && GET_MODE (addr) == DImode)
15192 if (GET_CODE (addr) == ZERO_EXTEND
15193 && GET_MODE (XEXP (addr, 0)) == SImode)
15195 addr = XEXP (addr, 0);
15196 if (CONST_INT_P (addr))
15197 return 0;
15199 else if (GET_CODE (addr) == AND
15200 && const_32bit_mask (XEXP (addr, 1), DImode))
15202 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
15203 if (addr == NULL_RTX)
15204 return 0;
15206 if (CONST_INT_P (addr))
15207 return 0;
15211 /* Allow SImode subregs of DImode addresses,
15212 they will be emitted with addr32 prefix. */
15213 if (TARGET_64BIT && GET_MODE (addr) == SImode)
15215 if (SUBREG_P (addr)
15216 && GET_MODE (SUBREG_REG (addr)) == DImode)
15218 addr = SUBREG_REG (addr);
15219 if (CONST_INT_P (addr))
15220 return 0;
15224 if (REG_P (addr))
15225 base = addr;
15226 else if (SUBREG_P (addr))
15228 if (REG_P (SUBREG_REG (addr)))
15229 base = addr;
15230 else
15231 return 0;
15233 else if (GET_CODE (addr) == PLUS)
15235 rtx addends[4], op;
15236 int n = 0, i;
15238 op = addr;
15241 if (n >= 4)
15242 return 0;
15243 addends[n++] = XEXP (op, 1);
15244 op = XEXP (op, 0);
15246 while (GET_CODE (op) == PLUS);
15247 if (n >= 4)
15248 return 0;
15249 addends[n] = op;
15251 for (i = n; i >= 0; --i)
15253 op = addends[i];
15254 switch (GET_CODE (op))
15256 case MULT:
15257 if (index)
15258 return 0;
15259 index = XEXP (op, 0);
15260 scale_rtx = XEXP (op, 1);
15261 break;
15263 case ASHIFT:
15264 if (index)
15265 return 0;
15266 index = XEXP (op, 0);
15267 tmp = XEXP (op, 1);
15268 if (!CONST_INT_P (tmp))
15269 return 0;
15270 scale = INTVAL (tmp);
15271 if ((unsigned HOST_WIDE_INT) scale > 3)
15272 return 0;
15273 scale = 1 << scale;
15274 break;
15276 case ZERO_EXTEND:
15277 op = XEXP (op, 0);
15278 if (GET_CODE (op) != UNSPEC)
15279 return 0;
15280 /* FALLTHRU */
15282 case UNSPEC:
15283 if (XINT (op, 1) == UNSPEC_TP
15284 && TARGET_TLS_DIRECT_SEG_REFS
15285 && seg == ADDR_SPACE_GENERIC)
15286 seg = DEFAULT_TLS_SEG_REG;
15287 else
15288 return 0;
15289 break;
15291 case SUBREG:
15292 if (!REG_P (SUBREG_REG (op)))
15293 return 0;
15294 /* FALLTHRU */
15296 case REG:
15297 if (!base)
15298 base = op;
15299 else if (!index)
15300 index = op;
15301 else
15302 return 0;
15303 break;
15305 case CONST:
15306 case CONST_INT:
15307 case SYMBOL_REF:
15308 case LABEL_REF:
15309 if (disp)
15310 return 0;
15311 disp = op;
15312 break;
15314 default:
15315 return 0;
15319 else if (GET_CODE (addr) == MULT)
15321 index = XEXP (addr, 0); /* index*scale */
15322 scale_rtx = XEXP (addr, 1);
15324 else if (GET_CODE (addr) == ASHIFT)
15326 /* We're called for lea too, which implements ashift on occasion. */
15327 index = XEXP (addr, 0);
15328 tmp = XEXP (addr, 1);
15329 if (!CONST_INT_P (tmp))
15330 return 0;
15331 scale = INTVAL (tmp);
15332 if ((unsigned HOST_WIDE_INT) scale > 3)
15333 return 0;
15334 scale = 1 << scale;
15335 retval = -1;
15337 else
15338 disp = addr; /* displacement */
15340 if (index)
15342 if (REG_P (index))
15344 else if (SUBREG_P (index)
15345 && REG_P (SUBREG_REG (index)))
15347 else
15348 return 0;
15351 /* Extract the integral value of scale. */
15352 if (scale_rtx)
15354 if (!CONST_INT_P (scale_rtx))
15355 return 0;
15356 scale = INTVAL (scale_rtx);
15359 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
15360 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
15362 /* Avoid useless 0 displacement. */
15363 if (disp == const0_rtx && (base || index))
15364 disp = NULL_RTX;
15366 /* Allow arg pointer and stack pointer as index if there is not scaling. */
15367 if (base_reg && index_reg && scale == 1
15368 && (REGNO (index_reg) == ARG_POINTER_REGNUM
15369 || REGNO (index_reg) == FRAME_POINTER_REGNUM
15370 || REGNO (index_reg) == SP_REG))
15372 std::swap (base, index);
15373 std::swap (base_reg, index_reg);
15376 /* Special case: %ebp cannot be encoded as a base without a displacement.
15377 Similarly %r13. */
15378 if (!disp && base_reg
15379 && (REGNO (base_reg) == ARG_POINTER_REGNUM
15380 || REGNO (base_reg) == FRAME_POINTER_REGNUM
15381 || REGNO (base_reg) == BP_REG
15382 || REGNO (base_reg) == R13_REG))
15383 disp = const0_rtx;
15385 /* Special case: on K6, [%esi] makes the instruction vector decoded.
15386 Avoid this by transforming to [%esi+0].
15387 Reload calls address legitimization without cfun defined, so we need
15388 to test cfun for being non-NULL. */
15389 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
15390 && base_reg && !index_reg && !disp
15391 && REGNO (base_reg) == SI_REG)
15392 disp = const0_rtx;
15394 /* Special case: encode reg+reg instead of reg*2. */
15395 if (!base && index && scale == 2)
15396 base = index, base_reg = index_reg, scale = 1;
15398 /* Special case: scaling cannot be encoded without base or displacement. */
15399 if (!base && !disp && index && scale != 1)
15400 disp = const0_rtx;
15402 out->base = base;
15403 out->index = index;
15404 out->disp = disp;
15405 out->scale = scale;
15406 out->seg = seg;
15408 return retval;
15411 /* Return cost of the memory address x.
15412 For i386, it is better to use a complex address than let gcc copy
15413 the address into a reg and make a new pseudo. But not if the address
15414 requires to two regs - that would mean more pseudos with longer
15415 lifetimes. */
15416 static int
15417 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
15419 struct ix86_address parts;
15420 int cost = 1;
15421 int ok = ix86_decompose_address (x, &parts);
15423 gcc_assert (ok);
15425 if (parts.base && SUBREG_P (parts.base))
15426 parts.base = SUBREG_REG (parts.base);
15427 if (parts.index && SUBREG_P (parts.index))
15428 parts.index = SUBREG_REG (parts.index);
15430 /* Attempt to minimize number of registers in the address by increasing
15431 address cost for each used register. We don't increase address cost
15432 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
15433 is not invariant itself it most likely means that base or index is not
15434 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
15435 which is not profitable for x86. */
15436 if (parts.base
15437 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
15438 && (current_pass->type == GIMPLE_PASS
15439 || !pic_offset_table_rtx
15440 || !REG_P (parts.base)
15441 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
15442 cost++;
15444 if (parts.index
15445 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
15446 && (current_pass->type == GIMPLE_PASS
15447 || !pic_offset_table_rtx
15448 || !REG_P (parts.index)
15449 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
15450 cost++;
15452 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
15453 since it's predecode logic can't detect the length of instructions
15454 and it degenerates to vector decoded. Increase cost of such
15455 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
15456 to split such addresses or even refuse such addresses at all.
15458 Following addressing modes are affected:
15459 [base+scale*index]
15460 [scale*index+disp]
15461 [base+index]
15463 The first and last case may be avoidable by explicitly coding the zero in
15464 memory address, but I don't have AMD-K6 machine handy to check this
15465 theory. */
15467 if (TARGET_K6
15468 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
15469 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
15470 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
15471 cost += 10;
15473 return cost;
15476 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
15477 this is used for to form addresses to local data when -fPIC is in
15478 use. */
15480 static bool
15481 darwin_local_data_pic (rtx disp)
15483 return (GET_CODE (disp) == UNSPEC
15484 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
15487 /* True if operand X should be loaded from GOT. */
15489 bool
15490 ix86_force_load_from_GOT_p (rtx x)
15492 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
15493 && !TARGET_PECOFF && !TARGET_MACHO
15494 && !flag_plt && !flag_pic
15495 && ix86_cmodel != CM_LARGE
15496 && GET_CODE (x) == SYMBOL_REF
15497 && SYMBOL_REF_FUNCTION_P (x)
15498 && !SYMBOL_REF_LOCAL_P (x));
15501 /* Determine if a given RTX is a valid constant. We already know this
15502 satisfies CONSTANT_P. */
15504 static bool
15505 ix86_legitimate_constant_p (machine_mode mode, rtx x)
15507 /* Pointer bounds constants are not valid. */
15508 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15509 return false;
15511 switch (GET_CODE (x))
15513 case CONST:
15514 x = XEXP (x, 0);
15516 if (GET_CODE (x) == PLUS)
15518 if (!CONST_INT_P (XEXP (x, 1)))
15519 return false;
15520 x = XEXP (x, 0);
15523 if (TARGET_MACHO && darwin_local_data_pic (x))
15524 return true;
15526 /* Only some unspecs are valid as "constants". */
15527 if (GET_CODE (x) == UNSPEC)
15528 switch (XINT (x, 1))
15530 case UNSPEC_GOT:
15531 case UNSPEC_GOTOFF:
15532 case UNSPEC_PLTOFF:
15533 return TARGET_64BIT;
15534 case UNSPEC_TPOFF:
15535 case UNSPEC_NTPOFF:
15536 x = XVECEXP (x, 0, 0);
15537 return (GET_CODE (x) == SYMBOL_REF
15538 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15539 case UNSPEC_DTPOFF:
15540 x = XVECEXP (x, 0, 0);
15541 return (GET_CODE (x) == SYMBOL_REF
15542 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15543 default:
15544 return false;
15547 /* We must have drilled down to a symbol. */
15548 if (GET_CODE (x) == LABEL_REF)
15549 return true;
15550 if (GET_CODE (x) != SYMBOL_REF)
15551 return false;
15552 /* FALLTHRU */
15554 case SYMBOL_REF:
15555 /* TLS symbols are never valid. */
15556 if (SYMBOL_REF_TLS_MODEL (x))
15557 return false;
15559 /* DLLIMPORT symbols are never valid. */
15560 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15561 && SYMBOL_REF_DLLIMPORT_P (x))
15562 return false;
15564 #if TARGET_MACHO
15565 /* mdynamic-no-pic */
15566 if (MACHO_DYNAMIC_NO_PIC_P)
15567 return machopic_symbol_defined_p (x);
15568 #endif
15570 /* External function address should be loaded
15571 via the GOT slot to avoid PLT. */
15572 if (ix86_force_load_from_GOT_p (x))
15573 return false;
15575 break;
15577 CASE_CONST_SCALAR_INT:
15578 switch (mode)
15580 case E_TImode:
15581 if (TARGET_64BIT)
15582 return true;
15583 /* FALLTHRU */
15584 case E_OImode:
15585 case E_XImode:
15586 if (!standard_sse_constant_p (x, mode))
15587 return false;
15588 default:
15589 break;
15591 break;
15593 case CONST_VECTOR:
15594 if (!standard_sse_constant_p (x, mode))
15595 return false;
15597 default:
15598 break;
15601 /* Otherwise we handle everything else in the move patterns. */
15602 return true;
15605 /* Determine if it's legal to put X into the constant pool. This
15606 is not possible for the address of thread-local symbols, which
15607 is checked above. */
15609 static bool
15610 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15612 /* We can put any immediate constant in memory. */
15613 switch (GET_CODE (x))
15615 CASE_CONST_ANY:
15616 return false;
15618 default:
15619 break;
15622 return !ix86_legitimate_constant_p (mode, x);
15625 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15626 otherwise zero. */
15628 static bool
15629 is_imported_p (rtx x)
15631 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15632 || GET_CODE (x) != SYMBOL_REF)
15633 return false;
15635 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15639 /* Nonzero if the constant value X is a legitimate general operand
15640 when generating PIC code. It is given that flag_pic is on and
15641 that X satisfies CONSTANT_P. */
15643 bool
15644 legitimate_pic_operand_p (rtx x)
15646 rtx inner;
15648 switch (GET_CODE (x))
15650 case CONST:
15651 inner = XEXP (x, 0);
15652 if (GET_CODE (inner) == PLUS
15653 && CONST_INT_P (XEXP (inner, 1)))
15654 inner = XEXP (inner, 0);
15656 /* Only some unspecs are valid as "constants". */
15657 if (GET_CODE (inner) == UNSPEC)
15658 switch (XINT (inner, 1))
15660 case UNSPEC_GOT:
15661 case UNSPEC_GOTOFF:
15662 case UNSPEC_PLTOFF:
15663 return TARGET_64BIT;
15664 case UNSPEC_TPOFF:
15665 x = XVECEXP (inner, 0, 0);
15666 return (GET_CODE (x) == SYMBOL_REF
15667 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15668 case UNSPEC_MACHOPIC_OFFSET:
15669 return legitimate_pic_address_disp_p (x);
15670 default:
15671 return false;
15673 /* FALLTHRU */
15675 case SYMBOL_REF:
15676 case LABEL_REF:
15677 return legitimate_pic_address_disp_p (x);
15679 default:
15680 return true;
15684 /* Determine if a given CONST RTX is a valid memory displacement
15685 in PIC mode. */
15687 bool
15688 legitimate_pic_address_disp_p (rtx disp)
15690 bool saw_plus;
15692 /* In 64bit mode we can allow direct addresses of symbols and labels
15693 when they are not dynamic symbols. */
15694 if (TARGET_64BIT)
15696 rtx op0 = disp, op1;
15698 switch (GET_CODE (disp))
15700 case LABEL_REF:
15701 return true;
15703 case CONST:
15704 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15705 break;
15706 op0 = XEXP (XEXP (disp, 0), 0);
15707 op1 = XEXP (XEXP (disp, 0), 1);
15708 if (!CONST_INT_P (op1))
15709 break;
15710 if (GET_CODE (op0) == UNSPEC
15711 && (XINT (op0, 1) == UNSPEC_DTPOFF
15712 || XINT (op0, 1) == UNSPEC_NTPOFF)
15713 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15714 return true;
15715 if (INTVAL (op1) >= 16*1024*1024
15716 || INTVAL (op1) < -16*1024*1024)
15717 break;
15718 if (GET_CODE (op0) == LABEL_REF)
15719 return true;
15720 if (GET_CODE (op0) == CONST
15721 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15722 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15723 return true;
15724 if (GET_CODE (op0) == UNSPEC
15725 && XINT (op0, 1) == UNSPEC_PCREL)
15726 return true;
15727 if (GET_CODE (op0) != SYMBOL_REF)
15728 break;
15729 /* FALLTHRU */
15731 case SYMBOL_REF:
15732 /* TLS references should always be enclosed in UNSPEC.
15733 The dllimported symbol needs always to be resolved. */
15734 if (SYMBOL_REF_TLS_MODEL (op0)
15735 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15736 return false;
15738 if (TARGET_PECOFF)
15740 if (is_imported_p (op0))
15741 return true;
15743 if (SYMBOL_REF_FAR_ADDR_P (op0)
15744 || !SYMBOL_REF_LOCAL_P (op0))
15745 break;
15747 /* Function-symbols need to be resolved only for
15748 large-model.
15749 For the small-model we don't need to resolve anything
15750 here. */
15751 if ((ix86_cmodel != CM_LARGE_PIC
15752 && SYMBOL_REF_FUNCTION_P (op0))
15753 || ix86_cmodel == CM_SMALL_PIC)
15754 return true;
15755 /* Non-external symbols don't need to be resolved for
15756 large, and medium-model. */
15757 if ((ix86_cmodel == CM_LARGE_PIC
15758 || ix86_cmodel == CM_MEDIUM_PIC)
15759 && !SYMBOL_REF_EXTERNAL_P (op0))
15760 return true;
15762 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15763 && (SYMBOL_REF_LOCAL_P (op0)
15764 || (HAVE_LD_PIE_COPYRELOC
15765 && flag_pie
15766 && !SYMBOL_REF_WEAK (op0)
15767 && !SYMBOL_REF_FUNCTION_P (op0)))
15768 && ix86_cmodel != CM_LARGE_PIC)
15769 return true;
15770 break;
15772 default:
15773 break;
15776 if (GET_CODE (disp) != CONST)
15777 return false;
15778 disp = XEXP (disp, 0);
15780 if (TARGET_64BIT)
15782 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15783 of GOT tables. We should not need these anyway. */
15784 if (GET_CODE (disp) != UNSPEC
15785 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15786 && XINT (disp, 1) != UNSPEC_GOTOFF
15787 && XINT (disp, 1) != UNSPEC_PCREL
15788 && XINT (disp, 1) != UNSPEC_PLTOFF))
15789 return false;
15791 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15792 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15793 return false;
15794 return true;
15797 saw_plus = false;
15798 if (GET_CODE (disp) == PLUS)
15800 if (!CONST_INT_P (XEXP (disp, 1)))
15801 return false;
15802 disp = XEXP (disp, 0);
15803 saw_plus = true;
15806 if (TARGET_MACHO && darwin_local_data_pic (disp))
15807 return true;
15809 if (GET_CODE (disp) != UNSPEC)
15810 return false;
15812 switch (XINT (disp, 1))
15814 case UNSPEC_GOT:
15815 if (saw_plus)
15816 return false;
15817 /* We need to check for both symbols and labels because VxWorks loads
15818 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15819 details. */
15820 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15821 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15822 case UNSPEC_GOTOFF:
15823 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15824 While ABI specify also 32bit relocation but we don't produce it in
15825 small PIC model at all. */
15826 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15827 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15828 && !TARGET_64BIT)
15829 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15830 return false;
15831 case UNSPEC_GOTTPOFF:
15832 case UNSPEC_GOTNTPOFF:
15833 case UNSPEC_INDNTPOFF:
15834 if (saw_plus)
15835 return false;
15836 disp = XVECEXP (disp, 0, 0);
15837 return (GET_CODE (disp) == SYMBOL_REF
15838 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15839 case UNSPEC_NTPOFF:
15840 disp = XVECEXP (disp, 0, 0);
15841 return (GET_CODE (disp) == SYMBOL_REF
15842 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15843 case UNSPEC_DTPOFF:
15844 disp = XVECEXP (disp, 0, 0);
15845 return (GET_CODE (disp) == SYMBOL_REF
15846 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15849 return false;
15852 /* Determine if op is suitable RTX for an address register.
15853 Return naked register if a register or a register subreg is
15854 found, otherwise return NULL_RTX. */
15856 static rtx
15857 ix86_validate_address_register (rtx op)
15859 machine_mode mode = GET_MODE (op);
15861 /* Only SImode or DImode registers can form the address. */
15862 if (mode != SImode && mode != DImode)
15863 return NULL_RTX;
15865 if (REG_P (op))
15866 return op;
15867 else if (SUBREG_P (op))
15869 rtx reg = SUBREG_REG (op);
15871 if (!REG_P (reg))
15872 return NULL_RTX;
15874 mode = GET_MODE (reg);
15876 /* Don't allow SUBREGs that span more than a word. It can
15877 lead to spill failures when the register is one word out
15878 of a two word structure. */
15879 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15880 return NULL_RTX;
15882 /* Allow only SUBREGs of non-eliminable hard registers. */
15883 if (register_no_elim_operand (reg, mode))
15884 return reg;
15887 /* Op is not a register. */
15888 return NULL_RTX;
15891 /* Recognizes RTL expressions that are valid memory addresses for an
15892 instruction. The MODE argument is the machine mode for the MEM
15893 expression that wants to use this address.
15895 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15896 convert common non-canonical forms to canonical form so that they will
15897 be recognized. */
15899 static bool
15900 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15902 struct ix86_address parts;
15903 rtx base, index, disp;
15904 HOST_WIDE_INT scale;
15905 addr_space_t seg;
15907 if (ix86_decompose_address (addr, &parts) <= 0)
15908 /* Decomposition failed. */
15909 return false;
15911 base = parts.base;
15912 index = parts.index;
15913 disp = parts.disp;
15914 scale = parts.scale;
15915 seg = parts.seg;
15917 /* Validate base register. */
15918 if (base)
15920 rtx reg = ix86_validate_address_register (base);
15922 if (reg == NULL_RTX)
15923 return false;
15925 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15926 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15927 /* Base is not valid. */
15928 return false;
15931 /* Validate index register. */
15932 if (index)
15934 rtx reg = ix86_validate_address_register (index);
15936 if (reg == NULL_RTX)
15937 return false;
15939 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15940 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15941 /* Index is not valid. */
15942 return false;
15945 /* Index and base should have the same mode. */
15946 if (base && index
15947 && GET_MODE (base) != GET_MODE (index))
15948 return false;
15950 /* Address override works only on the (%reg) part of %fs:(%reg). */
15951 if (seg != ADDR_SPACE_GENERIC
15952 && ((base && GET_MODE (base) != word_mode)
15953 || (index && GET_MODE (index) != word_mode)))
15954 return false;
15956 /* Validate scale factor. */
15957 if (scale != 1)
15959 if (!index)
15960 /* Scale without index. */
15961 return false;
15963 if (scale != 2 && scale != 4 && scale != 8)
15964 /* Scale is not a valid multiplier. */
15965 return false;
15968 /* Validate displacement. */
15969 if (disp)
15971 if (GET_CODE (disp) == CONST
15972 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15973 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15974 switch (XINT (XEXP (disp, 0), 1))
15976 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15977 when used. While ABI specify also 32bit relocations, we
15978 don't produce them at all and use IP relative instead.
15979 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15980 should be loaded via GOT. */
15981 case UNSPEC_GOT:
15982 if (!TARGET_64BIT
15983 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15984 goto is_legitimate_pic;
15985 /* FALLTHRU */
15986 case UNSPEC_GOTOFF:
15987 gcc_assert (flag_pic);
15988 if (!TARGET_64BIT)
15989 goto is_legitimate_pic;
15991 /* 64bit address unspec. */
15992 return false;
15994 case UNSPEC_GOTPCREL:
15995 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15996 goto is_legitimate_pic;
15997 /* FALLTHRU */
15998 case UNSPEC_PCREL:
15999 gcc_assert (flag_pic);
16000 goto is_legitimate_pic;
16002 case UNSPEC_GOTTPOFF:
16003 case UNSPEC_GOTNTPOFF:
16004 case UNSPEC_INDNTPOFF:
16005 case UNSPEC_NTPOFF:
16006 case UNSPEC_DTPOFF:
16007 break;
16009 default:
16010 /* Invalid address unspec. */
16011 return false;
16014 else if (SYMBOLIC_CONST (disp)
16015 && (flag_pic
16016 || (TARGET_MACHO
16017 #if TARGET_MACHO
16018 && MACHOPIC_INDIRECT
16019 && !machopic_operand_p (disp)
16020 #endif
16024 is_legitimate_pic:
16025 if (TARGET_64BIT && (index || base))
16027 /* foo@dtpoff(%rX) is ok. */
16028 if (GET_CODE (disp) != CONST
16029 || GET_CODE (XEXP (disp, 0)) != PLUS
16030 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
16031 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
16032 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
16033 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
16034 /* Non-constant pic memory reference. */
16035 return false;
16037 else if ((!TARGET_MACHO || flag_pic)
16038 && ! legitimate_pic_address_disp_p (disp))
16039 /* Displacement is an invalid pic construct. */
16040 return false;
16041 #if TARGET_MACHO
16042 else if (MACHO_DYNAMIC_NO_PIC_P
16043 && !ix86_legitimate_constant_p (Pmode, disp))
16044 /* displacment must be referenced via non_lazy_pointer */
16045 return false;
16046 #endif
16048 /* This code used to verify that a symbolic pic displacement
16049 includes the pic_offset_table_rtx register.
16051 While this is good idea, unfortunately these constructs may
16052 be created by "adds using lea" optimization for incorrect
16053 code like:
16055 int a;
16056 int foo(int i)
16058 return *(&a+i);
16061 This code is nonsensical, but results in addressing
16062 GOT table with pic_offset_table_rtx base. We can't
16063 just refuse it easily, since it gets matched by
16064 "addsi3" pattern, that later gets split to lea in the
16065 case output register differs from input. While this
16066 can be handled by separate addsi pattern for this case
16067 that never results in lea, this seems to be easier and
16068 correct fix for crash to disable this test. */
16070 else if (GET_CODE (disp) != LABEL_REF
16071 && !CONST_INT_P (disp)
16072 && (GET_CODE (disp) != CONST
16073 || !ix86_legitimate_constant_p (Pmode, disp))
16074 && (GET_CODE (disp) != SYMBOL_REF
16075 || !ix86_legitimate_constant_p (Pmode, disp)))
16076 /* Displacement is not constant. */
16077 return false;
16078 else if (TARGET_64BIT
16079 && !x86_64_immediate_operand (disp, VOIDmode))
16080 /* Displacement is out of range. */
16081 return false;
16082 /* In x32 mode, constant addresses are sign extended to 64bit, so
16083 we have to prevent addresses from 0x80000000 to 0xffffffff. */
16084 else if (TARGET_X32 && !(index || base)
16085 && CONST_INT_P (disp)
16086 && val_signbit_known_set_p (SImode, INTVAL (disp)))
16087 return false;
16090 /* Everything looks valid. */
16091 return true;
16094 /* Determine if a given RTX is a valid constant address. */
16096 bool
16097 constant_address_p (rtx x)
16099 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
16102 /* Return a unique alias set for the GOT. */
16104 static alias_set_type
16105 ix86_GOT_alias_set (void)
16107 static alias_set_type set = -1;
16108 if (set == -1)
16109 set = new_alias_set ();
16110 return set;
16113 /* Return a legitimate reference for ORIG (an address) using the
16114 register REG. If REG is 0, a new pseudo is generated.
16116 There are two types of references that must be handled:
16118 1. Global data references must load the address from the GOT, via
16119 the PIC reg. An insn is emitted to do this load, and the reg is
16120 returned.
16122 2. Static data references, constant pool addresses, and code labels
16123 compute the address as an offset from the GOT, whose base is in
16124 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
16125 differentiate them from global data objects. The returned
16126 address is the PIC reg + an unspec constant.
16128 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
16129 reg also appears in the address. */
16131 static rtx
16132 legitimize_pic_address (rtx orig, rtx reg)
16134 rtx addr = orig;
16135 rtx new_rtx = orig;
16137 #if TARGET_MACHO
16138 if (TARGET_MACHO && !TARGET_64BIT)
16140 if (reg == 0)
16141 reg = gen_reg_rtx (Pmode);
16142 /* Use the generic Mach-O PIC machinery. */
16143 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
16145 #endif
16147 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16149 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16150 if (tmp)
16151 return tmp;
16154 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
16155 new_rtx = addr;
16156 else if ((!TARGET_64BIT
16157 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
16158 && !TARGET_PECOFF
16159 && gotoff_operand (addr, Pmode))
16161 /* This symbol may be referenced via a displacement
16162 from the PIC base address (@GOTOFF). */
16163 if (GET_CODE (addr) == CONST)
16164 addr = XEXP (addr, 0);
16166 if (GET_CODE (addr) == PLUS)
16168 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
16169 UNSPEC_GOTOFF);
16170 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
16172 else
16173 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
16175 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16177 if (TARGET_64BIT)
16178 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16180 if (reg != 0)
16182 gcc_assert (REG_P (reg));
16183 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
16184 new_rtx, reg, 1, OPTAB_DIRECT);
16186 else
16187 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16189 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
16190 /* We can't use @GOTOFF for text labels
16191 on VxWorks, see gotoff_operand. */
16192 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
16194 rtx tmp = legitimize_pe_coff_symbol (addr, true);
16195 if (tmp)
16196 return tmp;
16198 /* For x64 PE-COFF there is no GOT table,
16199 so we use address directly. */
16200 if (TARGET_64BIT && TARGET_PECOFF)
16202 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
16203 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16205 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
16207 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
16208 UNSPEC_GOTPCREL);
16209 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16210 new_rtx = gen_const_mem (Pmode, new_rtx);
16211 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16213 else
16215 /* This symbol must be referenced via a load
16216 from the Global Offset Table (@GOT). */
16217 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
16218 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16219 if (TARGET_64BIT)
16220 new_rtx = force_reg (Pmode, new_rtx);
16221 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16222 new_rtx = gen_const_mem (Pmode, new_rtx);
16223 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
16226 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
16228 else
16230 if (CONST_INT_P (addr)
16231 && !x86_64_immediate_operand (addr, VOIDmode))
16232 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
16233 else if (GET_CODE (addr) == CONST)
16235 addr = XEXP (addr, 0);
16237 /* We must match stuff we generate before. Assume the only
16238 unspecs that can get here are ours. Not that we could do
16239 anything with them anyway.... */
16240 if (GET_CODE (addr) == UNSPEC
16241 || (GET_CODE (addr) == PLUS
16242 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
16243 return orig;
16244 gcc_assert (GET_CODE (addr) == PLUS);
16247 if (GET_CODE (addr) == PLUS)
16249 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
16251 /* Check first to see if this is a constant
16252 offset from a @GOTOFF symbol reference. */
16253 if (!TARGET_PECOFF
16254 && gotoff_operand (op0, Pmode)
16255 && CONST_INT_P (op1))
16257 if (!TARGET_64BIT)
16259 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
16260 UNSPEC_GOTOFF);
16261 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
16262 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
16264 if (reg != 0)
16266 gcc_assert (REG_P (reg));
16267 new_rtx = expand_simple_binop (Pmode, PLUS,
16268 pic_offset_table_rtx,
16269 new_rtx, reg, 1,
16270 OPTAB_DIRECT);
16272 else
16273 new_rtx
16274 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
16276 else
16278 if (INTVAL (op1) < -16*1024*1024
16279 || INTVAL (op1) >= 16*1024*1024)
16281 if (!x86_64_immediate_operand (op1, Pmode))
16282 op1 = force_reg (Pmode, op1);
16284 new_rtx
16285 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
16289 else
16291 rtx base = legitimize_pic_address (op0, reg);
16292 machine_mode mode = GET_MODE (base);
16293 new_rtx
16294 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
16296 if (CONST_INT_P (new_rtx))
16298 if (INTVAL (new_rtx) < -16*1024*1024
16299 || INTVAL (new_rtx) >= 16*1024*1024)
16301 if (!x86_64_immediate_operand (new_rtx, mode))
16302 new_rtx = force_reg (mode, new_rtx);
16304 new_rtx
16305 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
16307 else
16308 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
16310 else
16312 /* For %rip addressing, we have to use
16313 just disp32, not base nor index. */
16314 if (TARGET_64BIT
16315 && (GET_CODE (base) == SYMBOL_REF
16316 || GET_CODE (base) == LABEL_REF))
16317 base = force_reg (mode, base);
16318 if (GET_CODE (new_rtx) == PLUS
16319 && CONSTANT_P (XEXP (new_rtx, 1)))
16321 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
16322 new_rtx = XEXP (new_rtx, 1);
16324 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
16329 return new_rtx;
16332 /* Load the thread pointer. If TO_REG is true, force it into a register. */
16334 static rtx
16335 get_thread_pointer (machine_mode tp_mode, bool to_reg)
16337 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
16339 if (GET_MODE (tp) != tp_mode)
16341 gcc_assert (GET_MODE (tp) == SImode);
16342 gcc_assert (tp_mode == DImode);
16344 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
16347 if (to_reg)
16348 tp = copy_to_mode_reg (tp_mode, tp);
16350 return tp;
16353 /* Construct the SYMBOL_REF for the tls_get_addr function. */
16355 static GTY(()) rtx ix86_tls_symbol;
16357 static rtx
16358 ix86_tls_get_addr (void)
16360 if (!ix86_tls_symbol)
16362 const char *sym
16363 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
16364 ? "___tls_get_addr" : "__tls_get_addr");
16366 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
16369 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
16371 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
16372 UNSPEC_PLTOFF);
16373 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
16374 gen_rtx_CONST (Pmode, unspec));
16377 return ix86_tls_symbol;
16380 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
16382 static GTY(()) rtx ix86_tls_module_base_symbol;
16385 ix86_tls_module_base (void)
16387 if (!ix86_tls_module_base_symbol)
16389 ix86_tls_module_base_symbol
16390 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
16392 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
16393 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
16396 return ix86_tls_module_base_symbol;
16399 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
16400 false if we expect this to be used for a memory address and true if
16401 we expect to load the address into a register. */
16403 static rtx
16404 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
16406 rtx dest, base, off;
16407 rtx pic = NULL_RTX, tp = NULL_RTX;
16408 machine_mode tp_mode = Pmode;
16409 int type;
16411 /* Fall back to global dynamic model if tool chain cannot support local
16412 dynamic. */
16413 if (TARGET_SUN_TLS && !TARGET_64BIT
16414 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
16415 && model == TLS_MODEL_LOCAL_DYNAMIC)
16416 model = TLS_MODEL_GLOBAL_DYNAMIC;
16418 switch (model)
16420 case TLS_MODEL_GLOBAL_DYNAMIC:
16421 dest = gen_reg_rtx (Pmode);
16423 if (!TARGET_64BIT)
16425 if (flag_pic && !TARGET_PECOFF)
16426 pic = pic_offset_table_rtx;
16427 else
16429 pic = gen_reg_rtx (Pmode);
16430 emit_insn (gen_set_got (pic));
16434 if (TARGET_GNU2_TLS)
16436 if (TARGET_64BIT)
16437 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
16438 else
16439 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
16441 tp = get_thread_pointer (Pmode, true);
16442 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
16444 if (GET_MODE (x) != Pmode)
16445 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16447 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16449 else
16451 rtx caddr = ix86_tls_get_addr ();
16453 if (TARGET_64BIT)
16455 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16456 rtx_insn *insns;
16458 start_sequence ();
16459 emit_call_insn
16460 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
16461 insns = get_insns ();
16462 end_sequence ();
16464 if (GET_MODE (x) != Pmode)
16465 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16467 RTL_CONST_CALL_P (insns) = 1;
16468 emit_libcall_block (insns, dest, rax, x);
16470 else
16471 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
16473 break;
16475 case TLS_MODEL_LOCAL_DYNAMIC:
16476 base = gen_reg_rtx (Pmode);
16478 if (!TARGET_64BIT)
16480 if (flag_pic)
16481 pic = pic_offset_table_rtx;
16482 else
16484 pic = gen_reg_rtx (Pmode);
16485 emit_insn (gen_set_got (pic));
16489 if (TARGET_GNU2_TLS)
16491 rtx tmp = ix86_tls_module_base ();
16493 if (TARGET_64BIT)
16494 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
16495 else
16496 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
16498 tp = get_thread_pointer (Pmode, true);
16499 set_unique_reg_note (get_last_insn (), REG_EQUAL,
16500 gen_rtx_MINUS (Pmode, tmp, tp));
16502 else
16504 rtx caddr = ix86_tls_get_addr ();
16506 if (TARGET_64BIT)
16508 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16509 rtx_insn *insns;
16510 rtx eqv;
16512 start_sequence ();
16513 emit_call_insn
16514 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16515 insns = get_insns ();
16516 end_sequence ();
16518 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16519 share the LD_BASE result with other LD model accesses. */
16520 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16521 UNSPEC_TLS_LD_BASE);
16523 RTL_CONST_CALL_P (insns) = 1;
16524 emit_libcall_block (insns, base, rax, eqv);
16526 else
16527 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16530 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16531 off = gen_rtx_CONST (Pmode, off);
16533 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16535 if (TARGET_GNU2_TLS)
16537 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16539 if (GET_MODE (x) != Pmode)
16540 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16542 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16544 break;
16546 case TLS_MODEL_INITIAL_EXEC:
16547 if (TARGET_64BIT)
16549 if (TARGET_SUN_TLS && !TARGET_X32)
16551 /* The Sun linker took the AMD64 TLS spec literally
16552 and can only handle %rax as destination of the
16553 initial executable code sequence. */
16555 dest = gen_reg_rtx (DImode);
16556 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16557 return dest;
16560 /* Generate DImode references to avoid %fs:(%reg32)
16561 problems and linker IE->LE relaxation bug. */
16562 tp_mode = DImode;
16563 pic = NULL;
16564 type = UNSPEC_GOTNTPOFF;
16566 else if (flag_pic)
16568 pic = pic_offset_table_rtx;
16569 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16571 else if (!TARGET_ANY_GNU_TLS)
16573 pic = gen_reg_rtx (Pmode);
16574 emit_insn (gen_set_got (pic));
16575 type = UNSPEC_GOTTPOFF;
16577 else
16579 pic = NULL;
16580 type = UNSPEC_INDNTPOFF;
16583 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16584 off = gen_rtx_CONST (tp_mode, off);
16585 if (pic)
16586 off = gen_rtx_PLUS (tp_mode, pic, off);
16587 off = gen_const_mem (tp_mode, off);
16588 set_mem_alias_set (off, ix86_GOT_alias_set ());
16590 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16592 base = get_thread_pointer (tp_mode,
16593 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16594 off = force_reg (tp_mode, off);
16595 dest = gen_rtx_PLUS (tp_mode, base, off);
16596 if (tp_mode != Pmode)
16597 dest = convert_to_mode (Pmode, dest, 1);
16599 else
16601 base = get_thread_pointer (Pmode, true);
16602 dest = gen_reg_rtx (Pmode);
16603 emit_insn (ix86_gen_sub3 (dest, base, off));
16605 break;
16607 case TLS_MODEL_LOCAL_EXEC:
16608 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16609 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16610 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16611 off = gen_rtx_CONST (Pmode, off);
16613 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16615 base = get_thread_pointer (Pmode,
16616 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16617 return gen_rtx_PLUS (Pmode, base, off);
16619 else
16621 base = get_thread_pointer (Pmode, true);
16622 dest = gen_reg_rtx (Pmode);
16623 emit_insn (ix86_gen_sub3 (dest, base, off));
16625 break;
16627 default:
16628 gcc_unreachable ();
16631 return dest;
16634 /* Return true if OP refers to a TLS address. */
16635 bool
16636 ix86_tls_address_pattern_p (rtx op)
16638 subrtx_var_iterator::array_type array;
16639 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16641 rtx op = *iter;
16642 if (MEM_P (op))
16644 rtx *x = &XEXP (op, 0);
16645 while (GET_CODE (*x) == PLUS)
16647 int i;
16648 for (i = 0; i < 2; i++)
16650 rtx u = XEXP (*x, i);
16651 if (GET_CODE (u) == ZERO_EXTEND)
16652 u = XEXP (u, 0);
16653 if (GET_CODE (u) == UNSPEC
16654 && XINT (u, 1) == UNSPEC_TP)
16655 return true;
16657 x = &XEXP (*x, 0);
16660 iter.skip_subrtxes ();
16664 return false;
16667 /* Rewrite *LOC so that it refers to a default TLS address space. */
16668 void
16669 ix86_rewrite_tls_address_1 (rtx *loc)
16671 subrtx_ptr_iterator::array_type array;
16672 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16674 rtx *loc = *iter;
16675 if (MEM_P (*loc))
16677 rtx addr = XEXP (*loc, 0);
16678 rtx *x = &addr;
16679 while (GET_CODE (*x) == PLUS)
16681 int i;
16682 for (i = 0; i < 2; i++)
16684 rtx u = XEXP (*x, i);
16685 if (GET_CODE (u) == ZERO_EXTEND)
16686 u = XEXP (u, 0);
16687 if (GET_CODE (u) == UNSPEC
16688 && XINT (u, 1) == UNSPEC_TP)
16690 addr_space_t as = DEFAULT_TLS_SEG_REG;
16692 *x = XEXP (*x, 1 - i);
16694 *loc = replace_equiv_address_nv (*loc, addr, true);
16695 set_mem_addr_space (*loc, as);
16696 return;
16699 x = &XEXP (*x, 0);
16702 iter.skip_subrtxes ();
16707 /* Rewrite instruction pattern involvning TLS address
16708 so that it refers to a default TLS address space. */
16710 ix86_rewrite_tls_address (rtx pattern)
16712 pattern = copy_insn (pattern);
16713 ix86_rewrite_tls_address_1 (&pattern);
16714 return pattern;
16717 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16718 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16719 unique refptr-DECL symbol corresponding to symbol DECL. */
16721 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16723 static inline hashval_t hash (tree_map *m) { return m->hash; }
16724 static inline bool
16725 equal (tree_map *a, tree_map *b)
16727 return a->base.from == b->base.from;
16730 static int
16731 keep_cache_entry (tree_map *&m)
16733 return ggc_marked_p (m->base.from);
16737 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16739 static tree
16740 get_dllimport_decl (tree decl, bool beimport)
16742 struct tree_map *h, in;
16743 const char *name;
16744 const char *prefix;
16745 size_t namelen, prefixlen;
16746 char *imp_name;
16747 tree to;
16748 rtx rtl;
16750 if (!dllimport_map)
16751 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16753 in.hash = htab_hash_pointer (decl);
16754 in.base.from = decl;
16755 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16756 h = *loc;
16757 if (h)
16758 return h->to;
16760 *loc = h = ggc_alloc<tree_map> ();
16761 h->hash = in.hash;
16762 h->base.from = decl;
16763 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16764 VAR_DECL, NULL, ptr_type_node);
16765 DECL_ARTIFICIAL (to) = 1;
16766 DECL_IGNORED_P (to) = 1;
16767 DECL_EXTERNAL (to) = 1;
16768 TREE_READONLY (to) = 1;
16770 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16771 name = targetm.strip_name_encoding (name);
16772 if (beimport)
16773 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16774 ? "*__imp_" : "*__imp__";
16775 else
16776 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16777 namelen = strlen (name);
16778 prefixlen = strlen (prefix);
16779 imp_name = (char *) alloca (namelen + prefixlen + 1);
16780 memcpy (imp_name, prefix, prefixlen);
16781 memcpy (imp_name + prefixlen, name, namelen + 1);
16783 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16784 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16785 SET_SYMBOL_REF_DECL (rtl, to);
16786 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16787 if (!beimport)
16789 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16790 #ifdef SUB_TARGET_RECORD_STUB
16791 SUB_TARGET_RECORD_STUB (name);
16792 #endif
16795 rtl = gen_const_mem (Pmode, rtl);
16796 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16798 SET_DECL_RTL (to, rtl);
16799 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16801 return to;
16804 /* Expand SYMBOL into its corresponding far-address symbol.
16805 WANT_REG is true if we require the result be a register. */
16807 static rtx
16808 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16810 tree imp_decl;
16811 rtx x;
16813 gcc_assert (SYMBOL_REF_DECL (symbol));
16814 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16816 x = DECL_RTL (imp_decl);
16817 if (want_reg)
16818 x = force_reg (Pmode, x);
16819 return x;
16822 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16823 true if we require the result be a register. */
16825 static rtx
16826 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16828 tree imp_decl;
16829 rtx x;
16831 gcc_assert (SYMBOL_REF_DECL (symbol));
16832 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16834 x = DECL_RTL (imp_decl);
16835 if (want_reg)
16836 x = force_reg (Pmode, x);
16837 return x;
16840 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16841 is true if we require the result be a register. */
16843 static rtx
16844 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16846 if (!TARGET_PECOFF)
16847 return NULL_RTX;
16849 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16851 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16852 return legitimize_dllimport_symbol (addr, inreg);
16853 if (GET_CODE (addr) == CONST
16854 && GET_CODE (XEXP (addr, 0)) == PLUS
16855 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16856 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16858 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16859 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16863 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16864 return NULL_RTX;
16865 if (GET_CODE (addr) == SYMBOL_REF
16866 && !is_imported_p (addr)
16867 && SYMBOL_REF_EXTERNAL_P (addr)
16868 && SYMBOL_REF_DECL (addr))
16869 return legitimize_pe_coff_extern_decl (addr, inreg);
16871 if (GET_CODE (addr) == CONST
16872 && GET_CODE (XEXP (addr, 0)) == PLUS
16873 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16874 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16875 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16876 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16878 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16879 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16881 return NULL_RTX;
16884 /* Try machine-dependent ways of modifying an illegitimate address
16885 to be legitimate. If we find one, return the new, valid address.
16886 This macro is used in only one place: `memory_address' in explow.c.
16888 OLDX is the address as it was before break_out_memory_refs was called.
16889 In some cases it is useful to look at this to decide what needs to be done.
16891 It is always safe for this macro to do nothing. It exists to recognize
16892 opportunities to optimize the output.
16894 For the 80386, we handle X+REG by loading X into a register R and
16895 using R+REG. R will go in a general reg and indexing will be used.
16896 However, if REG is a broken-out memory address or multiplication,
16897 nothing needs to be done because REG can certainly go in a general reg.
16899 When -fpic is used, special handling is needed for symbolic references.
16900 See comments by legitimize_pic_address in i386.c for details. */
16902 static rtx
16903 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16905 bool changed = false;
16906 unsigned log;
16908 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16909 if (log)
16910 return legitimize_tls_address (x, (enum tls_model) log, false);
16911 if (GET_CODE (x) == CONST
16912 && GET_CODE (XEXP (x, 0)) == PLUS
16913 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16914 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16916 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16917 (enum tls_model) log, false);
16918 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16921 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16923 rtx tmp = legitimize_pe_coff_symbol (x, true);
16924 if (tmp)
16925 return tmp;
16928 if (flag_pic && SYMBOLIC_CONST (x))
16929 return legitimize_pic_address (x, 0);
16931 #if TARGET_MACHO
16932 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16933 return machopic_indirect_data_reference (x, 0);
16934 #endif
16936 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16937 if (GET_CODE (x) == ASHIFT
16938 && CONST_INT_P (XEXP (x, 1))
16939 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16941 changed = true;
16942 log = INTVAL (XEXP (x, 1));
16943 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16944 GEN_INT (1 << log));
16947 if (GET_CODE (x) == PLUS)
16949 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16951 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16952 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16953 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16955 changed = true;
16956 log = INTVAL (XEXP (XEXP (x, 0), 1));
16957 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16958 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16959 GEN_INT (1 << log));
16962 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16963 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16964 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16966 changed = true;
16967 log = INTVAL (XEXP (XEXP (x, 1), 1));
16968 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16969 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16970 GEN_INT (1 << log));
16973 /* Put multiply first if it isn't already. */
16974 if (GET_CODE (XEXP (x, 1)) == MULT)
16976 std::swap (XEXP (x, 0), XEXP (x, 1));
16977 changed = true;
16980 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16981 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16982 created by virtual register instantiation, register elimination, and
16983 similar optimizations. */
16984 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16986 changed = true;
16987 x = gen_rtx_PLUS (Pmode,
16988 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16989 XEXP (XEXP (x, 1), 0)),
16990 XEXP (XEXP (x, 1), 1));
16993 /* Canonicalize
16994 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16995 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16996 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16997 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16998 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16999 && CONSTANT_P (XEXP (x, 1)))
17001 rtx constant;
17002 rtx other = NULL_RTX;
17004 if (CONST_INT_P (XEXP (x, 1)))
17006 constant = XEXP (x, 1);
17007 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
17009 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
17011 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
17012 other = XEXP (x, 1);
17014 else
17015 constant = 0;
17017 if (constant)
17019 changed = true;
17020 x = gen_rtx_PLUS (Pmode,
17021 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
17022 XEXP (XEXP (XEXP (x, 0), 1), 0)),
17023 plus_constant (Pmode, other,
17024 INTVAL (constant)));
17028 if (changed && ix86_legitimate_address_p (mode, x, false))
17029 return x;
17031 if (GET_CODE (XEXP (x, 0)) == MULT)
17033 changed = true;
17034 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
17037 if (GET_CODE (XEXP (x, 1)) == MULT)
17039 changed = true;
17040 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
17043 if (changed
17044 && REG_P (XEXP (x, 1))
17045 && REG_P (XEXP (x, 0)))
17046 return x;
17048 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
17050 changed = true;
17051 x = legitimize_pic_address (x, 0);
17054 if (changed && ix86_legitimate_address_p (mode, x, false))
17055 return x;
17057 if (REG_P (XEXP (x, 0)))
17059 rtx temp = gen_reg_rtx (Pmode);
17060 rtx val = force_operand (XEXP (x, 1), temp);
17061 if (val != temp)
17063 val = convert_to_mode (Pmode, val, 1);
17064 emit_move_insn (temp, val);
17067 XEXP (x, 1) = temp;
17068 return x;
17071 else if (REG_P (XEXP (x, 1)))
17073 rtx temp = gen_reg_rtx (Pmode);
17074 rtx val = force_operand (XEXP (x, 0), temp);
17075 if (val != temp)
17077 val = convert_to_mode (Pmode, val, 1);
17078 emit_move_insn (temp, val);
17081 XEXP (x, 0) = temp;
17082 return x;
17086 return x;
17089 /* Print an integer constant expression in assembler syntax. Addition
17090 and subtraction are the only arithmetic that may appear in these
17091 expressions. FILE is the stdio stream to write to, X is the rtx, and
17092 CODE is the operand print code from the output string. */
17094 static void
17095 output_pic_addr_const (FILE *file, rtx x, int code)
17097 char buf[256];
17099 switch (GET_CODE (x))
17101 case PC:
17102 gcc_assert (flag_pic);
17103 putc ('.', file);
17104 break;
17106 case SYMBOL_REF:
17107 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
17108 output_addr_const (file, x);
17109 else
17111 const char *name = XSTR (x, 0);
17113 /* Mark the decl as referenced so that cgraph will
17114 output the function. */
17115 if (SYMBOL_REF_DECL (x))
17116 mark_decl_referenced (SYMBOL_REF_DECL (x));
17118 #if TARGET_MACHO
17119 if (MACHOPIC_INDIRECT
17120 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
17121 name = machopic_indirection_name (x, /*stub_p=*/true);
17122 #endif
17123 assemble_name (file, name);
17125 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
17126 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
17127 fputs ("@PLT", file);
17128 break;
17130 case LABEL_REF:
17131 x = XEXP (x, 0);
17132 /* FALLTHRU */
17133 case CODE_LABEL:
17134 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
17135 assemble_name (asm_out_file, buf);
17136 break;
17138 case CONST_INT:
17139 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
17140 break;
17142 case CONST:
17143 /* This used to output parentheses around the expression,
17144 but that does not work on the 386 (either ATT or BSD assembler). */
17145 output_pic_addr_const (file, XEXP (x, 0), code);
17146 break;
17148 case CONST_DOUBLE:
17149 /* We can't handle floating point constants;
17150 TARGET_PRINT_OPERAND must handle them. */
17151 output_operand_lossage ("floating constant misused");
17152 break;
17154 case PLUS:
17155 /* Some assemblers need integer constants to appear first. */
17156 if (CONST_INT_P (XEXP (x, 0)))
17158 output_pic_addr_const (file, XEXP (x, 0), code);
17159 putc ('+', file);
17160 output_pic_addr_const (file, XEXP (x, 1), code);
17162 else
17164 gcc_assert (CONST_INT_P (XEXP (x, 1)));
17165 output_pic_addr_const (file, XEXP (x, 1), code);
17166 putc ('+', file);
17167 output_pic_addr_const (file, XEXP (x, 0), code);
17169 break;
17171 case MINUS:
17172 if (!TARGET_MACHO)
17173 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
17174 output_pic_addr_const (file, XEXP (x, 0), code);
17175 putc ('-', file);
17176 output_pic_addr_const (file, XEXP (x, 1), code);
17177 if (!TARGET_MACHO)
17178 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
17179 break;
17181 case UNSPEC:
17182 gcc_assert (XVECLEN (x, 0) == 1);
17183 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
17184 switch (XINT (x, 1))
17186 case UNSPEC_GOT:
17187 fputs ("@GOT", file);
17188 break;
17189 case UNSPEC_GOTOFF:
17190 fputs ("@GOTOFF", file);
17191 break;
17192 case UNSPEC_PLTOFF:
17193 fputs ("@PLTOFF", file);
17194 break;
17195 case UNSPEC_PCREL:
17196 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17197 "(%rip)" : "[rip]", file);
17198 break;
17199 case UNSPEC_GOTPCREL:
17200 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17201 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
17202 break;
17203 case UNSPEC_GOTTPOFF:
17204 /* FIXME: This might be @TPOFF in Sun ld too. */
17205 fputs ("@gottpoff", file);
17206 break;
17207 case UNSPEC_TPOFF:
17208 fputs ("@tpoff", file);
17209 break;
17210 case UNSPEC_NTPOFF:
17211 if (TARGET_64BIT)
17212 fputs ("@tpoff", file);
17213 else
17214 fputs ("@ntpoff", file);
17215 break;
17216 case UNSPEC_DTPOFF:
17217 fputs ("@dtpoff", file);
17218 break;
17219 case UNSPEC_GOTNTPOFF:
17220 if (TARGET_64BIT)
17221 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
17222 "@gottpoff(%rip)": "@gottpoff[rip]", file);
17223 else
17224 fputs ("@gotntpoff", file);
17225 break;
17226 case UNSPEC_INDNTPOFF:
17227 fputs ("@indntpoff", file);
17228 break;
17229 #if TARGET_MACHO
17230 case UNSPEC_MACHOPIC_OFFSET:
17231 putc ('-', file);
17232 machopic_output_function_base_name (file);
17233 break;
17234 #endif
17235 default:
17236 output_operand_lossage ("invalid UNSPEC as operand");
17237 break;
17239 break;
17241 default:
17242 output_operand_lossage ("invalid expression as operand");
17246 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
17247 We need to emit DTP-relative relocations. */
17249 static void ATTRIBUTE_UNUSED
17250 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
17252 fputs (ASM_LONG, file);
17253 output_addr_const (file, x);
17254 fputs ("@dtpoff", file);
17255 switch (size)
17257 case 4:
17258 break;
17259 case 8:
17260 fputs (", 0", file);
17261 break;
17262 default:
17263 gcc_unreachable ();
17267 /* Return true if X is a representation of the PIC register. This copes
17268 with calls from ix86_find_base_term, where the register might have
17269 been replaced by a cselib value. */
17271 static bool
17272 ix86_pic_register_p (rtx x)
17274 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
17275 return (pic_offset_table_rtx
17276 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
17277 else if (!REG_P (x))
17278 return false;
17279 else if (pic_offset_table_rtx)
17281 if (REGNO (x) == REGNO (pic_offset_table_rtx))
17282 return true;
17283 if (HARD_REGISTER_P (x)
17284 && !HARD_REGISTER_P (pic_offset_table_rtx)
17285 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
17286 return true;
17287 return false;
17289 else
17290 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
17293 /* Helper function for ix86_delegitimize_address.
17294 Attempt to delegitimize TLS local-exec accesses. */
17296 static rtx
17297 ix86_delegitimize_tls_address (rtx orig_x)
17299 rtx x = orig_x, unspec;
17300 struct ix86_address addr;
17302 if (!TARGET_TLS_DIRECT_SEG_REFS)
17303 return orig_x;
17304 if (MEM_P (x))
17305 x = XEXP (x, 0);
17306 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
17307 return orig_x;
17308 if (ix86_decompose_address (x, &addr) == 0
17309 || addr.seg != DEFAULT_TLS_SEG_REG
17310 || addr.disp == NULL_RTX
17311 || GET_CODE (addr.disp) != CONST)
17312 return orig_x;
17313 unspec = XEXP (addr.disp, 0);
17314 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
17315 unspec = XEXP (unspec, 0);
17316 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
17317 return orig_x;
17318 x = XVECEXP (unspec, 0, 0);
17319 gcc_assert (GET_CODE (x) == SYMBOL_REF);
17320 if (unspec != XEXP (addr.disp, 0))
17321 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
17322 if (addr.index)
17324 rtx idx = addr.index;
17325 if (addr.scale != 1)
17326 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
17327 x = gen_rtx_PLUS (Pmode, idx, x);
17329 if (addr.base)
17330 x = gen_rtx_PLUS (Pmode, addr.base, x);
17331 if (MEM_P (orig_x))
17332 x = replace_equiv_address_nv (orig_x, x);
17333 return x;
17336 /* In the name of slightly smaller debug output, and to cater to
17337 general assembler lossage, recognize PIC+GOTOFF and turn it back
17338 into a direct symbol reference.
17340 On Darwin, this is necessary to avoid a crash, because Darwin
17341 has a different PIC label for each routine but the DWARF debugging
17342 information is not associated with any particular routine, so it's
17343 necessary to remove references to the PIC label from RTL stored by
17344 the DWARF output code.
17346 This helper is used in the normal ix86_delegitimize_address
17347 entrypoint (e.g. used in the target delegitimization hook) and
17348 in ix86_find_base_term. As compile time memory optimization, we
17349 avoid allocating rtxes that will not change anything on the outcome
17350 of the callers (find_base_value and find_base_term). */
17352 static inline rtx
17353 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
17355 rtx orig_x = delegitimize_mem_from_attrs (x);
17356 /* addend is NULL or some rtx if x is something+GOTOFF where
17357 something doesn't include the PIC register. */
17358 rtx addend = NULL_RTX;
17359 /* reg_addend is NULL or a multiple of some register. */
17360 rtx reg_addend = NULL_RTX;
17361 /* const_addend is NULL or a const_int. */
17362 rtx const_addend = NULL_RTX;
17363 /* This is the result, or NULL. */
17364 rtx result = NULL_RTX;
17366 x = orig_x;
17368 if (MEM_P (x))
17369 x = XEXP (x, 0);
17371 if (TARGET_64BIT)
17373 if (GET_CODE (x) == CONST
17374 && GET_CODE (XEXP (x, 0)) == PLUS
17375 && GET_MODE (XEXP (x, 0)) == Pmode
17376 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
17377 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
17378 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
17380 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
17381 base. A CONST can't be arg_pointer_rtx based. */
17382 if (base_term_p && MEM_P (orig_x))
17383 return orig_x;
17384 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
17385 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
17386 if (MEM_P (orig_x))
17387 x = replace_equiv_address_nv (orig_x, x);
17388 return x;
17391 if (GET_CODE (x) == CONST
17392 && GET_CODE (XEXP (x, 0)) == UNSPEC
17393 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
17394 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
17395 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
17397 x = XVECEXP (XEXP (x, 0), 0, 0);
17398 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
17400 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
17401 if (x == NULL_RTX)
17402 return orig_x;
17404 return x;
17407 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
17408 return ix86_delegitimize_tls_address (orig_x);
17410 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
17411 and -mcmodel=medium -fpic. */
17414 if (GET_CODE (x) != PLUS
17415 || GET_CODE (XEXP (x, 1)) != CONST)
17416 return ix86_delegitimize_tls_address (orig_x);
17418 if (ix86_pic_register_p (XEXP (x, 0)))
17419 /* %ebx + GOT/GOTOFF */
17421 else if (GET_CODE (XEXP (x, 0)) == PLUS)
17423 /* %ebx + %reg * scale + GOT/GOTOFF */
17424 reg_addend = XEXP (x, 0);
17425 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
17426 reg_addend = XEXP (reg_addend, 1);
17427 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
17428 reg_addend = XEXP (reg_addend, 0);
17429 else
17431 reg_addend = NULL_RTX;
17432 addend = XEXP (x, 0);
17435 else
17436 addend = XEXP (x, 0);
17438 x = XEXP (XEXP (x, 1), 0);
17439 if (GET_CODE (x) == PLUS
17440 && CONST_INT_P (XEXP (x, 1)))
17442 const_addend = XEXP (x, 1);
17443 x = XEXP (x, 0);
17446 if (GET_CODE (x) == UNSPEC
17447 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
17448 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
17449 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
17450 && !MEM_P (orig_x) && !addend)))
17451 result = XVECEXP (x, 0, 0);
17453 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
17454 && !MEM_P (orig_x))
17455 result = XVECEXP (x, 0, 0);
17457 if (! result)
17458 return ix86_delegitimize_tls_address (orig_x);
17460 /* For (PLUS something CONST_INT) both find_base_{value,term} just
17461 recurse on the first operand. */
17462 if (const_addend && !base_term_p)
17463 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
17464 if (reg_addend)
17465 result = gen_rtx_PLUS (Pmode, reg_addend, result);
17466 if (addend)
17468 /* If the rest of original X doesn't involve the PIC register, add
17469 addend and subtract pic_offset_table_rtx. This can happen e.g.
17470 for code like:
17471 leal (%ebx, %ecx, 4), %ecx
17473 movl foo@GOTOFF(%ecx), %edx
17474 in which case we return (%ecx - %ebx) + foo
17475 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
17476 and reload has completed. Don't do the latter for debug,
17477 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
17478 if (pic_offset_table_rtx
17479 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
17480 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
17481 pic_offset_table_rtx),
17482 result);
17483 else if (base_term_p
17484 && pic_offset_table_rtx
17485 && !TARGET_MACHO
17486 && !TARGET_VXWORKS_RTP)
17488 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
17489 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
17490 result = gen_rtx_PLUS (Pmode, tmp, result);
17492 else
17493 return orig_x;
17495 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
17497 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
17498 if (result == NULL_RTX)
17499 return orig_x;
17501 return result;
17504 /* The normal instantiation of the above template. */
17506 static rtx
17507 ix86_delegitimize_address (rtx x)
17509 return ix86_delegitimize_address_1 (x, false);
17512 /* If X is a machine specific address (i.e. a symbol or label being
17513 referenced as a displacement from the GOT implemented using an
17514 UNSPEC), then return the base term. Otherwise return X. */
17517 ix86_find_base_term (rtx x)
17519 rtx term;
17521 if (TARGET_64BIT)
17523 if (GET_CODE (x) != CONST)
17524 return x;
17525 term = XEXP (x, 0);
17526 if (GET_CODE (term) == PLUS
17527 && CONST_INT_P (XEXP (term, 1)))
17528 term = XEXP (term, 0);
17529 if (GET_CODE (term) != UNSPEC
17530 || (XINT (term, 1) != UNSPEC_GOTPCREL
17531 && XINT (term, 1) != UNSPEC_PCREL))
17532 return x;
17534 return XVECEXP (term, 0, 0);
17537 return ix86_delegitimize_address_1 (x, true);
17540 /* Return true if X shouldn't be emitted into the debug info.
17541 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17542 symbol easily into the .debug_info section, so we need not to
17543 delegitimize, but instead assemble as @gotoff.
17544 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17545 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17547 static bool
17548 ix86_const_not_ok_for_debug_p (rtx x)
17550 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17551 return true;
17553 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17554 return true;
17556 return false;
17559 static void
17560 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17561 bool fp, FILE *file)
17563 const char *suffix;
17565 if (mode == CCFPmode)
17567 code = ix86_fp_compare_code_to_integer (code);
17568 mode = CCmode;
17570 if (reverse)
17571 code = reverse_condition (code);
17573 switch (code)
17575 case EQ:
17576 gcc_assert (mode != CCGZmode);
17577 switch (mode)
17579 case E_CCAmode:
17580 suffix = "a";
17581 break;
17582 case E_CCCmode:
17583 suffix = "c";
17584 break;
17585 case E_CCOmode:
17586 suffix = "o";
17587 break;
17588 case E_CCPmode:
17589 suffix = "p";
17590 break;
17591 case E_CCSmode:
17592 suffix = "s";
17593 break;
17594 default:
17595 suffix = "e";
17596 break;
17598 break;
17599 case NE:
17600 gcc_assert (mode != CCGZmode);
17601 switch (mode)
17603 case E_CCAmode:
17604 suffix = "na";
17605 break;
17606 case E_CCCmode:
17607 suffix = "nc";
17608 break;
17609 case E_CCOmode:
17610 suffix = "no";
17611 break;
17612 case E_CCPmode:
17613 suffix = "np";
17614 break;
17615 case E_CCSmode:
17616 suffix = "ns";
17617 break;
17618 default:
17619 suffix = "ne";
17620 break;
17622 break;
17623 case GT:
17624 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17625 suffix = "g";
17626 break;
17627 case GTU:
17628 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17629 Those same assemblers have the same but opposite lossage on cmov. */
17630 if (mode == CCmode)
17631 suffix = fp ? "nbe" : "a";
17632 else
17633 gcc_unreachable ();
17634 break;
17635 case LT:
17636 switch (mode)
17638 case E_CCNOmode:
17639 case E_CCGOCmode:
17640 suffix = "s";
17641 break;
17643 case E_CCmode:
17644 case E_CCGCmode:
17645 case E_CCGZmode:
17646 suffix = "l";
17647 break;
17649 default:
17650 gcc_unreachable ();
17652 break;
17653 case LTU:
17654 if (mode == CCmode || mode == CCGZmode)
17655 suffix = "b";
17656 else if (mode == CCCmode)
17657 suffix = fp ? "b" : "c";
17658 else
17659 gcc_unreachable ();
17660 break;
17661 case GE:
17662 switch (mode)
17664 case E_CCNOmode:
17665 case E_CCGOCmode:
17666 suffix = "ns";
17667 break;
17669 case E_CCmode:
17670 case E_CCGCmode:
17671 case E_CCGZmode:
17672 suffix = "ge";
17673 break;
17675 default:
17676 gcc_unreachable ();
17678 break;
17679 case GEU:
17680 if (mode == CCmode || mode == CCGZmode)
17681 suffix = "nb";
17682 else if (mode == CCCmode)
17683 suffix = fp ? "nb" : "nc";
17684 else
17685 gcc_unreachable ();
17686 break;
17687 case LE:
17688 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17689 suffix = "le";
17690 break;
17691 case LEU:
17692 if (mode == CCmode)
17693 suffix = "be";
17694 else
17695 gcc_unreachable ();
17696 break;
17697 case UNORDERED:
17698 suffix = fp ? "u" : "p";
17699 break;
17700 case ORDERED:
17701 suffix = fp ? "nu" : "np";
17702 break;
17703 default:
17704 gcc_unreachable ();
17706 fputs (suffix, file);
17709 /* Print the name of register X to FILE based on its machine mode and number.
17710 If CODE is 'w', pretend the mode is HImode.
17711 If CODE is 'b', pretend the mode is QImode.
17712 If CODE is 'k', pretend the mode is SImode.
17713 If CODE is 'q', pretend the mode is DImode.
17714 If CODE is 'x', pretend the mode is V4SFmode.
17715 If CODE is 't', pretend the mode is V8SFmode.
17716 If CODE is 'g', pretend the mode is V16SFmode.
17717 If CODE is 'h', pretend the reg is the 'high' byte register.
17718 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17719 If CODE is 'd', duplicate the operand for AVX instruction.
17720 If CODE is 'V', print naked full integer register name without %.
17723 void
17724 print_reg (rtx x, int code, FILE *file)
17726 const char *reg;
17727 int msize;
17728 unsigned int regno;
17729 bool duplicated;
17731 if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
17732 putc ('%', file);
17734 if (x == pc_rtx)
17736 gcc_assert (TARGET_64BIT);
17737 fputs ("rip", file);
17738 return;
17741 if (code == 'y' && STACK_TOP_P (x))
17743 fputs ("st(0)", file);
17744 return;
17747 if (code == 'w')
17748 msize = 2;
17749 else if (code == 'b')
17750 msize = 1;
17751 else if (code == 'k')
17752 msize = 4;
17753 else if (code == 'q')
17754 msize = 8;
17755 else if (code == 'h')
17756 msize = 0;
17757 else if (code == 'x')
17758 msize = 16;
17759 else if (code == 't')
17760 msize = 32;
17761 else if (code == 'g')
17762 msize = 64;
17763 else
17764 msize = GET_MODE_SIZE (GET_MODE (x));
17766 regno = REGNO (x);
17768 if (regno == ARG_POINTER_REGNUM
17769 || regno == FRAME_POINTER_REGNUM
17770 || regno == FPSR_REG
17771 || regno == FPCR_REG)
17773 output_operand_lossage
17774 ("invalid use of register '%s'", reg_names[regno]);
17775 return;
17777 else if (regno == FLAGS_REG)
17779 output_operand_lossage ("invalid use of asm flag output");
17780 return;
17783 if (code == 'V')
17785 if (GENERAL_REGNO_P (regno))
17786 msize = GET_MODE_SIZE (word_mode);
17787 else
17788 error ("'V' modifier on non-integer register");
17791 duplicated = code == 'd' && TARGET_AVX;
17793 switch (msize)
17795 case 16:
17796 case 12:
17797 case 8:
17798 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17799 warning (0, "unsupported size for integer register");
17800 /* FALLTHRU */
17801 case 4:
17802 if (LEGACY_INT_REGNO_P (regno))
17803 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17804 /* FALLTHRU */
17805 case 2:
17806 normal:
17807 reg = hi_reg_name[regno];
17808 break;
17809 case 1:
17810 if (regno >= ARRAY_SIZE (qi_reg_name))
17811 goto normal;
17812 if (!ANY_QI_REGNO_P (regno))
17813 error ("unsupported size for integer register");
17814 reg = qi_reg_name[regno];
17815 break;
17816 case 0:
17817 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17818 goto normal;
17819 reg = qi_high_reg_name[regno];
17820 break;
17821 case 32:
17822 case 64:
17823 if (SSE_REGNO_P (regno))
17825 gcc_assert (!duplicated);
17826 putc (msize == 32 ? 'y' : 'z', file);
17827 reg = hi_reg_name[regno] + 1;
17828 break;
17830 goto normal;
17831 default:
17832 gcc_unreachable ();
17835 fputs (reg, file);
17837 /* Irritatingly, AMD extended registers use
17838 different naming convention: "r%d[bwd]" */
17839 if (REX_INT_REGNO_P (regno))
17841 gcc_assert (TARGET_64BIT);
17842 switch (msize)
17844 case 0:
17845 error ("extended registers have no high halves");
17846 break;
17847 case 1:
17848 putc ('b', file);
17849 break;
17850 case 2:
17851 putc ('w', file);
17852 break;
17853 case 4:
17854 putc ('d', file);
17855 break;
17856 case 8:
17857 /* no suffix */
17858 break;
17859 default:
17860 error ("unsupported operand size for extended register");
17861 break;
17863 return;
17866 if (duplicated)
17868 if (ASSEMBLER_DIALECT == ASM_ATT)
17869 fprintf (file, ", %%%s", reg);
17870 else
17871 fprintf (file, ", %s", reg);
17875 /* Meaning of CODE:
17876 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17877 C -- print opcode suffix for set/cmov insn.
17878 c -- like C, but print reversed condition
17879 F,f -- likewise, but for floating-point.
17880 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17881 otherwise nothing
17882 R -- print embedded rounding and sae.
17883 r -- print only sae.
17884 z -- print the opcode suffix for the size of the current operand.
17885 Z -- likewise, with special suffixes for x87 instructions.
17886 * -- print a star (in certain assembler syntax)
17887 A -- print an absolute memory reference.
17888 E -- print address with DImode register names if TARGET_64BIT.
17889 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17890 s -- print a shift double count, followed by the assemblers argument
17891 delimiter.
17892 b -- print the QImode name of the register for the indicated operand.
17893 %b0 would print %al if operands[0] is reg 0.
17894 w -- likewise, print the HImode name of the register.
17895 k -- likewise, print the SImode name of the register.
17896 q -- likewise, print the DImode name of the register.
17897 x -- likewise, print the V4SFmode name of the register.
17898 t -- likewise, print the V8SFmode name of the register.
17899 g -- likewise, print the V16SFmode name of the register.
17900 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17901 y -- print "st(0)" instead of "st" as a register.
17902 d -- print duplicated register operand for AVX instruction.
17903 D -- print condition for SSE cmp instruction.
17904 P -- if PIC, print an @PLT suffix.
17905 p -- print raw symbol name.
17906 X -- don't print any sort of PIC '@' suffix for a symbol.
17907 & -- print some in-use local-dynamic symbol name.
17908 H -- print a memory address offset by 8; used for sse high-parts
17909 Y -- print condition for XOP pcom* instruction.
17910 V -- print naked full integer register name without %.
17911 + -- print a branch hint as 'cs' or 'ds' prefix
17912 ; -- print a semicolon (after prefixes due to bug in older gas).
17913 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17914 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17915 ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
17918 void
17919 ix86_print_operand (FILE *file, rtx x, int code)
17921 if (code)
17923 switch (code)
17925 case 'A':
17926 switch (ASSEMBLER_DIALECT)
17928 case ASM_ATT:
17929 putc ('*', file);
17930 break;
17932 case ASM_INTEL:
17933 /* Intel syntax. For absolute addresses, registers should not
17934 be surrounded by braces. */
17935 if (!REG_P (x))
17937 putc ('[', file);
17938 ix86_print_operand (file, x, 0);
17939 putc (']', file);
17940 return;
17942 break;
17944 default:
17945 gcc_unreachable ();
17948 ix86_print_operand (file, x, 0);
17949 return;
17951 case 'E':
17952 /* Wrap address in an UNSPEC to declare special handling. */
17953 if (TARGET_64BIT)
17954 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17956 output_address (VOIDmode, x);
17957 return;
17959 case 'L':
17960 if (ASSEMBLER_DIALECT == ASM_ATT)
17961 putc ('l', file);
17962 return;
17964 case 'W':
17965 if (ASSEMBLER_DIALECT == ASM_ATT)
17966 putc ('w', file);
17967 return;
17969 case 'B':
17970 if (ASSEMBLER_DIALECT == ASM_ATT)
17971 putc ('b', file);
17972 return;
17974 case 'Q':
17975 if (ASSEMBLER_DIALECT == ASM_ATT)
17976 putc ('l', file);
17977 return;
17979 case 'S':
17980 if (ASSEMBLER_DIALECT == ASM_ATT)
17981 putc ('s', file);
17982 return;
17984 case 'T':
17985 if (ASSEMBLER_DIALECT == ASM_ATT)
17986 putc ('t', file);
17987 return;
17989 case 'O':
17990 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17991 if (ASSEMBLER_DIALECT != ASM_ATT)
17992 return;
17994 switch (GET_MODE_SIZE (GET_MODE (x)))
17996 case 2:
17997 putc ('w', file);
17998 break;
18000 case 4:
18001 putc ('l', file);
18002 break;
18004 case 8:
18005 putc ('q', file);
18006 break;
18008 default:
18009 output_operand_lossage ("invalid operand size for operand "
18010 "code 'O'");
18011 return;
18014 putc ('.', file);
18015 #endif
18016 return;
18018 case 'z':
18019 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18021 /* Opcodes don't get size suffixes if using Intel opcodes. */
18022 if (ASSEMBLER_DIALECT == ASM_INTEL)
18023 return;
18025 switch (GET_MODE_SIZE (GET_MODE (x)))
18027 case 1:
18028 putc ('b', file);
18029 return;
18031 case 2:
18032 putc ('w', file);
18033 return;
18035 case 4:
18036 putc ('l', file);
18037 return;
18039 case 8:
18040 putc ('q', file);
18041 return;
18043 default:
18044 output_operand_lossage ("invalid operand size for operand "
18045 "code 'z'");
18046 return;
18050 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18051 warning (0, "non-integer operand used with operand code 'z'");
18052 /* FALLTHRU */
18054 case 'Z':
18055 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
18056 if (ASSEMBLER_DIALECT == ASM_INTEL)
18057 return;
18059 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
18061 switch (GET_MODE_SIZE (GET_MODE (x)))
18063 case 2:
18064 #ifdef HAVE_AS_IX86_FILDS
18065 putc ('s', file);
18066 #endif
18067 return;
18069 case 4:
18070 putc ('l', file);
18071 return;
18073 case 8:
18074 #ifdef HAVE_AS_IX86_FILDQ
18075 putc ('q', file);
18076 #else
18077 fputs ("ll", file);
18078 #endif
18079 return;
18081 default:
18082 break;
18085 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
18087 /* 387 opcodes don't get size suffixes
18088 if the operands are registers. */
18089 if (STACK_REG_P (x))
18090 return;
18092 switch (GET_MODE_SIZE (GET_MODE (x)))
18094 case 4:
18095 putc ('s', file);
18096 return;
18098 case 8:
18099 putc ('l', file);
18100 return;
18102 case 12:
18103 case 16:
18104 putc ('t', file);
18105 return;
18107 default:
18108 break;
18111 else
18113 output_operand_lossage ("invalid operand type used with "
18114 "operand code 'Z'");
18115 return;
18118 output_operand_lossage ("invalid operand size for operand code 'Z'");
18119 return;
18121 case 'd':
18122 case 'b':
18123 case 'w':
18124 case 'k':
18125 case 'q':
18126 case 'h':
18127 case 't':
18128 case 'g':
18129 case 'y':
18130 case 'x':
18131 case 'X':
18132 case 'P':
18133 case 'p':
18134 case 'V':
18135 break;
18137 case 's':
18138 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
18140 ix86_print_operand (file, x, 0);
18141 fputs (", ", file);
18143 return;
18145 case 'Y':
18146 switch (GET_CODE (x))
18148 case NE:
18149 fputs ("neq", file);
18150 break;
18151 case EQ:
18152 fputs ("eq", file);
18153 break;
18154 case GE:
18155 case GEU:
18156 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
18157 break;
18158 case GT:
18159 case GTU:
18160 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
18161 break;
18162 case LE:
18163 case LEU:
18164 fputs ("le", file);
18165 break;
18166 case LT:
18167 case LTU:
18168 fputs ("lt", file);
18169 break;
18170 case UNORDERED:
18171 fputs ("unord", file);
18172 break;
18173 case ORDERED:
18174 fputs ("ord", file);
18175 break;
18176 case UNEQ:
18177 fputs ("ueq", file);
18178 break;
18179 case UNGE:
18180 fputs ("nlt", file);
18181 break;
18182 case UNGT:
18183 fputs ("nle", file);
18184 break;
18185 case UNLE:
18186 fputs ("ule", file);
18187 break;
18188 case UNLT:
18189 fputs ("ult", file);
18190 break;
18191 case LTGT:
18192 fputs ("une", file);
18193 break;
18194 default:
18195 output_operand_lossage ("operand is not a condition code, "
18196 "invalid operand code 'Y'");
18197 return;
18199 return;
18201 case 'D':
18202 /* Little bit of braindamage here. The SSE compare instructions
18203 does use completely different names for the comparisons that the
18204 fp conditional moves. */
18205 switch (GET_CODE (x))
18207 case UNEQ:
18208 if (TARGET_AVX)
18210 fputs ("eq_us", file);
18211 break;
18213 /* FALLTHRU */
18214 case EQ:
18215 fputs ("eq", file);
18216 break;
18217 case UNLT:
18218 if (TARGET_AVX)
18220 fputs ("nge", file);
18221 break;
18223 /* FALLTHRU */
18224 case LT:
18225 fputs ("lt", file);
18226 break;
18227 case UNLE:
18228 if (TARGET_AVX)
18230 fputs ("ngt", file);
18231 break;
18233 /* FALLTHRU */
18234 case LE:
18235 fputs ("le", file);
18236 break;
18237 case UNORDERED:
18238 fputs ("unord", file);
18239 break;
18240 case LTGT:
18241 if (TARGET_AVX)
18243 fputs ("neq_oq", file);
18244 break;
18246 /* FALLTHRU */
18247 case NE:
18248 fputs ("neq", file);
18249 break;
18250 case GE:
18251 if (TARGET_AVX)
18253 fputs ("ge", file);
18254 break;
18256 /* FALLTHRU */
18257 case UNGE:
18258 fputs ("nlt", file);
18259 break;
18260 case GT:
18261 if (TARGET_AVX)
18263 fputs ("gt", file);
18264 break;
18266 /* FALLTHRU */
18267 case UNGT:
18268 fputs ("nle", file);
18269 break;
18270 case ORDERED:
18271 fputs ("ord", file);
18272 break;
18273 default:
18274 output_operand_lossage ("operand is not a condition code, "
18275 "invalid operand code 'D'");
18276 return;
18278 return;
18280 case 'F':
18281 case 'f':
18282 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
18283 if (ASSEMBLER_DIALECT == ASM_ATT)
18284 putc ('.', file);
18285 gcc_fallthrough ();
18286 #endif
18288 case 'C':
18289 case 'c':
18290 if (!COMPARISON_P (x))
18292 output_operand_lossage ("operand is not a condition code, "
18293 "invalid operand code '%c'", code);
18294 return;
18296 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
18297 code == 'c' || code == 'f',
18298 code == 'F' || code == 'f',
18299 file);
18300 return;
18302 case 'H':
18303 if (!offsettable_memref_p (x))
18305 output_operand_lossage ("operand is not an offsettable memory "
18306 "reference, invalid operand code 'H'");
18307 return;
18309 /* It doesn't actually matter what mode we use here, as we're
18310 only going to use this for printing. */
18311 x = adjust_address_nv (x, DImode, 8);
18312 /* Output 'qword ptr' for intel assembler dialect. */
18313 if (ASSEMBLER_DIALECT == ASM_INTEL)
18314 code = 'q';
18315 break;
18317 case 'K':
18318 if (!CONST_INT_P (x))
18320 output_operand_lossage ("operand is not an integer, invalid "
18321 "operand code 'K'");
18322 return;
18325 if (INTVAL (x) & IX86_HLE_ACQUIRE)
18326 #ifdef HAVE_AS_IX86_HLE
18327 fputs ("xacquire ", file);
18328 #else
18329 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
18330 #endif
18331 else if (INTVAL (x) & IX86_HLE_RELEASE)
18332 #ifdef HAVE_AS_IX86_HLE
18333 fputs ("xrelease ", file);
18334 #else
18335 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
18336 #endif
18337 /* We do not want to print value of the operand. */
18338 return;
18340 case 'N':
18341 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
18342 fputs ("{z}", file);
18343 return;
18345 case 'r':
18346 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
18348 output_operand_lossage ("operand is not a specific integer, "
18349 "invalid operand code 'r'");
18350 return;
18353 if (ASSEMBLER_DIALECT == ASM_INTEL)
18354 fputs (", ", file);
18356 fputs ("{sae}", file);
18358 if (ASSEMBLER_DIALECT == ASM_ATT)
18359 fputs (", ", file);
18361 return;
18363 case 'R':
18364 if (!CONST_INT_P (x))
18366 output_operand_lossage ("operand is not an integer, invalid "
18367 "operand code 'R'");
18368 return;
18371 if (ASSEMBLER_DIALECT == ASM_INTEL)
18372 fputs (", ", file);
18374 switch (INTVAL (x))
18376 case ROUND_NEAREST_INT | ROUND_SAE:
18377 fputs ("{rn-sae}", file);
18378 break;
18379 case ROUND_NEG_INF | ROUND_SAE:
18380 fputs ("{rd-sae}", file);
18381 break;
18382 case ROUND_POS_INF | ROUND_SAE:
18383 fputs ("{ru-sae}", file);
18384 break;
18385 case ROUND_ZERO | ROUND_SAE:
18386 fputs ("{rz-sae}", file);
18387 break;
18388 default:
18389 output_operand_lossage ("operand is not a specific integer, "
18390 "invalid operand code 'R'");
18393 if (ASSEMBLER_DIALECT == ASM_ATT)
18394 fputs (", ", file);
18396 return;
18398 case '*':
18399 if (ASSEMBLER_DIALECT == ASM_ATT)
18400 putc ('*', file);
18401 return;
18403 case '&':
18405 const char *name = get_some_local_dynamic_name ();
18406 if (name == NULL)
18407 output_operand_lossage ("'%%&' used without any "
18408 "local dynamic TLS references");
18409 else
18410 assemble_name (file, name);
18411 return;
18414 case '+':
18416 rtx x;
18418 if (!optimize
18419 || optimize_function_for_size_p (cfun)
18420 || !TARGET_BRANCH_PREDICTION_HINTS)
18421 return;
18423 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
18424 if (x)
18426 int pred_val = profile_probability::from_reg_br_prob_note
18427 (XINT (x, 0)).to_reg_br_prob_base ();
18429 if (pred_val < REG_BR_PROB_BASE * 45 / 100
18430 || pred_val > REG_BR_PROB_BASE * 55 / 100)
18432 bool taken = pred_val > REG_BR_PROB_BASE / 2;
18433 bool cputaken
18434 = final_forward_branch_p (current_output_insn) == 0;
18436 /* Emit hints only in the case default branch prediction
18437 heuristics would fail. */
18438 if (taken != cputaken)
18440 /* We use 3e (DS) prefix for taken branches and
18441 2e (CS) prefix for not taken branches. */
18442 if (taken)
18443 fputs ("ds ; ", file);
18444 else
18445 fputs ("cs ; ", file);
18449 return;
18452 case ';':
18453 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
18454 putc (';', file);
18455 #endif
18456 return;
18458 case '~':
18459 putc (TARGET_AVX2 ? 'i' : 'f', file);
18460 return;
18462 case '^':
18463 if (TARGET_64BIT && Pmode != word_mode)
18464 fputs ("addr32 ", file);
18465 return;
18467 case '!':
18468 if (ix86_notrack_prefixed_insn_p (current_output_insn))
18469 fputs ("notrack ", file);
18470 return;
18472 default:
18473 output_operand_lossage ("invalid operand code '%c'", code);
18477 if (REG_P (x))
18478 print_reg (x, code, file);
18480 else if (MEM_P (x))
18482 rtx addr = XEXP (x, 0);
18484 /* No `byte ptr' prefix for call instructions ... */
18485 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
18487 machine_mode mode = GET_MODE (x);
18488 const char *size;
18490 /* Check for explicit size override codes. */
18491 if (code == 'b')
18492 size = "BYTE";
18493 else if (code == 'w')
18494 size = "WORD";
18495 else if (code == 'k')
18496 size = "DWORD";
18497 else if (code == 'q')
18498 size = "QWORD";
18499 else if (code == 'x')
18500 size = "XMMWORD";
18501 else if (code == 't')
18502 size = "YMMWORD";
18503 else if (code == 'g')
18504 size = "ZMMWORD";
18505 else if (mode == BLKmode)
18506 /* ... or BLKmode operands, when not overridden. */
18507 size = NULL;
18508 else
18509 switch (GET_MODE_SIZE (mode))
18511 case 1: size = "BYTE"; break;
18512 case 2: size = "WORD"; break;
18513 case 4: size = "DWORD"; break;
18514 case 8: size = "QWORD"; break;
18515 case 12: size = "TBYTE"; break;
18516 case 16:
18517 if (mode == XFmode)
18518 size = "TBYTE";
18519 else
18520 size = "XMMWORD";
18521 break;
18522 case 32: size = "YMMWORD"; break;
18523 case 64: size = "ZMMWORD"; break;
18524 default:
18525 gcc_unreachable ();
18527 if (size)
18529 fputs (size, file);
18530 fputs (" PTR ", file);
18534 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18535 output_operand_lossage ("invalid constraints for operand");
18536 else
18537 ix86_print_operand_address_as
18538 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18541 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18543 long l;
18545 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18547 if (ASSEMBLER_DIALECT == ASM_ATT)
18548 putc ('$', file);
18549 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18550 if (code == 'q')
18551 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18552 (unsigned long long) (int) l);
18553 else
18554 fprintf (file, "0x%08x", (unsigned int) l);
18557 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18559 long l[2];
18561 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18563 if (ASSEMBLER_DIALECT == ASM_ATT)
18564 putc ('$', file);
18565 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18568 /* These float cases don't actually occur as immediate operands. */
18569 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18571 char dstr[30];
18573 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18574 fputs (dstr, file);
18577 else
18579 /* We have patterns that allow zero sets of memory, for instance.
18580 In 64-bit mode, we should probably support all 8-byte vectors,
18581 since we can in fact encode that into an immediate. */
18582 if (GET_CODE (x) == CONST_VECTOR)
18584 if (x != CONST0_RTX (GET_MODE (x)))
18585 output_operand_lossage ("invalid vector immediate");
18586 x = const0_rtx;
18589 if (code != 'P' && code != 'p')
18591 if (CONST_INT_P (x))
18593 if (ASSEMBLER_DIALECT == ASM_ATT)
18594 putc ('$', file);
18596 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18597 || GET_CODE (x) == LABEL_REF)
18599 if (ASSEMBLER_DIALECT == ASM_ATT)
18600 putc ('$', file);
18601 else
18602 fputs ("OFFSET FLAT:", file);
18605 if (CONST_INT_P (x))
18606 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18607 else if (flag_pic || MACHOPIC_INDIRECT)
18608 output_pic_addr_const (file, x, code);
18609 else
18610 output_addr_const (file, x);
18614 static bool
18615 ix86_print_operand_punct_valid_p (unsigned char code)
18617 return (code == '*' || code == '+' || code == '&' || code == ';'
18618 || code == '~' || code == '^' || code == '!');
18621 /* Print a memory operand whose address is ADDR. */
18623 static void
18624 ix86_print_operand_address_as (FILE *file, rtx addr,
18625 addr_space_t as, bool no_rip)
18627 struct ix86_address parts;
18628 rtx base, index, disp;
18629 int scale;
18630 int ok;
18631 bool vsib = false;
18632 int code = 0;
18634 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18636 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18637 gcc_assert (parts.index == NULL_RTX);
18638 parts.index = XVECEXP (addr, 0, 1);
18639 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18640 addr = XVECEXP (addr, 0, 0);
18641 vsib = true;
18643 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18645 gcc_assert (TARGET_64BIT);
18646 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18647 code = 'q';
18649 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18651 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18652 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18653 if (parts.base != NULL_RTX)
18655 parts.index = parts.base;
18656 parts.scale = 1;
18658 parts.base = XVECEXP (addr, 0, 0);
18659 addr = XVECEXP (addr, 0, 0);
18661 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18663 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18664 gcc_assert (parts.index == NULL_RTX);
18665 parts.index = XVECEXP (addr, 0, 1);
18666 addr = XVECEXP (addr, 0, 0);
18668 else
18669 ok = ix86_decompose_address (addr, &parts);
18671 gcc_assert (ok);
18673 base = parts.base;
18674 index = parts.index;
18675 disp = parts.disp;
18676 scale = parts.scale;
18678 if (ADDR_SPACE_GENERIC_P (as))
18679 as = parts.seg;
18680 else
18681 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18683 if (!ADDR_SPACE_GENERIC_P (as))
18685 const char *string;
18687 if (as == ADDR_SPACE_SEG_FS)
18688 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18689 else if (as == ADDR_SPACE_SEG_GS)
18690 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18691 else
18692 gcc_unreachable ();
18693 fputs (string, file);
18696 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18697 if (TARGET_64BIT && !base && !index && !no_rip)
18699 rtx symbol = disp;
18701 if (GET_CODE (disp) == CONST
18702 && GET_CODE (XEXP (disp, 0)) == PLUS
18703 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18704 symbol = XEXP (XEXP (disp, 0), 0);
18706 if (GET_CODE (symbol) == LABEL_REF
18707 || (GET_CODE (symbol) == SYMBOL_REF
18708 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18709 base = pc_rtx;
18712 if (!base && !index)
18714 /* Displacement only requires special attention. */
18715 if (CONST_INT_P (disp))
18717 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18718 fputs ("ds:", file);
18719 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18721 /* Load the external function address via the GOT slot to avoid PLT. */
18722 else if (GET_CODE (disp) == CONST
18723 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18724 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18725 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18726 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18727 output_pic_addr_const (file, disp, 0);
18728 else if (flag_pic)
18729 output_pic_addr_const (file, disp, 0);
18730 else
18731 output_addr_const (file, disp);
18733 else
18735 /* Print SImode register names to force addr32 prefix. */
18736 if (SImode_address_operand (addr, VOIDmode))
18738 if (flag_checking)
18740 gcc_assert (TARGET_64BIT);
18741 switch (GET_CODE (addr))
18743 case SUBREG:
18744 gcc_assert (GET_MODE (addr) == SImode);
18745 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18746 break;
18747 case ZERO_EXTEND:
18748 case AND:
18749 gcc_assert (GET_MODE (addr) == DImode);
18750 break;
18751 default:
18752 gcc_unreachable ();
18755 gcc_assert (!code);
18756 code = 'k';
18758 else if (code == 0
18759 && TARGET_X32
18760 && disp
18761 && CONST_INT_P (disp)
18762 && INTVAL (disp) < -16*1024*1024)
18764 /* X32 runs in 64-bit mode, where displacement, DISP, in
18765 address DISP(%r64), is encoded as 32-bit immediate sign-
18766 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18767 address is %r64 + 0xffffffffbffffd00. When %r64 <
18768 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18769 which is invalid for x32. The correct address is %r64
18770 - 0x40000300 == 0xf7ffdd64. To properly encode
18771 -0x40000300(%r64) for x32, we zero-extend negative
18772 displacement by forcing addr32 prefix which truncates
18773 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18774 zero-extend all negative displacements, including -1(%rsp).
18775 However, for small negative displacements, sign-extension
18776 won't cause overflow. We only zero-extend negative
18777 displacements if they < -16*1024*1024, which is also used
18778 to check legitimate address displacements for PIC. */
18779 code = 'k';
18782 /* Since the upper 32 bits of RSP are always zero for x32,
18783 we can encode %esp as %rsp to avoid 0x67 prefix if
18784 there is no index register. */
18785 if (TARGET_X32 && Pmode == SImode
18786 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18787 code = 'q';
18789 if (ASSEMBLER_DIALECT == ASM_ATT)
18791 if (disp)
18793 if (flag_pic)
18794 output_pic_addr_const (file, disp, 0);
18795 else if (GET_CODE (disp) == LABEL_REF)
18796 output_asm_label (disp);
18797 else
18798 output_addr_const (file, disp);
18801 putc ('(', file);
18802 if (base)
18803 print_reg (base, code, file);
18804 if (index)
18806 putc (',', file);
18807 print_reg (index, vsib ? 0 : code, file);
18808 if (scale != 1 || vsib)
18809 fprintf (file, ",%d", scale);
18811 putc (')', file);
18813 else
18815 rtx offset = NULL_RTX;
18817 if (disp)
18819 /* Pull out the offset of a symbol; print any symbol itself. */
18820 if (GET_CODE (disp) == CONST
18821 && GET_CODE (XEXP (disp, 0)) == PLUS
18822 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18824 offset = XEXP (XEXP (disp, 0), 1);
18825 disp = gen_rtx_CONST (VOIDmode,
18826 XEXP (XEXP (disp, 0), 0));
18829 if (flag_pic)
18830 output_pic_addr_const (file, disp, 0);
18831 else if (GET_CODE (disp) == LABEL_REF)
18832 output_asm_label (disp);
18833 else if (CONST_INT_P (disp))
18834 offset = disp;
18835 else
18836 output_addr_const (file, disp);
18839 putc ('[', file);
18840 if (base)
18842 print_reg (base, code, file);
18843 if (offset)
18845 if (INTVAL (offset) >= 0)
18846 putc ('+', file);
18847 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18850 else if (offset)
18851 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18852 else
18853 putc ('0', file);
18855 if (index)
18857 putc ('+', file);
18858 print_reg (index, vsib ? 0 : code, file);
18859 if (scale != 1 || vsib)
18860 fprintf (file, "*%d", scale);
18862 putc (']', file);
18867 static void
18868 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18870 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18873 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18875 static bool
18876 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18878 rtx op;
18880 if (GET_CODE (x) != UNSPEC)
18881 return false;
18883 op = XVECEXP (x, 0, 0);
18884 switch (XINT (x, 1))
18886 case UNSPEC_GOTOFF:
18887 output_addr_const (file, op);
18888 fputs ("@gotoff", file);
18889 break;
18890 case UNSPEC_GOTTPOFF:
18891 output_addr_const (file, op);
18892 /* FIXME: This might be @TPOFF in Sun ld. */
18893 fputs ("@gottpoff", file);
18894 break;
18895 case UNSPEC_TPOFF:
18896 output_addr_const (file, op);
18897 fputs ("@tpoff", file);
18898 break;
18899 case UNSPEC_NTPOFF:
18900 output_addr_const (file, op);
18901 if (TARGET_64BIT)
18902 fputs ("@tpoff", file);
18903 else
18904 fputs ("@ntpoff", file);
18905 break;
18906 case UNSPEC_DTPOFF:
18907 output_addr_const (file, op);
18908 fputs ("@dtpoff", file);
18909 break;
18910 case UNSPEC_GOTNTPOFF:
18911 output_addr_const (file, op);
18912 if (TARGET_64BIT)
18913 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18914 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18915 else
18916 fputs ("@gotntpoff", file);
18917 break;
18918 case UNSPEC_INDNTPOFF:
18919 output_addr_const (file, op);
18920 fputs ("@indntpoff", file);
18921 break;
18922 #if TARGET_MACHO
18923 case UNSPEC_MACHOPIC_OFFSET:
18924 output_addr_const (file, op);
18925 putc ('-', file);
18926 machopic_output_function_base_name (file);
18927 break;
18928 #endif
18930 default:
18931 return false;
18934 return true;
18937 /* Split one or more double-mode RTL references into pairs of half-mode
18938 references. The RTL can be REG, offsettable MEM, integer constant, or
18939 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18940 split and "num" is its length. lo_half and hi_half are output arrays
18941 that parallel "operands". */
18943 void
18944 split_double_mode (machine_mode mode, rtx operands[],
18945 int num, rtx lo_half[], rtx hi_half[])
18947 machine_mode half_mode;
18948 unsigned int byte;
18950 switch (mode)
18952 case E_TImode:
18953 half_mode = DImode;
18954 break;
18955 case E_DImode:
18956 half_mode = SImode;
18957 break;
18958 default:
18959 gcc_unreachable ();
18962 byte = GET_MODE_SIZE (half_mode);
18964 while (num--)
18966 rtx op = operands[num];
18968 /* simplify_subreg refuse to split volatile memory addresses,
18969 but we still have to handle it. */
18970 if (MEM_P (op))
18972 lo_half[num] = adjust_address (op, half_mode, 0);
18973 hi_half[num] = adjust_address (op, half_mode, byte);
18975 else
18977 lo_half[num] = simplify_gen_subreg (half_mode, op,
18978 GET_MODE (op) == VOIDmode
18979 ? mode : GET_MODE (op), 0);
18980 hi_half[num] = simplify_gen_subreg (half_mode, op,
18981 GET_MODE (op) == VOIDmode
18982 ? mode : GET_MODE (op), byte);
18987 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18988 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18989 is the expression of the binary operation. The output may either be
18990 emitted here, or returned to the caller, like all output_* functions.
18992 There is no guarantee that the operands are the same mode, as they
18993 might be within FLOAT or FLOAT_EXTEND expressions. */
18995 #ifndef SYSV386_COMPAT
18996 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18997 wants to fix the assemblers because that causes incompatibility
18998 with gcc. No-one wants to fix gcc because that causes
18999 incompatibility with assemblers... You can use the option of
19000 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
19001 #define SYSV386_COMPAT 1
19002 #endif
19004 const char *
19005 output_387_binary_op (rtx_insn *insn, rtx *operands)
19007 static char buf[40];
19008 const char *p;
19009 bool is_sse
19010 = (SSE_REG_P (operands[0])
19011 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
19013 if (is_sse)
19014 p = "%v";
19015 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
19016 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
19017 p = "fi";
19018 else
19019 p = "f";
19021 strcpy (buf, p);
19023 switch (GET_CODE (operands[3]))
19025 case PLUS:
19026 p = "add"; break;
19027 case MINUS:
19028 p = "sub"; break;
19029 case MULT:
19030 p = "mul"; break;
19031 case DIV:
19032 p = "div"; break;
19033 default:
19034 gcc_unreachable ();
19037 strcat (buf, p);
19039 if (is_sse)
19041 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
19042 strcat (buf, p);
19044 if (TARGET_AVX)
19045 p = "\t{%2, %1, %0|%0, %1, %2}";
19046 else
19047 p = "\t{%2, %0|%0, %2}";
19049 strcat (buf, p);
19050 return buf;
19053 /* Even if we do not want to check the inputs, this documents input
19054 constraints. Which helps in understanding the following code. */
19055 if (flag_checking)
19057 if (STACK_REG_P (operands[0])
19058 && ((REG_P (operands[1])
19059 && REGNO (operands[0]) == REGNO (operands[1])
19060 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
19061 || (REG_P (operands[2])
19062 && REGNO (operands[0]) == REGNO (operands[2])
19063 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
19064 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
19065 ; /* ok */
19066 else
19067 gcc_unreachable ();
19070 switch (GET_CODE (operands[3]))
19072 case MULT:
19073 case PLUS:
19074 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
19075 std::swap (operands[1], operands[2]);
19077 /* know operands[0] == operands[1]. */
19079 if (MEM_P (operands[2]))
19081 p = "%Z2\t%2";
19082 break;
19085 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19087 if (STACK_TOP_P (operands[0]))
19088 /* How is it that we are storing to a dead operand[2]?
19089 Well, presumably operands[1] is dead too. We can't
19090 store the result to st(0) as st(0) gets popped on this
19091 instruction. Instead store to operands[2] (which I
19092 think has to be st(1)). st(1) will be popped later.
19093 gcc <= 2.8.1 didn't have this check and generated
19094 assembly code that the Unixware assembler rejected. */
19095 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19096 else
19097 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19098 break;
19101 if (STACK_TOP_P (operands[0]))
19102 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19103 else
19104 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19105 break;
19107 case MINUS:
19108 case DIV:
19109 if (MEM_P (operands[1]))
19111 p = "r%Z1\t%1";
19112 break;
19115 if (MEM_P (operands[2]))
19117 p = "%Z2\t%2";
19118 break;
19121 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
19123 #if SYSV386_COMPAT
19124 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
19125 derived assemblers, confusingly reverse the direction of
19126 the operation for fsub{r} and fdiv{r} when the
19127 destination register is not st(0). The Intel assembler
19128 doesn't have this brain damage. Read !SYSV386_COMPAT to
19129 figure out what the hardware really does. */
19130 if (STACK_TOP_P (operands[0]))
19131 p = "{p\t%0, %2|rp\t%2, %0}";
19132 else
19133 p = "{rp\t%2, %0|p\t%0, %2}";
19134 #else
19135 if (STACK_TOP_P (operands[0]))
19136 /* As above for fmul/fadd, we can't store to st(0). */
19137 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
19138 else
19139 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
19140 #endif
19141 break;
19144 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
19146 #if SYSV386_COMPAT
19147 if (STACK_TOP_P (operands[0]))
19148 p = "{rp\t%0, %1|p\t%1, %0}";
19149 else
19150 p = "{p\t%1, %0|rp\t%0, %1}";
19151 #else
19152 if (STACK_TOP_P (operands[0]))
19153 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
19154 else
19155 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
19156 #endif
19157 break;
19160 if (STACK_TOP_P (operands[0]))
19162 if (STACK_TOP_P (operands[1]))
19163 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
19164 else
19165 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
19166 break;
19168 else if (STACK_TOP_P (operands[1]))
19170 #if SYSV386_COMPAT
19171 p = "{\t%1, %0|r\t%0, %1}";
19172 #else
19173 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
19174 #endif
19176 else
19178 #if SYSV386_COMPAT
19179 p = "{r\t%2, %0|\t%0, %2}";
19180 #else
19181 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
19182 #endif
19184 break;
19186 default:
19187 gcc_unreachable ();
19190 strcat (buf, p);
19191 return buf;
19194 /* Return needed mode for entity in optimize_mode_switching pass. */
19196 static int
19197 ix86_dirflag_mode_needed (rtx_insn *insn)
19199 if (CALL_P (insn))
19201 if (cfun->machine->func_type == TYPE_NORMAL)
19202 return X86_DIRFLAG_ANY;
19203 else
19204 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
19205 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
19208 if (recog_memoized (insn) < 0)
19209 return X86_DIRFLAG_ANY;
19211 if (get_attr_type (insn) == TYPE_STR)
19213 /* Emit cld instruction if stringops are used in the function. */
19214 if (cfun->machine->func_type == TYPE_NORMAL)
19215 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
19216 else
19217 return X86_DIRFLAG_RESET;
19220 return X86_DIRFLAG_ANY;
19223 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
19225 static bool
19226 ix86_check_avx_upper_register (const_rtx exp)
19228 if (SUBREG_P (exp))
19229 exp = SUBREG_REG (exp);
19231 return (REG_P (exp)
19232 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
19233 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
19236 /* Return needed mode for entity in optimize_mode_switching pass. */
19238 static int
19239 ix86_avx_u128_mode_needed (rtx_insn *insn)
19241 if (CALL_P (insn))
19243 rtx link;
19245 /* Needed mode is set to AVX_U128_CLEAN if there are
19246 no 256bit or 512bit modes used in function arguments. */
19247 for (link = CALL_INSN_FUNCTION_USAGE (insn);
19248 link;
19249 link = XEXP (link, 1))
19251 if (GET_CODE (XEXP (link, 0)) == USE)
19253 rtx arg = XEXP (XEXP (link, 0), 0);
19255 if (ix86_check_avx_upper_register (arg))
19256 return AVX_U128_DIRTY;
19260 return AVX_U128_CLEAN;
19263 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
19264 Hardware changes state only when a 256bit register is written to,
19265 but we need to prevent the compiler from moving optimal insertion
19266 point above eventual read from 256bit or 512 bit register. */
19267 subrtx_iterator::array_type array;
19268 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
19269 if (ix86_check_avx_upper_register (*iter))
19270 return AVX_U128_DIRTY;
19272 return AVX_U128_ANY;
19275 /* Return mode that i387 must be switched into
19276 prior to the execution of insn. */
19278 static int
19279 ix86_i387_mode_needed (int entity, rtx_insn *insn)
19281 enum attr_i387_cw mode;
19283 /* The mode UNINITIALIZED is used to store control word after a
19284 function call or ASM pattern. The mode ANY specify that function
19285 has no requirements on the control word and make no changes in the
19286 bits we are interested in. */
19288 if (CALL_P (insn)
19289 || (NONJUMP_INSN_P (insn)
19290 && (asm_noperands (PATTERN (insn)) >= 0
19291 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
19292 return I387_CW_UNINITIALIZED;
19294 if (recog_memoized (insn) < 0)
19295 return I387_CW_ANY;
19297 mode = get_attr_i387_cw (insn);
19299 switch (entity)
19301 case I387_TRUNC:
19302 if (mode == I387_CW_TRUNC)
19303 return mode;
19304 break;
19306 case I387_FLOOR:
19307 if (mode == I387_CW_FLOOR)
19308 return mode;
19309 break;
19311 case I387_CEIL:
19312 if (mode == I387_CW_CEIL)
19313 return mode;
19314 break;
19316 case I387_MASK_PM:
19317 if (mode == I387_CW_MASK_PM)
19318 return mode;
19319 break;
19321 default:
19322 gcc_unreachable ();
19325 return I387_CW_ANY;
19328 /* Return mode that entity must be switched into
19329 prior to the execution of insn. */
19331 static int
19332 ix86_mode_needed (int entity, rtx_insn *insn)
19334 switch (entity)
19336 case X86_DIRFLAG:
19337 return ix86_dirflag_mode_needed (insn);
19338 case AVX_U128:
19339 return ix86_avx_u128_mode_needed (insn);
19340 case I387_TRUNC:
19341 case I387_FLOOR:
19342 case I387_CEIL:
19343 case I387_MASK_PM:
19344 return ix86_i387_mode_needed (entity, insn);
19345 default:
19346 gcc_unreachable ();
19348 return 0;
19351 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
19353 static void
19354 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
19356 if (ix86_check_avx_upper_register (dest))
19358 bool *used = (bool *) data;
19359 *used = true;
19363 /* Calculate mode of upper 128bit AVX registers after the insn. */
19365 static int
19366 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
19368 rtx pat = PATTERN (insn);
19370 if (vzeroupper_operation (pat, VOIDmode)
19371 || vzeroall_operation (pat, VOIDmode))
19372 return AVX_U128_CLEAN;
19374 /* We know that state is clean after CALL insn if there are no
19375 256bit or 512bit registers used in the function return register. */
19376 if (CALL_P (insn))
19378 bool avx_upper_reg_found = false;
19379 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
19381 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
19384 /* Otherwise, return current mode. Remember that if insn
19385 references AVX 256bit or 512bit registers, the mode was already
19386 changed to DIRTY from MODE_NEEDED. */
19387 return mode;
19390 /* Return the mode that an insn results in. */
19392 static int
19393 ix86_mode_after (int entity, int mode, rtx_insn *insn)
19395 switch (entity)
19397 case X86_DIRFLAG:
19398 return mode;
19399 case AVX_U128:
19400 return ix86_avx_u128_mode_after (mode, insn);
19401 case I387_TRUNC:
19402 case I387_FLOOR:
19403 case I387_CEIL:
19404 case I387_MASK_PM:
19405 return mode;
19406 default:
19407 gcc_unreachable ();
19411 static int
19412 ix86_dirflag_mode_entry (void)
19414 /* For TARGET_CLD or in the interrupt handler we can't assume
19415 direction flag state at function entry. */
19416 if (TARGET_CLD
19417 || cfun->machine->func_type != TYPE_NORMAL)
19418 return X86_DIRFLAG_ANY;
19420 return X86_DIRFLAG_RESET;
19423 static int
19424 ix86_avx_u128_mode_entry (void)
19426 tree arg;
19428 /* Entry mode is set to AVX_U128_DIRTY if there are
19429 256bit or 512bit modes used in function arguments. */
19430 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
19431 arg = TREE_CHAIN (arg))
19433 rtx incoming = DECL_INCOMING_RTL (arg);
19435 if (incoming && ix86_check_avx_upper_register (incoming))
19436 return AVX_U128_DIRTY;
19439 return AVX_U128_CLEAN;
19442 /* Return a mode that ENTITY is assumed to be
19443 switched to at function entry. */
19445 static int
19446 ix86_mode_entry (int entity)
19448 switch (entity)
19450 case X86_DIRFLAG:
19451 return ix86_dirflag_mode_entry ();
19452 case AVX_U128:
19453 return ix86_avx_u128_mode_entry ();
19454 case I387_TRUNC:
19455 case I387_FLOOR:
19456 case I387_CEIL:
19457 case I387_MASK_PM:
19458 return I387_CW_ANY;
19459 default:
19460 gcc_unreachable ();
19464 static int
19465 ix86_avx_u128_mode_exit (void)
19467 rtx reg = crtl->return_rtx;
19469 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
19470 or 512 bit modes used in the function return register. */
19471 if (reg && ix86_check_avx_upper_register (reg))
19472 return AVX_U128_DIRTY;
19474 return AVX_U128_CLEAN;
19477 /* Return a mode that ENTITY is assumed to be
19478 switched to at function exit. */
19480 static int
19481 ix86_mode_exit (int entity)
19483 switch (entity)
19485 case X86_DIRFLAG:
19486 return X86_DIRFLAG_ANY;
19487 case AVX_U128:
19488 return ix86_avx_u128_mode_exit ();
19489 case I387_TRUNC:
19490 case I387_FLOOR:
19491 case I387_CEIL:
19492 case I387_MASK_PM:
19493 return I387_CW_ANY;
19494 default:
19495 gcc_unreachable ();
19499 static int
19500 ix86_mode_priority (int, int n)
19502 return n;
19505 /* Output code to initialize control word copies used by trunc?f?i and
19506 rounding patterns. CURRENT_MODE is set to current control word,
19507 while NEW_MODE is set to new control word. */
19509 static void
19510 emit_i387_cw_initialization (int mode)
19512 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
19513 rtx new_mode;
19515 enum ix86_stack_slot slot;
19517 rtx reg = gen_reg_rtx (HImode);
19519 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19520 emit_move_insn (reg, copy_rtx (stored_mode));
19522 switch (mode)
19524 case I387_CW_TRUNC:
19525 /* round toward zero (truncate) */
19526 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19527 slot = SLOT_CW_TRUNC;
19528 break;
19530 case I387_CW_FLOOR:
19531 /* round down toward -oo */
19532 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19533 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19534 slot = SLOT_CW_FLOOR;
19535 break;
19537 case I387_CW_CEIL:
19538 /* round up toward +oo */
19539 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19540 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19541 slot = SLOT_CW_CEIL;
19542 break;
19544 case I387_CW_MASK_PM:
19545 /* mask precision exception for nearbyint() */
19546 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19547 slot = SLOT_CW_MASK_PM;
19548 break;
19550 default:
19551 gcc_unreachable ();
19554 gcc_assert (slot < MAX_386_STACK_LOCALS);
19556 new_mode = assign_386_stack_local (HImode, slot);
19557 emit_move_insn (new_mode, reg);
19560 /* Emit vzeroupper. */
19562 void
19563 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19565 int i;
19567 /* Cancel automatic vzeroupper insertion if there are
19568 live call-saved SSE registers at the insertion point. */
19570 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19571 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19572 return;
19574 if (TARGET_64BIT)
19575 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19576 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19577 return;
19579 emit_insn (gen_avx_vzeroupper ());
19582 /* Generate one or more insns to set ENTITY to MODE. */
19584 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19585 is the set of hard registers live at the point where the insn(s)
19586 are to be inserted. */
19588 static void
19589 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19590 HARD_REG_SET regs_live)
19592 switch (entity)
19594 case X86_DIRFLAG:
19595 if (mode == X86_DIRFLAG_RESET)
19596 emit_insn (gen_cld ());
19597 break;
19598 case AVX_U128:
19599 if (mode == AVX_U128_CLEAN)
19600 ix86_avx_emit_vzeroupper (regs_live);
19601 break;
19602 case I387_TRUNC:
19603 case I387_FLOOR:
19604 case I387_CEIL:
19605 case I387_MASK_PM:
19606 if (mode != I387_CW_ANY
19607 && mode != I387_CW_UNINITIALIZED)
19608 emit_i387_cw_initialization (mode);
19609 break;
19610 default:
19611 gcc_unreachable ();
19615 /* Output code for INSN to convert a float to a signed int. OPERANDS
19616 are the insn operands. The output may be [HSD]Imode and the input
19617 operand may be [SDX]Fmode. */
19619 const char *
19620 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19622 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19623 bool dimode_p = GET_MODE (operands[0]) == DImode;
19624 int round_mode = get_attr_i387_cw (insn);
19626 static char buf[40];
19627 const char *p;
19629 /* Jump through a hoop or two for DImode, since the hardware has no
19630 non-popping instruction. We used to do this a different way, but
19631 that was somewhat fragile and broke with post-reload splitters. */
19632 if ((dimode_p || fisttp) && !stack_top_dies)
19633 output_asm_insn ("fld\t%y1", operands);
19635 gcc_assert (STACK_TOP_P (operands[1]));
19636 gcc_assert (MEM_P (operands[0]));
19637 gcc_assert (GET_MODE (operands[1]) != TFmode);
19639 if (fisttp)
19640 return "fisttp%Z0\t%0";
19642 strcpy (buf, "fist");
19644 if (round_mode != I387_CW_ANY)
19645 output_asm_insn ("fldcw\t%3", operands);
19647 p = "p%Z0\t%0";
19648 strcat (buf, p + !(stack_top_dies || dimode_p));
19650 output_asm_insn (buf, operands);
19652 if (round_mode != I387_CW_ANY)
19653 output_asm_insn ("fldcw\t%2", operands);
19655 return "";
19658 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19659 have the values zero or one, indicates the ffreep insn's operand
19660 from the OPERANDS array. */
19662 static const char *
19663 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19665 if (TARGET_USE_FFREEP)
19666 #ifdef HAVE_AS_IX86_FFREEP
19667 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19668 #else
19670 static char retval[32];
19671 int regno = REGNO (operands[opno]);
19673 gcc_assert (STACK_REGNO_P (regno));
19675 regno -= FIRST_STACK_REG;
19677 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19678 return retval;
19680 #endif
19682 return opno ? "fstp\t%y1" : "fstp\t%y0";
19686 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19687 should be used. UNORDERED_P is true when fucom should be used. */
19689 const char *
19690 output_fp_compare (rtx_insn *insn, rtx *operands,
19691 bool eflags_p, bool unordered_p)
19693 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19694 bool stack_top_dies;
19696 static char buf[40];
19697 const char *p;
19699 gcc_assert (STACK_TOP_P (xops[0]));
19701 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19703 if (eflags_p)
19705 p = unordered_p ? "fucomi" : "fcomi";
19706 strcpy (buf, p);
19708 p = "p\t{%y1, %0|%0, %y1}";
19709 strcat (buf, p + !stack_top_dies);
19711 return buf;
19714 if (STACK_REG_P (xops[1])
19715 && stack_top_dies
19716 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19718 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19720 /* If both the top of the 387 stack die, and the other operand
19721 is also a stack register that dies, then this must be a
19722 `fcompp' float compare. */
19723 p = unordered_p ? "fucompp" : "fcompp";
19724 strcpy (buf, p);
19726 else if (const0_operand (xops[1], VOIDmode))
19728 gcc_assert (!unordered_p);
19729 strcpy (buf, "ftst");
19731 else
19733 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19735 gcc_assert (!unordered_p);
19736 p = "ficom";
19738 else
19739 p = unordered_p ? "fucom" : "fcom";
19741 strcpy (buf, p);
19743 p = "p%Z2\t%y2";
19744 strcat (buf, p + !stack_top_dies);
19747 output_asm_insn (buf, operands);
19748 return "fnstsw\t%0";
19751 void
19752 ix86_output_addr_vec_elt (FILE *file, int value)
19754 const char *directive = ASM_LONG;
19756 #ifdef ASM_QUAD
19757 if (TARGET_LP64)
19758 directive = ASM_QUAD;
19759 #else
19760 gcc_assert (!TARGET_64BIT);
19761 #endif
19763 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19766 void
19767 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19769 const char *directive = ASM_LONG;
19771 #ifdef ASM_QUAD
19772 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19773 directive = ASM_QUAD;
19774 #else
19775 gcc_assert (!TARGET_64BIT);
19776 #endif
19777 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19778 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19779 fprintf (file, "%s%s%d-%s%d\n",
19780 directive, LPREFIX, value, LPREFIX, rel);
19781 else if (HAVE_AS_GOTOFF_IN_DATA)
19782 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19783 #if TARGET_MACHO
19784 else if (TARGET_MACHO)
19786 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19787 machopic_output_function_base_name (file);
19788 putc ('\n', file);
19790 #endif
19791 else
19792 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19793 GOT_SYMBOL_NAME, LPREFIX, value);
19796 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19797 for the target. */
19799 void
19800 ix86_expand_clear (rtx dest)
19802 rtx tmp;
19804 /* We play register width games, which are only valid after reload. */
19805 gcc_assert (reload_completed);
19807 /* Avoid HImode and its attendant prefix byte. */
19808 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19809 dest = gen_rtx_REG (SImode, REGNO (dest));
19810 tmp = gen_rtx_SET (dest, const0_rtx);
19812 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19814 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19815 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19818 emit_insn (tmp);
19821 void
19822 ix86_expand_move (machine_mode mode, rtx operands[])
19824 rtx op0, op1;
19825 rtx tmp, addend = NULL_RTX;
19826 enum tls_model model;
19828 op0 = operands[0];
19829 op1 = operands[1];
19831 switch (GET_CODE (op1))
19833 case CONST:
19834 tmp = XEXP (op1, 0);
19836 if (GET_CODE (tmp) != PLUS
19837 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19838 break;
19840 op1 = XEXP (tmp, 0);
19841 addend = XEXP (tmp, 1);
19842 /* FALLTHRU */
19844 case SYMBOL_REF:
19845 model = SYMBOL_REF_TLS_MODEL (op1);
19847 if (model)
19848 op1 = legitimize_tls_address (op1, model, true);
19849 else if (ix86_force_load_from_GOT_p (op1))
19851 /* Load the external function address via GOT slot to avoid PLT. */
19852 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19853 (TARGET_64BIT
19854 ? UNSPEC_GOTPCREL
19855 : UNSPEC_GOT));
19856 op1 = gen_rtx_CONST (Pmode, op1);
19857 op1 = gen_const_mem (Pmode, op1);
19858 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19860 else
19862 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19863 if (tmp)
19865 op1 = tmp;
19866 if (!addend)
19867 break;
19869 else
19871 op1 = operands[1];
19872 break;
19876 if (addend)
19878 op1 = force_operand (op1, NULL_RTX);
19879 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19880 op0, 1, OPTAB_DIRECT);
19882 else
19883 op1 = force_operand (op1, op0);
19885 if (op1 == op0)
19886 return;
19888 op1 = convert_to_mode (mode, op1, 1);
19890 default:
19891 break;
19894 if ((flag_pic || MACHOPIC_INDIRECT)
19895 && symbolic_operand (op1, mode))
19897 if (TARGET_MACHO && !TARGET_64BIT)
19899 #if TARGET_MACHO
19900 /* dynamic-no-pic */
19901 if (MACHOPIC_INDIRECT)
19903 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19904 ? op0 : gen_reg_rtx (Pmode);
19905 op1 = machopic_indirect_data_reference (op1, temp);
19906 if (MACHOPIC_PURE)
19907 op1 = machopic_legitimize_pic_address (op1, mode,
19908 temp == op1 ? 0 : temp);
19910 if (op0 != op1 && GET_CODE (op0) != MEM)
19912 rtx insn = gen_rtx_SET (op0, op1);
19913 emit_insn (insn);
19914 return;
19916 if (GET_CODE (op0) == MEM)
19917 op1 = force_reg (Pmode, op1);
19918 else
19920 rtx temp = op0;
19921 if (GET_CODE (temp) != REG)
19922 temp = gen_reg_rtx (Pmode);
19923 temp = legitimize_pic_address (op1, temp);
19924 if (temp == op0)
19925 return;
19926 op1 = temp;
19928 /* dynamic-no-pic */
19929 #endif
19931 else
19933 if (MEM_P (op0))
19934 op1 = force_reg (mode, op1);
19935 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19937 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19938 op1 = legitimize_pic_address (op1, reg);
19939 if (op0 == op1)
19940 return;
19941 op1 = convert_to_mode (mode, op1, 1);
19945 else
19947 if (MEM_P (op0)
19948 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19949 || !push_operand (op0, mode))
19950 && MEM_P (op1))
19951 op1 = force_reg (mode, op1);
19953 if (push_operand (op0, mode)
19954 && ! general_no_elim_operand (op1, mode))
19955 op1 = copy_to_mode_reg (mode, op1);
19957 /* Force large constants in 64bit compilation into register
19958 to get them CSEed. */
19959 if (can_create_pseudo_p ()
19960 && (mode == DImode) && TARGET_64BIT
19961 && immediate_operand (op1, mode)
19962 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19963 && !register_operand (op0, mode)
19964 && optimize)
19965 op1 = copy_to_mode_reg (mode, op1);
19967 if (can_create_pseudo_p ()
19968 && CONST_DOUBLE_P (op1))
19970 /* If we are loading a floating point constant to a register,
19971 force the value to memory now, since we'll get better code
19972 out the back end. */
19974 op1 = validize_mem (force_const_mem (mode, op1));
19975 if (!register_operand (op0, mode))
19977 rtx temp = gen_reg_rtx (mode);
19978 emit_insn (gen_rtx_SET (temp, op1));
19979 emit_move_insn (op0, temp);
19980 return;
19985 emit_insn (gen_rtx_SET (op0, op1));
19988 void
19989 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19991 rtx op0 = operands[0], op1 = operands[1];
19992 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19993 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19994 unsigned int align = (TARGET_IAMCU
19995 ? GET_MODE_BITSIZE (mode)
19996 : GET_MODE_ALIGNMENT (mode));
19998 if (push_operand (op0, VOIDmode))
19999 op0 = emit_move_resolve_push (mode, op0);
20001 /* Force constants other than zero into memory. We do not know how
20002 the instructions used to build constants modify the upper 64 bits
20003 of the register, once we have that information we may be able
20004 to handle some of them more efficiently. */
20005 if (can_create_pseudo_p ()
20006 && (CONSTANT_P (op1)
20007 || (SUBREG_P (op1)
20008 && CONSTANT_P (SUBREG_REG (op1))))
20009 && ((register_operand (op0, mode)
20010 && !standard_sse_constant_p (op1, mode))
20011 /* ix86_expand_vector_move_misalign() does not like constants. */
20012 || (SSE_REG_MODE_P (mode)
20013 && MEM_P (op0)
20014 && MEM_ALIGN (op0) < align)))
20016 if (SUBREG_P (op1))
20018 machine_mode imode = GET_MODE (SUBREG_REG (op1));
20019 rtx r = force_const_mem (imode, SUBREG_REG (op1));
20020 if (r)
20021 r = validize_mem (r);
20022 else
20023 r = force_reg (imode, SUBREG_REG (op1));
20024 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
20026 else
20027 op1 = validize_mem (force_const_mem (mode, op1));
20030 /* We need to check memory alignment for SSE mode since attribute
20031 can make operands unaligned. */
20032 if (can_create_pseudo_p ()
20033 && SSE_REG_MODE_P (mode)
20034 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
20035 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
20037 rtx tmp[2];
20039 /* ix86_expand_vector_move_misalign() does not like both
20040 arguments in memory. */
20041 if (!register_operand (op0, mode)
20042 && !register_operand (op1, mode))
20043 op1 = force_reg (mode, op1);
20045 tmp[0] = op0; tmp[1] = op1;
20046 ix86_expand_vector_move_misalign (mode, tmp);
20047 return;
20050 /* Make operand1 a register if it isn't already. */
20051 if (can_create_pseudo_p ()
20052 && !register_operand (op0, mode)
20053 && !register_operand (op1, mode))
20055 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
20056 return;
20059 emit_insn (gen_rtx_SET (op0, op1));
20062 /* Split 32-byte AVX unaligned load and store if needed. */
20064 static void
20065 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
20067 rtx m;
20068 rtx (*extract) (rtx, rtx, rtx);
20069 machine_mode mode;
20071 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
20072 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
20074 emit_insn (gen_rtx_SET (op0, op1));
20075 return;
20078 rtx orig_op0 = NULL_RTX;
20079 mode = GET_MODE (op0);
20080 switch (GET_MODE_CLASS (mode))
20082 case MODE_VECTOR_INT:
20083 case MODE_INT:
20084 if (mode != V32QImode)
20086 if (!MEM_P (op0))
20088 orig_op0 = op0;
20089 op0 = gen_reg_rtx (V32QImode);
20091 else
20092 op0 = gen_lowpart (V32QImode, op0);
20093 op1 = gen_lowpart (V32QImode, op1);
20094 mode = V32QImode;
20096 break;
20097 case MODE_VECTOR_FLOAT:
20098 break;
20099 default:
20100 gcc_unreachable ();
20103 switch (mode)
20105 default:
20106 gcc_unreachable ();
20107 case E_V32QImode:
20108 extract = gen_avx_vextractf128v32qi;
20109 mode = V16QImode;
20110 break;
20111 case E_V8SFmode:
20112 extract = gen_avx_vextractf128v8sf;
20113 mode = V4SFmode;
20114 break;
20115 case E_V4DFmode:
20116 extract = gen_avx_vextractf128v4df;
20117 mode = V2DFmode;
20118 break;
20121 if (MEM_P (op1))
20123 rtx r = gen_reg_rtx (mode);
20124 m = adjust_address (op1, mode, 0);
20125 emit_move_insn (r, m);
20126 m = adjust_address (op1, mode, 16);
20127 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
20128 emit_move_insn (op0, r);
20130 else if (MEM_P (op0))
20132 m = adjust_address (op0, mode, 0);
20133 emit_insn (extract (m, op1, const0_rtx));
20134 m = adjust_address (op0, mode, 16);
20135 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
20137 else
20138 gcc_unreachable ();
20140 if (orig_op0)
20141 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
20144 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
20145 straight to ix86_expand_vector_move. */
20146 /* Code generation for scalar reg-reg moves of single and double precision data:
20147 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
20148 movaps reg, reg
20149 else
20150 movss reg, reg
20151 if (x86_sse_partial_reg_dependency == true)
20152 movapd reg, reg
20153 else
20154 movsd reg, reg
20156 Code generation for scalar loads of double precision data:
20157 if (x86_sse_split_regs == true)
20158 movlpd mem, reg (gas syntax)
20159 else
20160 movsd mem, reg
20162 Code generation for unaligned packed loads of single precision data
20163 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
20164 if (x86_sse_unaligned_move_optimal)
20165 movups mem, reg
20167 if (x86_sse_partial_reg_dependency == true)
20169 xorps reg, reg
20170 movlps mem, reg
20171 movhps mem+8, reg
20173 else
20175 movlps mem, reg
20176 movhps mem+8, reg
20179 Code generation for unaligned packed loads of double precision data
20180 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
20181 if (x86_sse_unaligned_move_optimal)
20182 movupd mem, reg
20184 if (x86_sse_split_regs == true)
20186 movlpd mem, reg
20187 movhpd mem+8, reg
20189 else
20191 movsd mem, reg
20192 movhpd mem+8, reg
20196 void
20197 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
20199 rtx op0, op1, m;
20201 op0 = operands[0];
20202 op1 = operands[1];
20204 /* Use unaligned load/store for AVX512 or when optimizing for size. */
20205 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
20207 emit_insn (gen_rtx_SET (op0, op1));
20208 return;
20211 if (TARGET_AVX)
20213 if (GET_MODE_SIZE (mode) == 32)
20214 ix86_avx256_split_vector_move_misalign (op0, op1);
20215 else
20216 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
20217 emit_insn (gen_rtx_SET (op0, op1));
20218 return;
20221 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
20222 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
20224 emit_insn (gen_rtx_SET (op0, op1));
20225 return;
20228 /* ??? If we have typed data, then it would appear that using
20229 movdqu is the only way to get unaligned data loaded with
20230 integer type. */
20231 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
20233 emit_insn (gen_rtx_SET (op0, op1));
20234 return;
20237 if (MEM_P (op1))
20239 if (TARGET_SSE2 && mode == V2DFmode)
20241 rtx zero;
20243 /* When SSE registers are split into halves, we can avoid
20244 writing to the top half twice. */
20245 if (TARGET_SSE_SPLIT_REGS)
20247 emit_clobber (op0);
20248 zero = op0;
20250 else
20252 /* ??? Not sure about the best option for the Intel chips.
20253 The following would seem to satisfy; the register is
20254 entirely cleared, breaking the dependency chain. We
20255 then store to the upper half, with a dependency depth
20256 of one. A rumor has it that Intel recommends two movsd
20257 followed by an unpacklpd, but this is unconfirmed. And
20258 given that the dependency depth of the unpacklpd would
20259 still be one, I'm not sure why this would be better. */
20260 zero = CONST0_RTX (V2DFmode);
20263 m = adjust_address (op1, DFmode, 0);
20264 emit_insn (gen_sse2_loadlpd (op0, zero, m));
20265 m = adjust_address (op1, DFmode, 8);
20266 emit_insn (gen_sse2_loadhpd (op0, op0, m));
20268 else
20270 rtx t;
20272 if (mode != V4SFmode)
20273 t = gen_reg_rtx (V4SFmode);
20274 else
20275 t = op0;
20277 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
20278 emit_move_insn (t, CONST0_RTX (V4SFmode));
20279 else
20280 emit_clobber (t);
20282 m = adjust_address (op1, V2SFmode, 0);
20283 emit_insn (gen_sse_loadlps (t, t, m));
20284 m = adjust_address (op1, V2SFmode, 8);
20285 emit_insn (gen_sse_loadhps (t, t, m));
20286 if (mode != V4SFmode)
20287 emit_move_insn (op0, gen_lowpart (mode, t));
20290 else if (MEM_P (op0))
20292 if (TARGET_SSE2 && mode == V2DFmode)
20294 m = adjust_address (op0, DFmode, 0);
20295 emit_insn (gen_sse2_storelpd (m, op1));
20296 m = adjust_address (op0, DFmode, 8);
20297 emit_insn (gen_sse2_storehpd (m, op1));
20299 else
20301 if (mode != V4SFmode)
20302 op1 = gen_lowpart (V4SFmode, op1);
20304 m = adjust_address (op0, V2SFmode, 0);
20305 emit_insn (gen_sse_storelps (m, op1));
20306 m = adjust_address (op0, V2SFmode, 8);
20307 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
20310 else
20311 gcc_unreachable ();
20314 /* Helper function of ix86_fixup_binary_operands to canonicalize
20315 operand order. Returns true if the operands should be swapped. */
20317 static bool
20318 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
20319 rtx operands[])
20321 rtx dst = operands[0];
20322 rtx src1 = operands[1];
20323 rtx src2 = operands[2];
20325 /* If the operation is not commutative, we can't do anything. */
20326 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
20327 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
20328 return false;
20330 /* Highest priority is that src1 should match dst. */
20331 if (rtx_equal_p (dst, src1))
20332 return false;
20333 if (rtx_equal_p (dst, src2))
20334 return true;
20336 /* Next highest priority is that immediate constants come second. */
20337 if (immediate_operand (src2, mode))
20338 return false;
20339 if (immediate_operand (src1, mode))
20340 return true;
20342 /* Lowest priority is that memory references should come second. */
20343 if (MEM_P (src2))
20344 return false;
20345 if (MEM_P (src1))
20346 return true;
20348 return false;
20352 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
20353 destination to use for the operation. If different from the true
20354 destination in operands[0], a copy operation will be required. */
20357 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
20358 rtx operands[])
20360 rtx dst = operands[0];
20361 rtx src1 = operands[1];
20362 rtx src2 = operands[2];
20364 /* Canonicalize operand order. */
20365 if (ix86_swap_binary_operands_p (code, mode, operands))
20367 /* It is invalid to swap operands of different modes. */
20368 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
20370 std::swap (src1, src2);
20373 /* Both source operands cannot be in memory. */
20374 if (MEM_P (src1) && MEM_P (src2))
20376 /* Optimization: Only read from memory once. */
20377 if (rtx_equal_p (src1, src2))
20379 src2 = force_reg (mode, src2);
20380 src1 = src2;
20382 else if (rtx_equal_p (dst, src1))
20383 src2 = force_reg (mode, src2);
20384 else
20385 src1 = force_reg (mode, src1);
20388 /* If the destination is memory, and we do not have matching source
20389 operands, do things in registers. */
20390 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20391 dst = gen_reg_rtx (mode);
20393 /* Source 1 cannot be a constant. */
20394 if (CONSTANT_P (src1))
20395 src1 = force_reg (mode, src1);
20397 /* Source 1 cannot be a non-matching memory. */
20398 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20399 src1 = force_reg (mode, src1);
20401 /* Improve address combine. */
20402 if (code == PLUS
20403 && GET_MODE_CLASS (mode) == MODE_INT
20404 && MEM_P (src2))
20405 src2 = force_reg (mode, src2);
20407 operands[1] = src1;
20408 operands[2] = src2;
20409 return dst;
20412 /* Similarly, but assume that the destination has already been
20413 set up properly. */
20415 void
20416 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
20417 machine_mode mode, rtx operands[])
20419 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
20420 gcc_assert (dst == operands[0]);
20423 /* Attempt to expand a binary operator. Make the expansion closer to the
20424 actual machine, then just general_operand, which will allow 3 separate
20425 memory references (one output, two input) in a single insn. */
20427 void
20428 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
20429 rtx operands[])
20431 rtx src1, src2, dst, op, clob;
20433 dst = ix86_fixup_binary_operands (code, mode, operands);
20434 src1 = operands[1];
20435 src2 = operands[2];
20437 /* Emit the instruction. */
20439 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
20441 if (reload_completed
20442 && code == PLUS
20443 && !rtx_equal_p (dst, src1))
20445 /* This is going to be an LEA; avoid splitting it later. */
20446 emit_insn (op);
20448 else
20450 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20451 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20454 /* Fix up the destination if needed. */
20455 if (dst != operands[0])
20456 emit_move_insn (operands[0], dst);
20459 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
20460 the given OPERANDS. */
20462 void
20463 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
20464 rtx operands[])
20466 rtx op1 = NULL_RTX, op2 = NULL_RTX;
20467 if (SUBREG_P (operands[1]))
20469 op1 = operands[1];
20470 op2 = operands[2];
20472 else if (SUBREG_P (operands[2]))
20474 op1 = operands[2];
20475 op2 = operands[1];
20477 /* Optimize (__m128i) d | (__m128i) e and similar code
20478 when d and e are float vectors into float vector logical
20479 insn. In C/C++ without using intrinsics there is no other way
20480 to express vector logical operation on float vectors than
20481 to cast them temporarily to integer vectors. */
20482 if (op1
20483 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20484 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20485 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20486 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20487 && SUBREG_BYTE (op1) == 0
20488 && (GET_CODE (op2) == CONST_VECTOR
20489 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20490 && SUBREG_BYTE (op2) == 0))
20491 && can_create_pseudo_p ())
20493 rtx dst;
20494 switch (GET_MODE (SUBREG_REG (op1)))
20496 case E_V4SFmode:
20497 case E_V8SFmode:
20498 case E_V16SFmode:
20499 case E_V2DFmode:
20500 case E_V4DFmode:
20501 case E_V8DFmode:
20502 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20503 if (GET_CODE (op2) == CONST_VECTOR)
20505 op2 = gen_lowpart (GET_MODE (dst), op2);
20506 op2 = force_reg (GET_MODE (dst), op2);
20508 else
20510 op1 = operands[1];
20511 op2 = SUBREG_REG (operands[2]);
20512 if (!vector_operand (op2, GET_MODE (dst)))
20513 op2 = force_reg (GET_MODE (dst), op2);
20515 op1 = SUBREG_REG (op1);
20516 if (!vector_operand (op1, GET_MODE (dst)))
20517 op1 = force_reg (GET_MODE (dst), op1);
20518 emit_insn (gen_rtx_SET (dst,
20519 gen_rtx_fmt_ee (code, GET_MODE (dst),
20520 op1, op2)));
20521 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20522 return;
20523 default:
20524 break;
20527 if (!vector_operand (operands[1], mode))
20528 operands[1] = force_reg (mode, operands[1]);
20529 if (!vector_operand (operands[2], mode))
20530 operands[2] = force_reg (mode, operands[2]);
20531 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20532 emit_insn (gen_rtx_SET (operands[0],
20533 gen_rtx_fmt_ee (code, mode, operands[1],
20534 operands[2])));
20537 /* Return TRUE or FALSE depending on whether the binary operator meets the
20538 appropriate constraints. */
20540 bool
20541 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20542 rtx operands[3])
20544 rtx dst = operands[0];
20545 rtx src1 = operands[1];
20546 rtx src2 = operands[2];
20548 /* Both source operands cannot be in memory. */
20549 if (MEM_P (src1) && MEM_P (src2))
20550 return false;
20552 /* Canonicalize operand order for commutative operators. */
20553 if (ix86_swap_binary_operands_p (code, mode, operands))
20554 std::swap (src1, src2);
20556 /* If the destination is memory, we must have a matching source operand. */
20557 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20558 return false;
20560 /* Source 1 cannot be a constant. */
20561 if (CONSTANT_P (src1))
20562 return false;
20564 /* Source 1 cannot be a non-matching memory. */
20565 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20566 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20567 return (code == AND
20568 && (mode == HImode
20569 || mode == SImode
20570 || (TARGET_64BIT && mode == DImode))
20571 && satisfies_constraint_L (src2));
20573 return true;
20576 /* Attempt to expand a unary operator. Make the expansion closer to the
20577 actual machine, then just general_operand, which will allow 2 separate
20578 memory references (one output, one input) in a single insn. */
20580 void
20581 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20582 rtx operands[])
20584 bool matching_memory = false;
20585 rtx src, dst, op, clob;
20587 dst = operands[0];
20588 src = operands[1];
20590 /* If the destination is memory, and we do not have matching source
20591 operands, do things in registers. */
20592 if (MEM_P (dst))
20594 if (rtx_equal_p (dst, src))
20595 matching_memory = true;
20596 else
20597 dst = gen_reg_rtx (mode);
20600 /* When source operand is memory, destination must match. */
20601 if (MEM_P (src) && !matching_memory)
20602 src = force_reg (mode, src);
20604 /* Emit the instruction. */
20606 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20608 if (code == NOT)
20609 emit_insn (op);
20610 else
20612 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20613 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20616 /* Fix up the destination if needed. */
20617 if (dst != operands[0])
20618 emit_move_insn (operands[0], dst);
20621 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20622 divisor are within the range [0-255]. */
20624 void
20625 ix86_split_idivmod (machine_mode mode, rtx operands[],
20626 bool signed_p)
20628 rtx_code_label *end_label, *qimode_label;
20629 rtx div, mod;
20630 rtx_insn *insn;
20631 rtx scratch, tmp0, tmp1, tmp2;
20632 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20633 rtx (*gen_zero_extend) (rtx, rtx);
20634 rtx (*gen_test_ccno_1) (rtx, rtx);
20636 switch (mode)
20638 case E_SImode:
20639 if (GET_MODE (operands[0]) == SImode)
20641 if (GET_MODE (operands[1]) == SImode)
20642 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20643 else
20644 gen_divmod4_1
20645 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20646 gen_zero_extend = gen_zero_extendqisi2;
20648 else
20650 gen_divmod4_1
20651 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20652 gen_zero_extend = gen_zero_extendqidi2;
20654 gen_test_ccno_1 = gen_testsi_ccno_1;
20655 break;
20656 case E_DImode:
20657 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20658 gen_test_ccno_1 = gen_testdi_ccno_1;
20659 gen_zero_extend = gen_zero_extendqidi2;
20660 break;
20661 default:
20662 gcc_unreachable ();
20665 end_label = gen_label_rtx ();
20666 qimode_label = gen_label_rtx ();
20668 scratch = gen_reg_rtx (mode);
20670 /* Use 8bit unsigned divimod if dividend and divisor are within
20671 the range [0-255]. */
20672 emit_move_insn (scratch, operands[2]);
20673 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20674 scratch, 1, OPTAB_DIRECT);
20675 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20676 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20677 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20678 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20679 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20680 pc_rtx);
20681 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20682 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20683 JUMP_LABEL (insn) = qimode_label;
20685 /* Generate original signed/unsigned divimod. */
20686 div = gen_divmod4_1 (operands[0], operands[1],
20687 operands[2], operands[3]);
20688 emit_insn (div);
20690 /* Branch to the end. */
20691 emit_jump_insn (gen_jump (end_label));
20692 emit_barrier ();
20694 /* Generate 8bit unsigned divide. */
20695 emit_label (qimode_label);
20696 /* Don't use operands[0] for result of 8bit divide since not all
20697 registers support QImode ZERO_EXTRACT. */
20698 tmp0 = lowpart_subreg (HImode, scratch, mode);
20699 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20700 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20701 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20703 if (signed_p)
20705 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20706 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20708 else
20710 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20711 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20713 if (mode == SImode)
20715 if (GET_MODE (operands[0]) != SImode)
20716 div = gen_rtx_ZERO_EXTEND (DImode, div);
20717 if (GET_MODE (operands[1]) != SImode)
20718 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20721 /* Extract remainder from AH. */
20722 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20723 tmp0, GEN_INT (8), GEN_INT (8));
20724 if (REG_P (operands[1]))
20725 insn = emit_move_insn (operands[1], tmp1);
20726 else
20728 /* Need a new scratch register since the old one has result
20729 of 8bit divide. */
20730 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20731 emit_move_insn (scratch, tmp1);
20732 insn = emit_move_insn (operands[1], scratch);
20734 set_unique_reg_note (insn, REG_EQUAL, mod);
20736 /* Zero extend quotient from AL. */
20737 tmp1 = gen_lowpart (QImode, tmp0);
20738 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20739 set_unique_reg_note (insn, REG_EQUAL, div);
20741 emit_label (end_label);
20744 #define LEA_MAX_STALL (3)
20745 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20747 /* Increase given DISTANCE in half-cycles according to
20748 dependencies between PREV and NEXT instructions.
20749 Add 1 half-cycle if there is no dependency and
20750 go to next cycle if there is some dependecy. */
20752 static unsigned int
20753 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20755 df_ref def, use;
20757 if (!prev || !next)
20758 return distance + (distance & 1) + 2;
20760 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20761 return distance + 1;
20763 FOR_EACH_INSN_USE (use, next)
20764 FOR_EACH_INSN_DEF (def, prev)
20765 if (!DF_REF_IS_ARTIFICIAL (def)
20766 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20767 return distance + (distance & 1) + 2;
20769 return distance + 1;
20772 /* Function checks if instruction INSN defines register number
20773 REGNO1 or REGNO2. */
20775 static bool
20776 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20777 rtx_insn *insn)
20779 df_ref def;
20781 FOR_EACH_INSN_DEF (def, insn)
20782 if (DF_REF_REG_DEF_P (def)
20783 && !DF_REF_IS_ARTIFICIAL (def)
20784 && (regno1 == DF_REF_REGNO (def)
20785 || regno2 == DF_REF_REGNO (def)))
20786 return true;
20788 return false;
20791 /* Function checks if instruction INSN uses register number
20792 REGNO as a part of address expression. */
20794 static bool
20795 insn_uses_reg_mem (unsigned int regno, rtx insn)
20797 df_ref use;
20799 FOR_EACH_INSN_USE (use, insn)
20800 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20801 return true;
20803 return false;
20806 /* Search backward for non-agu definition of register number REGNO1
20807 or register number REGNO2 in basic block starting from instruction
20808 START up to head of basic block or instruction INSN.
20810 Function puts true value into *FOUND var if definition was found
20811 and false otherwise.
20813 Distance in half-cycles between START and found instruction or head
20814 of BB is added to DISTANCE and returned. */
20816 static int
20817 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20818 rtx_insn *insn, int distance,
20819 rtx_insn *start, bool *found)
20821 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20822 rtx_insn *prev = start;
20823 rtx_insn *next = NULL;
20825 *found = false;
20827 while (prev
20828 && prev != insn
20829 && distance < LEA_SEARCH_THRESHOLD)
20831 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20833 distance = increase_distance (prev, next, distance);
20834 if (insn_defines_reg (regno1, regno2, prev))
20836 if (recog_memoized (prev) < 0
20837 || get_attr_type (prev) != TYPE_LEA)
20839 *found = true;
20840 return distance;
20844 next = prev;
20846 if (prev == BB_HEAD (bb))
20847 break;
20849 prev = PREV_INSN (prev);
20852 return distance;
20855 /* Search backward for non-agu definition of register number REGNO1
20856 or register number REGNO2 in INSN's basic block until
20857 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20858 2. Reach neighbor BBs boundary, or
20859 3. Reach agu definition.
20860 Returns the distance between the non-agu definition point and INSN.
20861 If no definition point, returns -1. */
20863 static int
20864 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20865 rtx_insn *insn)
20867 basic_block bb = BLOCK_FOR_INSN (insn);
20868 int distance = 0;
20869 bool found = false;
20871 if (insn != BB_HEAD (bb))
20872 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20873 distance, PREV_INSN (insn),
20874 &found);
20876 if (!found && distance < LEA_SEARCH_THRESHOLD)
20878 edge e;
20879 edge_iterator ei;
20880 bool simple_loop = false;
20882 FOR_EACH_EDGE (e, ei, bb->preds)
20883 if (e->src == bb)
20885 simple_loop = true;
20886 break;
20889 if (simple_loop)
20890 distance = distance_non_agu_define_in_bb (regno1, regno2,
20891 insn, distance,
20892 BB_END (bb), &found);
20893 else
20895 int shortest_dist = -1;
20896 bool found_in_bb = false;
20898 FOR_EACH_EDGE (e, ei, bb->preds)
20900 int bb_dist
20901 = distance_non_agu_define_in_bb (regno1, regno2,
20902 insn, distance,
20903 BB_END (e->src),
20904 &found_in_bb);
20905 if (found_in_bb)
20907 if (shortest_dist < 0)
20908 shortest_dist = bb_dist;
20909 else if (bb_dist > 0)
20910 shortest_dist = MIN (bb_dist, shortest_dist);
20912 found = true;
20916 distance = shortest_dist;
20920 /* get_attr_type may modify recog data. We want to make sure
20921 that recog data is valid for instruction INSN, on which
20922 distance_non_agu_define is called. INSN is unchanged here. */
20923 extract_insn_cached (insn);
20925 if (!found)
20926 return -1;
20928 return distance >> 1;
20931 /* Return the distance in half-cycles between INSN and the next
20932 insn that uses register number REGNO in memory address added
20933 to DISTANCE. Return -1 if REGNO0 is set.
20935 Put true value into *FOUND if register usage was found and
20936 false otherwise.
20937 Put true value into *REDEFINED if register redefinition was
20938 found and false otherwise. */
20940 static int
20941 distance_agu_use_in_bb (unsigned int regno,
20942 rtx_insn *insn, int distance, rtx_insn *start,
20943 bool *found, bool *redefined)
20945 basic_block bb = NULL;
20946 rtx_insn *next = start;
20947 rtx_insn *prev = NULL;
20949 *found = false;
20950 *redefined = false;
20952 if (start != NULL_RTX)
20954 bb = BLOCK_FOR_INSN (start);
20955 if (start != BB_HEAD (bb))
20956 /* If insn and start belong to the same bb, set prev to insn,
20957 so the call to increase_distance will increase the distance
20958 between insns by 1. */
20959 prev = insn;
20962 while (next
20963 && next != insn
20964 && distance < LEA_SEARCH_THRESHOLD)
20966 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20968 distance = increase_distance(prev, next, distance);
20969 if (insn_uses_reg_mem (regno, next))
20971 /* Return DISTANCE if OP0 is used in memory
20972 address in NEXT. */
20973 *found = true;
20974 return distance;
20977 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20979 /* Return -1 if OP0 is set in NEXT. */
20980 *redefined = true;
20981 return -1;
20984 prev = next;
20987 if (next == BB_END (bb))
20988 break;
20990 next = NEXT_INSN (next);
20993 return distance;
20996 /* Return the distance between INSN and the next insn that uses
20997 register number REGNO0 in memory address. Return -1 if no such
20998 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
21000 static int
21001 distance_agu_use (unsigned int regno0, rtx_insn *insn)
21003 basic_block bb = BLOCK_FOR_INSN (insn);
21004 int distance = 0;
21005 bool found = false;
21006 bool redefined = false;
21008 if (insn != BB_END (bb))
21009 distance = distance_agu_use_in_bb (regno0, insn, distance,
21010 NEXT_INSN (insn),
21011 &found, &redefined);
21013 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
21015 edge e;
21016 edge_iterator ei;
21017 bool simple_loop = false;
21019 FOR_EACH_EDGE (e, ei, bb->succs)
21020 if (e->dest == bb)
21022 simple_loop = true;
21023 break;
21026 if (simple_loop)
21027 distance = distance_agu_use_in_bb (regno0, insn,
21028 distance, BB_HEAD (bb),
21029 &found, &redefined);
21030 else
21032 int shortest_dist = -1;
21033 bool found_in_bb = false;
21034 bool redefined_in_bb = false;
21036 FOR_EACH_EDGE (e, ei, bb->succs)
21038 int bb_dist
21039 = distance_agu_use_in_bb (regno0, insn,
21040 distance, BB_HEAD (e->dest),
21041 &found_in_bb, &redefined_in_bb);
21042 if (found_in_bb)
21044 if (shortest_dist < 0)
21045 shortest_dist = bb_dist;
21046 else if (bb_dist > 0)
21047 shortest_dist = MIN (bb_dist, shortest_dist);
21049 found = true;
21053 distance = shortest_dist;
21057 if (!found || redefined)
21058 return -1;
21060 return distance >> 1;
21063 /* Define this macro to tune LEA priority vs ADD, it take effect when
21064 there is a dilemma of choicing LEA or ADD
21065 Negative value: ADD is more preferred than LEA
21066 Zero: Netrual
21067 Positive value: LEA is more preferred than ADD*/
21068 #define IX86_LEA_PRIORITY 0
21070 /* Return true if usage of lea INSN has performance advantage
21071 over a sequence of instructions. Instructions sequence has
21072 SPLIT_COST cycles higher latency than lea latency. */
21074 static bool
21075 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
21076 unsigned int regno2, int split_cost, bool has_scale)
21078 int dist_define, dist_use;
21080 /* For Silvermont if using a 2-source or 3-source LEA for
21081 non-destructive destination purposes, or due to wanting
21082 ability to use SCALE, the use of LEA is justified. */
21083 if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
21084 || TARGET_TREMONT || TARGET_INTEL)
21086 if (has_scale)
21087 return true;
21088 if (split_cost < 1)
21089 return false;
21090 if (regno0 == regno1 || regno0 == regno2)
21091 return false;
21092 return true;
21095 dist_define = distance_non_agu_define (regno1, regno2, insn);
21096 dist_use = distance_agu_use (regno0, insn);
21098 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
21100 /* If there is no non AGU operand definition, no AGU
21101 operand usage and split cost is 0 then both lea
21102 and non lea variants have same priority. Currently
21103 we prefer lea for 64 bit code and non lea on 32 bit
21104 code. */
21105 if (dist_use < 0 && split_cost == 0)
21106 return TARGET_64BIT || IX86_LEA_PRIORITY;
21107 else
21108 return true;
21111 /* With longer definitions distance lea is more preferable.
21112 Here we change it to take into account splitting cost and
21113 lea priority. */
21114 dist_define += split_cost + IX86_LEA_PRIORITY;
21116 /* If there is no use in memory addess then we just check
21117 that split cost exceeds AGU stall. */
21118 if (dist_use < 0)
21119 return dist_define > LEA_MAX_STALL;
21121 /* If this insn has both backward non-agu dependence and forward
21122 agu dependence, the one with short distance takes effect. */
21123 return dist_define >= dist_use;
21126 /* Return true if it is legal to clobber flags by INSN and
21127 false otherwise. */
21129 static bool
21130 ix86_ok_to_clobber_flags (rtx_insn *insn)
21132 basic_block bb = BLOCK_FOR_INSN (insn);
21133 df_ref use;
21134 bitmap live;
21136 while (insn)
21138 if (NONDEBUG_INSN_P (insn))
21140 FOR_EACH_INSN_USE (use, insn)
21141 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
21142 return false;
21144 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
21145 return true;
21148 if (insn == BB_END (bb))
21149 break;
21151 insn = NEXT_INSN (insn);
21154 live = df_get_live_out(bb);
21155 return !REGNO_REG_SET_P (live, FLAGS_REG);
21158 /* Return true if we need to split op0 = op1 + op2 into a sequence of
21159 move and add to avoid AGU stalls. */
21161 bool
21162 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
21164 unsigned int regno0, regno1, regno2;
21166 /* Check if we need to optimize. */
21167 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21168 return false;
21170 /* Check it is correct to split here. */
21171 if (!ix86_ok_to_clobber_flags(insn))
21172 return false;
21174 regno0 = true_regnum (operands[0]);
21175 regno1 = true_regnum (operands[1]);
21176 regno2 = true_regnum (operands[2]);
21178 /* We need to split only adds with non destructive
21179 destination operand. */
21180 if (regno0 == regno1 || regno0 == regno2)
21181 return false;
21182 else
21183 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
21186 /* Return true if we should emit lea instruction instead of mov
21187 instruction. */
21189 bool
21190 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
21192 unsigned int regno0, regno1;
21194 /* Check if we need to optimize. */
21195 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21196 return false;
21198 /* Use lea for reg to reg moves only. */
21199 if (!REG_P (operands[0]) || !REG_P (operands[1]))
21200 return false;
21202 regno0 = true_regnum (operands[0]);
21203 regno1 = true_regnum (operands[1]);
21205 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
21208 /* Return true if we need to split lea into a sequence of
21209 instructions to avoid AGU stalls. */
21211 bool
21212 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
21214 unsigned int regno0, regno1, regno2;
21215 int split_cost;
21216 struct ix86_address parts;
21217 int ok;
21219 /* Check we need to optimize. */
21220 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
21221 return false;
21223 /* The "at least two components" test below might not catch simple
21224 move or zero extension insns if parts.base is non-NULL and parts.disp
21225 is const0_rtx as the only components in the address, e.g. if the
21226 register is %rbp or %r13. As this test is much cheaper and moves or
21227 zero extensions are the common case, do this check first. */
21228 if (REG_P (operands[1])
21229 || (SImode_address_operand (operands[1], VOIDmode)
21230 && REG_P (XEXP (operands[1], 0))))
21231 return false;
21233 /* Check if it is OK to split here. */
21234 if (!ix86_ok_to_clobber_flags (insn))
21235 return false;
21237 ok = ix86_decompose_address (operands[1], &parts);
21238 gcc_assert (ok);
21240 /* There should be at least two components in the address. */
21241 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
21242 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
21243 return false;
21245 /* We should not split into add if non legitimate pic
21246 operand is used as displacement. */
21247 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
21248 return false;
21250 regno0 = true_regnum (operands[0]) ;
21251 regno1 = INVALID_REGNUM;
21252 regno2 = INVALID_REGNUM;
21254 if (parts.base)
21255 regno1 = true_regnum (parts.base);
21256 if (parts.index)
21257 regno2 = true_regnum (parts.index);
21259 split_cost = 0;
21261 /* Compute how many cycles we will add to execution time
21262 if split lea into a sequence of instructions. */
21263 if (parts.base || parts.index)
21265 /* Have to use mov instruction if non desctructive
21266 destination form is used. */
21267 if (regno1 != regno0 && regno2 != regno0)
21268 split_cost += 1;
21270 /* Have to add index to base if both exist. */
21271 if (parts.base && parts.index)
21272 split_cost += 1;
21274 /* Have to use shift and adds if scale is 2 or greater. */
21275 if (parts.scale > 1)
21277 if (regno0 != regno1)
21278 split_cost += 1;
21279 else if (regno2 == regno0)
21280 split_cost += 4;
21281 else
21282 split_cost += parts.scale;
21285 /* Have to use add instruction with immediate if
21286 disp is non zero. */
21287 if (parts.disp && parts.disp != const0_rtx)
21288 split_cost += 1;
21290 /* Subtract the price of lea. */
21291 split_cost -= 1;
21294 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
21295 parts.scale > 1);
21298 /* Emit x86 binary operand CODE in mode MODE, where the first operand
21299 matches destination. RTX includes clobber of FLAGS_REG. */
21301 static void
21302 ix86_emit_binop (enum rtx_code code, machine_mode mode,
21303 rtx dst, rtx src)
21305 rtx op, clob;
21307 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
21308 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21310 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
21313 /* Return true if regno1 def is nearest to the insn. */
21315 static bool
21316 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
21318 rtx_insn *prev = insn;
21319 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
21321 if (insn == start)
21322 return false;
21323 while (prev && prev != start)
21325 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
21327 prev = PREV_INSN (prev);
21328 continue;
21330 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
21331 return true;
21332 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
21333 return false;
21334 prev = PREV_INSN (prev);
21337 /* None of the regs is defined in the bb. */
21338 return false;
21341 /* Split lea instructions into a sequence of instructions
21342 which are executed on ALU to avoid AGU stalls.
21343 It is assumed that it is allowed to clobber flags register
21344 at lea position. */
21346 void
21347 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
21349 unsigned int regno0, regno1, regno2;
21350 struct ix86_address parts;
21351 rtx target, tmp;
21352 int ok, adds;
21354 ok = ix86_decompose_address (operands[1], &parts);
21355 gcc_assert (ok);
21357 target = gen_lowpart (mode, operands[0]);
21359 regno0 = true_regnum (target);
21360 regno1 = INVALID_REGNUM;
21361 regno2 = INVALID_REGNUM;
21363 if (parts.base)
21365 parts.base = gen_lowpart (mode, parts.base);
21366 regno1 = true_regnum (parts.base);
21369 if (parts.index)
21371 parts.index = gen_lowpart (mode, parts.index);
21372 regno2 = true_regnum (parts.index);
21375 if (parts.disp)
21376 parts.disp = gen_lowpart (mode, parts.disp);
21378 if (parts.scale > 1)
21380 /* Case r1 = r1 + ... */
21381 if (regno1 == regno0)
21383 /* If we have a case r1 = r1 + C * r2 then we
21384 should use multiplication which is very
21385 expensive. Assume cost model is wrong if we
21386 have such case here. */
21387 gcc_assert (regno2 != regno0);
21389 for (adds = parts.scale; adds > 0; adds--)
21390 ix86_emit_binop (PLUS, mode, target, parts.index);
21392 else
21394 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
21395 if (regno0 != regno2)
21396 emit_insn (gen_rtx_SET (target, parts.index));
21398 /* Use shift for scaling. */
21399 ix86_emit_binop (ASHIFT, mode, target,
21400 GEN_INT (exact_log2 (parts.scale)));
21402 if (parts.base)
21403 ix86_emit_binop (PLUS, mode, target, parts.base);
21405 if (parts.disp && parts.disp != const0_rtx)
21406 ix86_emit_binop (PLUS, mode, target, parts.disp);
21409 else if (!parts.base && !parts.index)
21411 gcc_assert(parts.disp);
21412 emit_insn (gen_rtx_SET (target, parts.disp));
21414 else
21416 if (!parts.base)
21418 if (regno0 != regno2)
21419 emit_insn (gen_rtx_SET (target, parts.index));
21421 else if (!parts.index)
21423 if (regno0 != regno1)
21424 emit_insn (gen_rtx_SET (target, parts.base));
21426 else
21428 if (regno0 == regno1)
21429 tmp = parts.index;
21430 else if (regno0 == regno2)
21431 tmp = parts.base;
21432 else
21434 rtx tmp1;
21436 /* Find better operand for SET instruction, depending
21437 on which definition is farther from the insn. */
21438 if (find_nearest_reg_def (insn, regno1, regno2))
21439 tmp = parts.index, tmp1 = parts.base;
21440 else
21441 tmp = parts.base, tmp1 = parts.index;
21443 emit_insn (gen_rtx_SET (target, tmp));
21445 if (parts.disp && parts.disp != const0_rtx)
21446 ix86_emit_binop (PLUS, mode, target, parts.disp);
21448 ix86_emit_binop (PLUS, mode, target, tmp1);
21449 return;
21452 ix86_emit_binop (PLUS, mode, target, tmp);
21455 if (parts.disp && parts.disp != const0_rtx)
21456 ix86_emit_binop (PLUS, mode, target, parts.disp);
21460 /* Return true if it is ok to optimize an ADD operation to LEA
21461 operation to avoid flag register consumation. For most processors,
21462 ADD is faster than LEA. For the processors like BONNELL, if the
21463 destination register of LEA holds an actual address which will be
21464 used soon, LEA is better and otherwise ADD is better. */
21466 bool
21467 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
21469 unsigned int regno0 = true_regnum (operands[0]);
21470 unsigned int regno1 = true_regnum (operands[1]);
21471 unsigned int regno2 = true_regnum (operands[2]);
21473 /* If a = b + c, (a!=b && a!=c), must use lea form. */
21474 if (regno0 != regno1 && regno0 != regno2)
21475 return true;
21477 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
21478 return false;
21480 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
21483 /* Return true if destination reg of SET_BODY is shift count of
21484 USE_BODY. */
21486 static bool
21487 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21489 rtx set_dest;
21490 rtx shift_rtx;
21491 int i;
21493 /* Retrieve destination of SET_BODY. */
21494 switch (GET_CODE (set_body))
21496 case SET:
21497 set_dest = SET_DEST (set_body);
21498 if (!set_dest || !REG_P (set_dest))
21499 return false;
21500 break;
21501 case PARALLEL:
21502 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21503 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21504 use_body))
21505 return true;
21506 /* FALLTHROUGH */
21507 default:
21508 return false;
21511 /* Retrieve shift count of USE_BODY. */
21512 switch (GET_CODE (use_body))
21514 case SET:
21515 shift_rtx = XEXP (use_body, 1);
21516 break;
21517 case PARALLEL:
21518 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21519 if (ix86_dep_by_shift_count_body (set_body,
21520 XVECEXP (use_body, 0, i)))
21521 return true;
21522 /* FALLTHROUGH */
21523 default:
21524 return false;
21527 if (shift_rtx
21528 && (GET_CODE (shift_rtx) == ASHIFT
21529 || GET_CODE (shift_rtx) == LSHIFTRT
21530 || GET_CODE (shift_rtx) == ASHIFTRT
21531 || GET_CODE (shift_rtx) == ROTATE
21532 || GET_CODE (shift_rtx) == ROTATERT))
21534 rtx shift_count = XEXP (shift_rtx, 1);
21536 /* Return true if shift count is dest of SET_BODY. */
21537 if (REG_P (shift_count))
21539 /* Add check since it can be invoked before register
21540 allocation in pre-reload schedule. */
21541 if (reload_completed
21542 && true_regnum (set_dest) == true_regnum (shift_count))
21543 return true;
21544 else if (REGNO(set_dest) == REGNO(shift_count))
21545 return true;
21549 return false;
21552 /* Return true if destination reg of SET_INSN is shift count of
21553 USE_INSN. */
21555 bool
21556 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21558 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21559 PATTERN (use_insn));
21562 /* Return TRUE or FALSE depending on whether the unary operator meets the
21563 appropriate constraints. */
21565 bool
21566 ix86_unary_operator_ok (enum rtx_code,
21567 machine_mode,
21568 rtx operands[2])
21570 /* If one of operands is memory, source and destination must match. */
21571 if ((MEM_P (operands[0])
21572 || MEM_P (operands[1]))
21573 && ! rtx_equal_p (operands[0], operands[1]))
21574 return false;
21575 return true;
21578 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21579 are ok, keeping in mind the possible movddup alternative. */
21581 bool
21582 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21584 if (MEM_P (operands[0]))
21585 return rtx_equal_p (operands[0], operands[1 + high]);
21586 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21587 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21588 return true;
21591 /* Post-reload splitter for converting an SF or DFmode value in an
21592 SSE register into an unsigned SImode. */
21594 void
21595 ix86_split_convert_uns_si_sse (rtx operands[])
21597 machine_mode vecmode;
21598 rtx value, large, zero_or_two31, input, two31, x;
21600 large = operands[1];
21601 zero_or_two31 = operands[2];
21602 input = operands[3];
21603 two31 = operands[4];
21604 vecmode = GET_MODE (large);
21605 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21607 /* Load up the value into the low element. We must ensure that the other
21608 elements are valid floats -- zero is the easiest such value. */
21609 if (MEM_P (input))
21611 if (vecmode == V4SFmode)
21612 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21613 else
21614 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21616 else
21618 input = gen_rtx_REG (vecmode, REGNO (input));
21619 emit_move_insn (value, CONST0_RTX (vecmode));
21620 if (vecmode == V4SFmode)
21621 emit_insn (gen_sse_movss (value, value, input));
21622 else
21623 emit_insn (gen_sse2_movsd (value, value, input));
21626 emit_move_insn (large, two31);
21627 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21629 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21630 emit_insn (gen_rtx_SET (large, x));
21632 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21633 emit_insn (gen_rtx_SET (zero_or_two31, x));
21635 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21636 emit_insn (gen_rtx_SET (value, x));
21638 large = gen_rtx_REG (V4SImode, REGNO (large));
21639 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21641 x = gen_rtx_REG (V4SImode, REGNO (value));
21642 if (vecmode == V4SFmode)
21643 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21644 else
21645 emit_insn (gen_sse2_cvttpd2dq (x, value));
21646 value = x;
21648 emit_insn (gen_xorv4si3 (value, value, large));
21651 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21652 Expects the 64-bit DImode to be supplied in a pair of integral
21653 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21654 -mfpmath=sse, !optimize_size only. */
21656 void
21657 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21659 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21660 rtx int_xmm, fp_xmm;
21661 rtx biases, exponents;
21662 rtx x;
21664 int_xmm = gen_reg_rtx (V4SImode);
21665 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21666 emit_insn (gen_movdi_to_sse (int_xmm, input));
21667 else if (TARGET_SSE_SPLIT_REGS)
21669 emit_clobber (int_xmm);
21670 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21672 else
21674 x = gen_reg_rtx (V2DImode);
21675 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21676 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21679 x = gen_rtx_CONST_VECTOR (V4SImode,
21680 gen_rtvec (4, GEN_INT (0x43300000UL),
21681 GEN_INT (0x45300000UL),
21682 const0_rtx, const0_rtx));
21683 exponents = validize_mem (force_const_mem (V4SImode, x));
21685 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21686 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21688 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21689 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21690 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21691 (0x1.0p84 + double(fp_value_hi_xmm)).
21692 Note these exponents differ by 32. */
21694 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21696 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21697 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21698 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21699 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21700 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21701 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21702 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21703 biases = validize_mem (force_const_mem (V2DFmode, biases));
21704 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21706 /* Add the upper and lower DFmode values together. */
21707 if (TARGET_SSE3)
21708 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21709 else
21711 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21712 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21713 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21716 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21719 /* Not used, but eases macroization of patterns. */
21720 void
21721 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21723 gcc_unreachable ();
21726 /* Convert an unsigned SImode value into a DFmode. Only currently used
21727 for SSE, but applicable anywhere. */
21729 void
21730 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21732 REAL_VALUE_TYPE TWO31r;
21733 rtx x, fp;
21735 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21736 NULL, 1, OPTAB_DIRECT);
21738 fp = gen_reg_rtx (DFmode);
21739 emit_insn (gen_floatsidf2 (fp, x));
21741 real_ldexp (&TWO31r, &dconst1, 31);
21742 x = const_double_from_real_value (TWO31r, DFmode);
21744 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21745 if (x != target)
21746 emit_move_insn (target, x);
21749 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21750 32-bit mode; otherwise we have a direct convert instruction. */
21752 void
21753 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21755 REAL_VALUE_TYPE TWO32r;
21756 rtx fp_lo, fp_hi, x;
21758 fp_lo = gen_reg_rtx (DFmode);
21759 fp_hi = gen_reg_rtx (DFmode);
21761 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21763 real_ldexp (&TWO32r, &dconst1, 32);
21764 x = const_double_from_real_value (TWO32r, DFmode);
21765 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21767 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21769 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21770 0, OPTAB_DIRECT);
21771 if (x != target)
21772 emit_move_insn (target, x);
21775 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21776 For x86_32, -mfpmath=sse, !optimize_size only. */
21777 void
21778 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21780 REAL_VALUE_TYPE ONE16r;
21781 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21783 real_ldexp (&ONE16r, &dconst1, 16);
21784 x = const_double_from_real_value (ONE16r, SFmode);
21785 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21786 NULL, 0, OPTAB_DIRECT);
21787 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21788 NULL, 0, OPTAB_DIRECT);
21789 fp_hi = gen_reg_rtx (SFmode);
21790 fp_lo = gen_reg_rtx (SFmode);
21791 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21792 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21793 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21794 0, OPTAB_DIRECT);
21795 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21796 0, OPTAB_DIRECT);
21797 if (!rtx_equal_p (target, fp_hi))
21798 emit_move_insn (target, fp_hi);
21801 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21802 a vector of unsigned ints VAL to vector of floats TARGET. */
21804 void
21805 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21807 rtx tmp[8];
21808 REAL_VALUE_TYPE TWO16r;
21809 machine_mode intmode = GET_MODE (val);
21810 machine_mode fltmode = GET_MODE (target);
21811 rtx (*cvt) (rtx, rtx);
21813 if (intmode == V4SImode)
21814 cvt = gen_floatv4siv4sf2;
21815 else
21816 cvt = gen_floatv8siv8sf2;
21817 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21818 tmp[0] = force_reg (intmode, tmp[0]);
21819 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21820 OPTAB_DIRECT);
21821 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21822 NULL_RTX, 1, OPTAB_DIRECT);
21823 tmp[3] = gen_reg_rtx (fltmode);
21824 emit_insn (cvt (tmp[3], tmp[1]));
21825 tmp[4] = gen_reg_rtx (fltmode);
21826 emit_insn (cvt (tmp[4], tmp[2]));
21827 real_ldexp (&TWO16r, &dconst1, 16);
21828 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21829 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21830 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21831 OPTAB_DIRECT);
21832 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21833 OPTAB_DIRECT);
21834 if (tmp[7] != target)
21835 emit_move_insn (target, tmp[7]);
21838 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21839 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21840 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21841 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21844 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21846 REAL_VALUE_TYPE TWO31r;
21847 rtx two31r, tmp[4];
21848 machine_mode mode = GET_MODE (val);
21849 machine_mode scalarmode = GET_MODE_INNER (mode);
21850 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21851 rtx (*cmp) (rtx, rtx, rtx, rtx);
21852 int i;
21854 for (i = 0; i < 3; i++)
21855 tmp[i] = gen_reg_rtx (mode);
21856 real_ldexp (&TWO31r, &dconst1, 31);
21857 two31r = const_double_from_real_value (TWO31r, scalarmode);
21858 two31r = ix86_build_const_vector (mode, 1, two31r);
21859 two31r = force_reg (mode, two31r);
21860 switch (mode)
21862 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21863 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21864 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21865 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21866 default: gcc_unreachable ();
21868 tmp[3] = gen_rtx_LE (mode, two31r, val);
21869 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21870 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21871 0, OPTAB_DIRECT);
21872 if (intmode == V4SImode || TARGET_AVX2)
21873 *xorp = expand_simple_binop (intmode, ASHIFT,
21874 gen_lowpart (intmode, tmp[0]),
21875 GEN_INT (31), NULL_RTX, 0,
21876 OPTAB_DIRECT);
21877 else
21879 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21880 two31 = ix86_build_const_vector (intmode, 1, two31);
21881 *xorp = expand_simple_binop (intmode, AND,
21882 gen_lowpart (intmode, tmp[0]),
21883 two31, NULL_RTX, 0,
21884 OPTAB_DIRECT);
21886 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21887 0, OPTAB_DIRECT);
21890 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21891 then replicate the value for all elements of the vector
21892 register. */
21895 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21897 int i, n_elt;
21898 rtvec v;
21899 machine_mode scalar_mode;
21901 switch (mode)
21903 case E_V64QImode:
21904 case E_V32QImode:
21905 case E_V16QImode:
21906 case E_V32HImode:
21907 case E_V16HImode:
21908 case E_V8HImode:
21909 case E_V16SImode:
21910 case E_V8SImode:
21911 case E_V4SImode:
21912 case E_V8DImode:
21913 case E_V4DImode:
21914 case E_V2DImode:
21915 gcc_assert (vect);
21916 /* FALLTHRU */
21917 case E_V16SFmode:
21918 case E_V8SFmode:
21919 case E_V4SFmode:
21920 case E_V8DFmode:
21921 case E_V4DFmode:
21922 case E_V2DFmode:
21923 n_elt = GET_MODE_NUNITS (mode);
21924 v = rtvec_alloc (n_elt);
21925 scalar_mode = GET_MODE_INNER (mode);
21927 RTVEC_ELT (v, 0) = value;
21929 for (i = 1; i < n_elt; ++i)
21930 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21932 return gen_rtx_CONST_VECTOR (mode, v);
21934 default:
21935 gcc_unreachable ();
21939 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21940 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21941 for an SSE register. If VECT is true, then replicate the mask for
21942 all elements of the vector register. If INVERT is true, then create
21943 a mask excluding the sign bit. */
21946 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21948 machine_mode vec_mode, imode;
21949 wide_int w;
21950 rtx mask, v;
21952 switch (mode)
21954 case E_V16SImode:
21955 case E_V16SFmode:
21956 case E_V8SImode:
21957 case E_V4SImode:
21958 case E_V8SFmode:
21959 case E_V4SFmode:
21960 vec_mode = mode;
21961 imode = SImode;
21962 break;
21964 case E_V8DImode:
21965 case E_V4DImode:
21966 case E_V2DImode:
21967 case E_V8DFmode:
21968 case E_V4DFmode:
21969 case E_V2DFmode:
21970 vec_mode = mode;
21971 imode = DImode;
21972 break;
21974 case E_TImode:
21975 case E_TFmode:
21976 vec_mode = VOIDmode;
21977 imode = TImode;
21978 break;
21980 default:
21981 gcc_unreachable ();
21984 machine_mode inner_mode = GET_MODE_INNER (mode);
21985 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21986 GET_MODE_BITSIZE (inner_mode));
21987 if (invert)
21988 w = wi::bit_not (w);
21990 /* Force this value into the low part of a fp vector constant. */
21991 mask = immed_wide_int_const (w, imode);
21992 mask = gen_lowpart (inner_mode, mask);
21994 if (vec_mode == VOIDmode)
21995 return force_reg (inner_mode, mask);
21997 v = ix86_build_const_vector (vec_mode, vect, mask);
21998 return force_reg (vec_mode, v);
22001 /* Generate code for floating point ABS or NEG. */
22003 void
22004 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
22005 rtx operands[])
22007 rtx mask, set, dst, src;
22008 bool use_sse = false;
22009 bool vector_mode = VECTOR_MODE_P (mode);
22010 machine_mode vmode = mode;
22012 if (vector_mode)
22013 use_sse = true;
22014 else if (mode == TFmode)
22015 use_sse = true;
22016 else if (TARGET_SSE_MATH)
22018 use_sse = SSE_FLOAT_MODE_P (mode);
22019 if (mode == SFmode)
22020 vmode = V4SFmode;
22021 else if (mode == DFmode)
22022 vmode = V2DFmode;
22025 /* NEG and ABS performed with SSE use bitwise mask operations.
22026 Create the appropriate mask now. */
22027 if (use_sse)
22028 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
22029 else
22030 mask = NULL_RTX;
22032 dst = operands[0];
22033 src = operands[1];
22035 set = gen_rtx_fmt_e (code, mode, src);
22036 set = gen_rtx_SET (dst, set);
22038 if (mask)
22040 rtx use, clob;
22041 rtvec par;
22043 use = gen_rtx_USE (VOIDmode, mask);
22044 if (vector_mode)
22045 par = gen_rtvec (2, set, use);
22046 else
22048 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
22049 par = gen_rtvec (3, set, use, clob);
22051 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
22053 else
22054 emit_insn (set);
22057 /* Expand a copysign operation. Special case operand 0 being a constant. */
22059 void
22060 ix86_expand_copysign (rtx operands[])
22062 machine_mode mode, vmode;
22063 rtx dest, op0, op1, mask, nmask;
22065 dest = operands[0];
22066 op0 = operands[1];
22067 op1 = operands[2];
22069 mode = GET_MODE (dest);
22071 if (mode == SFmode)
22072 vmode = V4SFmode;
22073 else if (mode == DFmode)
22074 vmode = V2DFmode;
22075 else
22076 vmode = mode;
22078 if (CONST_DOUBLE_P (op0))
22080 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
22082 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
22083 op0 = simplify_unary_operation (ABS, mode, op0, mode);
22085 if (mode == SFmode || mode == DFmode)
22087 if (op0 == CONST0_RTX (mode))
22088 op0 = CONST0_RTX (vmode);
22089 else
22091 rtx v = ix86_build_const_vector (vmode, false, op0);
22093 op0 = force_reg (vmode, v);
22096 else if (op0 != CONST0_RTX (mode))
22097 op0 = force_reg (mode, op0);
22099 mask = ix86_build_signbit_mask (vmode, 0, 0);
22101 if (mode == SFmode)
22102 copysign_insn = gen_copysignsf3_const;
22103 else if (mode == DFmode)
22104 copysign_insn = gen_copysigndf3_const;
22105 else
22106 copysign_insn = gen_copysigntf3_const;
22108 emit_insn (copysign_insn (dest, op0, op1, mask));
22110 else
22112 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
22114 nmask = ix86_build_signbit_mask (vmode, 0, 1);
22115 mask = ix86_build_signbit_mask (vmode, 0, 0);
22117 if (mode == SFmode)
22118 copysign_insn = gen_copysignsf3_var;
22119 else if (mode == DFmode)
22120 copysign_insn = gen_copysigndf3_var;
22121 else
22122 copysign_insn = gen_copysigntf3_var;
22124 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
22128 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
22129 be a constant, and so has already been expanded into a vector constant. */
22131 void
22132 ix86_split_copysign_const (rtx operands[])
22134 machine_mode mode, vmode;
22135 rtx dest, op0, mask, x;
22137 dest = operands[0];
22138 op0 = operands[1];
22139 mask = operands[3];
22141 mode = GET_MODE (dest);
22142 vmode = GET_MODE (mask);
22144 dest = lowpart_subreg (vmode, dest, mode);
22145 x = gen_rtx_AND (vmode, dest, mask);
22146 emit_insn (gen_rtx_SET (dest, x));
22148 if (op0 != CONST0_RTX (vmode))
22150 x = gen_rtx_IOR (vmode, dest, op0);
22151 emit_insn (gen_rtx_SET (dest, x));
22155 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
22156 so we have to do two masks. */
22158 void
22159 ix86_split_copysign_var (rtx operands[])
22161 machine_mode mode, vmode;
22162 rtx dest, scratch, op0, op1, mask, nmask, x;
22164 dest = operands[0];
22165 scratch = operands[1];
22166 op0 = operands[2];
22167 op1 = operands[3];
22168 nmask = operands[4];
22169 mask = operands[5];
22171 mode = GET_MODE (dest);
22172 vmode = GET_MODE (mask);
22174 if (rtx_equal_p (op0, op1))
22176 /* Shouldn't happen often (it's useless, obviously), but when it does
22177 we'd generate incorrect code if we continue below. */
22178 emit_move_insn (dest, op0);
22179 return;
22182 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
22184 gcc_assert (REGNO (op1) == REGNO (scratch));
22186 x = gen_rtx_AND (vmode, scratch, mask);
22187 emit_insn (gen_rtx_SET (scratch, x));
22189 dest = mask;
22190 op0 = lowpart_subreg (vmode, op0, mode);
22191 x = gen_rtx_NOT (vmode, dest);
22192 x = gen_rtx_AND (vmode, x, op0);
22193 emit_insn (gen_rtx_SET (dest, x));
22195 else
22197 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
22199 x = gen_rtx_AND (vmode, scratch, mask);
22201 else /* alternative 2,4 */
22203 gcc_assert (REGNO (mask) == REGNO (scratch));
22204 op1 = lowpart_subreg (vmode, op1, mode);
22205 x = gen_rtx_AND (vmode, scratch, op1);
22207 emit_insn (gen_rtx_SET (scratch, x));
22209 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
22211 dest = lowpart_subreg (vmode, op0, mode);
22212 x = gen_rtx_AND (vmode, dest, nmask);
22214 else /* alternative 3,4 */
22216 gcc_assert (REGNO (nmask) == REGNO (dest));
22217 dest = nmask;
22218 op0 = lowpart_subreg (vmode, op0, mode);
22219 x = gen_rtx_AND (vmode, dest, op0);
22221 emit_insn (gen_rtx_SET (dest, x));
22224 x = gen_rtx_IOR (vmode, dest, scratch);
22225 emit_insn (gen_rtx_SET (dest, x));
22228 /* Return TRUE or FALSE depending on whether the first SET in INSN
22229 has source and destination with matching CC modes, and that the
22230 CC mode is at least as constrained as REQ_MODE. */
22232 bool
22233 ix86_match_ccmode (rtx insn, machine_mode req_mode)
22235 rtx set;
22236 machine_mode set_mode;
22238 set = PATTERN (insn);
22239 if (GET_CODE (set) == PARALLEL)
22240 set = XVECEXP (set, 0, 0);
22241 gcc_assert (GET_CODE (set) == SET);
22242 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
22244 set_mode = GET_MODE (SET_DEST (set));
22245 switch (set_mode)
22247 case E_CCNOmode:
22248 if (req_mode != CCNOmode
22249 && (req_mode != CCmode
22250 || XEXP (SET_SRC (set), 1) != const0_rtx))
22251 return false;
22252 break;
22253 case E_CCmode:
22254 if (req_mode == CCGCmode)
22255 return false;
22256 /* FALLTHRU */
22257 case E_CCGCmode:
22258 if (req_mode == CCGOCmode || req_mode == CCNOmode)
22259 return false;
22260 /* FALLTHRU */
22261 case E_CCGOCmode:
22262 if (req_mode == CCZmode)
22263 return false;
22264 /* FALLTHRU */
22265 case E_CCZmode:
22266 break;
22268 case E_CCGZmode:
22270 case E_CCAmode:
22271 case E_CCCmode:
22272 case E_CCOmode:
22273 case E_CCPmode:
22274 case E_CCSmode:
22275 if (set_mode != req_mode)
22276 return false;
22277 break;
22279 default:
22280 gcc_unreachable ();
22283 return GET_MODE (SET_SRC (set)) == set_mode;
22286 /* Generate insn patterns to do an integer compare of OPERANDS. */
22288 static rtx
22289 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
22291 machine_mode cmpmode;
22292 rtx tmp, flags;
22294 cmpmode = SELECT_CC_MODE (code, op0, op1);
22295 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
22297 /* This is very simple, but making the interface the same as in the
22298 FP case makes the rest of the code easier. */
22299 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
22300 emit_insn (gen_rtx_SET (flags, tmp));
22302 /* Return the test that should be put into the flags user, i.e.
22303 the bcc, scc, or cmov instruction. */
22304 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
22307 /* Figure out whether to use unordered fp comparisons. */
22309 static bool
22310 ix86_unordered_fp_compare (enum rtx_code code)
22312 if (!TARGET_IEEE_FP)
22313 return false;
22315 switch (code)
22317 case GT:
22318 case GE:
22319 case LT:
22320 case LE:
22321 return false;
22323 case EQ:
22324 case NE:
22326 case LTGT:
22327 case UNORDERED:
22328 case ORDERED:
22329 case UNLT:
22330 case UNLE:
22331 case UNGT:
22332 case UNGE:
22333 case UNEQ:
22334 return true;
22336 default:
22337 gcc_unreachable ();
22341 machine_mode
22342 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
22344 machine_mode mode = GET_MODE (op0);
22346 if (SCALAR_FLOAT_MODE_P (mode))
22348 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22349 return CCFPmode;
22352 switch (code)
22354 /* Only zero flag is needed. */
22355 case EQ: /* ZF=0 */
22356 case NE: /* ZF!=0 */
22357 return CCZmode;
22358 /* Codes needing carry flag. */
22359 case GEU: /* CF=0 */
22360 case LTU: /* CF=1 */
22361 /* Detect overflow checks. They need just the carry flag. */
22362 if (GET_CODE (op0) == PLUS
22363 && (rtx_equal_p (op1, XEXP (op0, 0))
22364 || rtx_equal_p (op1, XEXP (op0, 1))))
22365 return CCCmode;
22366 else
22367 return CCmode;
22368 case GTU: /* CF=0 & ZF=0 */
22369 case LEU: /* CF=1 | ZF=1 */
22370 return CCmode;
22371 /* Codes possibly doable only with sign flag when
22372 comparing against zero. */
22373 case GE: /* SF=OF or SF=0 */
22374 case LT: /* SF<>OF or SF=1 */
22375 if (op1 == const0_rtx)
22376 return CCGOCmode;
22377 else
22378 /* For other cases Carry flag is not required. */
22379 return CCGCmode;
22380 /* Codes doable only with sign flag when comparing
22381 against zero, but we miss jump instruction for it
22382 so we need to use relational tests against overflow
22383 that thus needs to be zero. */
22384 case GT: /* ZF=0 & SF=OF */
22385 case LE: /* ZF=1 | SF<>OF */
22386 if (op1 == const0_rtx)
22387 return CCNOmode;
22388 else
22389 return CCGCmode;
22390 /* strcmp pattern do (use flags) and combine may ask us for proper
22391 mode. */
22392 case USE:
22393 return CCmode;
22394 default:
22395 gcc_unreachable ();
22399 /* Return the fixed registers used for condition codes. */
22401 static bool
22402 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
22404 *p1 = FLAGS_REG;
22405 *p2 = FPSR_REG;
22406 return true;
22409 /* If two condition code modes are compatible, return a condition code
22410 mode which is compatible with both. Otherwise, return
22411 VOIDmode. */
22413 static machine_mode
22414 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
22416 if (m1 == m2)
22417 return m1;
22419 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
22420 return VOIDmode;
22422 if ((m1 == CCGCmode && m2 == CCGOCmode)
22423 || (m1 == CCGOCmode && m2 == CCGCmode))
22424 return CCGCmode;
22426 if ((m1 == CCNOmode && m2 == CCGOCmode)
22427 || (m1 == CCGOCmode && m2 == CCNOmode))
22428 return CCNOmode;
22430 if (m1 == CCZmode
22431 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
22432 return m2;
22433 else if (m2 == CCZmode
22434 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
22435 return m1;
22437 switch (m1)
22439 default:
22440 gcc_unreachable ();
22442 case E_CCmode:
22443 case E_CCGCmode:
22444 case E_CCGOCmode:
22445 case E_CCNOmode:
22446 case E_CCAmode:
22447 case E_CCCmode:
22448 case E_CCOmode:
22449 case E_CCPmode:
22450 case E_CCSmode:
22451 case E_CCZmode:
22452 switch (m2)
22454 default:
22455 return VOIDmode;
22457 case E_CCmode:
22458 case E_CCGCmode:
22459 case E_CCGOCmode:
22460 case E_CCNOmode:
22461 case E_CCAmode:
22462 case E_CCCmode:
22463 case E_CCOmode:
22464 case E_CCPmode:
22465 case E_CCSmode:
22466 case E_CCZmode:
22467 return CCmode;
22470 case E_CCFPmode:
22471 /* These are only compatible with themselves, which we already
22472 checked above. */
22473 return VOIDmode;
22478 /* Return a comparison we can do and that it is equivalent to
22479 swap_condition (code) apart possibly from orderedness.
22480 But, never change orderedness if TARGET_IEEE_FP, returning
22481 UNKNOWN in that case if necessary. */
22483 static enum rtx_code
22484 ix86_fp_swap_condition (enum rtx_code code)
22486 switch (code)
22488 case GT: /* GTU - CF=0 & ZF=0 */
22489 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22490 case GE: /* GEU - CF=0 */
22491 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22492 case UNLT: /* LTU - CF=1 */
22493 return TARGET_IEEE_FP ? UNKNOWN : GT;
22494 case UNLE: /* LEU - CF=1 | ZF=1 */
22495 return TARGET_IEEE_FP ? UNKNOWN : GE;
22496 default:
22497 return swap_condition (code);
22501 /* Return cost of comparison CODE using the best strategy for performance.
22502 All following functions do use number of instructions as a cost metrics.
22503 In future this should be tweaked to compute bytes for optimize_size and
22504 take into account performance of various instructions on various CPUs. */
22506 static int
22507 ix86_fp_comparison_cost (enum rtx_code code)
22509 int arith_cost;
22511 /* The cost of code using bit-twiddling on %ah. */
22512 switch (code)
22514 case UNLE:
22515 case UNLT:
22516 case LTGT:
22517 case GT:
22518 case GE:
22519 case UNORDERED:
22520 case ORDERED:
22521 case UNEQ:
22522 arith_cost = 4;
22523 break;
22524 case LT:
22525 case NE:
22526 case EQ:
22527 case UNGE:
22528 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22529 break;
22530 case LE:
22531 case UNGT:
22532 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22533 break;
22534 default:
22535 gcc_unreachable ();
22538 switch (ix86_fp_comparison_strategy (code))
22540 case IX86_FPCMP_COMI:
22541 return arith_cost > 4 ? 3 : 2;
22542 case IX86_FPCMP_SAHF:
22543 return arith_cost > 4 ? 4 : 3;
22544 default:
22545 return arith_cost;
22549 /* Return strategy to use for floating-point. We assume that fcomi is always
22550 preferrable where available, since that is also true when looking at size
22551 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22553 enum ix86_fpcmp_strategy
22554 ix86_fp_comparison_strategy (enum rtx_code)
22556 /* Do fcomi/sahf based test when profitable. */
22558 if (TARGET_CMOVE)
22559 return IX86_FPCMP_COMI;
22561 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22562 return IX86_FPCMP_SAHF;
22564 return IX86_FPCMP_ARITH;
22567 /* Swap, force into registers, or otherwise massage the two operands
22568 to a fp comparison. The operands are updated in place; the new
22569 comparison code is returned. */
22571 static enum rtx_code
22572 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22574 bool unordered_compare = ix86_unordered_fp_compare (code);
22575 rtx op0 = *pop0, op1 = *pop1;
22576 machine_mode op_mode = GET_MODE (op0);
22577 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22579 /* All of the unordered compare instructions only work on registers.
22580 The same is true of the fcomi compare instructions. The XFmode
22581 compare instructions require registers except when comparing
22582 against zero or when converting operand 1 from fixed point to
22583 floating point. */
22585 if (!is_sse
22586 && (unordered_compare
22587 || (op_mode == XFmode
22588 && ! (standard_80387_constant_p (op0) == 1
22589 || standard_80387_constant_p (op1) == 1)
22590 && GET_CODE (op1) != FLOAT)
22591 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22593 op0 = force_reg (op_mode, op0);
22594 op1 = force_reg (op_mode, op1);
22596 else
22598 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22599 things around if they appear profitable, otherwise force op0
22600 into a register. */
22602 if (standard_80387_constant_p (op0) == 0
22603 || (MEM_P (op0)
22604 && ! (standard_80387_constant_p (op1) == 0
22605 || MEM_P (op1))))
22607 enum rtx_code new_code = ix86_fp_swap_condition (code);
22608 if (new_code != UNKNOWN)
22610 std::swap (op0, op1);
22611 code = new_code;
22615 if (!REG_P (op0))
22616 op0 = force_reg (op_mode, op0);
22618 if (CONSTANT_P (op1))
22620 int tmp = standard_80387_constant_p (op1);
22621 if (tmp == 0)
22622 op1 = validize_mem (force_const_mem (op_mode, op1));
22623 else if (tmp == 1)
22625 if (TARGET_CMOVE)
22626 op1 = force_reg (op_mode, op1);
22628 else
22629 op1 = force_reg (op_mode, op1);
22633 /* Try to rearrange the comparison to make it cheaper. */
22634 if (ix86_fp_comparison_cost (code)
22635 > ix86_fp_comparison_cost (swap_condition (code))
22636 && (REG_P (op1) || can_create_pseudo_p ()))
22638 std::swap (op0, op1);
22639 code = swap_condition (code);
22640 if (!REG_P (op0))
22641 op0 = force_reg (op_mode, op0);
22644 *pop0 = op0;
22645 *pop1 = op1;
22646 return code;
22649 /* Convert comparison codes we use to represent FP comparison to integer
22650 code that will result in proper branch. Return UNKNOWN if no such code
22651 is available. */
22653 enum rtx_code
22654 ix86_fp_compare_code_to_integer (enum rtx_code code)
22656 switch (code)
22658 case GT:
22659 return GTU;
22660 case GE:
22661 return GEU;
22662 case ORDERED:
22663 case UNORDERED:
22664 return code;
22665 case UNEQ:
22666 return EQ;
22667 case UNLT:
22668 return LTU;
22669 case UNLE:
22670 return LEU;
22671 case LTGT:
22672 return NE;
22673 default:
22674 return UNKNOWN;
22678 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22680 static rtx
22681 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22683 bool unordered_compare = ix86_unordered_fp_compare (code);
22684 machine_mode intcmp_mode;
22685 rtx tmp, tmp2;
22687 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22689 /* Do fcomi/sahf based test when profitable. */
22690 switch (ix86_fp_comparison_strategy (code))
22692 case IX86_FPCMP_COMI:
22693 intcmp_mode = CCFPmode;
22694 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22695 if (unordered_compare)
22696 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22697 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22698 break;
22700 case IX86_FPCMP_SAHF:
22701 intcmp_mode = CCFPmode;
22702 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22703 if (unordered_compare)
22704 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22705 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22706 if (!scratch)
22707 scratch = gen_reg_rtx (HImode);
22708 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22709 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22710 break;
22712 case IX86_FPCMP_ARITH:
22713 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22714 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22715 if (unordered_compare)
22716 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22717 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22718 if (!scratch)
22719 scratch = gen_reg_rtx (HImode);
22720 emit_insn (gen_rtx_SET (scratch, tmp));
22722 /* In the unordered case, we have to check C2 for NaN's, which
22723 doesn't happen to work out to anything nice combination-wise.
22724 So do some bit twiddling on the value we've got in AH to come
22725 up with an appropriate set of condition codes. */
22727 intcmp_mode = CCNOmode;
22728 switch (code)
22730 case GT:
22731 case UNGT:
22732 if (code == GT || !TARGET_IEEE_FP)
22734 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22735 code = EQ;
22737 else
22739 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22740 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22741 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22742 intcmp_mode = CCmode;
22743 code = GEU;
22745 break;
22746 case LT:
22747 case UNLT:
22748 if (code == LT && TARGET_IEEE_FP)
22750 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22751 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22752 intcmp_mode = CCmode;
22753 code = EQ;
22755 else
22757 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22758 code = NE;
22760 break;
22761 case GE:
22762 case UNGE:
22763 if (code == GE || !TARGET_IEEE_FP)
22765 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22766 code = EQ;
22768 else
22770 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22771 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22772 code = NE;
22774 break;
22775 case LE:
22776 case UNLE:
22777 if (code == LE && TARGET_IEEE_FP)
22779 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22780 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22781 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22782 intcmp_mode = CCmode;
22783 code = LTU;
22785 else
22787 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22788 code = NE;
22790 break;
22791 case EQ:
22792 case UNEQ:
22793 if (code == EQ && TARGET_IEEE_FP)
22795 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22796 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22797 intcmp_mode = CCmode;
22798 code = EQ;
22800 else
22802 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22803 code = NE;
22805 break;
22806 case NE:
22807 case LTGT:
22808 if (code == NE && TARGET_IEEE_FP)
22810 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22811 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22812 GEN_INT (0x40)));
22813 code = NE;
22815 else
22817 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22818 code = EQ;
22820 break;
22822 case UNORDERED:
22823 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22824 code = NE;
22825 break;
22826 case ORDERED:
22827 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22828 code = EQ;
22829 break;
22831 default:
22832 gcc_unreachable ();
22834 break;
22836 default:
22837 gcc_unreachable();
22840 /* Return the test that should be put into the flags user, i.e.
22841 the bcc, scc, or cmov instruction. */
22842 return gen_rtx_fmt_ee (code, VOIDmode,
22843 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22844 const0_rtx);
22847 static rtx
22848 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22850 rtx ret;
22852 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22853 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22855 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22857 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22858 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22860 else
22861 ret = ix86_expand_int_compare (code, op0, op1);
22863 return ret;
22866 void
22867 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22869 machine_mode mode = GET_MODE (op0);
22870 rtx tmp;
22872 /* Handle special case - vector comparsion with boolean result, transform
22873 it using ptest instruction. */
22874 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22876 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22877 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22879 gcc_assert (code == EQ || code == NE);
22880 /* Generate XOR since we can't check that one operand is zero vector. */
22881 tmp = gen_reg_rtx (mode);
22882 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22883 tmp = gen_lowpart (p_mode, tmp);
22884 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22885 gen_rtx_UNSPEC (CCmode,
22886 gen_rtvec (2, tmp, tmp),
22887 UNSPEC_PTEST)));
22888 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22889 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22890 gen_rtx_LABEL_REF (VOIDmode, label),
22891 pc_rtx);
22892 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22893 return;
22896 switch (mode)
22898 case E_SFmode:
22899 case E_DFmode:
22900 case E_XFmode:
22901 case E_QImode:
22902 case E_HImode:
22903 case E_SImode:
22904 simple:
22905 tmp = ix86_expand_compare (code, op0, op1);
22906 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22907 gen_rtx_LABEL_REF (VOIDmode, label),
22908 pc_rtx);
22909 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22910 return;
22912 case E_DImode:
22913 if (TARGET_64BIT)
22914 goto simple;
22915 /* For 32-bit target DI comparison may be performed on
22916 SSE registers. To allow this we should avoid split
22917 to SI mode which is achieved by doing xor in DI mode
22918 and then comparing with zero (which is recognized by
22919 STV pass). We don't compare using xor when optimizing
22920 for size. */
22921 if (!optimize_insn_for_size_p ()
22922 && TARGET_STV
22923 && (code == EQ || code == NE))
22925 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22926 op1 = const0_rtx;
22928 /* FALLTHRU */
22929 case E_TImode:
22930 /* Expand DImode branch into multiple compare+branch. */
22932 rtx lo[2], hi[2];
22933 rtx_code_label *label2;
22934 enum rtx_code code1, code2, code3;
22935 machine_mode submode;
22937 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22939 std::swap (op0, op1);
22940 code = swap_condition (code);
22943 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22944 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22946 submode = mode == DImode ? SImode : DImode;
22948 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22949 avoid two branches. This costs one extra insn, so disable when
22950 optimizing for size. */
22952 if ((code == EQ || code == NE)
22953 && (!optimize_insn_for_size_p ()
22954 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22956 rtx xor0, xor1;
22958 xor1 = hi[0];
22959 if (hi[1] != const0_rtx)
22960 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22961 NULL_RTX, 0, OPTAB_WIDEN);
22963 xor0 = lo[0];
22964 if (lo[1] != const0_rtx)
22965 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22966 NULL_RTX, 0, OPTAB_WIDEN);
22968 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22969 NULL_RTX, 0, OPTAB_WIDEN);
22971 ix86_expand_branch (code, tmp, const0_rtx, label);
22972 return;
22975 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22976 op1 is a constant and the low word is zero, then we can just
22977 examine the high word. Similarly for low word -1 and
22978 less-or-equal-than or greater-than. */
22980 if (CONST_INT_P (hi[1]))
22981 switch (code)
22983 case LT: case LTU: case GE: case GEU:
22984 if (lo[1] == const0_rtx)
22986 ix86_expand_branch (code, hi[0], hi[1], label);
22987 return;
22989 break;
22990 case LE: case LEU: case GT: case GTU:
22991 if (lo[1] == constm1_rtx)
22993 ix86_expand_branch (code, hi[0], hi[1], label);
22994 return;
22996 break;
22997 default:
22998 break;
23001 /* Emulate comparisons that do not depend on Zero flag with
23002 double-word subtraction. Note that only Overflow, Sign
23003 and Carry flags are valid, so swap arguments and condition
23004 of comparisons that would otherwise test Zero flag. */
23006 switch (code)
23008 case LE: case LEU: case GT: case GTU:
23009 std::swap (lo[0], lo[1]);
23010 std::swap (hi[0], hi[1]);
23011 code = swap_condition (code);
23012 /* FALLTHRU */
23014 case LT: case LTU: case GE: case GEU:
23016 rtx (*cmp_insn) (rtx, rtx);
23017 rtx (*sbb_insn) (rtx, rtx, rtx);
23018 bool uns = (code == LTU || code == GEU);
23020 if (TARGET_64BIT)
23022 cmp_insn = gen_cmpdi_1;
23023 sbb_insn
23024 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
23026 else
23028 cmp_insn = gen_cmpsi_1;
23029 sbb_insn
23030 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
23033 if (!nonimmediate_operand (lo[0], submode))
23034 lo[0] = force_reg (submode, lo[0]);
23035 if (!x86_64_general_operand (lo[1], submode))
23036 lo[1] = force_reg (submode, lo[1]);
23038 if (!register_operand (hi[0], submode))
23039 hi[0] = force_reg (submode, hi[0]);
23040 if ((uns && !nonimmediate_operand (hi[1], submode))
23041 || (!uns && !x86_64_general_operand (hi[1], submode)))
23042 hi[1] = force_reg (submode, hi[1]);
23044 emit_insn (cmp_insn (lo[0], lo[1]));
23045 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
23047 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
23049 ix86_expand_branch (code, tmp, const0_rtx, label);
23050 return;
23053 default:
23054 break;
23057 /* Otherwise, we need two or three jumps. */
23059 label2 = gen_label_rtx ();
23061 code1 = code;
23062 code2 = swap_condition (code);
23063 code3 = unsigned_condition (code);
23065 switch (code)
23067 case LT: case GT: case LTU: case GTU:
23068 break;
23070 case LE: code1 = LT; code2 = GT; break;
23071 case GE: code1 = GT; code2 = LT; break;
23072 case LEU: code1 = LTU; code2 = GTU; break;
23073 case GEU: code1 = GTU; code2 = LTU; break;
23075 case EQ: code1 = UNKNOWN; code2 = NE; break;
23076 case NE: code2 = UNKNOWN; break;
23078 default:
23079 gcc_unreachable ();
23083 * a < b =>
23084 * if (hi(a) < hi(b)) goto true;
23085 * if (hi(a) > hi(b)) goto false;
23086 * if (lo(a) < lo(b)) goto true;
23087 * false:
23090 if (code1 != UNKNOWN)
23091 ix86_expand_branch (code1, hi[0], hi[1], label);
23092 if (code2 != UNKNOWN)
23093 ix86_expand_branch (code2, hi[0], hi[1], label2);
23095 ix86_expand_branch (code3, lo[0], lo[1], label);
23097 if (code2 != UNKNOWN)
23098 emit_label (label2);
23099 return;
23102 default:
23103 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
23104 goto simple;
23108 void
23109 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
23111 rtx ret;
23113 gcc_assert (GET_MODE (dest) == QImode);
23115 ret = ix86_expand_compare (code, op0, op1);
23116 PUT_MODE (ret, QImode);
23117 emit_insn (gen_rtx_SET (dest, ret));
23120 /* Expand comparison setting or clearing carry flag. Return true when
23121 successful and set pop for the operation. */
23122 static bool
23123 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
23125 machine_mode mode =
23126 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
23128 /* Do not handle double-mode compares that go through special path. */
23129 if (mode == (TARGET_64BIT ? TImode : DImode))
23130 return false;
23132 if (SCALAR_FLOAT_MODE_P (mode))
23134 rtx compare_op;
23135 rtx_insn *compare_seq;
23137 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
23139 /* Shortcut: following common codes never translate
23140 into carry flag compares. */
23141 if (code == EQ || code == NE || code == UNEQ || code == LTGT
23142 || code == ORDERED || code == UNORDERED)
23143 return false;
23145 /* These comparisons require zero flag; swap operands so they won't. */
23146 if ((code == GT || code == UNLE || code == LE || code == UNGT)
23147 && !TARGET_IEEE_FP)
23149 std::swap (op0, op1);
23150 code = swap_condition (code);
23153 /* Try to expand the comparison and verify that we end up with
23154 carry flag based comparison. This fails to be true only when
23155 we decide to expand comparison using arithmetic that is not
23156 too common scenario. */
23157 start_sequence ();
23158 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
23159 compare_seq = get_insns ();
23160 end_sequence ();
23162 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
23163 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
23164 else
23165 code = GET_CODE (compare_op);
23167 if (code != LTU && code != GEU)
23168 return false;
23170 emit_insn (compare_seq);
23171 *pop = compare_op;
23172 return true;
23175 if (!INTEGRAL_MODE_P (mode))
23176 return false;
23178 switch (code)
23180 case LTU:
23181 case GEU:
23182 break;
23184 /* Convert a==0 into (unsigned)a<1. */
23185 case EQ:
23186 case NE:
23187 if (op1 != const0_rtx)
23188 return false;
23189 op1 = const1_rtx;
23190 code = (code == EQ ? LTU : GEU);
23191 break;
23193 /* Convert a>b into b<a or a>=b-1. */
23194 case GTU:
23195 case LEU:
23196 if (CONST_INT_P (op1))
23198 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
23199 /* Bail out on overflow. We still can swap operands but that
23200 would force loading of the constant into register. */
23201 if (op1 == const0_rtx
23202 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
23203 return false;
23204 code = (code == GTU ? GEU : LTU);
23206 else
23208 std::swap (op0, op1);
23209 code = (code == GTU ? LTU : GEU);
23211 break;
23213 /* Convert a>=0 into (unsigned)a<0x80000000. */
23214 case LT:
23215 case GE:
23216 if (mode == DImode || op1 != const0_rtx)
23217 return false;
23218 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23219 code = (code == LT ? GEU : LTU);
23220 break;
23221 case LE:
23222 case GT:
23223 if (mode == DImode || op1 != constm1_rtx)
23224 return false;
23225 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
23226 code = (code == LE ? GEU : LTU);
23227 break;
23229 default:
23230 return false;
23232 /* Swapping operands may cause constant to appear as first operand. */
23233 if (!nonimmediate_operand (op0, VOIDmode))
23235 if (!can_create_pseudo_p ())
23236 return false;
23237 op0 = force_reg (mode, op0);
23239 *pop = ix86_expand_compare (code, op0, op1);
23240 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
23241 return true;
23244 bool
23245 ix86_expand_int_movcc (rtx operands[])
23247 enum rtx_code code = GET_CODE (operands[1]), compare_code;
23248 rtx_insn *compare_seq;
23249 rtx compare_op;
23250 machine_mode mode = GET_MODE (operands[0]);
23251 bool sign_bit_compare_p = false;
23252 rtx op0 = XEXP (operands[1], 0);
23253 rtx op1 = XEXP (operands[1], 1);
23255 if (GET_MODE (op0) == TImode
23256 || (GET_MODE (op0) == DImode
23257 && !TARGET_64BIT))
23258 return false;
23260 start_sequence ();
23261 compare_op = ix86_expand_compare (code, op0, op1);
23262 compare_seq = get_insns ();
23263 end_sequence ();
23265 compare_code = GET_CODE (compare_op);
23267 if ((op1 == const0_rtx && (code == GE || code == LT))
23268 || (op1 == constm1_rtx && (code == GT || code == LE)))
23269 sign_bit_compare_p = true;
23271 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
23272 HImode insns, we'd be swallowed in word prefix ops. */
23274 if ((mode != HImode || TARGET_FAST_PREFIX)
23275 && (mode != (TARGET_64BIT ? TImode : DImode))
23276 && CONST_INT_P (operands[2])
23277 && CONST_INT_P (operands[3]))
23279 rtx out = operands[0];
23280 HOST_WIDE_INT ct = INTVAL (operands[2]);
23281 HOST_WIDE_INT cf = INTVAL (operands[3]);
23282 HOST_WIDE_INT diff;
23284 diff = ct - cf;
23285 /* Sign bit compares are better done using shifts than we do by using
23286 sbb. */
23287 if (sign_bit_compare_p
23288 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
23290 /* Detect overlap between destination and compare sources. */
23291 rtx tmp = out;
23293 if (!sign_bit_compare_p)
23295 rtx flags;
23296 bool fpcmp = false;
23298 compare_code = GET_CODE (compare_op);
23300 flags = XEXP (compare_op, 0);
23302 if (GET_MODE (flags) == CCFPmode)
23304 fpcmp = true;
23305 compare_code
23306 = ix86_fp_compare_code_to_integer (compare_code);
23309 /* To simplify rest of code, restrict to the GEU case. */
23310 if (compare_code == LTU)
23312 std::swap (ct, cf);
23313 compare_code = reverse_condition (compare_code);
23314 code = reverse_condition (code);
23316 else
23318 if (fpcmp)
23319 PUT_CODE (compare_op,
23320 reverse_condition_maybe_unordered
23321 (GET_CODE (compare_op)));
23322 else
23323 PUT_CODE (compare_op,
23324 reverse_condition (GET_CODE (compare_op)));
23326 diff = ct - cf;
23328 if (reg_overlap_mentioned_p (out, op0)
23329 || reg_overlap_mentioned_p (out, op1))
23330 tmp = gen_reg_rtx (mode);
23332 if (mode == DImode)
23333 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
23334 else
23335 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
23336 flags, compare_op));
23338 else
23340 if (code == GT || code == GE)
23341 code = reverse_condition (code);
23342 else
23344 std::swap (ct, cf);
23345 diff = ct - cf;
23347 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
23350 if (diff == 1)
23353 * cmpl op0,op1
23354 * sbbl dest,dest
23355 * [addl dest, ct]
23357 * Size 5 - 8.
23359 if (ct)
23360 tmp = expand_simple_binop (mode, PLUS,
23361 tmp, GEN_INT (ct),
23362 copy_rtx (tmp), 1, OPTAB_DIRECT);
23364 else if (cf == -1)
23367 * cmpl op0,op1
23368 * sbbl dest,dest
23369 * orl $ct, dest
23371 * Size 8.
23373 tmp = expand_simple_binop (mode, IOR,
23374 tmp, GEN_INT (ct),
23375 copy_rtx (tmp), 1, OPTAB_DIRECT);
23377 else if (diff == -1 && ct)
23380 * cmpl op0,op1
23381 * sbbl dest,dest
23382 * notl dest
23383 * [addl dest, cf]
23385 * Size 8 - 11.
23387 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23388 if (cf)
23389 tmp = expand_simple_binop (mode, PLUS,
23390 copy_rtx (tmp), GEN_INT (cf),
23391 copy_rtx (tmp), 1, OPTAB_DIRECT);
23393 else
23396 * cmpl op0,op1
23397 * sbbl dest,dest
23398 * [notl dest]
23399 * andl cf - ct, dest
23400 * [addl dest, ct]
23402 * Size 8 - 11.
23405 if (cf == 0)
23407 cf = ct;
23408 ct = 0;
23409 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
23412 tmp = expand_simple_binop (mode, AND,
23413 copy_rtx (tmp),
23414 gen_int_mode (cf - ct, mode),
23415 copy_rtx (tmp), 1, OPTAB_DIRECT);
23416 if (ct)
23417 tmp = expand_simple_binop (mode, PLUS,
23418 copy_rtx (tmp), GEN_INT (ct),
23419 copy_rtx (tmp), 1, OPTAB_DIRECT);
23422 if (!rtx_equal_p (tmp, out))
23423 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
23425 return true;
23428 if (diff < 0)
23430 machine_mode cmp_mode = GET_MODE (op0);
23431 enum rtx_code new_code;
23433 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23435 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23437 /* We may be reversing unordered compare to normal compare, that
23438 is not valid in general (we may convert non-trapping condition
23439 to trapping one), however on i386 we currently emit all
23440 comparisons unordered. */
23441 new_code = reverse_condition_maybe_unordered (code);
23443 else
23444 new_code = ix86_reverse_condition (code, cmp_mode);
23445 if (new_code != UNKNOWN)
23447 std::swap (ct, cf);
23448 diff = -diff;
23449 code = new_code;
23453 compare_code = UNKNOWN;
23454 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
23455 && CONST_INT_P (op1))
23457 if (op1 == const0_rtx
23458 && (code == LT || code == GE))
23459 compare_code = code;
23460 else if (op1 == constm1_rtx)
23462 if (code == LE)
23463 compare_code = LT;
23464 else if (code == GT)
23465 compare_code = GE;
23469 /* Optimize dest = (op0 < 0) ? -1 : cf. */
23470 if (compare_code != UNKNOWN
23471 && GET_MODE (op0) == GET_MODE (out)
23472 && (cf == -1 || ct == -1))
23474 /* If lea code below could be used, only optimize
23475 if it results in a 2 insn sequence. */
23477 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
23478 || diff == 3 || diff == 5 || diff == 9)
23479 || (compare_code == LT && ct == -1)
23480 || (compare_code == GE && cf == -1))
23483 * notl op1 (if necessary)
23484 * sarl $31, op1
23485 * orl cf, op1
23487 if (ct != -1)
23489 cf = ct;
23490 ct = -1;
23491 code = reverse_condition (code);
23494 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23496 out = expand_simple_binop (mode, IOR,
23497 out, GEN_INT (cf),
23498 out, 1, OPTAB_DIRECT);
23499 if (out != operands[0])
23500 emit_move_insn (operands[0], out);
23502 return true;
23507 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23508 || diff == 3 || diff == 5 || diff == 9)
23509 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23510 && (mode != DImode
23511 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23514 * xorl dest,dest
23515 * cmpl op1,op2
23516 * setcc dest
23517 * lea cf(dest*(ct-cf)),dest
23519 * Size 14.
23521 * This also catches the degenerate setcc-only case.
23524 rtx tmp;
23525 int nops;
23527 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23529 nops = 0;
23530 /* On x86_64 the lea instruction operates on Pmode, so we need
23531 to get arithmetics done in proper mode to match. */
23532 if (diff == 1)
23533 tmp = copy_rtx (out);
23534 else
23536 rtx out1;
23537 out1 = copy_rtx (out);
23538 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23539 nops++;
23540 if (diff & 1)
23542 tmp = gen_rtx_PLUS (mode, tmp, out1);
23543 nops++;
23546 if (cf != 0)
23548 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23549 nops++;
23551 if (!rtx_equal_p (tmp, out))
23553 if (nops == 1)
23554 out = force_operand (tmp, copy_rtx (out));
23555 else
23556 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23558 if (!rtx_equal_p (out, operands[0]))
23559 emit_move_insn (operands[0], copy_rtx (out));
23561 return true;
23565 * General case: Jumpful:
23566 * xorl dest,dest cmpl op1, op2
23567 * cmpl op1, op2 movl ct, dest
23568 * setcc dest jcc 1f
23569 * decl dest movl cf, dest
23570 * andl (cf-ct),dest 1:
23571 * addl ct,dest
23573 * Size 20. Size 14.
23575 * This is reasonably steep, but branch mispredict costs are
23576 * high on modern cpus, so consider failing only if optimizing
23577 * for space.
23580 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23581 && BRANCH_COST (optimize_insn_for_speed_p (),
23582 false) >= 2)
23584 if (cf == 0)
23586 machine_mode cmp_mode = GET_MODE (op0);
23587 enum rtx_code new_code;
23589 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23591 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23593 /* We may be reversing unordered compare to normal compare,
23594 that is not valid in general (we may convert non-trapping
23595 condition to trapping one), however on i386 we currently
23596 emit all comparisons unordered. */
23597 new_code = reverse_condition_maybe_unordered (code);
23599 else
23601 new_code = ix86_reverse_condition (code, cmp_mode);
23602 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23603 compare_code = reverse_condition (compare_code);
23606 if (new_code != UNKNOWN)
23608 cf = ct;
23609 ct = 0;
23610 code = new_code;
23614 if (compare_code != UNKNOWN)
23616 /* notl op1 (if needed)
23617 sarl $31, op1
23618 andl (cf-ct), op1
23619 addl ct, op1
23621 For x < 0 (resp. x <= -1) there will be no notl,
23622 so if possible swap the constants to get rid of the
23623 complement.
23624 True/false will be -1/0 while code below (store flag
23625 followed by decrement) is 0/-1, so the constants need
23626 to be exchanged once more. */
23628 if (compare_code == GE || !cf)
23630 code = reverse_condition (code);
23631 compare_code = LT;
23633 else
23634 std::swap (ct, cf);
23636 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23638 else
23640 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23642 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23643 constm1_rtx,
23644 copy_rtx (out), 1, OPTAB_DIRECT);
23647 out = expand_simple_binop (mode, AND, copy_rtx (out),
23648 gen_int_mode (cf - ct, mode),
23649 copy_rtx (out), 1, OPTAB_DIRECT);
23650 if (ct)
23651 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23652 copy_rtx (out), 1, OPTAB_DIRECT);
23653 if (!rtx_equal_p (out, operands[0]))
23654 emit_move_insn (operands[0], copy_rtx (out));
23656 return true;
23660 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23662 /* Try a few things more with specific constants and a variable. */
23664 optab op;
23665 rtx var, orig_out, out, tmp;
23667 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23668 return false;
23670 /* If one of the two operands is an interesting constant, load a
23671 constant with the above and mask it in with a logical operation. */
23673 if (CONST_INT_P (operands[2]))
23675 var = operands[3];
23676 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23677 operands[3] = constm1_rtx, op = and_optab;
23678 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23679 operands[3] = const0_rtx, op = ior_optab;
23680 else
23681 return false;
23683 else if (CONST_INT_P (operands[3]))
23685 var = operands[2];
23686 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23687 operands[2] = constm1_rtx, op = and_optab;
23688 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23689 operands[2] = const0_rtx, op = ior_optab;
23690 else
23691 return false;
23693 else
23694 return false;
23696 orig_out = operands[0];
23697 tmp = gen_reg_rtx (mode);
23698 operands[0] = tmp;
23700 /* Recurse to get the constant loaded. */
23701 if (!ix86_expand_int_movcc (operands))
23702 return false;
23704 /* Mask in the interesting variable. */
23705 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23706 OPTAB_WIDEN);
23707 if (!rtx_equal_p (out, orig_out))
23708 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23710 return true;
23714 * For comparison with above,
23716 * movl cf,dest
23717 * movl ct,tmp
23718 * cmpl op1,op2
23719 * cmovcc tmp,dest
23721 * Size 15.
23724 if (! nonimmediate_operand (operands[2], mode))
23725 operands[2] = force_reg (mode, operands[2]);
23726 if (! nonimmediate_operand (operands[3], mode))
23727 operands[3] = force_reg (mode, operands[3]);
23729 if (! register_operand (operands[2], VOIDmode)
23730 && (mode == QImode
23731 || ! register_operand (operands[3], VOIDmode)))
23732 operands[2] = force_reg (mode, operands[2]);
23734 if (mode == QImode
23735 && ! register_operand (operands[3], VOIDmode))
23736 operands[3] = force_reg (mode, operands[3]);
23738 emit_insn (compare_seq);
23739 emit_insn (gen_rtx_SET (operands[0],
23740 gen_rtx_IF_THEN_ELSE (mode,
23741 compare_op, operands[2],
23742 operands[3])));
23743 return true;
23746 /* Swap, force into registers, or otherwise massage the two operands
23747 to an sse comparison with a mask result. Thus we differ a bit from
23748 ix86_prepare_fp_compare_args which expects to produce a flags result.
23750 The DEST operand exists to help determine whether to commute commutative
23751 operators. The POP0/POP1 operands are updated in place. The new
23752 comparison code is returned, or UNKNOWN if not implementable. */
23754 static enum rtx_code
23755 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23756 rtx *pop0, rtx *pop1)
23758 switch (code)
23760 case LTGT:
23761 case UNEQ:
23762 /* AVX supports all the needed comparisons. */
23763 if (TARGET_AVX)
23764 break;
23765 /* We have no LTGT as an operator. We could implement it with
23766 NE & ORDERED, but this requires an extra temporary. It's
23767 not clear that it's worth it. */
23768 return UNKNOWN;
23770 case LT:
23771 case LE:
23772 case UNGT:
23773 case UNGE:
23774 /* These are supported directly. */
23775 break;
23777 case EQ:
23778 case NE:
23779 case UNORDERED:
23780 case ORDERED:
23781 /* AVX has 3 operand comparisons, no need to swap anything. */
23782 if (TARGET_AVX)
23783 break;
23784 /* For commutative operators, try to canonicalize the destination
23785 operand to be first in the comparison - this helps reload to
23786 avoid extra moves. */
23787 if (!dest || !rtx_equal_p (dest, *pop1))
23788 break;
23789 /* FALLTHRU */
23791 case GE:
23792 case GT:
23793 case UNLE:
23794 case UNLT:
23795 /* These are not supported directly before AVX, and furthermore
23796 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23797 comparison operands to transform into something that is
23798 supported. */
23799 std::swap (*pop0, *pop1);
23800 code = swap_condition (code);
23801 break;
23803 default:
23804 gcc_unreachable ();
23807 return code;
23810 /* Detect conditional moves that exactly match min/max operational
23811 semantics. Note that this is IEEE safe, as long as we don't
23812 interchange the operands.
23814 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23815 and TRUE if the operation is successful and instructions are emitted. */
23817 static bool
23818 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23819 rtx cmp_op1, rtx if_true, rtx if_false)
23821 machine_mode mode;
23822 bool is_min;
23823 rtx tmp;
23825 if (code == LT)
23827 else if (code == UNGE)
23828 std::swap (if_true, if_false);
23829 else
23830 return false;
23832 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23833 is_min = true;
23834 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23835 is_min = false;
23836 else
23837 return false;
23839 mode = GET_MODE (dest);
23841 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23842 but MODE may be a vector mode and thus not appropriate. */
23843 if (!flag_finite_math_only || flag_signed_zeros)
23845 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23846 rtvec v;
23848 if_true = force_reg (mode, if_true);
23849 v = gen_rtvec (2, if_true, if_false);
23850 tmp = gen_rtx_UNSPEC (mode, v, u);
23852 else
23854 code = is_min ? SMIN : SMAX;
23855 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23858 emit_insn (gen_rtx_SET (dest, tmp));
23859 return true;
23862 /* Expand an sse vector comparison. Return the register with the result. */
23864 static rtx
23865 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23866 rtx op_true, rtx op_false)
23868 machine_mode mode = GET_MODE (dest);
23869 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23871 /* In general case result of comparison can differ from operands' type. */
23872 machine_mode cmp_mode;
23874 /* In AVX512F the result of comparison is an integer mask. */
23875 bool maskcmp = false;
23876 rtx x;
23878 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23880 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23881 cmp_mode = int_mode_for_size (nbits, 0).require ();
23882 maskcmp = true;
23884 else
23885 cmp_mode = cmp_ops_mode;
23888 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23889 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23890 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23892 if (optimize
23893 || (maskcmp && cmp_mode != mode)
23894 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23895 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23896 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23898 /* Compare patterns for int modes are unspec in AVX512F only. */
23899 if (maskcmp && (code == GT || code == EQ))
23901 rtx (*gen)(rtx, rtx, rtx);
23903 switch (cmp_ops_mode)
23905 case E_V64QImode:
23906 gcc_assert (TARGET_AVX512BW);
23907 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23908 break;
23909 case E_V32HImode:
23910 gcc_assert (TARGET_AVX512BW);
23911 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23912 break;
23913 case E_V16SImode:
23914 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23915 break;
23916 case E_V8DImode:
23917 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23918 break;
23919 default:
23920 gen = NULL;
23923 if (gen)
23925 emit_insn (gen (dest, cmp_op0, cmp_op1));
23926 return dest;
23929 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23931 if (cmp_mode != mode && !maskcmp)
23933 x = force_reg (cmp_ops_mode, x);
23934 convert_move (dest, x, false);
23936 else
23937 emit_insn (gen_rtx_SET (dest, x));
23939 return dest;
23942 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23943 operations. This is used for both scalar and vector conditional moves. */
23945 void
23946 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23948 machine_mode mode = GET_MODE (dest);
23949 machine_mode cmpmode = GET_MODE (cmp);
23951 /* In AVX512F the result of comparison is an integer mask. */
23952 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23954 rtx t2, t3, x;
23956 /* If we have an integer mask and FP value then we need
23957 to cast mask to FP mode. */
23958 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23960 cmp = force_reg (cmpmode, cmp);
23961 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23964 if (vector_all_ones_operand (op_true, mode)
23965 && rtx_equal_p (op_false, CONST0_RTX (mode))
23966 && !maskcmp)
23968 emit_insn (gen_rtx_SET (dest, cmp));
23970 else if (op_false == CONST0_RTX (mode)
23971 && !maskcmp)
23973 op_true = force_reg (mode, op_true);
23974 x = gen_rtx_AND (mode, cmp, op_true);
23975 emit_insn (gen_rtx_SET (dest, x));
23977 else if (op_true == CONST0_RTX (mode)
23978 && !maskcmp)
23980 op_false = force_reg (mode, op_false);
23981 x = gen_rtx_NOT (mode, cmp);
23982 x = gen_rtx_AND (mode, x, op_false);
23983 emit_insn (gen_rtx_SET (dest, x));
23985 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23986 && !maskcmp)
23988 op_false = force_reg (mode, op_false);
23989 x = gen_rtx_IOR (mode, cmp, op_false);
23990 emit_insn (gen_rtx_SET (dest, x));
23992 else if (TARGET_XOP
23993 && !maskcmp)
23995 op_true = force_reg (mode, op_true);
23997 if (!nonimmediate_operand (op_false, mode))
23998 op_false = force_reg (mode, op_false);
24000 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
24001 op_true,
24002 op_false)));
24004 else
24006 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24007 rtx d = dest;
24009 if (!nonimmediate_operand (op_true, mode))
24010 op_true = force_reg (mode, op_true);
24012 op_false = force_reg (mode, op_false);
24014 switch (mode)
24016 case E_V4SFmode:
24017 if (TARGET_SSE4_1)
24018 gen = gen_sse4_1_blendvps;
24019 break;
24020 case E_V2DFmode:
24021 if (TARGET_SSE4_1)
24022 gen = gen_sse4_1_blendvpd;
24023 break;
24024 case E_V16QImode:
24025 case E_V8HImode:
24026 case E_V4SImode:
24027 case E_V2DImode:
24028 if (TARGET_SSE4_1)
24030 gen = gen_sse4_1_pblendvb;
24031 if (mode != V16QImode)
24032 d = gen_reg_rtx (V16QImode);
24033 op_false = gen_lowpart (V16QImode, op_false);
24034 op_true = gen_lowpart (V16QImode, op_true);
24035 cmp = gen_lowpart (V16QImode, cmp);
24037 break;
24038 case E_V8SFmode:
24039 if (TARGET_AVX)
24040 gen = gen_avx_blendvps256;
24041 break;
24042 case E_V4DFmode:
24043 if (TARGET_AVX)
24044 gen = gen_avx_blendvpd256;
24045 break;
24046 case E_V32QImode:
24047 case E_V16HImode:
24048 case E_V8SImode:
24049 case E_V4DImode:
24050 if (TARGET_AVX2)
24052 gen = gen_avx2_pblendvb;
24053 if (mode != V32QImode)
24054 d = gen_reg_rtx (V32QImode);
24055 op_false = gen_lowpart (V32QImode, op_false);
24056 op_true = gen_lowpart (V32QImode, op_true);
24057 cmp = gen_lowpart (V32QImode, cmp);
24059 break;
24061 case E_V64QImode:
24062 gen = gen_avx512bw_blendmv64qi;
24063 break;
24064 case E_V32HImode:
24065 gen = gen_avx512bw_blendmv32hi;
24066 break;
24067 case E_V16SImode:
24068 gen = gen_avx512f_blendmv16si;
24069 break;
24070 case E_V8DImode:
24071 gen = gen_avx512f_blendmv8di;
24072 break;
24073 case E_V8DFmode:
24074 gen = gen_avx512f_blendmv8df;
24075 break;
24076 case E_V16SFmode:
24077 gen = gen_avx512f_blendmv16sf;
24078 break;
24080 default:
24081 break;
24084 if (gen != NULL)
24086 emit_insn (gen (d, op_false, op_true, cmp));
24087 if (d != dest)
24088 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
24090 else
24092 op_true = force_reg (mode, op_true);
24094 t2 = gen_reg_rtx (mode);
24095 if (optimize)
24096 t3 = gen_reg_rtx (mode);
24097 else
24098 t3 = dest;
24100 x = gen_rtx_AND (mode, op_true, cmp);
24101 emit_insn (gen_rtx_SET (t2, x));
24103 x = gen_rtx_NOT (mode, cmp);
24104 x = gen_rtx_AND (mode, x, op_false);
24105 emit_insn (gen_rtx_SET (t3, x));
24107 x = gen_rtx_IOR (mode, t3, t2);
24108 emit_insn (gen_rtx_SET (dest, x));
24113 /* Expand a floating-point conditional move. Return true if successful. */
24115 bool
24116 ix86_expand_fp_movcc (rtx operands[])
24118 machine_mode mode = GET_MODE (operands[0]);
24119 enum rtx_code code = GET_CODE (operands[1]);
24120 rtx tmp, compare_op;
24121 rtx op0 = XEXP (operands[1], 0);
24122 rtx op1 = XEXP (operands[1], 1);
24124 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
24126 machine_mode cmode;
24128 /* Since we've no cmove for sse registers, don't force bad register
24129 allocation just to gain access to it. Deny movcc when the
24130 comparison mode doesn't match the move mode. */
24131 cmode = GET_MODE (op0);
24132 if (cmode == VOIDmode)
24133 cmode = GET_MODE (op1);
24134 if (cmode != mode)
24135 return false;
24137 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
24138 if (code == UNKNOWN)
24139 return false;
24141 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
24142 operands[2], operands[3]))
24143 return true;
24145 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
24146 operands[2], operands[3]);
24147 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
24148 return true;
24151 if (GET_MODE (op0) == TImode
24152 || (GET_MODE (op0) == DImode
24153 && !TARGET_64BIT))
24154 return false;
24156 /* The floating point conditional move instructions don't directly
24157 support conditions resulting from a signed integer comparison. */
24159 compare_op = ix86_expand_compare (code, op0, op1);
24160 if (!fcmov_comparison_operator (compare_op, VOIDmode))
24162 tmp = gen_reg_rtx (QImode);
24163 ix86_expand_setcc (tmp, code, op0, op1);
24165 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
24168 emit_insn (gen_rtx_SET (operands[0],
24169 gen_rtx_IF_THEN_ELSE (mode, compare_op,
24170 operands[2], operands[3])));
24172 return true;
24175 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
24177 static int
24178 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
24180 switch (code)
24182 case EQ:
24183 return 0;
24184 case LT:
24185 case LTU:
24186 return 1;
24187 case LE:
24188 case LEU:
24189 return 2;
24190 case NE:
24191 return 4;
24192 case GE:
24193 case GEU:
24194 return 5;
24195 case GT:
24196 case GTU:
24197 return 6;
24198 default:
24199 gcc_unreachable ();
24203 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
24205 static int
24206 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
24208 switch (code)
24210 case EQ:
24211 return 0x00;
24212 case NE:
24213 return 0x04;
24214 case GT:
24215 return 0x0e;
24216 case LE:
24217 return 0x02;
24218 case GE:
24219 return 0x0d;
24220 case LT:
24221 return 0x01;
24222 case UNLE:
24223 return 0x0a;
24224 case UNLT:
24225 return 0x09;
24226 case UNGE:
24227 return 0x05;
24228 case UNGT:
24229 return 0x06;
24230 case UNEQ:
24231 return 0x18;
24232 case LTGT:
24233 return 0x0c;
24234 case ORDERED:
24235 return 0x07;
24236 case UNORDERED:
24237 return 0x03;
24238 default:
24239 gcc_unreachable ();
24243 /* Return immediate value to be used in UNSPEC_PCMP
24244 for comparison CODE in MODE. */
24246 static int
24247 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
24249 if (FLOAT_MODE_P (mode))
24250 return ix86_fp_cmp_code_to_pcmp_immediate (code);
24251 return ix86_int_cmp_code_to_pcmp_immediate (code);
24254 /* Expand AVX-512 vector comparison. */
24256 bool
24257 ix86_expand_mask_vec_cmp (rtx operands[])
24259 machine_mode mask_mode = GET_MODE (operands[0]);
24260 machine_mode cmp_mode = GET_MODE (operands[2]);
24261 enum rtx_code code = GET_CODE (operands[1]);
24262 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
24263 int unspec_code;
24264 rtx unspec;
24266 switch (code)
24268 case LEU:
24269 case GTU:
24270 case GEU:
24271 case LTU:
24272 unspec_code = UNSPEC_UNSIGNED_PCMP;
24273 break;
24275 default:
24276 unspec_code = UNSPEC_PCMP;
24279 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
24280 operands[3], imm),
24281 unspec_code);
24282 emit_insn (gen_rtx_SET (operands[0], unspec));
24284 return true;
24287 /* Expand fp vector comparison. */
24289 bool
24290 ix86_expand_fp_vec_cmp (rtx operands[])
24292 enum rtx_code code = GET_CODE (operands[1]);
24293 rtx cmp;
24295 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24296 &operands[2], &operands[3]);
24297 if (code == UNKNOWN)
24299 rtx temp;
24300 switch (GET_CODE (operands[1]))
24302 case LTGT:
24303 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
24304 operands[3], NULL, NULL);
24305 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
24306 operands[3], NULL, NULL);
24307 code = AND;
24308 break;
24309 case UNEQ:
24310 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
24311 operands[3], NULL, NULL);
24312 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
24313 operands[3], NULL, NULL);
24314 code = IOR;
24315 break;
24316 default:
24317 gcc_unreachable ();
24319 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24320 OPTAB_DIRECT);
24322 else
24323 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
24324 operands[1], operands[2]);
24326 if (operands[0] != cmp)
24327 emit_move_insn (operands[0], cmp);
24329 return true;
24332 static rtx
24333 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
24334 rtx op_true, rtx op_false, bool *negate)
24336 machine_mode data_mode = GET_MODE (dest);
24337 machine_mode mode = GET_MODE (cop0);
24338 rtx x;
24340 *negate = false;
24342 /* XOP supports all of the comparisons on all 128-bit vector int types. */
24343 if (TARGET_XOP
24344 && (mode == V16QImode || mode == V8HImode
24345 || mode == V4SImode || mode == V2DImode))
24347 else
24349 /* Canonicalize the comparison to EQ, GT, GTU. */
24350 switch (code)
24352 case EQ:
24353 case GT:
24354 case GTU:
24355 break;
24357 case NE:
24358 case LE:
24359 case LEU:
24360 code = reverse_condition (code);
24361 *negate = true;
24362 break;
24364 case GE:
24365 case GEU:
24366 code = reverse_condition (code);
24367 *negate = true;
24368 /* FALLTHRU */
24370 case LT:
24371 case LTU:
24372 std::swap (cop0, cop1);
24373 code = swap_condition (code);
24374 break;
24376 default:
24377 gcc_unreachable ();
24380 /* Only SSE4.1/SSE4.2 supports V2DImode. */
24381 if (mode == V2DImode)
24383 switch (code)
24385 case EQ:
24386 /* SSE4.1 supports EQ. */
24387 if (!TARGET_SSE4_1)
24388 return NULL;
24389 break;
24391 case GT:
24392 case GTU:
24393 /* SSE4.2 supports GT/GTU. */
24394 if (!TARGET_SSE4_2)
24395 return NULL;
24396 break;
24398 default:
24399 gcc_unreachable ();
24403 /* Unsigned parallel compare is not supported by the hardware.
24404 Play some tricks to turn this into a signed comparison
24405 against 0. */
24406 if (code == GTU)
24408 cop0 = force_reg (mode, cop0);
24410 switch (mode)
24412 case E_V16SImode:
24413 case E_V8DImode:
24414 case E_V8SImode:
24415 case E_V4DImode:
24416 case E_V4SImode:
24417 case E_V2DImode:
24419 rtx t1, t2, mask;
24420 rtx (*gen_sub3) (rtx, rtx, rtx);
24422 switch (mode)
24424 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
24425 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
24426 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
24427 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
24428 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
24429 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
24430 default:
24431 gcc_unreachable ();
24433 /* Subtract (-(INT MAX) - 1) from both operands to make
24434 them signed. */
24435 mask = ix86_build_signbit_mask (mode, true, false);
24436 t1 = gen_reg_rtx (mode);
24437 emit_insn (gen_sub3 (t1, cop0, mask));
24439 t2 = gen_reg_rtx (mode);
24440 emit_insn (gen_sub3 (t2, cop1, mask));
24442 cop0 = t1;
24443 cop1 = t2;
24444 code = GT;
24446 break;
24448 case E_V64QImode:
24449 case E_V32HImode:
24450 case E_V32QImode:
24451 case E_V16HImode:
24452 case E_V16QImode:
24453 case E_V8HImode:
24454 /* Perform a parallel unsigned saturating subtraction. */
24455 x = gen_reg_rtx (mode);
24456 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
24457 cop1)));
24459 cop0 = x;
24460 cop1 = CONST0_RTX (mode);
24461 code = EQ;
24462 *negate = !*negate;
24463 break;
24465 default:
24466 gcc_unreachable ();
24471 if (*negate)
24472 std::swap (op_true, op_false);
24474 /* Allow the comparison to be done in one mode, but the movcc to
24475 happen in another mode. */
24476 if (data_mode == mode)
24478 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
24479 op_true, op_false);
24481 else
24483 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24484 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24485 op_true, op_false);
24486 if (GET_MODE (x) == mode)
24487 x = gen_lowpart (data_mode, x);
24490 return x;
24493 /* Expand integer vector comparison. */
24495 bool
24496 ix86_expand_int_vec_cmp (rtx operands[])
24498 rtx_code code = GET_CODE (operands[1]);
24499 bool negate = false;
24500 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24501 operands[3], NULL, NULL, &negate);
24503 if (!cmp)
24504 return false;
24506 if (negate)
24507 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24508 CONST0_RTX (GET_MODE (cmp)),
24509 NULL, NULL, &negate);
24511 gcc_assert (!negate);
24513 if (operands[0] != cmp)
24514 emit_move_insn (operands[0], cmp);
24516 return true;
24519 /* Expand a floating-point vector conditional move; a vcond operation
24520 rather than a movcc operation. */
24522 bool
24523 ix86_expand_fp_vcond (rtx operands[])
24525 enum rtx_code code = GET_CODE (operands[3]);
24526 rtx cmp;
24528 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24529 &operands[4], &operands[5]);
24530 if (code == UNKNOWN)
24532 rtx temp;
24533 switch (GET_CODE (operands[3]))
24535 case LTGT:
24536 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24537 operands[5], operands[0], operands[0]);
24538 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24539 operands[5], operands[1], operands[2]);
24540 code = AND;
24541 break;
24542 case UNEQ:
24543 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24544 operands[5], operands[0], operands[0]);
24545 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24546 operands[5], operands[1], operands[2]);
24547 code = IOR;
24548 break;
24549 default:
24550 gcc_unreachable ();
24552 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24553 OPTAB_DIRECT);
24554 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24555 return true;
24558 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24559 operands[5], operands[1], operands[2]))
24560 return true;
24562 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24563 operands[1], operands[2]);
24564 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24565 return true;
24568 /* Expand a signed/unsigned integral vector conditional move. */
24570 bool
24571 ix86_expand_int_vcond (rtx operands[])
24573 machine_mode data_mode = GET_MODE (operands[0]);
24574 machine_mode mode = GET_MODE (operands[4]);
24575 enum rtx_code code = GET_CODE (operands[3]);
24576 bool negate = false;
24577 rtx x, cop0, cop1;
24579 cop0 = operands[4];
24580 cop1 = operands[5];
24582 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24583 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24584 if ((code == LT || code == GE)
24585 && data_mode == mode
24586 && cop1 == CONST0_RTX (mode)
24587 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24588 && GET_MODE_UNIT_SIZE (data_mode) > 1
24589 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24590 && (GET_MODE_SIZE (data_mode) == 16
24591 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24593 rtx negop = operands[2 - (code == LT)];
24594 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24595 if (negop == CONST1_RTX (data_mode))
24597 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24598 operands[0], 1, OPTAB_DIRECT);
24599 if (res != operands[0])
24600 emit_move_insn (operands[0], res);
24601 return true;
24603 else if (GET_MODE_INNER (data_mode) != DImode
24604 && vector_all_ones_operand (negop, data_mode))
24606 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24607 operands[0], 0, OPTAB_DIRECT);
24608 if (res != operands[0])
24609 emit_move_insn (operands[0], res);
24610 return true;
24614 if (!nonimmediate_operand (cop1, mode))
24615 cop1 = force_reg (mode, cop1);
24616 if (!general_operand (operands[1], data_mode))
24617 operands[1] = force_reg (data_mode, operands[1]);
24618 if (!general_operand (operands[2], data_mode))
24619 operands[2] = force_reg (data_mode, operands[2]);
24621 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24622 operands[1], operands[2], &negate);
24624 if (!x)
24625 return false;
24627 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24628 operands[2-negate]);
24629 return true;
24632 /* AVX512F does support 64-byte integer vector operations,
24633 thus the longest vector we are faced with is V64QImode. */
24634 #define MAX_VECT_LEN 64
24636 struct expand_vec_perm_d
24638 rtx target, op0, op1;
24639 unsigned char perm[MAX_VECT_LEN];
24640 machine_mode vmode;
24641 unsigned char nelt;
24642 bool one_operand_p;
24643 bool testing_p;
24646 static bool
24647 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24648 struct expand_vec_perm_d *d)
24650 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24651 expander, so args are either in d, or in op0, op1 etc. */
24652 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24653 machine_mode maskmode = mode;
24654 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24656 switch (mode)
24658 case E_V8HImode:
24659 if (TARGET_AVX512VL && TARGET_AVX512BW)
24660 gen = gen_avx512vl_vpermt2varv8hi3;
24661 break;
24662 case E_V16HImode:
24663 if (TARGET_AVX512VL && TARGET_AVX512BW)
24664 gen = gen_avx512vl_vpermt2varv16hi3;
24665 break;
24666 case E_V64QImode:
24667 if (TARGET_AVX512VBMI)
24668 gen = gen_avx512bw_vpermt2varv64qi3;
24669 break;
24670 case E_V32HImode:
24671 if (TARGET_AVX512BW)
24672 gen = gen_avx512bw_vpermt2varv32hi3;
24673 break;
24674 case E_V4SImode:
24675 if (TARGET_AVX512VL)
24676 gen = gen_avx512vl_vpermt2varv4si3;
24677 break;
24678 case E_V8SImode:
24679 if (TARGET_AVX512VL)
24680 gen = gen_avx512vl_vpermt2varv8si3;
24681 break;
24682 case E_V16SImode:
24683 if (TARGET_AVX512F)
24684 gen = gen_avx512f_vpermt2varv16si3;
24685 break;
24686 case E_V4SFmode:
24687 if (TARGET_AVX512VL)
24689 gen = gen_avx512vl_vpermt2varv4sf3;
24690 maskmode = V4SImode;
24692 break;
24693 case E_V8SFmode:
24694 if (TARGET_AVX512VL)
24696 gen = gen_avx512vl_vpermt2varv8sf3;
24697 maskmode = V8SImode;
24699 break;
24700 case E_V16SFmode:
24701 if (TARGET_AVX512F)
24703 gen = gen_avx512f_vpermt2varv16sf3;
24704 maskmode = V16SImode;
24706 break;
24707 case E_V2DImode:
24708 if (TARGET_AVX512VL)
24709 gen = gen_avx512vl_vpermt2varv2di3;
24710 break;
24711 case E_V4DImode:
24712 if (TARGET_AVX512VL)
24713 gen = gen_avx512vl_vpermt2varv4di3;
24714 break;
24715 case E_V8DImode:
24716 if (TARGET_AVX512F)
24717 gen = gen_avx512f_vpermt2varv8di3;
24718 break;
24719 case E_V2DFmode:
24720 if (TARGET_AVX512VL)
24722 gen = gen_avx512vl_vpermt2varv2df3;
24723 maskmode = V2DImode;
24725 break;
24726 case E_V4DFmode:
24727 if (TARGET_AVX512VL)
24729 gen = gen_avx512vl_vpermt2varv4df3;
24730 maskmode = V4DImode;
24732 break;
24733 case E_V8DFmode:
24734 if (TARGET_AVX512F)
24736 gen = gen_avx512f_vpermt2varv8df3;
24737 maskmode = V8DImode;
24739 break;
24740 default:
24741 break;
24744 if (gen == NULL)
24745 return false;
24747 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24748 expander, so args are either in d, or in op0, op1 etc. */
24749 if (d)
24751 rtx vec[64];
24752 target = d->target;
24753 op0 = d->op0;
24754 op1 = d->op1;
24755 for (int i = 0; i < d->nelt; ++i)
24756 vec[i] = GEN_INT (d->perm[i]);
24757 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24760 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24761 return true;
24764 /* Expand a variable vector permutation. */
24766 void
24767 ix86_expand_vec_perm (rtx operands[])
24769 rtx target = operands[0];
24770 rtx op0 = operands[1];
24771 rtx op1 = operands[2];
24772 rtx mask = operands[3];
24773 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24774 machine_mode mode = GET_MODE (op0);
24775 machine_mode maskmode = GET_MODE (mask);
24776 int w, e, i;
24777 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24779 /* Number of elements in the vector. */
24780 w = GET_MODE_NUNITS (mode);
24781 e = GET_MODE_UNIT_SIZE (mode);
24782 gcc_assert (w <= 64);
24784 if (TARGET_AVX512F && one_operand_shuffle)
24786 rtx (*gen) (rtx, rtx, rtx) = NULL;
24787 switch (mode)
24789 case E_V16SImode:
24790 gen =gen_avx512f_permvarv16si;
24791 break;
24792 case E_V16SFmode:
24793 gen = gen_avx512f_permvarv16sf;
24794 break;
24795 case E_V8DImode:
24796 gen = gen_avx512f_permvarv8di;
24797 break;
24798 case E_V8DFmode:
24799 gen = gen_avx512f_permvarv8df;
24800 break;
24801 default:
24802 break;
24804 if (gen != NULL)
24806 emit_insn (gen (target, op0, mask));
24807 return;
24811 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24812 return;
24814 if (TARGET_AVX2)
24816 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24818 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24819 an constant shuffle operand. With a tiny bit of effort we can
24820 use VPERMD instead. A re-interpretation stall for V4DFmode is
24821 unfortunate but there's no avoiding it.
24822 Similarly for V16HImode we don't have instructions for variable
24823 shuffling, while for V32QImode we can use after preparing suitable
24824 masks vpshufb; vpshufb; vpermq; vpor. */
24826 if (mode == V16HImode)
24828 maskmode = mode = V32QImode;
24829 w = 32;
24830 e = 1;
24832 else
24834 maskmode = mode = V8SImode;
24835 w = 8;
24836 e = 4;
24838 t1 = gen_reg_rtx (maskmode);
24840 /* Replicate the low bits of the V4DImode mask into V8SImode:
24841 mask = { A B C D }
24842 t1 = { A A B B C C D D }. */
24843 for (i = 0; i < w / 2; ++i)
24844 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24845 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24846 vt = force_reg (maskmode, vt);
24847 mask = gen_lowpart (maskmode, mask);
24848 if (maskmode == V8SImode)
24849 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24850 else
24851 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24853 /* Multiply the shuffle indicies by two. */
24854 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24855 OPTAB_DIRECT);
24857 /* Add one to the odd shuffle indicies:
24858 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24859 for (i = 0; i < w / 2; ++i)
24861 vec[i * 2] = const0_rtx;
24862 vec[i * 2 + 1] = const1_rtx;
24864 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24865 vt = validize_mem (force_const_mem (maskmode, vt));
24866 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24867 OPTAB_DIRECT);
24869 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24870 operands[3] = mask = t1;
24871 target = gen_reg_rtx (mode);
24872 op0 = gen_lowpart (mode, op0);
24873 op1 = gen_lowpart (mode, op1);
24876 switch (mode)
24878 case E_V8SImode:
24879 /* The VPERMD and VPERMPS instructions already properly ignore
24880 the high bits of the shuffle elements. No need for us to
24881 perform an AND ourselves. */
24882 if (one_operand_shuffle)
24884 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24885 if (target != operands[0])
24886 emit_move_insn (operands[0],
24887 gen_lowpart (GET_MODE (operands[0]), target));
24889 else
24891 t1 = gen_reg_rtx (V8SImode);
24892 t2 = gen_reg_rtx (V8SImode);
24893 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24894 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24895 goto merge_two;
24897 return;
24899 case E_V8SFmode:
24900 mask = gen_lowpart (V8SImode, mask);
24901 if (one_operand_shuffle)
24902 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24903 else
24905 t1 = gen_reg_rtx (V8SFmode);
24906 t2 = gen_reg_rtx (V8SFmode);
24907 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24908 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24909 goto merge_two;
24911 return;
24913 case E_V4SImode:
24914 /* By combining the two 128-bit input vectors into one 256-bit
24915 input vector, we can use VPERMD and VPERMPS for the full
24916 two-operand shuffle. */
24917 t1 = gen_reg_rtx (V8SImode);
24918 t2 = gen_reg_rtx (V8SImode);
24919 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24920 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24921 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24922 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24923 return;
24925 case E_V4SFmode:
24926 t1 = gen_reg_rtx (V8SFmode);
24927 t2 = gen_reg_rtx (V8SImode);
24928 mask = gen_lowpart (V4SImode, mask);
24929 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24930 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24931 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24932 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24933 return;
24935 case E_V32QImode:
24936 t1 = gen_reg_rtx (V32QImode);
24937 t2 = gen_reg_rtx (V32QImode);
24938 t3 = gen_reg_rtx (V32QImode);
24939 vt2 = GEN_INT (-128);
24940 vt = gen_const_vec_duplicate (V32QImode, vt2);
24941 vt = force_reg (V32QImode, vt);
24942 for (i = 0; i < 32; i++)
24943 vec[i] = i < 16 ? vt2 : const0_rtx;
24944 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24945 vt2 = force_reg (V32QImode, vt2);
24946 /* From mask create two adjusted masks, which contain the same
24947 bits as mask in the low 7 bits of each vector element.
24948 The first mask will have the most significant bit clear
24949 if it requests element from the same 128-bit lane
24950 and MSB set if it requests element from the other 128-bit lane.
24951 The second mask will have the opposite values of the MSB,
24952 and additionally will have its 128-bit lanes swapped.
24953 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24954 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24955 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24956 stands for other 12 bytes. */
24957 /* The bit whether element is from the same lane or the other
24958 lane is bit 4, so shift it up by 3 to the MSB position. */
24959 t5 = gen_reg_rtx (V4DImode);
24960 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24961 GEN_INT (3)));
24962 /* Clear MSB bits from the mask just in case it had them set. */
24963 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24964 /* After this t1 will have MSB set for elements from other lane. */
24965 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24966 /* Clear bits other than MSB. */
24967 emit_insn (gen_andv32qi3 (t1, t1, vt));
24968 /* Or in the lower bits from mask into t3. */
24969 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24970 /* And invert MSB bits in t1, so MSB is set for elements from the same
24971 lane. */
24972 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24973 /* Swap 128-bit lanes in t3. */
24974 t6 = gen_reg_rtx (V4DImode);
24975 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24976 const2_rtx, GEN_INT (3),
24977 const0_rtx, const1_rtx));
24978 /* And or in the lower bits from mask into t1. */
24979 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24980 if (one_operand_shuffle)
24982 /* Each of these shuffles will put 0s in places where
24983 element from the other 128-bit lane is needed, otherwise
24984 will shuffle in the requested value. */
24985 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24986 gen_lowpart (V32QImode, t6)));
24987 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24988 /* For t3 the 128-bit lanes are swapped again. */
24989 t7 = gen_reg_rtx (V4DImode);
24990 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24991 const2_rtx, GEN_INT (3),
24992 const0_rtx, const1_rtx));
24993 /* And oring both together leads to the result. */
24994 emit_insn (gen_iorv32qi3 (target, t1,
24995 gen_lowpart (V32QImode, t7)));
24996 if (target != operands[0])
24997 emit_move_insn (operands[0],
24998 gen_lowpart (GET_MODE (operands[0]), target));
24999 return;
25002 t4 = gen_reg_rtx (V32QImode);
25003 /* Similarly to the above one_operand_shuffle code,
25004 just for repeated twice for each operand. merge_two:
25005 code will merge the two results together. */
25006 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
25007 gen_lowpart (V32QImode, t6)));
25008 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
25009 gen_lowpart (V32QImode, t6)));
25010 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
25011 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
25012 t7 = gen_reg_rtx (V4DImode);
25013 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
25014 const2_rtx, GEN_INT (3),
25015 const0_rtx, const1_rtx));
25016 t8 = gen_reg_rtx (V4DImode);
25017 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
25018 const2_rtx, GEN_INT (3),
25019 const0_rtx, const1_rtx));
25020 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
25021 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
25022 t1 = t4;
25023 t2 = t3;
25024 goto merge_two;
25026 default:
25027 gcc_assert (GET_MODE_SIZE (mode) <= 16);
25028 break;
25032 if (TARGET_XOP)
25034 /* The XOP VPPERM insn supports three inputs. By ignoring the
25035 one_operand_shuffle special case, we avoid creating another
25036 set of constant vectors in memory. */
25037 one_operand_shuffle = false;
25039 /* mask = mask & {2*w-1, ...} */
25040 vt = GEN_INT (2*w - 1);
25042 else
25044 /* mask = mask & {w-1, ...} */
25045 vt = GEN_INT (w - 1);
25048 vt = gen_const_vec_duplicate (maskmode, vt);
25049 mask = expand_simple_binop (maskmode, AND, mask, vt,
25050 NULL_RTX, 0, OPTAB_DIRECT);
25052 /* For non-QImode operations, convert the word permutation control
25053 into a byte permutation control. */
25054 if (mode != V16QImode)
25056 mask = expand_simple_binop (maskmode, ASHIFT, mask,
25057 GEN_INT (exact_log2 (e)),
25058 NULL_RTX, 0, OPTAB_DIRECT);
25060 /* Convert mask to vector of chars. */
25061 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
25063 /* Replicate each of the input bytes into byte positions:
25064 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
25065 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
25066 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
25067 for (i = 0; i < 16; ++i)
25068 vec[i] = GEN_INT (i/e * e);
25069 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25070 vt = validize_mem (force_const_mem (V16QImode, vt));
25071 if (TARGET_XOP)
25072 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
25073 else
25074 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
25076 /* Convert it into the byte positions by doing
25077 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
25078 for (i = 0; i < 16; ++i)
25079 vec[i] = GEN_INT (i % e);
25080 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
25081 vt = validize_mem (force_const_mem (V16QImode, vt));
25082 emit_insn (gen_addv16qi3 (mask, mask, vt));
25085 /* The actual shuffle operations all operate on V16QImode. */
25086 op0 = gen_lowpart (V16QImode, op0);
25087 op1 = gen_lowpart (V16QImode, op1);
25089 if (TARGET_XOP)
25091 if (GET_MODE (target) != V16QImode)
25092 target = gen_reg_rtx (V16QImode);
25093 emit_insn (gen_xop_pperm (target, op0, op1, mask));
25094 if (target != operands[0])
25095 emit_move_insn (operands[0],
25096 gen_lowpart (GET_MODE (operands[0]), target));
25098 else if (one_operand_shuffle)
25100 if (GET_MODE (target) != V16QImode)
25101 target = gen_reg_rtx (V16QImode);
25102 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
25103 if (target != operands[0])
25104 emit_move_insn (operands[0],
25105 gen_lowpart (GET_MODE (operands[0]), target));
25107 else
25109 rtx xops[6];
25110 bool ok;
25112 /* Shuffle the two input vectors independently. */
25113 t1 = gen_reg_rtx (V16QImode);
25114 t2 = gen_reg_rtx (V16QImode);
25115 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
25116 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
25118 merge_two:
25119 /* Then merge them together. The key is whether any given control
25120 element contained a bit set that indicates the second word. */
25121 mask = operands[3];
25122 vt = GEN_INT (w);
25123 if (maskmode == V2DImode && !TARGET_SSE4_1)
25125 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
25126 more shuffle to convert the V2DI input mask into a V4SI
25127 input mask. At which point the masking that expand_int_vcond
25128 will work as desired. */
25129 rtx t3 = gen_reg_rtx (V4SImode);
25130 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
25131 const0_rtx, const0_rtx,
25132 const2_rtx, const2_rtx));
25133 mask = t3;
25134 maskmode = V4SImode;
25135 e = w = 4;
25138 vt = gen_const_vec_duplicate (maskmode, vt);
25139 vt = force_reg (maskmode, vt);
25140 mask = expand_simple_binop (maskmode, AND, mask, vt,
25141 NULL_RTX, 0, OPTAB_DIRECT);
25143 if (GET_MODE (target) != mode)
25144 target = gen_reg_rtx (mode);
25145 xops[0] = target;
25146 xops[1] = gen_lowpart (mode, t2);
25147 xops[2] = gen_lowpart (mode, t1);
25148 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
25149 xops[4] = mask;
25150 xops[5] = vt;
25151 ok = ix86_expand_int_vcond (xops);
25152 gcc_assert (ok);
25153 if (target != operands[0])
25154 emit_move_insn (operands[0],
25155 gen_lowpart (GET_MODE (operands[0]), target));
25159 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
25160 true if we should do zero extension, else sign extension. HIGH_P is
25161 true if we want the N/2 high elements, else the low elements. */
25163 void
25164 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
25166 machine_mode imode = GET_MODE (src);
25167 rtx tmp;
25169 if (TARGET_SSE4_1)
25171 rtx (*unpack)(rtx, rtx);
25172 rtx (*extract)(rtx, rtx) = NULL;
25173 machine_mode halfmode = BLKmode;
25175 switch (imode)
25177 case E_V64QImode:
25178 if (unsigned_p)
25179 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
25180 else
25181 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
25182 halfmode = V32QImode;
25183 extract
25184 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
25185 break;
25186 case E_V32QImode:
25187 if (unsigned_p)
25188 unpack = gen_avx2_zero_extendv16qiv16hi2;
25189 else
25190 unpack = gen_avx2_sign_extendv16qiv16hi2;
25191 halfmode = V16QImode;
25192 extract
25193 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
25194 break;
25195 case E_V32HImode:
25196 if (unsigned_p)
25197 unpack = gen_avx512f_zero_extendv16hiv16si2;
25198 else
25199 unpack = gen_avx512f_sign_extendv16hiv16si2;
25200 halfmode = V16HImode;
25201 extract
25202 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
25203 break;
25204 case E_V16HImode:
25205 if (unsigned_p)
25206 unpack = gen_avx2_zero_extendv8hiv8si2;
25207 else
25208 unpack = gen_avx2_sign_extendv8hiv8si2;
25209 halfmode = V8HImode;
25210 extract
25211 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
25212 break;
25213 case E_V16SImode:
25214 if (unsigned_p)
25215 unpack = gen_avx512f_zero_extendv8siv8di2;
25216 else
25217 unpack = gen_avx512f_sign_extendv8siv8di2;
25218 halfmode = V8SImode;
25219 extract
25220 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
25221 break;
25222 case E_V8SImode:
25223 if (unsigned_p)
25224 unpack = gen_avx2_zero_extendv4siv4di2;
25225 else
25226 unpack = gen_avx2_sign_extendv4siv4di2;
25227 halfmode = V4SImode;
25228 extract
25229 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
25230 break;
25231 case E_V16QImode:
25232 if (unsigned_p)
25233 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
25234 else
25235 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
25236 break;
25237 case E_V8HImode:
25238 if (unsigned_p)
25239 unpack = gen_sse4_1_zero_extendv4hiv4si2;
25240 else
25241 unpack = gen_sse4_1_sign_extendv4hiv4si2;
25242 break;
25243 case E_V4SImode:
25244 if (unsigned_p)
25245 unpack = gen_sse4_1_zero_extendv2siv2di2;
25246 else
25247 unpack = gen_sse4_1_sign_extendv2siv2di2;
25248 break;
25249 default:
25250 gcc_unreachable ();
25253 if (GET_MODE_SIZE (imode) >= 32)
25255 tmp = gen_reg_rtx (halfmode);
25256 emit_insn (extract (tmp, src));
25258 else if (high_p)
25260 /* Shift higher 8 bytes to lower 8 bytes. */
25261 tmp = gen_reg_rtx (V1TImode);
25262 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
25263 GEN_INT (64)));
25264 tmp = gen_lowpart (imode, tmp);
25266 else
25267 tmp = src;
25269 emit_insn (unpack (dest, tmp));
25271 else
25273 rtx (*unpack)(rtx, rtx, rtx);
25275 switch (imode)
25277 case E_V16QImode:
25278 if (high_p)
25279 unpack = gen_vec_interleave_highv16qi;
25280 else
25281 unpack = gen_vec_interleave_lowv16qi;
25282 break;
25283 case E_V8HImode:
25284 if (high_p)
25285 unpack = gen_vec_interleave_highv8hi;
25286 else
25287 unpack = gen_vec_interleave_lowv8hi;
25288 break;
25289 case E_V4SImode:
25290 if (high_p)
25291 unpack = gen_vec_interleave_highv4si;
25292 else
25293 unpack = gen_vec_interleave_lowv4si;
25294 break;
25295 default:
25296 gcc_unreachable ();
25299 if (unsigned_p)
25300 tmp = force_reg (imode, CONST0_RTX (imode));
25301 else
25302 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
25303 src, pc_rtx, pc_rtx);
25305 rtx tmp2 = gen_reg_rtx (imode);
25306 emit_insn (unpack (tmp2, src, tmp));
25307 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
25311 /* Expand conditional increment or decrement using adb/sbb instructions.
25312 The default case using setcc followed by the conditional move can be
25313 done by generic code. */
25314 bool
25315 ix86_expand_int_addcc (rtx operands[])
25317 enum rtx_code code = GET_CODE (operands[1]);
25318 rtx flags;
25319 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
25320 rtx compare_op;
25321 rtx val = const0_rtx;
25322 bool fpcmp = false;
25323 machine_mode mode;
25324 rtx op0 = XEXP (operands[1], 0);
25325 rtx op1 = XEXP (operands[1], 1);
25327 if (operands[3] != const1_rtx
25328 && operands[3] != constm1_rtx)
25329 return false;
25330 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
25331 return false;
25332 code = GET_CODE (compare_op);
25334 flags = XEXP (compare_op, 0);
25336 if (GET_MODE (flags) == CCFPmode)
25338 fpcmp = true;
25339 code = ix86_fp_compare_code_to_integer (code);
25342 if (code != LTU)
25344 val = constm1_rtx;
25345 if (fpcmp)
25346 PUT_CODE (compare_op,
25347 reverse_condition_maybe_unordered
25348 (GET_CODE (compare_op)));
25349 else
25350 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
25353 mode = GET_MODE (operands[0]);
25355 /* Construct either adc or sbb insn. */
25356 if ((code == LTU) == (operands[3] == constm1_rtx))
25358 switch (mode)
25360 case E_QImode:
25361 insn = gen_subqi3_carry;
25362 break;
25363 case E_HImode:
25364 insn = gen_subhi3_carry;
25365 break;
25366 case E_SImode:
25367 insn = gen_subsi3_carry;
25368 break;
25369 case E_DImode:
25370 insn = gen_subdi3_carry;
25371 break;
25372 default:
25373 gcc_unreachable ();
25376 else
25378 switch (mode)
25380 case E_QImode:
25381 insn = gen_addqi3_carry;
25382 break;
25383 case E_HImode:
25384 insn = gen_addhi3_carry;
25385 break;
25386 case E_SImode:
25387 insn = gen_addsi3_carry;
25388 break;
25389 case E_DImode:
25390 insn = gen_adddi3_carry;
25391 break;
25392 default:
25393 gcc_unreachable ();
25396 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
25398 return true;
25402 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
25403 but works for floating pointer parameters and nonoffsetable memories.
25404 For pushes, it returns just stack offsets; the values will be saved
25405 in the right order. Maximally three parts are generated. */
25407 static int
25408 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
25410 int size;
25412 if (!TARGET_64BIT)
25413 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
25414 else
25415 size = (GET_MODE_SIZE (mode) + 4) / 8;
25417 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
25418 gcc_assert (size >= 2 && size <= 4);
25420 /* Optimize constant pool reference to immediates. This is used by fp
25421 moves, that force all constants to memory to allow combining. */
25422 if (MEM_P (operand) && MEM_READONLY_P (operand))
25423 operand = avoid_constant_pool_reference (operand);
25425 if (MEM_P (operand) && !offsettable_memref_p (operand))
25427 /* The only non-offsetable memories we handle are pushes. */
25428 int ok = push_operand (operand, VOIDmode);
25430 gcc_assert (ok);
25432 operand = copy_rtx (operand);
25433 PUT_MODE (operand, word_mode);
25434 parts[0] = parts[1] = parts[2] = parts[3] = operand;
25435 return size;
25438 if (GET_CODE (operand) == CONST_VECTOR)
25440 scalar_int_mode imode = int_mode_for_mode (mode).require ();
25441 /* Caution: if we looked through a constant pool memory above,
25442 the operand may actually have a different mode now. That's
25443 ok, since we want to pun this all the way back to an integer. */
25444 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
25445 gcc_assert (operand != NULL);
25446 mode = imode;
25449 if (!TARGET_64BIT)
25451 if (mode == DImode)
25452 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25453 else
25455 int i;
25457 if (REG_P (operand))
25459 gcc_assert (reload_completed);
25460 for (i = 0; i < size; i++)
25461 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
25463 else if (offsettable_memref_p (operand))
25465 operand = adjust_address (operand, SImode, 0);
25466 parts[0] = operand;
25467 for (i = 1; i < size; i++)
25468 parts[i] = adjust_address (operand, SImode, 4 * i);
25470 else if (CONST_DOUBLE_P (operand))
25472 const REAL_VALUE_TYPE *r;
25473 long l[4];
25475 r = CONST_DOUBLE_REAL_VALUE (operand);
25476 switch (mode)
25478 case E_TFmode:
25479 real_to_target (l, r, mode);
25480 parts[3] = gen_int_mode (l[3], SImode);
25481 parts[2] = gen_int_mode (l[2], SImode);
25482 break;
25483 case E_XFmode:
25484 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25485 long double may not be 80-bit. */
25486 real_to_target (l, r, mode);
25487 parts[2] = gen_int_mode (l[2], SImode);
25488 break;
25489 case E_DFmode:
25490 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25491 break;
25492 default:
25493 gcc_unreachable ();
25495 parts[1] = gen_int_mode (l[1], SImode);
25496 parts[0] = gen_int_mode (l[0], SImode);
25498 else
25499 gcc_unreachable ();
25502 else
25504 if (mode == TImode)
25505 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25506 if (mode == XFmode || mode == TFmode)
25508 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25509 if (REG_P (operand))
25511 gcc_assert (reload_completed);
25512 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25513 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25515 else if (offsettable_memref_p (operand))
25517 operand = adjust_address (operand, DImode, 0);
25518 parts[0] = operand;
25519 parts[1] = adjust_address (operand, upper_mode, 8);
25521 else if (CONST_DOUBLE_P (operand))
25523 long l[4];
25525 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25527 /* real_to_target puts 32-bit pieces in each long. */
25528 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25529 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25530 << 32), DImode);
25532 if (upper_mode == SImode)
25533 parts[1] = gen_int_mode (l[2], SImode);
25534 else
25535 parts[1]
25536 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25537 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25538 << 32), DImode);
25540 else
25541 gcc_unreachable ();
25545 return size;
25548 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25549 Return false when normal moves are needed; true when all required
25550 insns have been emitted. Operands 2-4 contain the input values
25551 int the correct order; operands 5-7 contain the output values. */
25553 void
25554 ix86_split_long_move (rtx operands[])
25556 rtx part[2][4];
25557 int nparts, i, j;
25558 int push = 0;
25559 int collisions = 0;
25560 machine_mode mode = GET_MODE (operands[0]);
25561 bool collisionparts[4];
25563 /* The DFmode expanders may ask us to move double.
25564 For 64bit target this is single move. By hiding the fact
25565 here we simplify i386.md splitters. */
25566 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25568 /* Optimize constant pool reference to immediates. This is used by
25569 fp moves, that force all constants to memory to allow combining. */
25571 if (MEM_P (operands[1])
25572 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25573 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25574 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25575 if (push_operand (operands[0], VOIDmode))
25577 operands[0] = copy_rtx (operands[0]);
25578 PUT_MODE (operands[0], word_mode);
25580 else
25581 operands[0] = gen_lowpart (DImode, operands[0]);
25582 operands[1] = gen_lowpart (DImode, operands[1]);
25583 emit_move_insn (operands[0], operands[1]);
25584 return;
25587 /* The only non-offsettable memory we handle is push. */
25588 if (push_operand (operands[0], VOIDmode))
25589 push = 1;
25590 else
25591 gcc_assert (!MEM_P (operands[0])
25592 || offsettable_memref_p (operands[0]));
25594 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25595 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25597 /* When emitting push, take care for source operands on the stack. */
25598 if (push && MEM_P (operands[1])
25599 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25601 rtx src_base = XEXP (part[1][nparts - 1], 0);
25603 /* Compensate for the stack decrement by 4. */
25604 if (!TARGET_64BIT && nparts == 3
25605 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25606 src_base = plus_constant (Pmode, src_base, 4);
25608 /* src_base refers to the stack pointer and is
25609 automatically decreased by emitted push. */
25610 for (i = 0; i < nparts; i++)
25611 part[1][i] = change_address (part[1][i],
25612 GET_MODE (part[1][i]), src_base);
25615 /* We need to do copy in the right order in case an address register
25616 of the source overlaps the destination. */
25617 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25619 rtx tmp;
25621 for (i = 0; i < nparts; i++)
25623 collisionparts[i]
25624 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25625 if (collisionparts[i])
25626 collisions++;
25629 /* Collision in the middle part can be handled by reordering. */
25630 if (collisions == 1 && nparts == 3 && collisionparts [1])
25632 std::swap (part[0][1], part[0][2]);
25633 std::swap (part[1][1], part[1][2]);
25635 else if (collisions == 1
25636 && nparts == 4
25637 && (collisionparts [1] || collisionparts [2]))
25639 if (collisionparts [1])
25641 std::swap (part[0][1], part[0][2]);
25642 std::swap (part[1][1], part[1][2]);
25644 else
25646 std::swap (part[0][2], part[0][3]);
25647 std::swap (part[1][2], part[1][3]);
25651 /* If there are more collisions, we can't handle it by reordering.
25652 Do an lea to the last part and use only one colliding move. */
25653 else if (collisions > 1)
25655 rtx base, addr;
25657 collisions = 1;
25659 base = part[0][nparts - 1];
25661 /* Handle the case when the last part isn't valid for lea.
25662 Happens in 64-bit mode storing the 12-byte XFmode. */
25663 if (GET_MODE (base) != Pmode)
25664 base = gen_rtx_REG (Pmode, REGNO (base));
25666 addr = XEXP (part[1][0], 0);
25667 if (TARGET_TLS_DIRECT_SEG_REFS)
25669 struct ix86_address parts;
25670 int ok = ix86_decompose_address (addr, &parts);
25671 gcc_assert (ok);
25672 /* It is not valid to use %gs: or %fs: in lea. */
25673 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25675 emit_insn (gen_rtx_SET (base, addr));
25676 part[1][0] = replace_equiv_address (part[1][0], base);
25677 for (i = 1; i < nparts; i++)
25679 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25680 part[1][i] = replace_equiv_address (part[1][i], tmp);
25685 if (push)
25687 if (!TARGET_64BIT)
25689 if (nparts == 3)
25691 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25692 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25693 stack_pointer_rtx, GEN_INT (-4)));
25694 emit_move_insn (part[0][2], part[1][2]);
25696 else if (nparts == 4)
25698 emit_move_insn (part[0][3], part[1][3]);
25699 emit_move_insn (part[0][2], part[1][2]);
25702 else
25704 /* In 64bit mode we don't have 32bit push available. In case this is
25705 register, it is OK - we will just use larger counterpart. We also
25706 retype memory - these comes from attempt to avoid REX prefix on
25707 moving of second half of TFmode value. */
25708 if (GET_MODE (part[1][1]) == SImode)
25710 switch (GET_CODE (part[1][1]))
25712 case MEM:
25713 part[1][1] = adjust_address (part[1][1], DImode, 0);
25714 break;
25716 case REG:
25717 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25718 break;
25720 default:
25721 gcc_unreachable ();
25724 if (GET_MODE (part[1][0]) == SImode)
25725 part[1][0] = part[1][1];
25728 emit_move_insn (part[0][1], part[1][1]);
25729 emit_move_insn (part[0][0], part[1][0]);
25730 return;
25733 /* Choose correct order to not overwrite the source before it is copied. */
25734 if ((REG_P (part[0][0])
25735 && REG_P (part[1][1])
25736 && (REGNO (part[0][0]) == REGNO (part[1][1])
25737 || (nparts == 3
25738 && REGNO (part[0][0]) == REGNO (part[1][2]))
25739 || (nparts == 4
25740 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25741 || (collisions > 0
25742 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25744 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25746 operands[2 + i] = part[0][j];
25747 operands[6 + i] = part[1][j];
25750 else
25752 for (i = 0; i < nparts; i++)
25754 operands[2 + i] = part[0][i];
25755 operands[6 + i] = part[1][i];
25759 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25760 if (optimize_insn_for_size_p ())
25762 for (j = 0; j < nparts - 1; j++)
25763 if (CONST_INT_P (operands[6 + j])
25764 && operands[6 + j] != const0_rtx
25765 && REG_P (operands[2 + j]))
25766 for (i = j; i < nparts - 1; i++)
25767 if (CONST_INT_P (operands[7 + i])
25768 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25769 operands[7 + i] = operands[2 + j];
25772 for (i = 0; i < nparts; i++)
25773 emit_move_insn (operands[2 + i], operands[6 + i]);
25775 return;
25778 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25779 left shift by a constant, either using a single shift or
25780 a sequence of add instructions. */
25782 static void
25783 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25785 rtx (*insn)(rtx, rtx, rtx);
25787 if (count == 1
25788 || (count * ix86_cost->add <= ix86_cost->shift_const
25789 && !optimize_insn_for_size_p ()))
25791 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25792 while (count-- > 0)
25793 emit_insn (insn (operand, operand, operand));
25795 else
25797 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25798 emit_insn (insn (operand, operand, GEN_INT (count)));
25802 void
25803 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25805 rtx (*gen_ashl3)(rtx, rtx, rtx);
25806 rtx (*gen_shld)(rtx, rtx, rtx);
25807 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25809 rtx low[2], high[2];
25810 int count;
25812 if (CONST_INT_P (operands[2]))
25814 split_double_mode (mode, operands, 2, low, high);
25815 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25817 if (count >= half_width)
25819 emit_move_insn (high[0], low[1]);
25820 emit_move_insn (low[0], const0_rtx);
25822 if (count > half_width)
25823 ix86_expand_ashl_const (high[0], count - half_width, mode);
25825 else
25827 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25829 if (!rtx_equal_p (operands[0], operands[1]))
25830 emit_move_insn (operands[0], operands[1]);
25832 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25833 ix86_expand_ashl_const (low[0], count, mode);
25835 return;
25838 split_double_mode (mode, operands, 1, low, high);
25840 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25842 if (operands[1] == const1_rtx)
25844 /* Assuming we've chosen a QImode capable registers, then 1 << N
25845 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25846 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25848 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25850 ix86_expand_clear (low[0]);
25851 ix86_expand_clear (high[0]);
25852 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25854 d = gen_lowpart (QImode, low[0]);
25855 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25856 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25857 emit_insn (gen_rtx_SET (d, s));
25859 d = gen_lowpart (QImode, high[0]);
25860 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25861 s = gen_rtx_NE (QImode, flags, const0_rtx);
25862 emit_insn (gen_rtx_SET (d, s));
25865 /* Otherwise, we can get the same results by manually performing
25866 a bit extract operation on bit 5/6, and then performing the two
25867 shifts. The two methods of getting 0/1 into low/high are exactly
25868 the same size. Avoiding the shift in the bit extract case helps
25869 pentium4 a bit; no one else seems to care much either way. */
25870 else
25872 machine_mode half_mode;
25873 rtx (*gen_lshr3)(rtx, rtx, rtx);
25874 rtx (*gen_and3)(rtx, rtx, rtx);
25875 rtx (*gen_xor3)(rtx, rtx, rtx);
25876 HOST_WIDE_INT bits;
25877 rtx x;
25879 if (mode == DImode)
25881 half_mode = SImode;
25882 gen_lshr3 = gen_lshrsi3;
25883 gen_and3 = gen_andsi3;
25884 gen_xor3 = gen_xorsi3;
25885 bits = 5;
25887 else
25889 half_mode = DImode;
25890 gen_lshr3 = gen_lshrdi3;
25891 gen_and3 = gen_anddi3;
25892 gen_xor3 = gen_xordi3;
25893 bits = 6;
25896 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25897 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25898 else
25899 x = gen_lowpart (half_mode, operands[2]);
25900 emit_insn (gen_rtx_SET (high[0], x));
25902 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25903 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25904 emit_move_insn (low[0], high[0]);
25905 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25908 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25909 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25910 return;
25913 if (operands[1] == constm1_rtx)
25915 /* For -1 << N, we can avoid the shld instruction, because we
25916 know that we're shifting 0...31/63 ones into a -1. */
25917 emit_move_insn (low[0], constm1_rtx);
25918 if (optimize_insn_for_size_p ())
25919 emit_move_insn (high[0], low[0]);
25920 else
25921 emit_move_insn (high[0], constm1_rtx);
25923 else
25925 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25927 if (!rtx_equal_p (operands[0], operands[1]))
25928 emit_move_insn (operands[0], operands[1]);
25930 split_double_mode (mode, operands, 1, low, high);
25931 emit_insn (gen_shld (high[0], low[0], operands[2]));
25934 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25936 if (TARGET_CMOVE && scratch)
25938 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25939 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25941 ix86_expand_clear (scratch);
25942 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25944 else
25946 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25947 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25949 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25953 void
25954 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25956 rtx (*gen_ashr3)(rtx, rtx, rtx)
25957 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25958 rtx (*gen_shrd)(rtx, rtx, rtx);
25959 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25961 rtx low[2], high[2];
25962 int count;
25964 if (CONST_INT_P (operands[2]))
25966 split_double_mode (mode, operands, 2, low, high);
25967 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25969 if (count == GET_MODE_BITSIZE (mode) - 1)
25971 emit_move_insn (high[0], high[1]);
25972 emit_insn (gen_ashr3 (high[0], high[0],
25973 GEN_INT (half_width - 1)));
25974 emit_move_insn (low[0], high[0]);
25977 else if (count >= half_width)
25979 emit_move_insn (low[0], high[1]);
25980 emit_move_insn (high[0], low[0]);
25981 emit_insn (gen_ashr3 (high[0], high[0],
25982 GEN_INT (half_width - 1)));
25984 if (count > half_width)
25985 emit_insn (gen_ashr3 (low[0], low[0],
25986 GEN_INT (count - half_width)));
25988 else
25990 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25992 if (!rtx_equal_p (operands[0], operands[1]))
25993 emit_move_insn (operands[0], operands[1]);
25995 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25996 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25999 else
26001 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26003 if (!rtx_equal_p (operands[0], operands[1]))
26004 emit_move_insn (operands[0], operands[1]);
26006 split_double_mode (mode, operands, 1, low, high);
26008 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26009 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
26011 if (TARGET_CMOVE && scratch)
26013 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26014 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26016 emit_move_insn (scratch, high[0]);
26017 emit_insn (gen_ashr3 (scratch, scratch,
26018 GEN_INT (half_width - 1)));
26019 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26020 scratch));
26022 else
26024 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
26025 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
26027 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
26032 void
26033 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
26035 rtx (*gen_lshr3)(rtx, rtx, rtx)
26036 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
26037 rtx (*gen_shrd)(rtx, rtx, rtx);
26038 int half_width = GET_MODE_BITSIZE (mode) >> 1;
26040 rtx low[2], high[2];
26041 int count;
26043 if (CONST_INT_P (operands[2]))
26045 split_double_mode (mode, operands, 2, low, high);
26046 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
26048 if (count >= half_width)
26050 emit_move_insn (low[0], high[1]);
26051 ix86_expand_clear (high[0]);
26053 if (count > half_width)
26054 emit_insn (gen_lshr3 (low[0], low[0],
26055 GEN_INT (count - half_width)));
26057 else
26059 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26061 if (!rtx_equal_p (operands[0], operands[1]))
26062 emit_move_insn (operands[0], operands[1]);
26064 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
26065 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
26068 else
26070 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
26072 if (!rtx_equal_p (operands[0], operands[1]))
26073 emit_move_insn (operands[0], operands[1]);
26075 split_double_mode (mode, operands, 1, low, high);
26077 emit_insn (gen_shrd (low[0], high[0], operands[2]));
26078 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
26080 if (TARGET_CMOVE && scratch)
26082 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
26083 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
26085 ix86_expand_clear (scratch);
26086 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
26087 scratch));
26089 else
26091 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
26092 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
26094 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
26099 /* Predict just emitted jump instruction to be taken with probability PROB. */
26100 static void
26101 predict_jump (int prob)
26103 rtx_insn *insn = get_last_insn ();
26104 gcc_assert (JUMP_P (insn));
26105 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
26108 /* Helper function for the string operations below. Dest VARIABLE whether
26109 it is aligned to VALUE bytes. If true, jump to the label. */
26110 static rtx_code_label *
26111 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
26113 rtx_code_label *label = gen_label_rtx ();
26114 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
26115 if (GET_MODE (variable) == DImode)
26116 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
26117 else
26118 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
26119 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
26120 1, label);
26121 if (epilogue)
26122 predict_jump (REG_BR_PROB_BASE * 50 / 100);
26123 else
26124 predict_jump (REG_BR_PROB_BASE * 90 / 100);
26125 return label;
26128 /* Adjust COUNTER by the VALUE. */
26129 static void
26130 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
26132 rtx (*gen_add)(rtx, rtx, rtx)
26133 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
26135 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
26138 /* Zero extend possibly SImode EXP to Pmode register. */
26140 ix86_zero_extend_to_Pmode (rtx exp)
26142 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
26145 /* Divide COUNTREG by SCALE. */
26146 static rtx
26147 scale_counter (rtx countreg, int scale)
26149 rtx sc;
26151 if (scale == 1)
26152 return countreg;
26153 if (CONST_INT_P (countreg))
26154 return GEN_INT (INTVAL (countreg) / scale);
26155 gcc_assert (REG_P (countreg));
26157 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
26158 GEN_INT (exact_log2 (scale)),
26159 NULL, 1, OPTAB_DIRECT);
26160 return sc;
26163 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
26164 DImode for constant loop counts. */
26166 static machine_mode
26167 counter_mode (rtx count_exp)
26169 if (GET_MODE (count_exp) != VOIDmode)
26170 return GET_MODE (count_exp);
26171 if (!CONST_INT_P (count_exp))
26172 return Pmode;
26173 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
26174 return DImode;
26175 return SImode;
26178 /* Copy the address to a Pmode register. This is used for x32 to
26179 truncate DImode TLS address to a SImode register. */
26181 static rtx
26182 ix86_copy_addr_to_reg (rtx addr)
26184 rtx reg;
26185 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
26187 reg = copy_addr_to_reg (addr);
26188 REG_POINTER (reg) = 1;
26189 return reg;
26191 else
26193 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
26194 reg = copy_to_mode_reg (DImode, addr);
26195 REG_POINTER (reg) = 1;
26196 return gen_rtx_SUBREG (SImode, reg, 0);
26200 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
26201 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
26202 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
26203 memory by VALUE (supposed to be in MODE).
26205 The size is rounded down to whole number of chunk size moved at once.
26206 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
26209 static void
26210 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
26211 rtx destptr, rtx srcptr, rtx value,
26212 rtx count, machine_mode mode, int unroll,
26213 int expected_size, bool issetmem)
26215 rtx_code_label *out_label, *top_label;
26216 rtx iter, tmp;
26217 machine_mode iter_mode = counter_mode (count);
26218 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
26219 rtx piece_size = GEN_INT (piece_size_n);
26220 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
26221 rtx size;
26222 int i;
26224 top_label = gen_label_rtx ();
26225 out_label = gen_label_rtx ();
26226 iter = gen_reg_rtx (iter_mode);
26228 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
26229 NULL, 1, OPTAB_DIRECT);
26230 /* Those two should combine. */
26231 if (piece_size == const1_rtx)
26233 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
26234 true, out_label);
26235 predict_jump (REG_BR_PROB_BASE * 10 / 100);
26237 emit_move_insn (iter, const0_rtx);
26239 emit_label (top_label);
26241 tmp = convert_modes (Pmode, iter_mode, iter, true);
26243 /* This assert could be relaxed - in this case we'll need to compute
26244 smallest power of two, containing in PIECE_SIZE_N and pass it to
26245 offset_address. */
26246 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
26247 destmem = offset_address (destmem, tmp, piece_size_n);
26248 destmem = adjust_address (destmem, mode, 0);
26250 if (!issetmem)
26252 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
26253 srcmem = adjust_address (srcmem, mode, 0);
26255 /* When unrolling for chips that reorder memory reads and writes,
26256 we can save registers by using single temporary.
26257 Also using 4 temporaries is overkill in 32bit mode. */
26258 if (!TARGET_64BIT && 0)
26260 for (i = 0; i < unroll; i++)
26262 if (i)
26264 destmem =
26265 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26266 srcmem =
26267 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26269 emit_move_insn (destmem, srcmem);
26272 else
26274 rtx tmpreg[4];
26275 gcc_assert (unroll <= 4);
26276 for (i = 0; i < unroll; i++)
26278 tmpreg[i] = gen_reg_rtx (mode);
26279 if (i)
26281 srcmem =
26282 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
26284 emit_move_insn (tmpreg[i], srcmem);
26286 for (i = 0; i < unroll; i++)
26288 if (i)
26290 destmem =
26291 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26293 emit_move_insn (destmem, tmpreg[i]);
26297 else
26298 for (i = 0; i < unroll; i++)
26300 if (i)
26301 destmem =
26302 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
26303 emit_move_insn (destmem, value);
26306 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
26307 true, OPTAB_LIB_WIDEN);
26308 if (tmp != iter)
26309 emit_move_insn (iter, tmp);
26311 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
26312 true, top_label);
26313 if (expected_size != -1)
26315 expected_size /= GET_MODE_SIZE (mode) * unroll;
26316 if (expected_size == 0)
26317 predict_jump (0);
26318 else if (expected_size > REG_BR_PROB_BASE)
26319 predict_jump (REG_BR_PROB_BASE - 1);
26320 else
26321 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
26323 else
26324 predict_jump (REG_BR_PROB_BASE * 80 / 100);
26325 iter = ix86_zero_extend_to_Pmode (iter);
26326 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
26327 true, OPTAB_LIB_WIDEN);
26328 if (tmp != destptr)
26329 emit_move_insn (destptr, tmp);
26330 if (!issetmem)
26332 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
26333 true, OPTAB_LIB_WIDEN);
26334 if (tmp != srcptr)
26335 emit_move_insn (srcptr, tmp);
26337 emit_label (out_label);
26340 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
26341 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
26342 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
26343 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
26344 ORIG_VALUE is the original value passed to memset to fill the memory with.
26345 Other arguments have same meaning as for previous function. */
26347 static void
26348 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
26349 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
26350 rtx count,
26351 machine_mode mode, bool issetmem)
26353 rtx destexp;
26354 rtx srcexp;
26355 rtx countreg;
26356 HOST_WIDE_INT rounded_count;
26358 /* If possible, it is shorter to use rep movs.
26359 TODO: Maybe it is better to move this logic to decide_alg. */
26360 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
26361 && (!issetmem || orig_value == const0_rtx))
26362 mode = SImode;
26364 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
26365 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
26367 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
26368 GET_MODE_SIZE (mode)));
26369 if (mode != QImode)
26371 destexp = gen_rtx_ASHIFT (Pmode, countreg,
26372 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26373 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
26375 else
26376 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
26377 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
26379 rounded_count
26380 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26381 destmem = shallow_copy_rtx (destmem);
26382 set_mem_size (destmem, rounded_count);
26384 else if (MEM_SIZE_KNOWN_P (destmem))
26385 clear_mem_size (destmem);
26387 if (issetmem)
26389 value = force_reg (mode, gen_lowpart (mode, value));
26390 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
26392 else
26394 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
26395 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
26396 if (mode != QImode)
26398 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
26399 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
26400 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
26402 else
26403 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
26404 if (CONST_INT_P (count))
26406 rounded_count
26407 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
26408 srcmem = shallow_copy_rtx (srcmem);
26409 set_mem_size (srcmem, rounded_count);
26411 else
26413 if (MEM_SIZE_KNOWN_P (srcmem))
26414 clear_mem_size (srcmem);
26416 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
26417 destexp, srcexp));
26421 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
26422 DESTMEM.
26423 SRC is passed by pointer to be updated on return.
26424 Return value is updated DST. */
26425 static rtx
26426 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
26427 HOST_WIDE_INT size_to_move)
26429 rtx dst = destmem, src = *srcmem, adjust, tempreg;
26430 enum insn_code code;
26431 machine_mode move_mode;
26432 int piece_size, i;
26434 /* Find the widest mode in which we could perform moves.
26435 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26436 it until move of such size is supported. */
26437 piece_size = 1 << floor_log2 (size_to_move);
26438 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
26439 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26441 gcc_assert (piece_size > 1);
26442 piece_size >>= 1;
26445 /* Find the corresponding vector mode with the same size as MOVE_MODE.
26446 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
26447 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
26449 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
26450 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
26451 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
26453 move_mode = word_mode;
26454 piece_size = GET_MODE_SIZE (move_mode);
26455 code = optab_handler (mov_optab, move_mode);
26458 gcc_assert (code != CODE_FOR_nothing);
26460 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26461 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
26463 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26464 gcc_assert (size_to_move % piece_size == 0);
26465 adjust = GEN_INT (piece_size);
26466 for (i = 0; i < size_to_move; i += piece_size)
26468 /* We move from memory to memory, so we'll need to do it via
26469 a temporary register. */
26470 tempreg = gen_reg_rtx (move_mode);
26471 emit_insn (GEN_FCN (code) (tempreg, src));
26472 emit_insn (GEN_FCN (code) (dst, tempreg));
26474 emit_move_insn (destptr,
26475 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26476 emit_move_insn (srcptr,
26477 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
26479 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26480 piece_size);
26481 src = adjust_automodify_address_nv (src, move_mode, srcptr,
26482 piece_size);
26485 /* Update DST and SRC rtx. */
26486 *srcmem = src;
26487 return dst;
26490 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26491 static void
26492 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26493 rtx destptr, rtx srcptr, rtx count, int max_size)
26495 rtx src, dest;
26496 if (CONST_INT_P (count))
26498 HOST_WIDE_INT countval = INTVAL (count);
26499 HOST_WIDE_INT epilogue_size = countval % max_size;
26500 int i;
26502 /* For now MAX_SIZE should be a power of 2. This assert could be
26503 relaxed, but it'll require a bit more complicated epilogue
26504 expanding. */
26505 gcc_assert ((max_size & (max_size - 1)) == 0);
26506 for (i = max_size; i >= 1; i >>= 1)
26508 if (epilogue_size & i)
26509 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26511 return;
26513 if (max_size > 8)
26515 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26516 count, 1, OPTAB_DIRECT);
26517 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26518 count, QImode, 1, 4, false);
26519 return;
26522 /* When there are stringops, we can cheaply increase dest and src pointers.
26523 Otherwise we save code size by maintaining offset (zero is readily
26524 available from preceding rep operation) and using x86 addressing modes.
26526 if (TARGET_SINGLE_STRINGOP)
26528 if (max_size > 4)
26530 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26531 src = change_address (srcmem, SImode, srcptr);
26532 dest = change_address (destmem, SImode, destptr);
26533 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26534 emit_label (label);
26535 LABEL_NUSES (label) = 1;
26537 if (max_size > 2)
26539 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26540 src = change_address (srcmem, HImode, srcptr);
26541 dest = change_address (destmem, HImode, destptr);
26542 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26543 emit_label (label);
26544 LABEL_NUSES (label) = 1;
26546 if (max_size > 1)
26548 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26549 src = change_address (srcmem, QImode, srcptr);
26550 dest = change_address (destmem, QImode, destptr);
26551 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26552 emit_label (label);
26553 LABEL_NUSES (label) = 1;
26556 else
26558 rtx offset = force_reg (Pmode, const0_rtx);
26559 rtx tmp;
26561 if (max_size > 4)
26563 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26564 src = change_address (srcmem, SImode, srcptr);
26565 dest = change_address (destmem, SImode, destptr);
26566 emit_move_insn (dest, src);
26567 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26568 true, OPTAB_LIB_WIDEN);
26569 if (tmp != offset)
26570 emit_move_insn (offset, tmp);
26571 emit_label (label);
26572 LABEL_NUSES (label) = 1;
26574 if (max_size > 2)
26576 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26577 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26578 src = change_address (srcmem, HImode, tmp);
26579 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26580 dest = change_address (destmem, HImode, tmp);
26581 emit_move_insn (dest, src);
26582 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26583 true, OPTAB_LIB_WIDEN);
26584 if (tmp != offset)
26585 emit_move_insn (offset, tmp);
26586 emit_label (label);
26587 LABEL_NUSES (label) = 1;
26589 if (max_size > 1)
26591 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26592 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26593 src = change_address (srcmem, QImode, tmp);
26594 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26595 dest = change_address (destmem, QImode, tmp);
26596 emit_move_insn (dest, src);
26597 emit_label (label);
26598 LABEL_NUSES (label) = 1;
26603 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26604 with value PROMOTED_VAL.
26605 SRC is passed by pointer to be updated on return.
26606 Return value is updated DST. */
26607 static rtx
26608 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26609 HOST_WIDE_INT size_to_move)
26611 rtx dst = destmem, adjust;
26612 enum insn_code code;
26613 machine_mode move_mode;
26614 int piece_size, i;
26616 /* Find the widest mode in which we could perform moves.
26617 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26618 it until move of such size is supported. */
26619 move_mode = GET_MODE (promoted_val);
26620 if (move_mode == VOIDmode)
26621 move_mode = QImode;
26622 if (size_to_move < GET_MODE_SIZE (move_mode))
26624 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26625 move_mode = int_mode_for_size (move_bits, 0).require ();
26626 promoted_val = gen_lowpart (move_mode, promoted_val);
26628 piece_size = GET_MODE_SIZE (move_mode);
26629 code = optab_handler (mov_optab, move_mode);
26630 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26632 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26634 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26635 gcc_assert (size_to_move % piece_size == 0);
26636 adjust = GEN_INT (piece_size);
26637 for (i = 0; i < size_to_move; i += piece_size)
26639 if (piece_size <= GET_MODE_SIZE (word_mode))
26641 emit_insn (gen_strset (destptr, dst, promoted_val));
26642 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26643 piece_size);
26644 continue;
26647 emit_insn (GEN_FCN (code) (dst, promoted_val));
26649 emit_move_insn (destptr,
26650 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26652 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26653 piece_size);
26656 /* Update DST rtx. */
26657 return dst;
26659 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26660 static void
26661 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26662 rtx count, int max_size)
26664 count =
26665 expand_simple_binop (counter_mode (count), AND, count,
26666 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26667 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26668 gen_lowpart (QImode, value), count, QImode,
26669 1, max_size / 2, true);
26672 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26673 static void
26674 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26675 rtx count, int max_size)
26677 rtx dest;
26679 if (CONST_INT_P (count))
26681 HOST_WIDE_INT countval = INTVAL (count);
26682 HOST_WIDE_INT epilogue_size = countval % max_size;
26683 int i;
26685 /* For now MAX_SIZE should be a power of 2. This assert could be
26686 relaxed, but it'll require a bit more complicated epilogue
26687 expanding. */
26688 gcc_assert ((max_size & (max_size - 1)) == 0);
26689 for (i = max_size; i >= 1; i >>= 1)
26691 if (epilogue_size & i)
26693 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26694 destmem = emit_memset (destmem, destptr, vec_value, i);
26695 else
26696 destmem = emit_memset (destmem, destptr, value, i);
26699 return;
26701 if (max_size > 32)
26703 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26704 return;
26706 if (max_size > 16)
26708 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26709 if (TARGET_64BIT)
26711 dest = change_address (destmem, DImode, destptr);
26712 emit_insn (gen_strset (destptr, dest, value));
26713 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26714 emit_insn (gen_strset (destptr, dest, value));
26716 else
26718 dest = change_address (destmem, SImode, destptr);
26719 emit_insn (gen_strset (destptr, dest, value));
26720 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26721 emit_insn (gen_strset (destptr, dest, value));
26722 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26723 emit_insn (gen_strset (destptr, dest, value));
26724 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26725 emit_insn (gen_strset (destptr, dest, value));
26727 emit_label (label);
26728 LABEL_NUSES (label) = 1;
26730 if (max_size > 8)
26732 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26733 if (TARGET_64BIT)
26735 dest = change_address (destmem, DImode, destptr);
26736 emit_insn (gen_strset (destptr, dest, value));
26738 else
26740 dest = change_address (destmem, SImode, destptr);
26741 emit_insn (gen_strset (destptr, dest, value));
26742 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26743 emit_insn (gen_strset (destptr, dest, value));
26745 emit_label (label);
26746 LABEL_NUSES (label) = 1;
26748 if (max_size > 4)
26750 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26751 dest = change_address (destmem, SImode, destptr);
26752 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26753 emit_label (label);
26754 LABEL_NUSES (label) = 1;
26756 if (max_size > 2)
26758 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26759 dest = change_address (destmem, HImode, destptr);
26760 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26761 emit_label (label);
26762 LABEL_NUSES (label) = 1;
26764 if (max_size > 1)
26766 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26767 dest = change_address (destmem, QImode, destptr);
26768 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26769 emit_label (label);
26770 LABEL_NUSES (label) = 1;
26774 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26775 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26776 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26777 ignored.
26778 Return value is updated DESTMEM. */
26779 static rtx
26780 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26781 rtx destptr, rtx srcptr, rtx value,
26782 rtx vec_value, rtx count, int align,
26783 int desired_alignment, bool issetmem)
26785 int i;
26786 for (i = 1; i < desired_alignment; i <<= 1)
26788 if (align <= i)
26790 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26791 if (issetmem)
26793 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26794 destmem = emit_memset (destmem, destptr, vec_value, i);
26795 else
26796 destmem = emit_memset (destmem, destptr, value, i);
26798 else
26799 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26800 ix86_adjust_counter (count, i);
26801 emit_label (label);
26802 LABEL_NUSES (label) = 1;
26803 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26806 return destmem;
26809 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26810 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26811 and jump to DONE_LABEL. */
26812 static void
26813 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26814 rtx destptr, rtx srcptr,
26815 rtx value, rtx vec_value,
26816 rtx count, int size,
26817 rtx done_label, bool issetmem)
26819 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26820 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26821 rtx modesize;
26822 int n;
26824 /* If we do not have vector value to copy, we must reduce size. */
26825 if (issetmem)
26827 if (!vec_value)
26829 if (GET_MODE (value) == VOIDmode && size > 8)
26830 mode = Pmode;
26831 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26832 mode = GET_MODE (value);
26834 else
26835 mode = GET_MODE (vec_value), value = vec_value;
26837 else
26839 /* Choose appropriate vector mode. */
26840 if (size >= 32)
26841 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26842 else if (size >= 16)
26843 mode = TARGET_SSE ? V16QImode : DImode;
26844 srcmem = change_address (srcmem, mode, srcptr);
26846 destmem = change_address (destmem, mode, destptr);
26847 modesize = GEN_INT (GET_MODE_SIZE (mode));
26848 gcc_assert (GET_MODE_SIZE (mode) <= size);
26849 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26851 if (issetmem)
26852 emit_move_insn (destmem, gen_lowpart (mode, value));
26853 else
26855 emit_move_insn (destmem, srcmem);
26856 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26858 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26861 destmem = offset_address (destmem, count, 1);
26862 destmem = offset_address (destmem, GEN_INT (-2 * size),
26863 GET_MODE_SIZE (mode));
26864 if (!issetmem)
26866 srcmem = offset_address (srcmem, count, 1);
26867 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26868 GET_MODE_SIZE (mode));
26870 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26872 if (issetmem)
26873 emit_move_insn (destmem, gen_lowpart (mode, value));
26874 else
26876 emit_move_insn (destmem, srcmem);
26877 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26879 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26881 emit_jump_insn (gen_jump (done_label));
26882 emit_barrier ();
26884 emit_label (label);
26885 LABEL_NUSES (label) = 1;
26888 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26889 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26890 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26891 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26892 DONE_LABEL is a label after the whole copying sequence. The label is created
26893 on demand if *DONE_LABEL is NULL.
26894 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26895 bounds after the initial copies.
26897 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26898 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26899 we will dispatch to a library call for large blocks.
26901 In pseudocode we do:
26903 if (COUNT < SIZE)
26905 Assume that SIZE is 4. Bigger sizes are handled analogously
26906 if (COUNT & 4)
26908 copy 4 bytes from SRCPTR to DESTPTR
26909 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26910 goto done_label
26912 if (!COUNT)
26913 goto done_label;
26914 copy 1 byte from SRCPTR to DESTPTR
26915 if (COUNT & 2)
26917 copy 2 bytes from SRCPTR to DESTPTR
26918 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26921 else
26923 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26924 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26926 OLD_DESPTR = DESTPTR;
26927 Align DESTPTR up to DESIRED_ALIGN
26928 SRCPTR += DESTPTR - OLD_DESTPTR
26929 COUNT -= DEST_PTR - OLD_DESTPTR
26930 if (DYNAMIC_CHECK)
26931 Round COUNT down to multiple of SIZE
26932 << optional caller supplied zero size guard is here >>
26933 << optional caller supplied dynamic check is here >>
26934 << caller supplied main copy loop is here >>
26936 done_label:
26938 static void
26939 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26940 rtx *destptr, rtx *srcptr,
26941 machine_mode mode,
26942 rtx value, rtx vec_value,
26943 rtx *count,
26944 rtx_code_label **done_label,
26945 int size,
26946 int desired_align,
26947 int align,
26948 unsigned HOST_WIDE_INT *min_size,
26949 bool dynamic_check,
26950 bool issetmem)
26952 rtx_code_label *loop_label = NULL, *label;
26953 int n;
26954 rtx modesize;
26955 int prolog_size = 0;
26956 rtx mode_value;
26958 /* Chose proper value to copy. */
26959 if (issetmem && VECTOR_MODE_P (mode))
26960 mode_value = vec_value;
26961 else
26962 mode_value = value;
26963 gcc_assert (GET_MODE_SIZE (mode) <= size);
26965 /* See if block is big or small, handle small blocks. */
26966 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26968 int size2 = size;
26969 loop_label = gen_label_rtx ();
26971 if (!*done_label)
26972 *done_label = gen_label_rtx ();
26974 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26975 1, loop_label);
26976 size2 >>= 1;
26978 /* Handle sizes > 3. */
26979 for (;size2 > 2; size2 >>= 1)
26980 expand_small_movmem_or_setmem (destmem, srcmem,
26981 *destptr, *srcptr,
26982 value, vec_value,
26983 *count,
26984 size2, *done_label, issetmem);
26985 /* Nothing to copy? Jump to DONE_LABEL if so */
26986 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26987 1, *done_label);
26989 /* Do a byte copy. */
26990 destmem = change_address (destmem, QImode, *destptr);
26991 if (issetmem)
26992 emit_move_insn (destmem, gen_lowpart (QImode, value));
26993 else
26995 srcmem = change_address (srcmem, QImode, *srcptr);
26996 emit_move_insn (destmem, srcmem);
26999 /* Handle sizes 2 and 3. */
27000 label = ix86_expand_aligntest (*count, 2, false);
27001 destmem = change_address (destmem, HImode, *destptr);
27002 destmem = offset_address (destmem, *count, 1);
27003 destmem = offset_address (destmem, GEN_INT (-2), 2);
27004 if (issetmem)
27005 emit_move_insn (destmem, gen_lowpart (HImode, value));
27006 else
27008 srcmem = change_address (srcmem, HImode, *srcptr);
27009 srcmem = offset_address (srcmem, *count, 1);
27010 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
27011 emit_move_insn (destmem, srcmem);
27014 emit_label (label);
27015 LABEL_NUSES (label) = 1;
27016 emit_jump_insn (gen_jump (*done_label));
27017 emit_barrier ();
27019 else
27020 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
27021 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
27023 /* Start memcpy for COUNT >= SIZE. */
27024 if (loop_label)
27026 emit_label (loop_label);
27027 LABEL_NUSES (loop_label) = 1;
27030 /* Copy first desired_align bytes. */
27031 if (!issetmem)
27032 srcmem = change_address (srcmem, mode, *srcptr);
27033 destmem = change_address (destmem, mode, *destptr);
27034 modesize = GEN_INT (GET_MODE_SIZE (mode));
27035 for (n = 0; prolog_size < desired_align - align; n++)
27037 if (issetmem)
27038 emit_move_insn (destmem, mode_value);
27039 else
27041 emit_move_insn (destmem, srcmem);
27042 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
27044 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
27045 prolog_size += GET_MODE_SIZE (mode);
27049 /* Copy last SIZE bytes. */
27050 destmem = offset_address (destmem, *count, 1);
27051 destmem = offset_address (destmem,
27052 GEN_INT (-size - prolog_size),
27054 if (issetmem)
27055 emit_move_insn (destmem, mode_value);
27056 else
27058 srcmem = offset_address (srcmem, *count, 1);
27059 srcmem = offset_address (srcmem,
27060 GEN_INT (-size - prolog_size),
27062 emit_move_insn (destmem, srcmem);
27064 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
27066 destmem = offset_address (destmem, modesize, 1);
27067 if (issetmem)
27068 emit_move_insn (destmem, mode_value);
27069 else
27071 srcmem = offset_address (srcmem, modesize, 1);
27072 emit_move_insn (destmem, srcmem);
27076 /* Align destination. */
27077 if (desired_align > 1 && desired_align > align)
27079 rtx saveddest = *destptr;
27081 gcc_assert (desired_align <= size);
27082 /* Align destptr up, place it to new register. */
27083 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
27084 GEN_INT (prolog_size),
27085 NULL_RTX, 1, OPTAB_DIRECT);
27086 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
27087 REG_POINTER (*destptr) = 1;
27088 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
27089 GEN_INT (-desired_align),
27090 *destptr, 1, OPTAB_DIRECT);
27091 /* See how many bytes we skipped. */
27092 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
27093 *destptr,
27094 saveddest, 1, OPTAB_DIRECT);
27095 /* Adjust srcptr and count. */
27096 if (!issetmem)
27097 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
27098 saveddest, *srcptr, 1, OPTAB_DIRECT);
27099 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27100 saveddest, *count, 1, OPTAB_DIRECT);
27101 /* We copied at most size + prolog_size. */
27102 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
27103 *min_size
27104 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
27105 else
27106 *min_size = 0;
27108 /* Our loops always round down the block size, but for dispatch to
27109 library we need precise value. */
27110 if (dynamic_check)
27111 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
27112 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
27114 else
27116 gcc_assert (prolog_size == 0);
27117 /* Decrease count, so we won't end up copying last word twice. */
27118 if (!CONST_INT_P (*count))
27119 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
27120 constm1_rtx, *count, 1, OPTAB_DIRECT);
27121 else
27122 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
27123 (unsigned HOST_WIDE_INT)size));
27124 if (*min_size)
27125 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
27130 /* This function is like the previous one, except here we know how many bytes
27131 need to be copied. That allows us to update alignment not only of DST, which
27132 is returned, but also of SRC, which is passed as a pointer for that
27133 reason. */
27134 static rtx
27135 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
27136 rtx srcreg, rtx value, rtx vec_value,
27137 int desired_align, int align_bytes,
27138 bool issetmem)
27140 rtx src = NULL;
27141 rtx orig_dst = dst;
27142 rtx orig_src = NULL;
27143 int piece_size = 1;
27144 int copied_bytes = 0;
27146 if (!issetmem)
27148 gcc_assert (srcp != NULL);
27149 src = *srcp;
27150 orig_src = src;
27153 for (piece_size = 1;
27154 piece_size <= desired_align && copied_bytes < align_bytes;
27155 piece_size <<= 1)
27157 if (align_bytes & piece_size)
27159 if (issetmem)
27161 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
27162 dst = emit_memset (dst, destreg, vec_value, piece_size);
27163 else
27164 dst = emit_memset (dst, destreg, value, piece_size);
27166 else
27167 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
27168 copied_bytes += piece_size;
27171 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
27172 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27173 if (MEM_SIZE_KNOWN_P (orig_dst))
27174 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
27176 if (!issetmem)
27178 int src_align_bytes = get_mem_align_offset (src, desired_align
27179 * BITS_PER_UNIT);
27180 if (src_align_bytes >= 0)
27181 src_align_bytes = desired_align - src_align_bytes;
27182 if (src_align_bytes >= 0)
27184 unsigned int src_align;
27185 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
27187 if ((src_align_bytes & (src_align - 1))
27188 == (align_bytes & (src_align - 1)))
27189 break;
27191 if (src_align > (unsigned int) desired_align)
27192 src_align = desired_align;
27193 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
27194 set_mem_align (src, src_align * BITS_PER_UNIT);
27196 if (MEM_SIZE_KNOWN_P (orig_src))
27197 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
27198 *srcp = src;
27201 return dst;
27204 /* Return true if ALG can be used in current context.
27205 Assume we expand memset if MEMSET is true. */
27206 static bool
27207 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
27209 if (alg == no_stringop)
27210 return false;
27211 if (alg == vector_loop)
27212 return TARGET_SSE || TARGET_AVX;
27213 /* Algorithms using the rep prefix want at least edi and ecx;
27214 additionally, memset wants eax and memcpy wants esi. Don't
27215 consider such algorithms if the user has appropriated those
27216 registers for their own purposes, or if we have a non-default
27217 address space, since some string insns cannot override the segment. */
27218 if (alg == rep_prefix_1_byte
27219 || alg == rep_prefix_4_byte
27220 || alg == rep_prefix_8_byte)
27222 if (have_as)
27223 return false;
27224 if (fixed_regs[CX_REG]
27225 || fixed_regs[DI_REG]
27226 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
27227 return false;
27229 return true;
27232 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
27233 static enum stringop_alg
27234 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
27235 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
27236 bool memset, bool zero_memset, bool have_as,
27237 int *dynamic_check, bool *noalign, bool recur)
27239 const struct stringop_algs *algs;
27240 bool optimize_for_speed;
27241 int max = 0;
27242 const struct processor_costs *cost;
27243 int i;
27244 bool any_alg_usable_p = false;
27246 *noalign = false;
27247 *dynamic_check = -1;
27249 /* Even if the string operation call is cold, we still might spend a lot
27250 of time processing large blocks. */
27251 if (optimize_function_for_size_p (cfun)
27252 || (optimize_insn_for_size_p ()
27253 && (max_size < 256
27254 || (expected_size != -1 && expected_size < 256))))
27255 optimize_for_speed = false;
27256 else
27257 optimize_for_speed = true;
27259 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
27260 if (memset)
27261 algs = &cost->memset[TARGET_64BIT != 0];
27262 else
27263 algs = &cost->memcpy[TARGET_64BIT != 0];
27265 /* See maximal size for user defined algorithm. */
27266 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27268 enum stringop_alg candidate = algs->size[i].alg;
27269 bool usable = alg_usable_p (candidate, memset, have_as);
27270 any_alg_usable_p |= usable;
27272 if (candidate != libcall && candidate && usable)
27273 max = algs->size[i].max;
27276 /* If expected size is not known but max size is small enough
27277 so inline version is a win, set expected size into
27278 the range. */
27279 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
27280 && expected_size == -1)
27281 expected_size = min_size / 2 + max_size / 2;
27283 /* If user specified the algorithm, honor it if possible. */
27284 if (ix86_stringop_alg != no_stringop
27285 && alg_usable_p (ix86_stringop_alg, memset, have_as))
27286 return ix86_stringop_alg;
27287 /* rep; movq or rep; movl is the smallest variant. */
27288 else if (!optimize_for_speed)
27290 *noalign = true;
27291 if (!count || (count & 3) || (memset && !zero_memset))
27292 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
27293 ? rep_prefix_1_byte : loop_1_byte;
27294 else
27295 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
27296 ? rep_prefix_4_byte : loop;
27298 /* Very tiny blocks are best handled via the loop, REP is expensive to
27299 setup. */
27300 else if (expected_size != -1 && expected_size < 4)
27301 return loop_1_byte;
27302 else if (expected_size != -1)
27304 enum stringop_alg alg = libcall;
27305 bool alg_noalign = false;
27306 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
27308 /* We get here if the algorithms that were not libcall-based
27309 were rep-prefix based and we are unable to use rep prefixes
27310 based on global register usage. Break out of the loop and
27311 use the heuristic below. */
27312 if (algs->size[i].max == 0)
27313 break;
27314 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
27316 enum stringop_alg candidate = algs->size[i].alg;
27318 if (candidate != libcall
27319 && alg_usable_p (candidate, memset, have_as))
27321 alg = candidate;
27322 alg_noalign = algs->size[i].noalign;
27324 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
27325 last non-libcall inline algorithm. */
27326 if (TARGET_INLINE_ALL_STRINGOPS)
27328 /* When the current size is best to be copied by a libcall,
27329 but we are still forced to inline, run the heuristic below
27330 that will pick code for medium sized blocks. */
27331 if (alg != libcall)
27333 *noalign = alg_noalign;
27334 return alg;
27336 else if (!any_alg_usable_p)
27337 break;
27339 else if (alg_usable_p (candidate, memset, have_as))
27341 *noalign = algs->size[i].noalign;
27342 return candidate;
27347 /* When asked to inline the call anyway, try to pick meaningful choice.
27348 We look for maximal size of block that is faster to copy by hand and
27349 take blocks of at most of that size guessing that average size will
27350 be roughly half of the block.
27352 If this turns out to be bad, we might simply specify the preferred
27353 choice in ix86_costs. */
27354 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27355 && (algs->unknown_size == libcall
27356 || !alg_usable_p (algs->unknown_size, memset, have_as)))
27358 enum stringop_alg alg;
27359 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
27361 /* If there aren't any usable algorithms or if recursing already,
27362 then recursing on smaller sizes or same size isn't going to
27363 find anything. Just return the simple byte-at-a-time copy loop. */
27364 if (!any_alg_usable_p || recur)
27366 /* Pick something reasonable. */
27367 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
27368 *dynamic_check = 128;
27369 return loop_1_byte;
27371 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
27372 zero_memset, have_as, dynamic_check, noalign, true);
27373 gcc_assert (*dynamic_check == -1);
27374 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
27375 *dynamic_check = max;
27376 else
27377 gcc_assert (alg != libcall);
27378 return alg;
27380 return (alg_usable_p (algs->unknown_size, memset, have_as)
27381 ? algs->unknown_size : libcall);
27384 /* Decide on alignment. We know that the operand is already aligned to ALIGN
27385 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
27386 static int
27387 decide_alignment (int align,
27388 enum stringop_alg alg,
27389 int expected_size,
27390 machine_mode move_mode)
27392 int desired_align = 0;
27394 gcc_assert (alg != no_stringop);
27396 if (alg == libcall)
27397 return 0;
27398 if (move_mode == VOIDmode)
27399 return 0;
27401 desired_align = GET_MODE_SIZE (move_mode);
27402 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
27403 copying whole cacheline at once. */
27404 if (TARGET_PENTIUMPRO
27405 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
27406 desired_align = 8;
27408 if (optimize_size)
27409 desired_align = 1;
27410 if (desired_align < align)
27411 desired_align = align;
27412 if (expected_size != -1 && expected_size < 4)
27413 desired_align = align;
27415 return desired_align;
27419 /* Helper function for memcpy. For QImode value 0xXY produce
27420 0xXYXYXYXY of wide specified by MODE. This is essentially
27421 a * 0x10101010, but we can do slightly better than
27422 synth_mult by unwinding the sequence by hand on CPUs with
27423 slow multiply. */
27424 static rtx
27425 promote_duplicated_reg (machine_mode mode, rtx val)
27427 machine_mode valmode = GET_MODE (val);
27428 rtx tmp;
27429 int nops = mode == DImode ? 3 : 2;
27431 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
27432 if (val == const0_rtx)
27433 return copy_to_mode_reg (mode, CONST0_RTX (mode));
27434 if (CONST_INT_P (val))
27436 HOST_WIDE_INT v = INTVAL (val) & 255;
27438 v |= v << 8;
27439 v |= v << 16;
27440 if (mode == DImode)
27441 v |= (v << 16) << 16;
27442 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
27445 if (valmode == VOIDmode)
27446 valmode = QImode;
27447 if (valmode != QImode)
27448 val = gen_lowpart (QImode, val);
27449 if (mode == QImode)
27450 return val;
27451 if (!TARGET_PARTIAL_REG_STALL)
27452 nops--;
27453 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
27454 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
27455 <= (ix86_cost->shift_const + ix86_cost->add) * nops
27456 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
27458 rtx reg = convert_modes (mode, QImode, val, true);
27459 tmp = promote_duplicated_reg (mode, const1_rtx);
27460 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
27461 OPTAB_DIRECT);
27463 else
27465 rtx reg = convert_modes (mode, QImode, val, true);
27467 if (!TARGET_PARTIAL_REG_STALL)
27468 if (mode == SImode)
27469 emit_insn (gen_insvsi_1 (reg, reg));
27470 else
27471 emit_insn (gen_insvdi_1 (reg, reg));
27472 else
27474 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
27475 NULL, 1, OPTAB_DIRECT);
27476 reg =
27477 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27479 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
27480 NULL, 1, OPTAB_DIRECT);
27481 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27482 if (mode == SImode)
27483 return reg;
27484 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27485 NULL, 1, OPTAB_DIRECT);
27486 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27487 return reg;
27491 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27492 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27493 alignment from ALIGN to DESIRED_ALIGN. */
27494 static rtx
27495 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27496 int align)
27498 rtx promoted_val;
27500 if (TARGET_64BIT
27501 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27502 promoted_val = promote_duplicated_reg (DImode, val);
27503 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27504 promoted_val = promote_duplicated_reg (SImode, val);
27505 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27506 promoted_val = promote_duplicated_reg (HImode, val);
27507 else
27508 promoted_val = val;
27510 return promoted_val;
27513 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27514 operations when profitable. The code depends upon architecture, block size
27515 and alignment, but always has one of the following overall structures:
27517 Aligned move sequence:
27519 1) Prologue guard: Conditional that jumps up to epilogues for small
27520 blocks that can be handled by epilogue alone. This is faster
27521 but also needed for correctness, since prologue assume the block
27522 is larger than the desired alignment.
27524 Optional dynamic check for size and libcall for large
27525 blocks is emitted here too, with -minline-stringops-dynamically.
27527 2) Prologue: copy first few bytes in order to get destination
27528 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27529 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27530 copied. We emit either a jump tree on power of two sized
27531 blocks, or a byte loop.
27533 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27534 with specified algorithm.
27536 4) Epilogue: code copying tail of the block that is too small to be
27537 handled by main body (or up to size guarded by prologue guard).
27539 Misaligned move sequence
27541 1) missaligned move prologue/epilogue containing:
27542 a) Prologue handling small memory blocks and jumping to done_label
27543 (skipped if blocks are known to be large enough)
27544 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27545 needed by single possibly misaligned move
27546 (skipped if alignment is not needed)
27547 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27549 2) Zero size guard dispatching to done_label, if needed
27551 3) dispatch to library call, if needed,
27553 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27554 with specified algorithm. */
27555 bool
27556 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27557 rtx align_exp, rtx expected_align_exp,
27558 rtx expected_size_exp, rtx min_size_exp,
27559 rtx max_size_exp, rtx probable_max_size_exp,
27560 bool issetmem)
27562 rtx destreg;
27563 rtx srcreg = NULL;
27564 rtx_code_label *label = NULL;
27565 rtx tmp;
27566 rtx_code_label *jump_around_label = NULL;
27567 HOST_WIDE_INT align = 1;
27568 unsigned HOST_WIDE_INT count = 0;
27569 HOST_WIDE_INT expected_size = -1;
27570 int size_needed = 0, epilogue_size_needed;
27571 int desired_align = 0, align_bytes = 0;
27572 enum stringop_alg alg;
27573 rtx promoted_val = NULL;
27574 rtx vec_promoted_val = NULL;
27575 bool force_loopy_epilogue = false;
27576 int dynamic_check;
27577 bool need_zero_guard = false;
27578 bool noalign;
27579 machine_mode move_mode = VOIDmode;
27580 machine_mode wider_mode;
27581 int unroll_factor = 1;
27582 /* TODO: Once value ranges are available, fill in proper data. */
27583 unsigned HOST_WIDE_INT min_size = 0;
27584 unsigned HOST_WIDE_INT max_size = -1;
27585 unsigned HOST_WIDE_INT probable_max_size = -1;
27586 bool misaligned_prologue_used = false;
27587 bool have_as;
27589 if (CONST_INT_P (align_exp))
27590 align = INTVAL (align_exp);
27591 /* i386 can do misaligned access on reasonably increased cost. */
27592 if (CONST_INT_P (expected_align_exp)
27593 && INTVAL (expected_align_exp) > align)
27594 align = INTVAL (expected_align_exp);
27595 /* ALIGN is the minimum of destination and source alignment, but we care here
27596 just about destination alignment. */
27597 else if (!issetmem
27598 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27599 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27601 if (CONST_INT_P (count_exp))
27603 min_size = max_size = probable_max_size = count = expected_size
27604 = INTVAL (count_exp);
27605 /* When COUNT is 0, there is nothing to do. */
27606 if (!count)
27607 return true;
27609 else
27611 if (min_size_exp)
27612 min_size = INTVAL (min_size_exp);
27613 if (max_size_exp)
27614 max_size = INTVAL (max_size_exp);
27615 if (probable_max_size_exp)
27616 probable_max_size = INTVAL (probable_max_size_exp);
27617 if (CONST_INT_P (expected_size_exp))
27618 expected_size = INTVAL (expected_size_exp);
27621 /* Make sure we don't need to care about overflow later on. */
27622 if (count > (HOST_WIDE_INT_1U << 30))
27623 return false;
27625 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27626 if (!issetmem)
27627 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27629 /* Step 0: Decide on preferred algorithm, desired alignment and
27630 size of chunks to be copied by main loop. */
27631 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27632 issetmem,
27633 issetmem && val_exp == const0_rtx, have_as,
27634 &dynamic_check, &noalign, false);
27635 if (alg == libcall)
27636 return false;
27637 gcc_assert (alg != no_stringop);
27639 /* For now vector-version of memset is generated only for memory zeroing, as
27640 creating of promoted vector value is very cheap in this case. */
27641 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27642 alg = unrolled_loop;
27644 if (!count)
27645 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27646 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27647 if (!issetmem)
27648 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27650 unroll_factor = 1;
27651 move_mode = word_mode;
27652 switch (alg)
27654 case libcall:
27655 case no_stringop:
27656 case last_alg:
27657 gcc_unreachable ();
27658 case loop_1_byte:
27659 need_zero_guard = true;
27660 move_mode = QImode;
27661 break;
27662 case loop:
27663 need_zero_guard = true;
27664 break;
27665 case unrolled_loop:
27666 need_zero_guard = true;
27667 unroll_factor = (TARGET_64BIT ? 4 : 2);
27668 break;
27669 case vector_loop:
27670 need_zero_guard = true;
27671 unroll_factor = 4;
27672 /* Find the widest supported mode. */
27673 move_mode = word_mode;
27674 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27675 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27676 move_mode = wider_mode;
27678 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27679 move_mode = TImode;
27681 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27682 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27683 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27685 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27686 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27687 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27688 move_mode = word_mode;
27690 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27691 break;
27692 case rep_prefix_8_byte:
27693 move_mode = DImode;
27694 break;
27695 case rep_prefix_4_byte:
27696 move_mode = SImode;
27697 break;
27698 case rep_prefix_1_byte:
27699 move_mode = QImode;
27700 break;
27702 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27703 epilogue_size_needed = size_needed;
27705 /* If we are going to call any library calls conditionally, make sure any
27706 pending stack adjustment happen before the first conditional branch,
27707 otherwise they will be emitted before the library call only and won't
27708 happen from the other branches. */
27709 if (dynamic_check != -1)
27710 do_pending_stack_adjust ();
27712 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27713 if (!TARGET_ALIGN_STRINGOPS || noalign)
27714 align = desired_align;
27716 /* Step 1: Prologue guard. */
27718 /* Alignment code needs count to be in register. */
27719 if (CONST_INT_P (count_exp) && desired_align > align)
27721 if (INTVAL (count_exp) > desired_align
27722 && INTVAL (count_exp) > size_needed)
27724 align_bytes
27725 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27726 if (align_bytes <= 0)
27727 align_bytes = 0;
27728 else
27729 align_bytes = desired_align - align_bytes;
27731 if (align_bytes == 0)
27732 count_exp = force_reg (counter_mode (count_exp), count_exp);
27734 gcc_assert (desired_align >= 1 && align >= 1);
27736 /* Misaligned move sequences handle both prologue and epilogue at once.
27737 Default code generation results in a smaller code for large alignments
27738 and also avoids redundant job when sizes are known precisely. */
27739 misaligned_prologue_used
27740 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27741 && MAX (desired_align, epilogue_size_needed) <= 32
27742 && desired_align <= epilogue_size_needed
27743 && ((desired_align > align && !align_bytes)
27744 || (!count && epilogue_size_needed > 1)));
27746 /* Do the cheap promotion to allow better CSE across the
27747 main loop and epilogue (ie one load of the big constant in the
27748 front of all code.
27749 For now the misaligned move sequences do not have fast path
27750 without broadcasting. */
27751 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27753 if (alg == vector_loop)
27755 gcc_assert (val_exp == const0_rtx);
27756 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27757 promoted_val = promote_duplicated_reg_to_size (val_exp,
27758 GET_MODE_SIZE (word_mode),
27759 desired_align, align);
27761 else
27763 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27764 desired_align, align);
27767 /* Misaligned move sequences handles both prologues and epilogues at once.
27768 Default code generation results in smaller code for large alignments and
27769 also avoids redundant job when sizes are known precisely. */
27770 if (misaligned_prologue_used)
27772 /* Misaligned move prologue handled small blocks by itself. */
27773 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27774 (dst, src, &destreg, &srcreg,
27775 move_mode, promoted_val, vec_promoted_val,
27776 &count_exp,
27777 &jump_around_label,
27778 desired_align < align
27779 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27780 desired_align, align, &min_size, dynamic_check, issetmem);
27781 if (!issetmem)
27782 src = change_address (src, BLKmode, srcreg);
27783 dst = change_address (dst, BLKmode, destreg);
27784 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27785 epilogue_size_needed = 0;
27786 if (need_zero_guard
27787 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27789 /* It is possible that we copied enough so the main loop will not
27790 execute. */
27791 gcc_assert (size_needed > 1);
27792 if (jump_around_label == NULL_RTX)
27793 jump_around_label = gen_label_rtx ();
27794 emit_cmp_and_jump_insns (count_exp,
27795 GEN_INT (size_needed),
27796 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27797 if (expected_size == -1
27798 || expected_size < (desired_align - align) / 2 + size_needed)
27799 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27800 else
27801 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27804 /* Ensure that alignment prologue won't copy past end of block. */
27805 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27807 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27808 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27809 Make sure it is power of 2. */
27810 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27812 /* To improve performance of small blocks, we jump around the VAL
27813 promoting mode. This mean that if the promoted VAL is not constant,
27814 we might not use it in the epilogue and have to use byte
27815 loop variant. */
27816 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27817 force_loopy_epilogue = true;
27818 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27819 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27821 /* If main algorithm works on QImode, no epilogue is needed.
27822 For small sizes just don't align anything. */
27823 if (size_needed == 1)
27824 desired_align = align;
27825 else
27826 goto epilogue;
27828 else if (!count
27829 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27831 label = gen_label_rtx ();
27832 emit_cmp_and_jump_insns (count_exp,
27833 GEN_INT (epilogue_size_needed),
27834 LTU, 0, counter_mode (count_exp), 1, label);
27835 if (expected_size == -1 || expected_size < epilogue_size_needed)
27836 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27837 else
27838 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27842 /* Emit code to decide on runtime whether library call or inline should be
27843 used. */
27844 if (dynamic_check != -1)
27846 if (!issetmem && CONST_INT_P (count_exp))
27848 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27850 emit_block_copy_via_libcall (dst, src, count_exp);
27851 count_exp = const0_rtx;
27852 goto epilogue;
27855 else
27857 rtx_code_label *hot_label = gen_label_rtx ();
27858 if (jump_around_label == NULL_RTX)
27859 jump_around_label = gen_label_rtx ();
27860 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27861 LEU, 0, counter_mode (count_exp),
27862 1, hot_label);
27863 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27864 if (issetmem)
27865 set_storage_via_libcall (dst, count_exp, val_exp);
27866 else
27867 emit_block_copy_via_libcall (dst, src, count_exp);
27868 emit_jump (jump_around_label);
27869 emit_label (hot_label);
27873 /* Step 2: Alignment prologue. */
27874 /* Do the expensive promotion once we branched off the small blocks. */
27875 if (issetmem && !promoted_val)
27876 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27877 desired_align, align);
27879 if (desired_align > align && !misaligned_prologue_used)
27881 if (align_bytes == 0)
27883 /* Except for the first move in prologue, we no longer know
27884 constant offset in aliasing info. It don't seems to worth
27885 the pain to maintain it for the first move, so throw away
27886 the info early. */
27887 dst = change_address (dst, BLKmode, destreg);
27888 if (!issetmem)
27889 src = change_address (src, BLKmode, srcreg);
27890 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27891 promoted_val, vec_promoted_val,
27892 count_exp, align, desired_align,
27893 issetmem);
27894 /* At most desired_align - align bytes are copied. */
27895 if (min_size < (unsigned)(desired_align - align))
27896 min_size = 0;
27897 else
27898 min_size -= desired_align - align;
27900 else
27902 /* If we know how many bytes need to be stored before dst is
27903 sufficiently aligned, maintain aliasing info accurately. */
27904 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27905 srcreg,
27906 promoted_val,
27907 vec_promoted_val,
27908 desired_align,
27909 align_bytes,
27910 issetmem);
27912 count_exp = plus_constant (counter_mode (count_exp),
27913 count_exp, -align_bytes);
27914 count -= align_bytes;
27915 min_size -= align_bytes;
27916 max_size -= align_bytes;
27918 if (need_zero_guard
27919 && min_size < (unsigned HOST_WIDE_INT) size_needed
27920 && (count < (unsigned HOST_WIDE_INT) size_needed
27921 || (align_bytes == 0
27922 && count < ((unsigned HOST_WIDE_INT) size_needed
27923 + desired_align - align))))
27925 /* It is possible that we copied enough so the main loop will not
27926 execute. */
27927 gcc_assert (size_needed > 1);
27928 if (label == NULL_RTX)
27929 label = gen_label_rtx ();
27930 emit_cmp_and_jump_insns (count_exp,
27931 GEN_INT (size_needed),
27932 LTU, 0, counter_mode (count_exp), 1, label);
27933 if (expected_size == -1
27934 || expected_size < (desired_align - align) / 2 + size_needed)
27935 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27936 else
27937 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27940 if (label && size_needed == 1)
27942 emit_label (label);
27943 LABEL_NUSES (label) = 1;
27944 label = NULL;
27945 epilogue_size_needed = 1;
27946 if (issetmem)
27947 promoted_val = val_exp;
27949 else if (label == NULL_RTX && !misaligned_prologue_used)
27950 epilogue_size_needed = size_needed;
27952 /* Step 3: Main loop. */
27954 switch (alg)
27956 case libcall:
27957 case no_stringop:
27958 case last_alg:
27959 gcc_unreachable ();
27960 case loop_1_byte:
27961 case loop:
27962 case unrolled_loop:
27963 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27964 count_exp, move_mode, unroll_factor,
27965 expected_size, issetmem);
27966 break;
27967 case vector_loop:
27968 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27969 vec_promoted_val, count_exp, move_mode,
27970 unroll_factor, expected_size, issetmem);
27971 break;
27972 case rep_prefix_8_byte:
27973 case rep_prefix_4_byte:
27974 case rep_prefix_1_byte:
27975 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27976 val_exp, count_exp, move_mode, issetmem);
27977 break;
27979 /* Adjust properly the offset of src and dest memory for aliasing. */
27980 if (CONST_INT_P (count_exp))
27982 if (!issetmem)
27983 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27984 (count / size_needed) * size_needed);
27985 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27986 (count / size_needed) * size_needed);
27988 else
27990 if (!issetmem)
27991 src = change_address (src, BLKmode, srcreg);
27992 dst = change_address (dst, BLKmode, destreg);
27995 /* Step 4: Epilogue to copy the remaining bytes. */
27996 epilogue:
27997 if (label)
27999 /* When the main loop is done, COUNT_EXP might hold original count,
28000 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
28001 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
28002 bytes. Compensate if needed. */
28004 if (size_needed < epilogue_size_needed)
28006 tmp =
28007 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
28008 GEN_INT (size_needed - 1), count_exp, 1,
28009 OPTAB_DIRECT);
28010 if (tmp != count_exp)
28011 emit_move_insn (count_exp, tmp);
28013 emit_label (label);
28014 LABEL_NUSES (label) = 1;
28017 if (count_exp != const0_rtx && epilogue_size_needed > 1)
28019 if (force_loopy_epilogue)
28020 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
28021 epilogue_size_needed);
28022 else
28024 if (issetmem)
28025 expand_setmem_epilogue (dst, destreg, promoted_val,
28026 vec_promoted_val, count_exp,
28027 epilogue_size_needed);
28028 else
28029 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
28030 epilogue_size_needed);
28033 if (jump_around_label)
28034 emit_label (jump_around_label);
28035 return true;
28039 /* Expand the appropriate insns for doing strlen if not just doing
28040 repnz; scasb
28042 out = result, initialized with the start address
28043 align_rtx = alignment of the address.
28044 scratch = scratch register, initialized with the startaddress when
28045 not aligned, otherwise undefined
28047 This is just the body. It needs the initializations mentioned above and
28048 some address computing at the end. These things are done in i386.md. */
28050 static void
28051 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
28053 int align;
28054 rtx tmp;
28055 rtx_code_label *align_2_label = NULL;
28056 rtx_code_label *align_3_label = NULL;
28057 rtx_code_label *align_4_label = gen_label_rtx ();
28058 rtx_code_label *end_0_label = gen_label_rtx ();
28059 rtx mem;
28060 rtx tmpreg = gen_reg_rtx (SImode);
28061 rtx scratch = gen_reg_rtx (SImode);
28062 rtx cmp;
28064 align = 0;
28065 if (CONST_INT_P (align_rtx))
28066 align = INTVAL (align_rtx);
28068 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
28070 /* Is there a known alignment and is it less than 4? */
28071 if (align < 4)
28073 rtx scratch1 = gen_reg_rtx (Pmode);
28074 emit_move_insn (scratch1, out);
28075 /* Is there a known alignment and is it not 2? */
28076 if (align != 2)
28078 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
28079 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
28081 /* Leave just the 3 lower bits. */
28082 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
28083 NULL_RTX, 0, OPTAB_WIDEN);
28085 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28086 Pmode, 1, align_4_label);
28087 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
28088 Pmode, 1, align_2_label);
28089 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
28090 Pmode, 1, align_3_label);
28092 else
28094 /* Since the alignment is 2, we have to check 2 or 0 bytes;
28095 check if is aligned to 4 - byte. */
28097 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
28098 NULL_RTX, 0, OPTAB_WIDEN);
28100 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
28101 Pmode, 1, align_4_label);
28104 mem = change_address (src, QImode, out);
28106 /* Now compare the bytes. */
28108 /* Compare the first n unaligned byte on a byte per byte basis. */
28109 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
28110 QImode, 1, end_0_label);
28112 /* Increment the address. */
28113 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28115 /* Not needed with an alignment of 2 */
28116 if (align != 2)
28118 emit_label (align_2_label);
28120 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28121 end_0_label);
28123 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28125 emit_label (align_3_label);
28128 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
28129 end_0_label);
28131 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
28134 /* Generate loop to check 4 bytes at a time. It is not a good idea to
28135 align this loop. It gives only huge programs, but does not help to
28136 speed up. */
28137 emit_label (align_4_label);
28139 mem = change_address (src, SImode, out);
28140 emit_move_insn (scratch, mem);
28141 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
28143 /* This formula yields a nonzero result iff one of the bytes is zero.
28144 This saves three branches inside loop and many cycles. */
28146 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
28147 emit_insn (gen_one_cmplsi2 (scratch, scratch));
28148 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
28149 emit_insn (gen_andsi3 (tmpreg, tmpreg,
28150 gen_int_mode (0x80808080, SImode)));
28151 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
28152 align_4_label);
28154 if (TARGET_CMOVE)
28156 rtx reg = gen_reg_rtx (SImode);
28157 rtx reg2 = gen_reg_rtx (Pmode);
28158 emit_move_insn (reg, tmpreg);
28159 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
28161 /* If zero is not in the first two bytes, move two bytes forward. */
28162 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28163 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28164 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28165 emit_insn (gen_rtx_SET (tmpreg,
28166 gen_rtx_IF_THEN_ELSE (SImode, tmp,
28167 reg,
28168 tmpreg)));
28169 /* Emit lea manually to avoid clobbering of flags. */
28170 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
28172 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28173 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
28174 emit_insn (gen_rtx_SET (out,
28175 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
28176 reg2,
28177 out)));
28179 else
28181 rtx_code_label *end_2_label = gen_label_rtx ();
28182 /* Is zero in the first two bytes? */
28184 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
28185 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
28186 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
28187 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
28188 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
28189 pc_rtx);
28190 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
28191 JUMP_LABEL (tmp) = end_2_label;
28193 /* Not in the first two. Move two bytes forward. */
28194 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
28195 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
28197 emit_label (end_2_label);
28201 /* Avoid branch in fixing the byte. */
28202 tmpreg = gen_lowpart (QImode, tmpreg);
28203 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
28204 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
28205 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
28206 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
28208 emit_label (end_0_label);
28211 /* Expand strlen. */
28213 bool
28214 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
28216 rtx addr, scratch1, scratch2, scratch3, scratch4;
28218 /* The generic case of strlen expander is long. Avoid it's
28219 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
28221 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28222 && !TARGET_INLINE_ALL_STRINGOPS
28223 && !optimize_insn_for_size_p ()
28224 && (!CONST_INT_P (align) || INTVAL (align) < 4))
28225 return false;
28227 addr = force_reg (Pmode, XEXP (src, 0));
28228 scratch1 = gen_reg_rtx (Pmode);
28230 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
28231 && !optimize_insn_for_size_p ())
28233 /* Well it seems that some optimizer does not combine a call like
28234 foo(strlen(bar), strlen(bar));
28235 when the move and the subtraction is done here. It does calculate
28236 the length just once when these instructions are done inside of
28237 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
28238 often used and I use one fewer register for the lifetime of
28239 output_strlen_unroll() this is better. */
28241 emit_move_insn (out, addr);
28243 ix86_expand_strlensi_unroll_1 (out, src, align);
28245 /* strlensi_unroll_1 returns the address of the zero at the end of
28246 the string, like memchr(), so compute the length by subtracting
28247 the start address. */
28248 emit_insn (ix86_gen_sub3 (out, out, addr));
28250 else
28252 rtx unspec;
28254 /* Can't use this if the user has appropriated eax, ecx, or edi. */
28255 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
28256 return false;
28257 /* Can't use this for non-default address spaces. */
28258 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
28259 return false;
28261 scratch2 = gen_reg_rtx (Pmode);
28262 scratch3 = gen_reg_rtx (Pmode);
28263 scratch4 = force_reg (Pmode, constm1_rtx);
28265 emit_move_insn (scratch3, addr);
28266 eoschar = force_reg (QImode, eoschar);
28268 src = replace_equiv_address_nv (src, scratch3);
28270 /* If .md starts supporting :P, this can be done in .md. */
28271 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
28272 scratch4), UNSPEC_SCAS);
28273 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
28274 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
28275 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
28277 return true;
28280 /* For given symbol (function) construct code to compute address of it's PLT
28281 entry in large x86-64 PIC model. */
28282 static rtx
28283 construct_plt_address (rtx symbol)
28285 rtx tmp, unspec;
28287 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
28288 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
28289 gcc_assert (Pmode == DImode);
28291 tmp = gen_reg_rtx (Pmode);
28292 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
28294 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
28295 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
28296 return tmp;
28299 rtx_insn *
28300 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
28301 rtx callarg2,
28302 rtx pop, bool sibcall)
28304 rtx vec[3];
28305 rtx use = NULL, call;
28306 unsigned int vec_len = 0;
28307 tree fndecl;
28309 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28311 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
28312 if (fndecl
28313 && (lookup_attribute ("interrupt",
28314 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
28315 error ("interrupt service routine can't be called directly");
28317 else
28318 fndecl = NULL_TREE;
28320 if (pop == const0_rtx)
28321 pop = NULL;
28322 gcc_assert (!TARGET_64BIT || !pop);
28324 if (TARGET_MACHO && !TARGET_64BIT)
28326 #if TARGET_MACHO
28327 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
28328 fnaddr = machopic_indirect_call_target (fnaddr);
28329 #endif
28331 else
28333 /* Static functions and indirect calls don't need the pic register. Also,
28334 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
28335 it an indirect call. */
28336 rtx addr = XEXP (fnaddr, 0);
28337 if (flag_pic
28338 && GET_CODE (addr) == SYMBOL_REF
28339 && !SYMBOL_REF_LOCAL_P (addr))
28341 if (flag_plt
28342 && (SYMBOL_REF_DECL (addr) == NULL_TREE
28343 || !lookup_attribute ("noplt",
28344 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
28346 if (!TARGET_64BIT
28347 || (ix86_cmodel == CM_LARGE_PIC
28348 && DEFAULT_ABI != MS_ABI))
28350 use_reg (&use, gen_rtx_REG (Pmode,
28351 REAL_PIC_OFFSET_TABLE_REGNUM));
28352 if (ix86_use_pseudo_pic_reg ())
28353 emit_move_insn (gen_rtx_REG (Pmode,
28354 REAL_PIC_OFFSET_TABLE_REGNUM),
28355 pic_offset_table_rtx);
28358 else if (!TARGET_PECOFF && !TARGET_MACHO)
28360 if (TARGET_64BIT)
28362 fnaddr = gen_rtx_UNSPEC (Pmode,
28363 gen_rtvec (1, addr),
28364 UNSPEC_GOTPCREL);
28365 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28367 else
28369 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
28370 UNSPEC_GOT);
28371 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
28372 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
28373 fnaddr);
28375 fnaddr = gen_const_mem (Pmode, fnaddr);
28376 /* Pmode may not be the same as word_mode for x32, which
28377 doesn't support indirect branch via 32-bit memory slot.
28378 Since x32 GOT slot is 64 bit with zero upper 32 bits,
28379 indirect branch via x32 GOT slot is OK. */
28380 if (GET_MODE (fnaddr) != word_mode)
28381 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
28382 fnaddr = gen_rtx_MEM (QImode, fnaddr);
28387 /* Skip setting up RAX register for -mskip-rax-setup when there are no
28388 parameters passed in vector registers. */
28389 if (TARGET_64BIT
28390 && (INTVAL (callarg2) > 0
28391 || (INTVAL (callarg2) == 0
28392 && (TARGET_SSE || !flag_skip_rax_setup))))
28394 rtx al = gen_rtx_REG (QImode, AX_REG);
28395 emit_move_insn (al, callarg2);
28396 use_reg (&use, al);
28399 if (ix86_cmodel == CM_LARGE_PIC
28400 && !TARGET_PECOFF
28401 && MEM_P (fnaddr)
28402 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
28403 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
28404 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
28405 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
28406 branch via x32 GOT slot is OK. */
28407 else if (!(TARGET_X32
28408 && MEM_P (fnaddr)
28409 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
28410 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
28411 && (sibcall
28412 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
28413 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
28415 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
28416 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
28419 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
28421 if (retval)
28422 call = gen_rtx_SET (retval, call);
28423 vec[vec_len++] = call;
28425 if (pop)
28427 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
28428 pop = gen_rtx_SET (stack_pointer_rtx, pop);
28429 vec[vec_len++] = pop;
28432 if (cfun->machine->no_caller_saved_registers
28433 && (!fndecl
28434 || (!TREE_THIS_VOLATILE (fndecl)
28435 && !lookup_attribute ("no_caller_saved_registers",
28436 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
28438 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
28439 bool is_64bit_ms_abi = (TARGET_64BIT
28440 && ix86_function_abi (fndecl) == MS_ABI);
28441 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
28443 /* If there are no caller-saved registers, add all registers
28444 that are clobbered by the call which returns. */
28445 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
28446 if (!fixed_regs[i]
28447 && (ix86_call_used_regs[i] == 1
28448 || (ix86_call_used_regs[i] & c_mask))
28449 && !STACK_REGNO_P (i)
28450 && !MMX_REGNO_P (i))
28451 clobber_reg (&use,
28452 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
28454 else if (TARGET_64BIT_MS_ABI
28455 && (!callarg2 || INTVAL (callarg2) != -2))
28457 unsigned i;
28459 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28461 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28462 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28464 clobber_reg (&use, gen_rtx_REG (mode, regno));
28467 /* Set here, but it may get cleared later. */
28468 if (TARGET_CALL_MS2SYSV_XLOGUES)
28470 if (!TARGET_SSE)
28473 /* Don't break hot-patched functions. */
28474 else if (ix86_function_ms_hook_prologue (current_function_decl))
28477 /* TODO: Cases not yet examined. */
28478 else if (flag_split_stack)
28479 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28481 else
28483 gcc_assert (!reload_completed);
28484 cfun->machine->call_ms2sysv = true;
28489 if (vec_len > 1)
28490 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28491 rtx_insn *call_insn = emit_call_insn (call);
28492 if (use)
28493 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
28495 return call_insn;
28498 /* Return true if the function being called was marked with attribute
28499 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28500 to handle the non-PIC case in the backend because there is no easy
28501 interface for the front-end to force non-PLT calls to use the GOT.
28502 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28503 to call the function marked "noplt" indirectly. */
28505 static bool
28506 ix86_nopic_noplt_attribute_p (rtx call_op)
28508 if (flag_pic || ix86_cmodel == CM_LARGE
28509 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28510 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28511 || SYMBOL_REF_LOCAL_P (call_op))
28512 return false;
28514 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28516 if (!flag_plt
28517 || (symbol_decl != NULL_TREE
28518 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28519 return true;
28521 return false;
28524 /* Output indirect branch via a call and return thunk. CALL_OP is a
28525 register which contains the branch target. XASM is the assembly
28526 template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
28527 A normal call is converted to:
28529 call __x86_indirect_thunk_reg
28531 and a tail call is converted to:
28533 jmp __x86_indirect_thunk_reg
28536 static void
28537 ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
28539 char thunk_name_buf[32];
28540 char *thunk_name;
28541 enum indirect_thunk_prefix need_prefix
28542 = indirect_thunk_need_prefix (current_output_insn);
28543 int regno = REGNO (call_op);
28545 if (cfun->machine->indirect_branch_type
28546 != indirect_branch_thunk_inline)
28548 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28550 int i = regno;
28551 if (i >= FIRST_REX_INT_REG)
28552 i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
28553 if (need_prefix == indirect_thunk_prefix_bnd)
28554 indirect_thunks_bnd_used |= 1 << i;
28555 else
28556 indirect_thunks_used |= 1 << i;
28558 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28559 thunk_name = thunk_name_buf;
28561 else
28562 thunk_name = NULL;
28564 if (sibcall_p)
28566 if (thunk_name != NULL)
28568 if (need_prefix == indirect_thunk_prefix_bnd)
28569 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28570 else
28571 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28573 else
28574 output_indirect_thunk (need_prefix, regno);
28576 else
28578 if (thunk_name != NULL)
28580 if (need_prefix == indirect_thunk_prefix_bnd)
28581 fprintf (asm_out_file, "\tbnd call\t%s\n", thunk_name);
28582 else
28583 fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
28584 return;
28587 char indirectlabel1[32];
28588 char indirectlabel2[32];
28590 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28591 INDIRECT_LABEL,
28592 indirectlabelno++);
28593 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28594 INDIRECT_LABEL,
28595 indirectlabelno++);
28597 /* Jump. */
28598 if (need_prefix == indirect_thunk_prefix_bnd)
28599 fputs ("\tbnd jmp\t", asm_out_file);
28600 else
28601 fputs ("\tjmp\t", asm_out_file);
28602 assemble_name_raw (asm_out_file, indirectlabel2);
28603 fputc ('\n', asm_out_file);
28605 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28607 if (thunk_name != NULL)
28609 if (need_prefix == indirect_thunk_prefix_bnd)
28610 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28611 else
28612 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28614 else
28615 output_indirect_thunk (need_prefix, regno);
28617 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28619 /* Call. */
28620 if (need_prefix == indirect_thunk_prefix_bnd)
28621 fputs ("\tbnd call\t", asm_out_file);
28622 else
28623 fputs ("\tcall\t", asm_out_file);
28624 assemble_name_raw (asm_out_file, indirectlabel1);
28625 fputc ('\n', asm_out_file);
28629 /* Output indirect branch via a call and return thunk. CALL_OP is
28630 the branch target. XASM is the assembly template for CALL_OP.
28631 Branch is a tail call if SIBCALL_P is true. A normal call is
28632 converted to:
28634 jmp L2
28636 push CALL_OP
28637 jmp __x86_indirect_thunk
28639 call L1
28641 and a tail call is converted to:
28643 push CALL_OP
28644 jmp __x86_indirect_thunk
28647 static void
28648 ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
28649 bool sibcall_p)
28651 char thunk_name_buf[32];
28652 char *thunk_name;
28653 char push_buf[64];
28654 enum indirect_thunk_prefix need_prefix
28655 = indirect_thunk_need_prefix (current_output_insn);
28656 int regno = -1;
28658 if (cfun->machine->indirect_branch_type
28659 != indirect_branch_thunk_inline)
28661 if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
28663 if (need_prefix == indirect_thunk_prefix_bnd)
28664 indirect_thunk_bnd_needed = true;
28665 else
28666 indirect_thunk_needed = true;
28668 indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
28669 thunk_name = thunk_name_buf;
28671 else
28672 thunk_name = NULL;
28674 snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
28675 TARGET_64BIT ? 'q' : 'l', xasm);
28677 if (sibcall_p)
28679 output_asm_insn (push_buf, &call_op);
28680 if (thunk_name != NULL)
28682 if (need_prefix == indirect_thunk_prefix_bnd)
28683 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28684 else
28685 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28687 else
28688 output_indirect_thunk (need_prefix, regno);
28690 else
28692 char indirectlabel1[32];
28693 char indirectlabel2[32];
28695 ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
28696 INDIRECT_LABEL,
28697 indirectlabelno++);
28698 ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
28699 INDIRECT_LABEL,
28700 indirectlabelno++);
28702 /* Jump. */
28703 if (need_prefix == indirect_thunk_prefix_bnd)
28704 fputs ("\tbnd jmp\t", asm_out_file);
28705 else
28706 fputs ("\tjmp\t", asm_out_file);
28707 assemble_name_raw (asm_out_file, indirectlabel2);
28708 fputc ('\n', asm_out_file);
28710 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
28712 /* An external function may be called via GOT, instead of PLT. */
28713 if (MEM_P (call_op))
28715 struct ix86_address parts;
28716 rtx addr = XEXP (call_op, 0);
28717 if (ix86_decompose_address (addr, &parts)
28718 && parts.base == stack_pointer_rtx)
28720 /* Since call will adjust stack by -UNITS_PER_WORD,
28721 we must convert "disp(stack, index, scale)" to
28722 "disp+UNITS_PER_WORD(stack, index, scale)". */
28723 if (parts.index)
28725 addr = gen_rtx_MULT (Pmode, parts.index,
28726 GEN_INT (parts.scale));
28727 addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28728 addr);
28730 else
28731 addr = stack_pointer_rtx;
28733 rtx disp;
28734 if (parts.disp != NULL_RTX)
28735 disp = plus_constant (Pmode, parts.disp,
28736 UNITS_PER_WORD);
28737 else
28738 disp = GEN_INT (UNITS_PER_WORD);
28740 addr = gen_rtx_PLUS (Pmode, addr, disp);
28741 call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
28745 output_asm_insn (push_buf, &call_op);
28747 if (thunk_name != NULL)
28749 if (need_prefix == indirect_thunk_prefix_bnd)
28750 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28751 else
28752 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28754 else
28755 output_indirect_thunk (need_prefix, regno);
28757 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
28759 /* Call. */
28760 if (need_prefix == indirect_thunk_prefix_bnd)
28761 fputs ("\tbnd call\t", asm_out_file);
28762 else
28763 fputs ("\tcall\t", asm_out_file);
28764 assemble_name_raw (asm_out_file, indirectlabel1);
28765 fputc ('\n', asm_out_file);
28769 /* Output indirect branch via a call and return thunk. CALL_OP is
28770 the branch target. XASM is the assembly template for CALL_OP.
28771 Branch is a tail call if SIBCALL_P is true. */
28773 static void
28774 ix86_output_indirect_branch (rtx call_op, const char *xasm,
28775 bool sibcall_p)
28777 if (REG_P (call_op))
28778 ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
28779 else
28780 ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
28783 /* Output indirect jump. CALL_OP is the jump target. */
28785 const char *
28786 ix86_output_indirect_jmp (rtx call_op)
28788 if (cfun->machine->indirect_branch_type != indirect_branch_keep)
28790 /* We can't have red-zone since "call" in the indirect thunk
28791 pushes the return address onto stack, destroying red-zone. */
28792 if (ix86_red_zone_size != 0)
28793 gcc_unreachable ();
28795 ix86_output_indirect_branch (call_op, "%0", true);
28796 return "";
28798 else
28799 return "%!jmp\t%A0";
28802 /* Output function return. CALL_OP is the jump target. Add a REP
28803 prefix to RET if LONG_P is true and function return is kept. */
28805 const char *
28806 ix86_output_function_return (bool long_p)
28808 if (cfun->machine->function_return_type != indirect_branch_keep)
28810 char thunk_name[32];
28811 enum indirect_thunk_prefix need_prefix
28812 = indirect_thunk_need_prefix (current_output_insn);
28814 if (cfun->machine->function_return_type
28815 != indirect_branch_thunk_inline)
28817 bool need_thunk = (cfun->machine->function_return_type
28818 == indirect_branch_thunk);
28819 indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
28820 true);
28821 if (need_prefix == indirect_thunk_prefix_bnd)
28823 indirect_return_bnd_needed |= need_thunk;
28824 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28826 else
28828 indirect_return_needed |= need_thunk;
28829 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28832 else
28833 output_indirect_thunk (need_prefix, INVALID_REGNUM);
28835 return "";
28838 if (!long_p)
28839 return "%!ret";
28841 return "rep%; ret";
28844 /* Output indirect function return. RET_OP is the function return
28845 target. */
28847 const char *
28848 ix86_output_indirect_function_return (rtx ret_op)
28850 if (cfun->machine->function_return_type != indirect_branch_keep)
28852 char thunk_name[32];
28853 enum indirect_thunk_prefix need_prefix
28854 = indirect_thunk_need_prefix (current_output_insn);
28855 unsigned int regno = REGNO (ret_op);
28856 gcc_assert (regno == CX_REG);
28858 if (cfun->machine->function_return_type
28859 != indirect_branch_thunk_inline)
28861 bool need_thunk = (cfun->machine->function_return_type
28862 == indirect_branch_thunk);
28863 indirect_thunk_name (thunk_name, regno, need_prefix, true);
28864 if (need_prefix == indirect_thunk_prefix_bnd)
28866 if (need_thunk)
28868 indirect_return_via_cx_bnd = true;
28869 indirect_thunks_bnd_used |= 1 << CX_REG;
28871 fprintf (asm_out_file, "\tbnd jmp\t%s\n", thunk_name);
28873 else
28875 if (need_thunk)
28877 indirect_return_via_cx = true;
28878 indirect_thunks_used |= 1 << CX_REG;
28880 fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
28883 else
28884 output_indirect_thunk (need_prefix, regno);
28886 return "";
28888 else
28889 return "%!jmp\t%A0";
28892 /* Split simple return with popping POPC bytes from stack to indirect
28893 branch with stack adjustment . */
28895 void
28896 ix86_split_simple_return_pop_internal (rtx popc)
28898 struct machine_function *m = cfun->machine;
28899 rtx ecx = gen_rtx_REG (SImode, CX_REG);
28900 rtx_insn *insn;
28902 /* There is no "pascal" calling convention in any 64bit ABI. */
28903 gcc_assert (!TARGET_64BIT);
28905 insn = emit_insn (gen_pop (ecx));
28906 m->fs.cfa_offset -= UNITS_PER_WORD;
28907 m->fs.sp_offset -= UNITS_PER_WORD;
28909 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
28910 x = gen_rtx_SET (stack_pointer_rtx, x);
28911 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
28912 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
28913 RTX_FRAME_RELATED_P (insn) = 1;
28915 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
28916 x = gen_rtx_SET (stack_pointer_rtx, x);
28917 insn = emit_insn (x);
28918 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
28919 RTX_FRAME_RELATED_P (insn) = 1;
28921 /* Now return address is in ECX. */
28922 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
28925 /* Output the assembly for a call instruction. */
28927 const char *
28928 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28930 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28931 bool output_indirect_p
28932 = (!TARGET_SEH
28933 && cfun->machine->indirect_branch_type != indirect_branch_keep);
28934 bool seh_nop_p = false;
28935 const char *xasm;
28937 if (SIBLING_CALL_P (insn))
28939 if (direct_p)
28941 if (ix86_nopic_noplt_attribute_p (call_op))
28943 direct_p = false;
28944 if (TARGET_64BIT)
28946 if (output_indirect_p)
28947 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28948 else
28949 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28951 else
28953 if (output_indirect_p)
28954 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
28955 else
28956 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28959 else
28960 xasm = "%!jmp\t%P0";
28962 /* SEH epilogue detection requires the indirect branch case
28963 to include REX.W. */
28964 else if (TARGET_SEH)
28965 xasm = "%!rex.W jmp\t%A0";
28966 else
28968 if (output_indirect_p)
28969 xasm = "%0";
28970 else
28971 xasm = "%!jmp\t%A0";
28974 if (output_indirect_p && !direct_p)
28975 ix86_output_indirect_branch (call_op, xasm, true);
28976 else
28977 output_asm_insn (xasm, &call_op);
28978 return "";
28981 /* SEH unwinding can require an extra nop to be emitted in several
28982 circumstances. Determine if we have one of those. */
28983 if (TARGET_SEH)
28985 rtx_insn *i;
28987 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28989 /* Prevent a catch region from being adjacent to a jump that would
28990 be interpreted as an epilogue sequence by the unwinder. */
28991 if (JUMP_P(i) && CROSSING_JUMP_P (i))
28993 seh_nop_p = true;
28994 break;
28997 /* If we get to another real insn, we don't need the nop. */
28998 if (INSN_P (i))
28999 break;
29001 /* If we get to the epilogue note, prevent a catch region from
29002 being adjacent to the standard epilogue sequence. If non-
29003 call-exceptions, we'll have done this during epilogue emission. */
29004 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
29005 && !flag_non_call_exceptions
29006 && !can_throw_internal (insn))
29008 seh_nop_p = true;
29009 break;
29013 /* If we didn't find a real insn following the call, prevent the
29014 unwinder from looking into the next function. */
29015 if (i == NULL)
29016 seh_nop_p = true;
29019 if (direct_p)
29021 if (ix86_nopic_noplt_attribute_p (call_op))
29023 direct_p = false;
29024 if (TARGET_64BIT)
29026 if (output_indirect_p)
29027 xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29028 else
29029 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
29031 else
29033 if (output_indirect_p)
29034 xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
29035 else
29036 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
29039 else
29040 xasm = "%!call\t%P0";
29042 else
29044 if (output_indirect_p)
29045 xasm = "%0";
29046 else
29047 xasm = "%!call\t%A0";
29050 if (output_indirect_p && !direct_p)
29051 ix86_output_indirect_branch (call_op, xasm, false);
29052 else
29053 output_asm_insn (xasm, &call_op);
29055 if (seh_nop_p)
29056 return "nop";
29058 return "";
29061 /* Clear stack slot assignments remembered from previous functions.
29062 This is called from INIT_EXPANDERS once before RTL is emitted for each
29063 function. */
29065 static struct machine_function *
29066 ix86_init_machine_status (void)
29068 struct machine_function *f;
29070 f = ggc_cleared_alloc<machine_function> ();
29071 f->call_abi = ix86_abi;
29073 return f;
29076 /* Return a MEM corresponding to a stack slot with mode MODE.
29077 Allocate a new slot if necessary.
29079 The RTL for a function can have several slots available: N is
29080 which slot to use. */
29083 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
29085 struct stack_local_entry *s;
29087 gcc_assert (n < MAX_386_STACK_LOCALS);
29089 for (s = ix86_stack_locals; s; s = s->next)
29090 if (s->mode == mode && s->n == n)
29091 return validize_mem (copy_rtx (s->rtl));
29093 s = ggc_alloc<stack_local_entry> ();
29094 s->n = n;
29095 s->mode = mode;
29096 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
29098 s->next = ix86_stack_locals;
29099 ix86_stack_locals = s;
29100 return validize_mem (copy_rtx (s->rtl));
29103 static void
29104 ix86_instantiate_decls (void)
29106 struct stack_local_entry *s;
29108 for (s = ix86_stack_locals; s; s = s->next)
29109 if (s->rtl != NULL_RTX)
29110 instantiate_decl_rtl (s->rtl);
29113 /* Return the number used for encoding REG, in the range 0..7. */
29115 static int
29116 reg_encoded_number (rtx reg)
29118 unsigned regno = REGNO (reg);
29119 switch (regno)
29121 case AX_REG:
29122 return 0;
29123 case CX_REG:
29124 return 1;
29125 case DX_REG:
29126 return 2;
29127 case BX_REG:
29128 return 3;
29129 case SP_REG:
29130 return 4;
29131 case BP_REG:
29132 return 5;
29133 case SI_REG:
29134 return 6;
29135 case DI_REG:
29136 return 7;
29137 default:
29138 break;
29140 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
29141 return regno - FIRST_STACK_REG;
29142 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
29143 return regno - FIRST_SSE_REG;
29144 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
29145 return regno - FIRST_MMX_REG;
29146 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
29147 return regno - FIRST_REX_SSE_REG;
29148 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
29149 return regno - FIRST_REX_INT_REG;
29150 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
29151 return regno - FIRST_MASK_REG;
29152 return -1;
29155 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
29156 in its encoding if it could be relevant for ROP mitigation, otherwise
29157 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
29158 used for calculating it into them. */
29160 static int
29161 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
29162 int *popno0 = 0, int *popno1 = 0)
29164 if (asm_noperands (PATTERN (insn)) >= 0)
29165 return -1;
29166 int has_modrm = get_attr_modrm (insn);
29167 if (!has_modrm)
29168 return -1;
29169 enum attr_modrm_class cls = get_attr_modrm_class (insn);
29170 rtx op0, op1;
29171 switch (cls)
29173 case MODRM_CLASS_OP02:
29174 gcc_assert (noperands >= 3);
29175 if (popno0)
29177 *popno0 = 0;
29178 *popno1 = 2;
29180 op0 = operands[0];
29181 op1 = operands[2];
29182 break;
29183 case MODRM_CLASS_OP01:
29184 gcc_assert (noperands >= 2);
29185 if (popno0)
29187 *popno0 = 0;
29188 *popno1 = 1;
29190 op0 = operands[0];
29191 op1 = operands[1];
29192 break;
29193 default:
29194 return -1;
29196 if (REG_P (op0) && REG_P (op1))
29198 int enc0 = reg_encoded_number (op0);
29199 int enc1 = reg_encoded_number (op1);
29200 return 0xc0 + (enc1 << 3) + enc0;
29202 return -1;
29205 /* Check whether x86 address PARTS is a pc-relative address. */
29207 bool
29208 ix86_rip_relative_addr_p (struct ix86_address *parts)
29210 rtx base, index, disp;
29212 base = parts->base;
29213 index = parts->index;
29214 disp = parts->disp;
29216 if (disp && !base && !index)
29218 if (TARGET_64BIT)
29220 rtx symbol = disp;
29222 if (GET_CODE (disp) == CONST)
29223 symbol = XEXP (disp, 0);
29224 if (GET_CODE (symbol) == PLUS
29225 && CONST_INT_P (XEXP (symbol, 1)))
29226 symbol = XEXP (symbol, 0);
29228 if (GET_CODE (symbol) == LABEL_REF
29229 || (GET_CODE (symbol) == SYMBOL_REF
29230 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
29231 || (GET_CODE (symbol) == UNSPEC
29232 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
29233 || XINT (symbol, 1) == UNSPEC_PCREL
29234 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
29235 return true;
29238 return false;
29241 /* Calculate the length of the memory address in the instruction encoding.
29242 Includes addr32 prefix, does not include the one-byte modrm, opcode,
29243 or other prefixes. We never generate addr32 prefix for LEA insn. */
29246 memory_address_length (rtx addr, bool lea)
29248 struct ix86_address parts;
29249 rtx base, index, disp;
29250 int len;
29251 int ok;
29253 if (GET_CODE (addr) == PRE_DEC
29254 || GET_CODE (addr) == POST_INC
29255 || GET_CODE (addr) == PRE_MODIFY
29256 || GET_CODE (addr) == POST_MODIFY)
29257 return 0;
29259 ok = ix86_decompose_address (addr, &parts);
29260 gcc_assert (ok);
29262 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
29264 /* If this is not LEA instruction, add the length of addr32 prefix. */
29265 if (TARGET_64BIT && !lea
29266 && (SImode_address_operand (addr, VOIDmode)
29267 || (parts.base && GET_MODE (parts.base) == SImode)
29268 || (parts.index && GET_MODE (parts.index) == SImode)))
29269 len++;
29271 base = parts.base;
29272 index = parts.index;
29273 disp = parts.disp;
29275 if (base && SUBREG_P (base))
29276 base = SUBREG_REG (base);
29277 if (index && SUBREG_P (index))
29278 index = SUBREG_REG (index);
29280 gcc_assert (base == NULL_RTX || REG_P (base));
29281 gcc_assert (index == NULL_RTX || REG_P (index));
29283 /* Rule of thumb:
29284 - esp as the base always wants an index,
29285 - ebp as the base always wants a displacement,
29286 - r12 as the base always wants an index,
29287 - r13 as the base always wants a displacement. */
29289 /* Register Indirect. */
29290 if (base && !index && !disp)
29292 /* esp (for its index) and ebp (for its displacement) need
29293 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
29294 code. */
29295 if (base == arg_pointer_rtx
29296 || base == frame_pointer_rtx
29297 || REGNO (base) == SP_REG
29298 || REGNO (base) == BP_REG
29299 || REGNO (base) == R12_REG
29300 || REGNO (base) == R13_REG)
29301 len++;
29304 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
29305 is not disp32, but disp32(%rip), so for disp32
29306 SIB byte is needed, unless print_operand_address
29307 optimizes it into disp32(%rip) or (%rip) is implied
29308 by UNSPEC. */
29309 else if (disp && !base && !index)
29311 len += 4;
29312 if (!ix86_rip_relative_addr_p (&parts))
29313 len++;
29315 else
29317 /* Find the length of the displacement constant. */
29318 if (disp)
29320 if (base && satisfies_constraint_K (disp))
29321 len += 1;
29322 else
29323 len += 4;
29325 /* ebp always wants a displacement. Similarly r13. */
29326 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
29327 len++;
29329 /* An index requires the two-byte modrm form.... */
29330 if (index
29331 /* ...like esp (or r12), which always wants an index. */
29332 || base == arg_pointer_rtx
29333 || base == frame_pointer_rtx
29334 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
29335 len++;
29338 return len;
29341 /* Compute default value for "length_immediate" attribute. When SHORTFORM
29342 is set, expect that insn have 8bit immediate alternative. */
29344 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
29346 int len = 0;
29347 int i;
29348 extract_insn_cached (insn);
29349 for (i = recog_data.n_operands - 1; i >= 0; --i)
29350 if (CONSTANT_P (recog_data.operand[i]))
29352 enum attr_mode mode = get_attr_mode (insn);
29354 gcc_assert (!len);
29355 if (shortform && CONST_INT_P (recog_data.operand[i]))
29357 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
29358 switch (mode)
29360 case MODE_QI:
29361 len = 1;
29362 continue;
29363 case MODE_HI:
29364 ival = trunc_int_for_mode (ival, HImode);
29365 break;
29366 case MODE_SI:
29367 ival = trunc_int_for_mode (ival, SImode);
29368 break;
29369 default:
29370 break;
29372 if (IN_RANGE (ival, -128, 127))
29374 len = 1;
29375 continue;
29378 switch (mode)
29380 case MODE_QI:
29381 len = 1;
29382 break;
29383 case MODE_HI:
29384 len = 2;
29385 break;
29386 case MODE_SI:
29387 len = 4;
29388 break;
29389 /* Immediates for DImode instructions are encoded
29390 as 32bit sign extended values. */
29391 case MODE_DI:
29392 len = 4;
29393 break;
29394 default:
29395 fatal_insn ("unknown insn mode", insn);
29398 return len;
29401 /* Compute default value for "length_address" attribute. */
29403 ix86_attr_length_address_default (rtx_insn *insn)
29405 int i;
29407 if (get_attr_type (insn) == TYPE_LEA)
29409 rtx set = PATTERN (insn), addr;
29411 if (GET_CODE (set) == PARALLEL)
29412 set = XVECEXP (set, 0, 0);
29414 gcc_assert (GET_CODE (set) == SET);
29416 addr = SET_SRC (set);
29418 return memory_address_length (addr, true);
29421 extract_insn_cached (insn);
29422 for (i = recog_data.n_operands - 1; i >= 0; --i)
29424 rtx op = recog_data.operand[i];
29425 if (MEM_P (op))
29427 constrain_operands_cached (insn, reload_completed);
29428 if (which_alternative != -1)
29430 const char *constraints = recog_data.constraints[i];
29431 int alt = which_alternative;
29433 while (*constraints == '=' || *constraints == '+')
29434 constraints++;
29435 while (alt-- > 0)
29436 while (*constraints++ != ',')
29438 /* Skip ignored operands. */
29439 if (*constraints == 'X')
29440 continue;
29443 int len = memory_address_length (XEXP (op, 0), false);
29445 /* Account for segment prefix for non-default addr spaces. */
29446 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
29447 len++;
29449 return len;
29452 return 0;
29455 /* Compute default value for "length_vex" attribute. It includes
29456 2 or 3 byte VEX prefix and 1 opcode byte. */
29459 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
29460 bool has_vex_w)
29462 int i;
29464 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
29465 byte VEX prefix. */
29466 if (!has_0f_opcode || has_vex_w)
29467 return 3 + 1;
29469 /* We can always use 2 byte VEX prefix in 32bit. */
29470 if (!TARGET_64BIT)
29471 return 2 + 1;
29473 extract_insn_cached (insn);
29475 for (i = recog_data.n_operands - 1; i >= 0; --i)
29476 if (REG_P (recog_data.operand[i]))
29478 /* REX.W bit uses 3 byte VEX prefix. */
29479 if (GET_MODE (recog_data.operand[i]) == DImode
29480 && GENERAL_REG_P (recog_data.operand[i]))
29481 return 3 + 1;
29483 else
29485 /* REX.X or REX.B bits use 3 byte VEX prefix. */
29486 if (MEM_P (recog_data.operand[i])
29487 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
29488 return 3 + 1;
29491 return 2 + 1;
29495 static bool
29496 ix86_class_likely_spilled_p (reg_class_t);
29498 /* Returns true if lhs of insn is HW function argument register and set up
29499 is_spilled to true if it is likely spilled HW register. */
29500 static bool
29501 insn_is_function_arg (rtx insn, bool* is_spilled)
29503 rtx dst;
29505 if (!NONDEBUG_INSN_P (insn))
29506 return false;
29507 /* Call instructions are not movable, ignore it. */
29508 if (CALL_P (insn))
29509 return false;
29510 insn = PATTERN (insn);
29511 if (GET_CODE (insn) == PARALLEL)
29512 insn = XVECEXP (insn, 0, 0);
29513 if (GET_CODE (insn) != SET)
29514 return false;
29515 dst = SET_DEST (insn);
29516 if (REG_P (dst) && HARD_REGISTER_P (dst)
29517 && ix86_function_arg_regno_p (REGNO (dst)))
29519 /* Is it likely spilled HW register? */
29520 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
29521 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
29522 *is_spilled = true;
29523 return true;
29525 return false;
29528 /* Add output dependencies for chain of function adjacent arguments if only
29529 there is a move to likely spilled HW register. Return first argument
29530 if at least one dependence was added or NULL otherwise. */
29531 static rtx_insn *
29532 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
29534 rtx_insn *insn;
29535 rtx_insn *last = call;
29536 rtx_insn *first_arg = NULL;
29537 bool is_spilled = false;
29539 head = PREV_INSN (head);
29541 /* Find nearest to call argument passing instruction. */
29542 while (true)
29544 last = PREV_INSN (last);
29545 if (last == head)
29546 return NULL;
29547 if (!NONDEBUG_INSN_P (last))
29548 continue;
29549 if (insn_is_function_arg (last, &is_spilled))
29550 break;
29551 return NULL;
29554 first_arg = last;
29555 while (true)
29557 insn = PREV_INSN (last);
29558 if (!INSN_P (insn))
29559 break;
29560 if (insn == head)
29561 break;
29562 if (!NONDEBUG_INSN_P (insn))
29564 last = insn;
29565 continue;
29567 if (insn_is_function_arg (insn, &is_spilled))
29569 /* Add output depdendence between two function arguments if chain
29570 of output arguments contains likely spilled HW registers. */
29571 if (is_spilled)
29572 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29573 first_arg = last = insn;
29575 else
29576 break;
29578 if (!is_spilled)
29579 return NULL;
29580 return first_arg;
29583 /* Add output or anti dependency from insn to first_arg to restrict its code
29584 motion. */
29585 static void
29586 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
29588 rtx set;
29589 rtx tmp;
29591 /* Add anti dependencies for bounds stores. */
29592 if (INSN_P (insn)
29593 && GET_CODE (PATTERN (insn)) == PARALLEL
29594 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
29595 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
29597 add_dependence (first_arg, insn, REG_DEP_ANTI);
29598 return;
29601 set = single_set (insn);
29602 if (!set)
29603 return;
29604 tmp = SET_DEST (set);
29605 if (REG_P (tmp))
29607 /* Add output dependency to the first function argument. */
29608 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
29609 return;
29611 /* Add anti dependency. */
29612 add_dependence (first_arg, insn, REG_DEP_ANTI);
29615 /* Avoid cross block motion of function argument through adding dependency
29616 from the first non-jump instruction in bb. */
29617 static void
29618 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
29620 rtx_insn *insn = BB_END (bb);
29622 while (insn)
29624 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
29626 rtx set = single_set (insn);
29627 if (set)
29629 avoid_func_arg_motion (arg, insn);
29630 return;
29633 if (insn == BB_HEAD (bb))
29634 return;
29635 insn = PREV_INSN (insn);
29639 /* Hook for pre-reload schedule - avoid motion of function arguments
29640 passed in likely spilled HW registers. */
29641 static void
29642 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
29644 rtx_insn *insn;
29645 rtx_insn *first_arg = NULL;
29646 if (reload_completed)
29647 return;
29648 while (head != tail && DEBUG_INSN_P (head))
29649 head = NEXT_INSN (head);
29650 for (insn = tail; insn != head; insn = PREV_INSN (insn))
29651 if (INSN_P (insn) && CALL_P (insn))
29653 first_arg = add_parameter_dependencies (insn, head);
29654 if (first_arg)
29656 /* Add dependee for first argument to predecessors if only
29657 region contains more than one block. */
29658 basic_block bb = BLOCK_FOR_INSN (insn);
29659 int rgn = CONTAINING_RGN (bb->index);
29660 int nr_blks = RGN_NR_BLOCKS (rgn);
29661 /* Skip trivial regions and region head blocks that can have
29662 predecessors outside of region. */
29663 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
29665 edge e;
29666 edge_iterator ei;
29668 /* Regions are SCCs with the exception of selective
29669 scheduling with pipelining of outer blocks enabled.
29670 So also check that immediate predecessors of a non-head
29671 block are in the same region. */
29672 FOR_EACH_EDGE (e, ei, bb->preds)
29674 /* Avoid creating of loop-carried dependencies through
29675 using topological ordering in the region. */
29676 if (rgn == CONTAINING_RGN (e->src->index)
29677 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
29678 add_dependee_for_func_arg (first_arg, e->src);
29681 insn = first_arg;
29682 if (insn == head)
29683 break;
29686 else if (first_arg)
29687 avoid_func_arg_motion (first_arg, insn);
29690 /* Hook for pre-reload schedule - set priority of moves from likely spilled
29691 HW registers to maximum, to schedule them at soon as possible. These are
29692 moves from function argument registers at the top of the function entry
29693 and moves from function return value registers after call. */
29694 static int
29695 ix86_adjust_priority (rtx_insn *insn, int priority)
29697 rtx set;
29699 if (reload_completed)
29700 return priority;
29702 if (!NONDEBUG_INSN_P (insn))
29703 return priority;
29705 set = single_set (insn);
29706 if (set)
29708 rtx tmp = SET_SRC (set);
29709 if (REG_P (tmp)
29710 && HARD_REGISTER_P (tmp)
29711 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
29712 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
29713 return current_sched_info->sched_max_insns_priority;
29716 return priority;
29719 /* Prepare for scheduling pass. */
29720 static void
29721 ix86_sched_init_global (FILE *, int, int)
29723 /* Install scheduling hooks for current CPU. Some of these hooks are used
29724 in time-critical parts of the scheduler, so we only set them up when
29725 they are actually used. */
29726 switch (ix86_tune)
29728 case PROCESSOR_CORE2:
29729 case PROCESSOR_NEHALEM:
29730 case PROCESSOR_SANDYBRIDGE:
29731 case PROCESSOR_HASWELL:
29732 case PROCESSOR_GENERIC:
29733 /* Do not perform multipass scheduling for pre-reload schedule
29734 to save compile time. */
29735 if (reload_completed)
29737 ix86_core2i7_init_hooks ();
29738 break;
29740 /* Fall through. */
29741 default:
29742 targetm.sched.dfa_post_advance_cycle = NULL;
29743 targetm.sched.first_cycle_multipass_init = NULL;
29744 targetm.sched.first_cycle_multipass_begin = NULL;
29745 targetm.sched.first_cycle_multipass_issue = NULL;
29746 targetm.sched.first_cycle_multipass_backtrack = NULL;
29747 targetm.sched.first_cycle_multipass_end = NULL;
29748 targetm.sched.first_cycle_multipass_fini = NULL;
29749 break;
29754 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
29756 static HOST_WIDE_INT
29757 ix86_static_rtx_alignment (machine_mode mode)
29759 if (mode == DFmode)
29760 return 64;
29761 if (ALIGN_MODE_128 (mode))
29762 return MAX (128, GET_MODE_ALIGNMENT (mode));
29763 return GET_MODE_ALIGNMENT (mode);
29766 /* Implement TARGET_CONSTANT_ALIGNMENT. */
29768 static HOST_WIDE_INT
29769 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
29771 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
29772 || TREE_CODE (exp) == INTEGER_CST)
29774 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
29775 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
29776 return MAX (mode_align, align);
29778 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
29779 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
29780 return BITS_PER_WORD;
29782 return align;
29785 /* Implement TARGET_EMPTY_RECORD_P. */
29787 static bool
29788 ix86_is_empty_record (const_tree type)
29790 if (!TARGET_64BIT)
29791 return false;
29792 return default_is_empty_record (type);
29795 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
29797 static void
29798 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
29800 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
29802 if (!cum->warn_empty)
29803 return;
29805 if (!TYPE_EMPTY_P (type))
29806 return;
29808 const_tree ctx = get_ultimate_context (cum->decl);
29809 if (ctx != NULL_TREE
29810 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
29811 return;
29813 /* If the actual size of the type is zero, then there is no change
29814 in how objects of this size are passed. */
29815 if (int_size_in_bytes (type) == 0)
29816 return;
29818 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
29819 "changes in -fabi-version=12 (GCC 8)", type);
29821 /* Only warn once. */
29822 cum->warn_empty = false;
29825 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
29826 the data type, and ALIGN is the alignment that the object would
29827 ordinarily have. */
29829 static int
29830 iamcu_alignment (tree type, int align)
29832 machine_mode mode;
29834 if (align < 32 || TYPE_USER_ALIGN (type))
29835 return align;
29837 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
29838 bytes. */
29839 mode = TYPE_MODE (strip_array_types (type));
29840 switch (GET_MODE_CLASS (mode))
29842 case MODE_INT:
29843 case MODE_COMPLEX_INT:
29844 case MODE_COMPLEX_FLOAT:
29845 case MODE_FLOAT:
29846 case MODE_DECIMAL_FLOAT:
29847 return 32;
29848 default:
29849 return align;
29853 /* Compute the alignment for a static variable.
29854 TYPE is the data type, and ALIGN is the alignment that
29855 the object would ordinarily have. The value of this function is used
29856 instead of that alignment to align the object. */
29859 ix86_data_alignment (tree type, int align, bool opt)
29861 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
29862 for symbols from other compilation units or symbols that don't need
29863 to bind locally. In order to preserve some ABI compatibility with
29864 those compilers, ensure we don't decrease alignment from what we
29865 used to assume. */
29867 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
29869 /* A data structure, equal or greater than the size of a cache line
29870 (64 bytes in the Pentium 4 and other recent Intel processors, including
29871 processors based on Intel Core microarchitecture) should be aligned
29872 so that its base address is a multiple of a cache line size. */
29874 int max_align
29875 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
29877 if (max_align < BITS_PER_WORD)
29878 max_align = BITS_PER_WORD;
29880 switch (ix86_align_data_type)
29882 case ix86_align_data_type_abi: opt = false; break;
29883 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
29884 case ix86_align_data_type_cacheline: break;
29887 if (TARGET_IAMCU)
29888 align = iamcu_alignment (type, align);
29890 if (opt
29891 && AGGREGATE_TYPE_P (type)
29892 && TYPE_SIZE (type)
29893 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
29895 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
29896 && align < max_align_compat)
29897 align = max_align_compat;
29898 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
29899 && align < max_align)
29900 align = max_align;
29903 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29904 to 16byte boundary. */
29905 if (TARGET_64BIT)
29907 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
29908 && TYPE_SIZE (type)
29909 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29910 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29911 && align < 128)
29912 return 128;
29915 if (!opt)
29916 return align;
29918 if (TREE_CODE (type) == ARRAY_TYPE)
29920 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29921 return 64;
29922 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29923 return 128;
29925 else if (TREE_CODE (type) == COMPLEX_TYPE)
29928 if (TYPE_MODE (type) == DCmode && align < 64)
29929 return 64;
29930 if ((TYPE_MODE (type) == XCmode
29931 || TYPE_MODE (type) == TCmode) && align < 128)
29932 return 128;
29934 else if ((TREE_CODE (type) == RECORD_TYPE
29935 || TREE_CODE (type) == UNION_TYPE
29936 || TREE_CODE (type) == QUAL_UNION_TYPE)
29937 && TYPE_FIELDS (type))
29939 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29940 return 64;
29941 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29942 return 128;
29944 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29945 || TREE_CODE (type) == INTEGER_TYPE)
29947 if (TYPE_MODE (type) == DFmode && align < 64)
29948 return 64;
29949 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29950 return 128;
29953 return align;
29956 /* Compute the alignment for a local variable or a stack slot. EXP is
29957 the data type or decl itself, MODE is the widest mode available and
29958 ALIGN is the alignment that the object would ordinarily have. The
29959 value of this macro is used instead of that alignment to align the
29960 object. */
29962 unsigned int
29963 ix86_local_alignment (tree exp, machine_mode mode,
29964 unsigned int align)
29966 tree type, decl;
29968 if (exp && DECL_P (exp))
29970 type = TREE_TYPE (exp);
29971 decl = exp;
29973 else
29975 type = exp;
29976 decl = NULL;
29979 /* Don't do dynamic stack realignment for long long objects with
29980 -mpreferred-stack-boundary=2. */
29981 if (!TARGET_64BIT
29982 && align == 64
29983 && ix86_preferred_stack_boundary < 64
29984 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29985 && (!type || !TYPE_USER_ALIGN (type))
29986 && (!decl || !DECL_USER_ALIGN (decl)))
29987 align = 32;
29989 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29990 register in MODE. We will return the largest alignment of XF
29991 and DF. */
29992 if (!type)
29994 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29995 align = GET_MODE_ALIGNMENT (DFmode);
29996 return align;
29999 /* Don't increase alignment for Intel MCU psABI. */
30000 if (TARGET_IAMCU)
30001 return align;
30003 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
30004 to 16byte boundary. Exact wording is:
30006 An array uses the same alignment as its elements, except that a local or
30007 global array variable of length at least 16 bytes or
30008 a C99 variable-length array variable always has alignment of at least 16 bytes.
30010 This was added to allow use of aligned SSE instructions at arrays. This
30011 rule is meant for static storage (where compiler can not do the analysis
30012 by itself). We follow it for automatic variables only when convenient.
30013 We fully control everything in the function compiled and functions from
30014 other unit can not rely on the alignment.
30016 Exclude va_list type. It is the common case of local array where
30017 we can not benefit from the alignment.
30019 TODO: Probably one should optimize for size only when var is not escaping. */
30020 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
30021 && TARGET_SSE)
30023 if (AGGREGATE_TYPE_P (type)
30024 && (va_list_type_node == NULL_TREE
30025 || (TYPE_MAIN_VARIANT (type)
30026 != TYPE_MAIN_VARIANT (va_list_type_node)))
30027 && TYPE_SIZE (type)
30028 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
30029 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
30030 && align < 128)
30031 return 128;
30033 if (TREE_CODE (type) == ARRAY_TYPE)
30035 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
30036 return 64;
30037 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
30038 return 128;
30040 else if (TREE_CODE (type) == COMPLEX_TYPE)
30042 if (TYPE_MODE (type) == DCmode && align < 64)
30043 return 64;
30044 if ((TYPE_MODE (type) == XCmode
30045 || TYPE_MODE (type) == TCmode) && align < 128)
30046 return 128;
30048 else if ((TREE_CODE (type) == RECORD_TYPE
30049 || TREE_CODE (type) == UNION_TYPE
30050 || TREE_CODE (type) == QUAL_UNION_TYPE)
30051 && TYPE_FIELDS (type))
30053 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
30054 return 64;
30055 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
30056 return 128;
30058 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
30059 || TREE_CODE (type) == INTEGER_TYPE)
30062 if (TYPE_MODE (type) == DFmode && align < 64)
30063 return 64;
30064 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
30065 return 128;
30067 return align;
30070 /* Compute the minimum required alignment for dynamic stack realignment
30071 purposes for a local variable, parameter or a stack slot. EXP is
30072 the data type or decl itself, MODE is its mode and ALIGN is the
30073 alignment that the object would ordinarily have. */
30075 unsigned int
30076 ix86_minimum_alignment (tree exp, machine_mode mode,
30077 unsigned int align)
30079 tree type, decl;
30081 if (exp && DECL_P (exp))
30083 type = TREE_TYPE (exp);
30084 decl = exp;
30086 else
30088 type = exp;
30089 decl = NULL;
30092 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
30093 return align;
30095 /* Don't do dynamic stack realignment for long long objects with
30096 -mpreferred-stack-boundary=2. */
30097 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
30098 && (!type || !TYPE_USER_ALIGN (type))
30099 && (!decl || !DECL_USER_ALIGN (decl)))
30101 gcc_checking_assert (!TARGET_STV);
30102 return 32;
30105 return align;
30108 /* Find a location for the static chain incoming to a nested function.
30109 This is a register, unless all free registers are used by arguments. */
30111 static rtx
30112 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
30114 unsigned regno;
30116 if (TARGET_64BIT)
30118 /* We always use R10 in 64-bit mode. */
30119 regno = R10_REG;
30121 else
30123 const_tree fntype, fndecl;
30124 unsigned int ccvt;
30126 /* By default in 32-bit mode we use ECX to pass the static chain. */
30127 regno = CX_REG;
30129 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
30131 fntype = TREE_TYPE (fndecl_or_type);
30132 fndecl = fndecl_or_type;
30134 else
30136 fntype = fndecl_or_type;
30137 fndecl = NULL;
30140 ccvt = ix86_get_callcvt (fntype);
30141 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
30143 /* Fastcall functions use ecx/edx for arguments, which leaves
30144 us with EAX for the static chain.
30145 Thiscall functions use ecx for arguments, which also
30146 leaves us with EAX for the static chain. */
30147 regno = AX_REG;
30149 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
30151 /* Thiscall functions use ecx for arguments, which leaves
30152 us with EAX and EDX for the static chain.
30153 We are using for abi-compatibility EAX. */
30154 regno = AX_REG;
30156 else if (ix86_function_regparm (fntype, fndecl) == 3)
30158 /* For regparm 3, we have no free call-clobbered registers in
30159 which to store the static chain. In order to implement this,
30160 we have the trampoline push the static chain to the stack.
30161 However, we can't push a value below the return address when
30162 we call the nested function directly, so we have to use an
30163 alternate entry point. For this we use ESI, and have the
30164 alternate entry point push ESI, so that things appear the
30165 same once we're executing the nested function. */
30166 if (incoming_p)
30168 if (fndecl == current_function_decl
30169 && !ix86_static_chain_on_stack)
30171 gcc_assert (!reload_completed);
30172 ix86_static_chain_on_stack = true;
30174 return gen_frame_mem (SImode,
30175 plus_constant (Pmode,
30176 arg_pointer_rtx, -8));
30178 regno = SI_REG;
30182 return gen_rtx_REG (Pmode, regno);
30185 /* Emit RTL insns to initialize the variable parts of a trampoline.
30186 FNDECL is the decl of the target address; M_TRAMP is a MEM for
30187 the trampoline, and CHAIN_VALUE is an RTX for the static chain
30188 to be passed to the target function. */
30190 static void
30191 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
30193 rtx mem, fnaddr;
30194 int opcode;
30195 int offset = 0;
30196 bool need_endbr = (flag_cf_protection & CF_BRANCH);
30198 fnaddr = XEXP (DECL_RTL (fndecl), 0);
30200 if (TARGET_64BIT)
30202 int size;
30204 if (need_endbr)
30206 /* Insert ENDBR64. */
30207 mem = adjust_address (m_tramp, SImode, offset);
30208 emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
30209 offset += 4;
30212 /* Load the function address to r11. Try to load address using
30213 the shorter movl instead of movabs. We may want to support
30214 movq for kernel mode, but kernel does not use trampolines at
30215 the moment. FNADDR is a 32bit address and may not be in
30216 DImode when ptr_mode == SImode. Always use movl in this
30217 case. */
30218 if (ptr_mode == SImode
30219 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
30221 fnaddr = copy_addr_to_reg (fnaddr);
30223 mem = adjust_address (m_tramp, HImode, offset);
30224 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
30226 mem = adjust_address (m_tramp, SImode, offset + 2);
30227 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
30228 offset += 6;
30230 else
30232 mem = adjust_address (m_tramp, HImode, offset);
30233 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
30235 mem = adjust_address (m_tramp, DImode, offset + 2);
30236 emit_move_insn (mem, fnaddr);
30237 offset += 10;
30240 /* Load static chain using movabs to r10. Use the shorter movl
30241 instead of movabs when ptr_mode == SImode. */
30242 if (ptr_mode == SImode)
30244 opcode = 0xba41;
30245 size = 6;
30247 else
30249 opcode = 0xba49;
30250 size = 10;
30253 mem = adjust_address (m_tramp, HImode, offset);
30254 emit_move_insn (mem, gen_int_mode (opcode, HImode));
30256 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
30257 emit_move_insn (mem, chain_value);
30258 offset += size;
30260 /* Jump to r11; the last (unused) byte is a nop, only there to
30261 pad the write out to a single 32-bit store. */
30262 mem = adjust_address (m_tramp, SImode, offset);
30263 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
30264 offset += 4;
30266 else
30268 rtx disp, chain;
30270 /* Depending on the static chain location, either load a register
30271 with a constant, or push the constant to the stack. All of the
30272 instructions are the same size. */
30273 chain = ix86_static_chain (fndecl, true);
30274 if (REG_P (chain))
30276 switch (REGNO (chain))
30278 case AX_REG:
30279 opcode = 0xb8; break;
30280 case CX_REG:
30281 opcode = 0xb9; break;
30282 default:
30283 gcc_unreachable ();
30286 else
30287 opcode = 0x68;
30289 if (need_endbr)
30291 /* Insert ENDBR32. */
30292 mem = adjust_address (m_tramp, SImode, offset);
30293 emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
30294 offset += 4;
30297 mem = adjust_address (m_tramp, QImode, offset);
30298 emit_move_insn (mem, gen_int_mode (opcode, QImode));
30300 mem = adjust_address (m_tramp, SImode, offset + 1);
30301 emit_move_insn (mem, chain_value);
30302 offset += 5;
30304 mem = adjust_address (m_tramp, QImode, offset);
30305 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
30307 mem = adjust_address (m_tramp, SImode, offset + 1);
30309 /* Compute offset from the end of the jmp to the target function.
30310 In the case in which the trampoline stores the static chain on
30311 the stack, we need to skip the first insn which pushes the
30312 (call-saved) register static chain; this push is 1 byte. */
30313 offset += 5;
30314 disp = expand_binop (SImode, sub_optab, fnaddr,
30315 plus_constant (Pmode, XEXP (m_tramp, 0),
30316 offset - (MEM_P (chain) ? 1 : 0)),
30317 NULL_RTX, 1, OPTAB_DIRECT);
30318 emit_move_insn (mem, disp);
30321 gcc_assert (offset <= TRAMPOLINE_SIZE);
30323 #ifdef HAVE_ENABLE_EXECUTE_STACK
30324 #ifdef CHECK_EXECUTE_STACK_ENABLED
30325 if (CHECK_EXECUTE_STACK_ENABLED)
30326 #endif
30327 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
30328 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
30329 #endif
30332 static bool
30333 ix86_allocate_stack_slots_for_args (void)
30335 /* Naked functions should not allocate stack slots for arguments. */
30336 return !ix86_function_naked (current_function_decl);
30339 static bool
30340 ix86_warn_func_return (tree decl)
30342 /* Naked functions are implemented entirely in assembly, including the
30343 return sequence, so suppress warnings about this. */
30344 return !ix86_function_naked (decl);
30347 /* The following file contains several enumerations and data structures
30348 built from the definitions in i386-builtin-types.def. */
30350 #include "i386-builtin-types.inc"
30352 /* Table for the ix86 builtin non-function types. */
30353 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
30355 /* Retrieve an element from the above table, building some of
30356 the types lazily. */
30358 static tree
30359 ix86_get_builtin_type (enum ix86_builtin_type tcode)
30361 unsigned int index;
30362 tree type, itype;
30364 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
30366 type = ix86_builtin_type_tab[(int) tcode];
30367 if (type != NULL)
30368 return type;
30370 gcc_assert (tcode > IX86_BT_LAST_PRIM);
30371 if (tcode <= IX86_BT_LAST_VECT)
30373 machine_mode mode;
30375 index = tcode - IX86_BT_LAST_PRIM - 1;
30376 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
30377 mode = ix86_builtin_type_vect_mode[index];
30379 type = build_vector_type_for_mode (itype, mode);
30381 else
30383 int quals;
30385 index = tcode - IX86_BT_LAST_VECT - 1;
30386 if (tcode <= IX86_BT_LAST_PTR)
30387 quals = TYPE_UNQUALIFIED;
30388 else
30389 quals = TYPE_QUAL_CONST;
30391 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
30392 if (quals != TYPE_UNQUALIFIED)
30393 itype = build_qualified_type (itype, quals);
30395 type = build_pointer_type (itype);
30398 ix86_builtin_type_tab[(int) tcode] = type;
30399 return type;
30402 /* Table for the ix86 builtin function types. */
30403 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
30405 /* Retrieve an element from the above table, building some of
30406 the types lazily. */
30408 static tree
30409 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
30411 tree type;
30413 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
30415 type = ix86_builtin_func_type_tab[(int) tcode];
30416 if (type != NULL)
30417 return type;
30419 if (tcode <= IX86_BT_LAST_FUNC)
30421 unsigned start = ix86_builtin_func_start[(int) tcode];
30422 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
30423 tree rtype, atype, args = void_list_node;
30424 unsigned i;
30426 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
30427 for (i = after - 1; i > start; --i)
30429 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
30430 args = tree_cons (NULL, atype, args);
30433 type = build_function_type (rtype, args);
30435 else
30437 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
30438 enum ix86_builtin_func_type icode;
30440 icode = ix86_builtin_func_alias_base[index];
30441 type = ix86_get_builtin_func_type (icode);
30444 ix86_builtin_func_type_tab[(int) tcode] = type;
30445 return type;
30449 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
30450 bdesc_* arrays below should come first, then builtins for each bdesc_*
30451 array in ascending order, so that we can use direct array accesses. */
30452 enum ix86_builtins
30454 IX86_BUILTIN_MASKMOVQ,
30455 IX86_BUILTIN_LDMXCSR,
30456 IX86_BUILTIN_STMXCSR,
30457 IX86_BUILTIN_MASKMOVDQU,
30458 IX86_BUILTIN_PSLLDQ128,
30459 IX86_BUILTIN_CLFLUSH,
30460 IX86_BUILTIN_MONITOR,
30461 IX86_BUILTIN_MWAIT,
30462 IX86_BUILTIN_UMONITOR,
30463 IX86_BUILTIN_UMWAIT,
30464 IX86_BUILTIN_TPAUSE,
30465 IX86_BUILTIN_CLZERO,
30466 IX86_BUILTIN_CLDEMOTE,
30467 IX86_BUILTIN_VEC_INIT_V2SI,
30468 IX86_BUILTIN_VEC_INIT_V4HI,
30469 IX86_BUILTIN_VEC_INIT_V8QI,
30470 IX86_BUILTIN_VEC_EXT_V2DF,
30471 IX86_BUILTIN_VEC_EXT_V2DI,
30472 IX86_BUILTIN_VEC_EXT_V4SF,
30473 IX86_BUILTIN_VEC_EXT_V4SI,
30474 IX86_BUILTIN_VEC_EXT_V8HI,
30475 IX86_BUILTIN_VEC_EXT_V2SI,
30476 IX86_BUILTIN_VEC_EXT_V4HI,
30477 IX86_BUILTIN_VEC_EXT_V16QI,
30478 IX86_BUILTIN_VEC_SET_V2DI,
30479 IX86_BUILTIN_VEC_SET_V4SF,
30480 IX86_BUILTIN_VEC_SET_V4SI,
30481 IX86_BUILTIN_VEC_SET_V8HI,
30482 IX86_BUILTIN_VEC_SET_V4HI,
30483 IX86_BUILTIN_VEC_SET_V16QI,
30484 IX86_BUILTIN_GATHERSIV2DF,
30485 IX86_BUILTIN_GATHERSIV4DF,
30486 IX86_BUILTIN_GATHERDIV2DF,
30487 IX86_BUILTIN_GATHERDIV4DF,
30488 IX86_BUILTIN_GATHERSIV4SF,
30489 IX86_BUILTIN_GATHERSIV8SF,
30490 IX86_BUILTIN_GATHERDIV4SF,
30491 IX86_BUILTIN_GATHERDIV8SF,
30492 IX86_BUILTIN_GATHERSIV2DI,
30493 IX86_BUILTIN_GATHERSIV4DI,
30494 IX86_BUILTIN_GATHERDIV2DI,
30495 IX86_BUILTIN_GATHERDIV4DI,
30496 IX86_BUILTIN_GATHERSIV4SI,
30497 IX86_BUILTIN_GATHERSIV8SI,
30498 IX86_BUILTIN_GATHERDIV4SI,
30499 IX86_BUILTIN_GATHERDIV8SI,
30500 IX86_BUILTIN_VFMSUBSD3_MASK3,
30501 IX86_BUILTIN_VFMSUBSS3_MASK3,
30502 IX86_BUILTIN_GATHER3SIV8SF,
30503 IX86_BUILTIN_GATHER3SIV4SF,
30504 IX86_BUILTIN_GATHER3SIV4DF,
30505 IX86_BUILTIN_GATHER3SIV2DF,
30506 IX86_BUILTIN_GATHER3DIV8SF,
30507 IX86_BUILTIN_GATHER3DIV4SF,
30508 IX86_BUILTIN_GATHER3DIV4DF,
30509 IX86_BUILTIN_GATHER3DIV2DF,
30510 IX86_BUILTIN_GATHER3SIV8SI,
30511 IX86_BUILTIN_GATHER3SIV4SI,
30512 IX86_BUILTIN_GATHER3SIV4DI,
30513 IX86_BUILTIN_GATHER3SIV2DI,
30514 IX86_BUILTIN_GATHER3DIV8SI,
30515 IX86_BUILTIN_GATHER3DIV4SI,
30516 IX86_BUILTIN_GATHER3DIV4DI,
30517 IX86_BUILTIN_GATHER3DIV2DI,
30518 IX86_BUILTIN_SCATTERSIV8SF,
30519 IX86_BUILTIN_SCATTERSIV4SF,
30520 IX86_BUILTIN_SCATTERSIV4DF,
30521 IX86_BUILTIN_SCATTERSIV2DF,
30522 IX86_BUILTIN_SCATTERDIV8SF,
30523 IX86_BUILTIN_SCATTERDIV4SF,
30524 IX86_BUILTIN_SCATTERDIV4DF,
30525 IX86_BUILTIN_SCATTERDIV2DF,
30526 IX86_BUILTIN_SCATTERSIV8SI,
30527 IX86_BUILTIN_SCATTERSIV4SI,
30528 IX86_BUILTIN_SCATTERSIV4DI,
30529 IX86_BUILTIN_SCATTERSIV2DI,
30530 IX86_BUILTIN_SCATTERDIV8SI,
30531 IX86_BUILTIN_SCATTERDIV4SI,
30532 IX86_BUILTIN_SCATTERDIV4DI,
30533 IX86_BUILTIN_SCATTERDIV2DI,
30534 /* Alternate 4 and 8 element gather/scatter for the vectorizer
30535 where all operands are 32-byte or 64-byte wide respectively. */
30536 IX86_BUILTIN_GATHERALTSIV4DF,
30537 IX86_BUILTIN_GATHERALTDIV8SF,
30538 IX86_BUILTIN_GATHERALTSIV4DI,
30539 IX86_BUILTIN_GATHERALTDIV8SI,
30540 IX86_BUILTIN_GATHER3ALTDIV16SF,
30541 IX86_BUILTIN_GATHER3ALTDIV16SI,
30542 IX86_BUILTIN_GATHER3ALTSIV4DF,
30543 IX86_BUILTIN_GATHER3ALTDIV8SF,
30544 IX86_BUILTIN_GATHER3ALTSIV4DI,
30545 IX86_BUILTIN_GATHER3ALTDIV8SI,
30546 IX86_BUILTIN_GATHER3ALTSIV8DF,
30547 IX86_BUILTIN_GATHER3ALTSIV8DI,
30548 IX86_BUILTIN_GATHER3DIV16SF,
30549 IX86_BUILTIN_GATHER3DIV16SI,
30550 IX86_BUILTIN_GATHER3DIV8DF,
30551 IX86_BUILTIN_GATHER3DIV8DI,
30552 IX86_BUILTIN_GATHER3SIV16SF,
30553 IX86_BUILTIN_GATHER3SIV16SI,
30554 IX86_BUILTIN_GATHER3SIV8DF,
30555 IX86_BUILTIN_GATHER3SIV8DI,
30556 IX86_BUILTIN_SCATTERALTSIV8DF,
30557 IX86_BUILTIN_SCATTERALTDIV16SF,
30558 IX86_BUILTIN_SCATTERALTSIV8DI,
30559 IX86_BUILTIN_SCATTERALTDIV16SI,
30560 IX86_BUILTIN_SCATTERDIV16SF,
30561 IX86_BUILTIN_SCATTERDIV16SI,
30562 IX86_BUILTIN_SCATTERDIV8DF,
30563 IX86_BUILTIN_SCATTERDIV8DI,
30564 IX86_BUILTIN_SCATTERSIV16SF,
30565 IX86_BUILTIN_SCATTERSIV16SI,
30566 IX86_BUILTIN_SCATTERSIV8DF,
30567 IX86_BUILTIN_SCATTERSIV8DI,
30568 IX86_BUILTIN_GATHERPFQPD,
30569 IX86_BUILTIN_GATHERPFDPS,
30570 IX86_BUILTIN_GATHERPFDPD,
30571 IX86_BUILTIN_GATHERPFQPS,
30572 IX86_BUILTIN_SCATTERPFDPD,
30573 IX86_BUILTIN_SCATTERPFDPS,
30574 IX86_BUILTIN_SCATTERPFQPD,
30575 IX86_BUILTIN_SCATTERPFQPS,
30576 IX86_BUILTIN_CLWB,
30577 IX86_BUILTIN_CLFLUSHOPT,
30578 IX86_BUILTIN_INFQ,
30579 IX86_BUILTIN_HUGE_VALQ,
30580 IX86_BUILTIN_NANQ,
30581 IX86_BUILTIN_NANSQ,
30582 IX86_BUILTIN_XABORT,
30583 IX86_BUILTIN_ADDCARRYX32,
30584 IX86_BUILTIN_ADDCARRYX64,
30585 IX86_BUILTIN_SBB32,
30586 IX86_BUILTIN_SBB64,
30587 IX86_BUILTIN_RDRAND16_STEP,
30588 IX86_BUILTIN_RDRAND32_STEP,
30589 IX86_BUILTIN_RDRAND64_STEP,
30590 IX86_BUILTIN_RDSEED16_STEP,
30591 IX86_BUILTIN_RDSEED32_STEP,
30592 IX86_BUILTIN_RDSEED64_STEP,
30593 IX86_BUILTIN_MONITORX,
30594 IX86_BUILTIN_MWAITX,
30595 IX86_BUILTIN_CFSTRING,
30596 IX86_BUILTIN_CPU_INIT,
30597 IX86_BUILTIN_CPU_IS,
30598 IX86_BUILTIN_CPU_SUPPORTS,
30599 IX86_BUILTIN_READ_FLAGS,
30600 IX86_BUILTIN_WRITE_FLAGS,
30602 /* All the remaining builtins are tracked in bdesc_* arrays in
30603 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
30604 this point. */
30605 #define BDESC(mask, icode, name, code, comparison, flag) \
30606 code,
30607 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30608 code, \
30609 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
30610 #define BDESC_END(kind, next_kind)
30612 #include "i386-builtin.def"
30614 #undef BDESC
30615 #undef BDESC_FIRST
30616 #undef BDESC_END
30618 IX86_BUILTIN_MAX,
30620 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
30622 /* Now just the aliases for bdesc_* start/end. */
30623 #define BDESC(mask, icode, name, code, comparison, flag)
30624 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
30625 #define BDESC_END(kind, next_kind) \
30626 IX86_BUILTIN__BDESC_##kind##_LAST \
30627 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
30629 #include "i386-builtin.def"
30631 #undef BDESC
30632 #undef BDESC_FIRST
30633 #undef BDESC_END
30635 /* Just to make sure there is no comma after the last enumerator. */
30636 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
30639 /* Table for the ix86 builtin decls. */
30640 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
30642 /* Table of all of the builtin functions that are possible with different ISA's
30643 but are waiting to be built until a function is declared to use that
30644 ISA. */
30645 struct builtin_isa {
30646 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
30647 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
30648 const char *name; /* function name */
30649 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
30650 unsigned char const_p:1; /* true if the declaration is constant */
30651 unsigned char pure_p:1; /* true if the declaration has pure attribute */
30652 bool leaf_p; /* true if the declaration has leaf attribute */
30653 bool nothrow_p; /* true if the declaration has nothrow attribute */
30654 bool set_and_not_built_p;
30657 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
30659 /* Bits that can still enable any inclusion of a builtin. */
30660 static HOST_WIDE_INT deferred_isa_values = 0;
30661 static HOST_WIDE_INT deferred_isa_values2 = 0;
30663 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
30664 of which isa_flags to use in the ix86_builtins_isa array. Stores the
30665 function decl in the ix86_builtins array. Returns the function decl or
30666 NULL_TREE, if the builtin was not added.
30668 If the front end has a special hook for builtin functions, delay adding
30669 builtin functions that aren't in the current ISA until the ISA is changed
30670 with function specific optimization. Doing so, can save about 300K for the
30671 default compiler. When the builtin is expanded, check at that time whether
30672 it is valid.
30674 If the front end doesn't have a special hook, record all builtins, even if
30675 it isn't an instruction set in the current ISA in case the user uses
30676 function specific options for a different ISA, so that we don't get scope
30677 errors if a builtin is added in the middle of a function scope. */
30679 static inline tree
30680 def_builtin (HOST_WIDE_INT mask, const char *name,
30681 enum ix86_builtin_func_type tcode,
30682 enum ix86_builtins code)
30684 tree decl = NULL_TREE;
30686 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
30688 ix86_builtins_isa[(int) code].isa = mask;
30690 mask &= ~OPTION_MASK_ISA_64BIT;
30692 /* Filter out the masks most often ored together with others. */
30693 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
30694 && mask != OPTION_MASK_ISA_AVX512VL)
30695 mask &= ~OPTION_MASK_ISA_AVX512VL;
30696 if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
30697 && mask != OPTION_MASK_ISA_AVX512BW)
30698 mask &= ~OPTION_MASK_ISA_AVX512BW;
30700 if (mask == 0
30701 || (mask & ix86_isa_flags) != 0
30702 || (lang_hooks.builtin_function
30703 == lang_hooks.builtin_function_ext_scope))
30705 tree type = ix86_get_builtin_func_type (tcode);
30706 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30707 NULL, NULL_TREE);
30708 ix86_builtins[(int) code] = decl;
30709 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30711 else
30713 /* Just a MASK where set_and_not_built_p == true can potentially
30714 include a builtin. */
30715 deferred_isa_values |= mask;
30716 ix86_builtins[(int) code] = NULL_TREE;
30717 ix86_builtins_isa[(int) code].tcode = tcode;
30718 ix86_builtins_isa[(int) code].name = name;
30719 ix86_builtins_isa[(int) code].leaf_p = false;
30720 ix86_builtins_isa[(int) code].nothrow_p = false;
30721 ix86_builtins_isa[(int) code].const_p = false;
30722 ix86_builtins_isa[(int) code].pure_p = false;
30723 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30727 return decl;
30730 /* Like def_builtin, but also marks the function decl "const". */
30732 static inline tree
30733 def_builtin_const (HOST_WIDE_INT mask, const char *name,
30734 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30736 tree decl = def_builtin (mask, name, tcode, code);
30737 if (decl)
30738 TREE_READONLY (decl) = 1;
30739 else
30740 ix86_builtins_isa[(int) code].const_p = true;
30742 return decl;
30745 /* Like def_builtin, but also marks the function decl "pure". */
30747 static inline tree
30748 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
30749 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30751 tree decl = def_builtin (mask, name, tcode, code);
30752 if (decl)
30753 DECL_PURE_P (decl) = 1;
30754 else
30755 ix86_builtins_isa[(int) code].pure_p = true;
30757 return decl;
30760 /* Like def_builtin, but for additional isa2 flags. */
30762 static inline tree
30763 def_builtin2 (HOST_WIDE_INT mask, const char *name,
30764 enum ix86_builtin_func_type tcode,
30765 enum ix86_builtins code)
30767 tree decl = NULL_TREE;
30769 ix86_builtins_isa[(int) code].isa2 = mask;
30771 if (mask == 0
30772 || (mask & ix86_isa_flags2) != 0
30773 || (lang_hooks.builtin_function
30774 == lang_hooks.builtin_function_ext_scope))
30777 tree type = ix86_get_builtin_func_type (tcode);
30778 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30779 NULL, NULL_TREE);
30780 ix86_builtins[(int) code] = decl;
30781 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
30783 else
30785 /* Just a MASK where set_and_not_built_p == true can potentially
30786 include a builtin. */
30787 deferred_isa_values2 |= mask;
30788 ix86_builtins[(int) code] = NULL_TREE;
30789 ix86_builtins_isa[(int) code].tcode = tcode;
30790 ix86_builtins_isa[(int) code].name = name;
30791 ix86_builtins_isa[(int) code].leaf_p = false;
30792 ix86_builtins_isa[(int) code].nothrow_p = false;
30793 ix86_builtins_isa[(int) code].const_p = false;
30794 ix86_builtins_isa[(int) code].pure_p = false;
30795 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
30798 return decl;
30801 /* Like def_builtin, but also marks the function decl "const". */
30803 static inline tree
30804 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
30805 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30807 tree decl = def_builtin2 (mask, name, tcode, code);
30808 if (decl)
30809 TREE_READONLY (decl) = 1;
30810 else
30811 ix86_builtins_isa[(int) code].const_p = true;
30813 return decl;
30816 /* Like def_builtin, but also marks the function decl "pure". */
30818 static inline tree
30819 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
30820 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
30822 tree decl = def_builtin2 (mask, name, tcode, code);
30823 if (decl)
30824 DECL_PURE_P (decl) = 1;
30825 else
30826 ix86_builtins_isa[(int) code].pure_p = true;
30828 return decl;
30831 /* Add any new builtin functions for a given ISA that may not have been
30832 declared. This saves a bit of space compared to adding all of the
30833 declarations to the tree, even if we didn't use them. */
30835 static void
30836 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
30838 isa &= ~OPTION_MASK_ISA_64BIT;
30840 if ((isa & deferred_isa_values) == 0
30841 && (isa2 & deferred_isa_values2) == 0)
30842 return;
30844 /* Bits in ISA value can be removed from potential isa values. */
30845 deferred_isa_values &= ~isa;
30846 deferred_isa_values2 &= ~isa2;
30848 int i;
30849 tree saved_current_target_pragma = current_target_pragma;
30850 current_target_pragma = NULL_TREE;
30852 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
30854 if (((ix86_builtins_isa[i].isa & isa) != 0
30855 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
30856 && ix86_builtins_isa[i].set_and_not_built_p)
30858 tree decl, type;
30860 /* Don't define the builtin again. */
30861 ix86_builtins_isa[i].set_and_not_built_p = false;
30863 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
30864 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
30865 type, i, BUILT_IN_MD, NULL,
30866 NULL_TREE);
30868 ix86_builtins[i] = decl;
30869 if (ix86_builtins_isa[i].const_p)
30870 TREE_READONLY (decl) = 1;
30871 if (ix86_builtins_isa[i].pure_p)
30872 DECL_PURE_P (decl) = 1;
30873 if (ix86_builtins_isa[i].leaf_p)
30874 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30875 NULL_TREE);
30876 if (ix86_builtins_isa[i].nothrow_p)
30877 TREE_NOTHROW (decl) = 1;
30881 current_target_pragma = saved_current_target_pragma;
30884 /* Bits for builtin_description.flag. */
30886 /* Set when we don't support the comparison natively, and should
30887 swap_comparison in order to support it. */
30888 #define BUILTIN_DESC_SWAP_OPERANDS 1
30890 struct builtin_description
30892 const HOST_WIDE_INT mask;
30893 const enum insn_code icode;
30894 const char *const name;
30895 const enum ix86_builtins code;
30896 const enum rtx_code comparison;
30897 const int flag;
30900 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
30901 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
30902 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
30903 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
30904 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
30905 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
30906 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
30907 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
30908 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
30909 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
30910 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
30911 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
30912 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
30913 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
30914 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
30915 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
30916 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
30917 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
30918 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
30919 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
30920 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
30921 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
30922 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
30923 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
30924 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
30925 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30926 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30927 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30928 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30929 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30930 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30931 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30932 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30933 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30934 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30935 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30936 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30937 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30938 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30939 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30940 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30941 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30942 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30943 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30944 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30945 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30946 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30947 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30948 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30949 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30950 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30951 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30953 #define BDESC(mask, icode, name, code, comparison, flag) \
30954 { mask, icode, name, code, comparison, flag },
30955 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30956 static const struct builtin_description bdesc_##kind[] = \
30958 BDESC (mask, icode, name, code, comparison, flag)
30959 #define BDESC_END(kind, next_kind) \
30962 #include "i386-builtin.def"
30964 #undef BDESC
30965 #undef BDESC_FIRST
30966 #undef BDESC_END
30968 /* TM vector builtins. */
30970 /* Reuse the existing x86-specific `struct builtin_description' cause
30971 we're lazy. Add casts to make them fit. */
30972 static const struct builtin_description bdesc_tm[] =
30974 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30975 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30976 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30977 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30978 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30979 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30980 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30982 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30983 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30984 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30985 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30986 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30987 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30988 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30990 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30991 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30992 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30993 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30994 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30995 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30996 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30998 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30999 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
31000 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
31003 /* Initialize the transactional memory vector load/store builtins. */
31005 static void
31006 ix86_init_tm_builtins (void)
31008 enum ix86_builtin_func_type ftype;
31009 const struct builtin_description *d;
31010 size_t i;
31011 tree decl;
31012 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
31013 tree attrs_log, attrs_type_log;
31015 if (!flag_tm)
31016 return;
31018 /* If there are no builtins defined, we must be compiling in a
31019 language without trans-mem support. */
31020 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
31021 return;
31023 /* Use whatever attributes a normal TM load has. */
31024 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
31025 attrs_load = DECL_ATTRIBUTES (decl);
31026 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31027 /* Use whatever attributes a normal TM store has. */
31028 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
31029 attrs_store = DECL_ATTRIBUTES (decl);
31030 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31031 /* Use whatever attributes a normal TM log has. */
31032 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
31033 attrs_log = DECL_ATTRIBUTES (decl);
31034 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
31036 for (i = 0, d = bdesc_tm;
31037 i < ARRAY_SIZE (bdesc_tm);
31038 i++, d++)
31040 if ((d->mask & ix86_isa_flags) != 0
31041 || (lang_hooks.builtin_function
31042 == lang_hooks.builtin_function_ext_scope))
31044 tree type, attrs, attrs_type;
31045 enum built_in_function code = (enum built_in_function) d->code;
31047 ftype = (enum ix86_builtin_func_type) d->flag;
31048 type = ix86_get_builtin_func_type (ftype);
31050 if (BUILTIN_TM_LOAD_P (code))
31052 attrs = attrs_load;
31053 attrs_type = attrs_type_load;
31055 else if (BUILTIN_TM_STORE_P (code))
31057 attrs = attrs_store;
31058 attrs_type = attrs_type_store;
31060 else
31062 attrs = attrs_log;
31063 attrs_type = attrs_type_log;
31065 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
31066 /* The builtin without the prefix for
31067 calling it directly. */
31068 d->name + strlen ("__builtin_"),
31069 attrs);
31070 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
31071 set the TYPE_ATTRIBUTES. */
31072 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
31074 set_builtin_decl (code, decl, false);
31079 /* Macros for verification of enum ix86_builtins order. */
31080 #define BDESC_VERIFY(x, y, z) \
31081 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
31082 #define BDESC_VERIFYS(x, y, z) \
31083 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
31085 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31086 IX86_BUILTIN__BDESC_COMI_LAST, 1);
31087 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31088 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
31089 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31090 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
31091 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
31092 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
31093 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31094 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
31095 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
31096 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
31097 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31098 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
31099 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31100 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
31101 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
31102 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
31103 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31104 IX86_BUILTIN__BDESC_CET_LAST, 1);
31105 BDESC_VERIFYS (IX86_BUILTIN_MAX,
31106 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
31108 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
31109 in the current target ISA to allow the user to compile particular modules
31110 with different target specific options that differ from the command line
31111 options. */
31112 static void
31113 ix86_init_mmx_sse_builtins (void)
31115 const struct builtin_description * d;
31116 enum ix86_builtin_func_type ftype;
31117 size_t i;
31119 /* Add all special builtins with variable number of operands. */
31120 for (i = 0, d = bdesc_special_args;
31121 i < ARRAY_SIZE (bdesc_special_args);
31122 i++, d++)
31124 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
31125 if (d->name == 0)
31126 continue;
31128 ftype = (enum ix86_builtin_func_type) d->flag;
31129 def_builtin (d->mask, d->name, ftype, d->code);
31131 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
31132 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
31133 ARRAY_SIZE (bdesc_special_args) - 1);
31135 /* Add all special builtins with variable number of operands. */
31136 for (i = 0, d = bdesc_special_args2;
31137 i < ARRAY_SIZE (bdesc_special_args2);
31138 i++, d++)
31140 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
31141 if (d->name == 0)
31142 continue;
31144 ftype = (enum ix86_builtin_func_type) d->flag;
31145 def_builtin2 (d->mask, d->name, ftype, d->code);
31147 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
31148 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
31149 ARRAY_SIZE (bdesc_special_args2) - 1);
31151 /* Add all builtins with variable number of operands. */
31152 for (i = 0, d = bdesc_args;
31153 i < ARRAY_SIZE (bdesc_args);
31154 i++, d++)
31156 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
31157 if (d->name == 0)
31158 continue;
31160 ftype = (enum ix86_builtin_func_type) d->flag;
31161 def_builtin_const (d->mask, d->name, ftype, d->code);
31163 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
31164 IX86_BUILTIN__BDESC_ARGS_FIRST,
31165 ARRAY_SIZE (bdesc_args) - 1);
31167 /* Add all builtins with variable number of operands. */
31168 for (i = 0, d = bdesc_args2;
31169 i < ARRAY_SIZE (bdesc_args2);
31170 i++, d++)
31172 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
31173 if (d->name == 0)
31174 continue;
31176 ftype = (enum ix86_builtin_func_type) d->flag;
31177 def_builtin_const2 (d->mask, d->name, ftype, d->code);
31179 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
31180 IX86_BUILTIN__BDESC_ARGS2_FIRST,
31181 ARRAY_SIZE (bdesc_args2) - 1);
31183 /* Add all builtins with rounding. */
31184 for (i = 0, d = bdesc_round_args;
31185 i < ARRAY_SIZE (bdesc_round_args);
31186 i++, d++)
31188 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
31189 if (d->name == 0)
31190 continue;
31192 ftype = (enum ix86_builtin_func_type) d->flag;
31193 def_builtin_const (d->mask, d->name, ftype, d->code);
31195 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
31196 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
31197 ARRAY_SIZE (bdesc_round_args) - 1);
31199 /* pcmpestr[im] insns. */
31200 for (i = 0, d = bdesc_pcmpestr;
31201 i < ARRAY_SIZE (bdesc_pcmpestr);
31202 i++, d++)
31204 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
31205 if (d->code == IX86_BUILTIN_PCMPESTRM128)
31206 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
31207 else
31208 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
31209 def_builtin_const (d->mask, d->name, ftype, d->code);
31211 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
31212 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
31213 ARRAY_SIZE (bdesc_pcmpestr) - 1);
31215 /* pcmpistr[im] insns. */
31216 for (i = 0, d = bdesc_pcmpistr;
31217 i < ARRAY_SIZE (bdesc_pcmpistr);
31218 i++, d++)
31220 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
31221 if (d->code == IX86_BUILTIN_PCMPISTRM128)
31222 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
31223 else
31224 ftype = INT_FTYPE_V16QI_V16QI_INT;
31225 def_builtin_const (d->mask, d->name, ftype, d->code);
31227 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
31228 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
31229 ARRAY_SIZE (bdesc_pcmpistr) - 1);
31231 /* comi/ucomi insns. */
31232 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
31234 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
31235 if (d->mask == OPTION_MASK_ISA_SSE2)
31236 ftype = INT_FTYPE_V2DF_V2DF;
31237 else
31238 ftype = INT_FTYPE_V4SF_V4SF;
31239 def_builtin_const (d->mask, d->name, ftype, d->code);
31241 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
31242 IX86_BUILTIN__BDESC_COMI_FIRST,
31243 ARRAY_SIZE (bdesc_comi) - 1);
31245 /* SSE */
31246 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
31247 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
31248 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
31249 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
31251 /* SSE or 3DNow!A */
31252 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31253 /* As it uses V4HImode, we have to require -mmmx too. */
31254 | OPTION_MASK_ISA_MMX,
31255 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
31256 IX86_BUILTIN_MASKMOVQ);
31258 /* SSE2 */
31259 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
31260 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
31262 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
31263 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
31264 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
31265 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
31267 /* SSE3. */
31268 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
31269 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
31270 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
31271 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
31273 /* AES */
31274 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31275 "__builtin_ia32_aesenc128",
31276 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
31277 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31278 "__builtin_ia32_aesenclast128",
31279 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
31280 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31281 "__builtin_ia32_aesdec128",
31282 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
31283 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31284 "__builtin_ia32_aesdeclast128",
31285 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
31286 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31287 "__builtin_ia32_aesimc128",
31288 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
31289 def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2,
31290 "__builtin_ia32_aeskeygenassist128",
31291 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
31293 /* PCLMUL */
31294 def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2,
31295 "__builtin_ia32_pclmulqdq128",
31296 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
31298 /* RDRND */
31299 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
31300 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
31301 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
31302 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
31303 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
31304 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
31305 IX86_BUILTIN_RDRAND64_STEP);
31307 /* AVX2 */
31308 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
31309 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
31310 IX86_BUILTIN_GATHERSIV2DF);
31312 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
31313 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
31314 IX86_BUILTIN_GATHERSIV4DF);
31316 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
31317 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
31318 IX86_BUILTIN_GATHERDIV2DF);
31320 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
31321 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
31322 IX86_BUILTIN_GATHERDIV4DF);
31324 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
31325 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
31326 IX86_BUILTIN_GATHERSIV4SF);
31328 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
31329 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
31330 IX86_BUILTIN_GATHERSIV8SF);
31332 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
31333 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
31334 IX86_BUILTIN_GATHERDIV4SF);
31336 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
31337 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
31338 IX86_BUILTIN_GATHERDIV8SF);
31340 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
31341 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
31342 IX86_BUILTIN_GATHERSIV2DI);
31344 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
31345 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
31346 IX86_BUILTIN_GATHERSIV4DI);
31348 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
31349 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
31350 IX86_BUILTIN_GATHERDIV2DI);
31352 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
31353 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
31354 IX86_BUILTIN_GATHERDIV4DI);
31356 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
31357 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
31358 IX86_BUILTIN_GATHERSIV4SI);
31360 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
31361 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
31362 IX86_BUILTIN_GATHERSIV8SI);
31364 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
31365 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
31366 IX86_BUILTIN_GATHERDIV4SI);
31368 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
31369 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
31370 IX86_BUILTIN_GATHERDIV8SI);
31372 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
31373 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
31374 IX86_BUILTIN_GATHERALTSIV4DF);
31376 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
31377 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
31378 IX86_BUILTIN_GATHERALTDIV8SF);
31380 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
31381 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
31382 IX86_BUILTIN_GATHERALTSIV4DI);
31384 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
31385 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
31386 IX86_BUILTIN_GATHERALTDIV8SI);
31388 /* AVX512F */
31389 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
31390 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
31391 IX86_BUILTIN_GATHER3SIV16SF);
31393 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
31394 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
31395 IX86_BUILTIN_GATHER3SIV8DF);
31397 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
31398 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
31399 IX86_BUILTIN_GATHER3DIV16SF);
31401 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
31402 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
31403 IX86_BUILTIN_GATHER3DIV8DF);
31405 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
31406 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
31407 IX86_BUILTIN_GATHER3SIV16SI);
31409 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
31410 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
31411 IX86_BUILTIN_GATHER3SIV8DI);
31413 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
31414 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
31415 IX86_BUILTIN_GATHER3DIV16SI);
31417 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
31418 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
31419 IX86_BUILTIN_GATHER3DIV8DI);
31421 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
31422 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
31423 IX86_BUILTIN_GATHER3ALTSIV8DF);
31425 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
31426 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
31427 IX86_BUILTIN_GATHER3ALTDIV16SF);
31429 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
31430 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
31431 IX86_BUILTIN_GATHER3ALTSIV8DI);
31433 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
31434 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
31435 IX86_BUILTIN_GATHER3ALTDIV16SI);
31437 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
31438 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
31439 IX86_BUILTIN_SCATTERSIV16SF);
31441 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
31442 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
31443 IX86_BUILTIN_SCATTERSIV8DF);
31445 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
31446 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
31447 IX86_BUILTIN_SCATTERDIV16SF);
31449 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
31450 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
31451 IX86_BUILTIN_SCATTERDIV8DF);
31453 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
31454 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
31455 IX86_BUILTIN_SCATTERSIV16SI);
31457 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
31458 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
31459 IX86_BUILTIN_SCATTERSIV8DI);
31461 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
31462 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
31463 IX86_BUILTIN_SCATTERDIV16SI);
31465 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
31466 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
31467 IX86_BUILTIN_SCATTERDIV8DI);
31469 /* AVX512VL */
31470 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
31471 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
31472 IX86_BUILTIN_GATHER3SIV2DF);
31474 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
31475 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
31476 IX86_BUILTIN_GATHER3SIV4DF);
31478 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
31479 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
31480 IX86_BUILTIN_GATHER3DIV2DF);
31482 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
31483 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
31484 IX86_BUILTIN_GATHER3DIV4DF);
31486 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
31487 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
31488 IX86_BUILTIN_GATHER3SIV4SF);
31490 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
31491 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
31492 IX86_BUILTIN_GATHER3SIV8SF);
31494 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
31495 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
31496 IX86_BUILTIN_GATHER3DIV4SF);
31498 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
31499 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
31500 IX86_BUILTIN_GATHER3DIV8SF);
31502 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
31503 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
31504 IX86_BUILTIN_GATHER3SIV2DI);
31506 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
31507 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
31508 IX86_BUILTIN_GATHER3SIV4DI);
31510 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
31511 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
31512 IX86_BUILTIN_GATHER3DIV2DI);
31514 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
31515 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
31516 IX86_BUILTIN_GATHER3DIV4DI);
31518 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
31519 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
31520 IX86_BUILTIN_GATHER3SIV4SI);
31522 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
31523 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
31524 IX86_BUILTIN_GATHER3SIV8SI);
31526 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
31527 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
31528 IX86_BUILTIN_GATHER3DIV4SI);
31530 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
31531 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
31532 IX86_BUILTIN_GATHER3DIV8SI);
31534 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
31535 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
31536 IX86_BUILTIN_GATHER3ALTSIV4DF);
31538 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
31539 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
31540 IX86_BUILTIN_GATHER3ALTDIV8SF);
31542 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
31543 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
31544 IX86_BUILTIN_GATHER3ALTSIV4DI);
31546 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
31547 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
31548 IX86_BUILTIN_GATHER3ALTDIV8SI);
31550 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
31551 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
31552 IX86_BUILTIN_SCATTERSIV8SF);
31554 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
31555 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
31556 IX86_BUILTIN_SCATTERSIV4SF);
31558 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
31559 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
31560 IX86_BUILTIN_SCATTERSIV4DF);
31562 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
31563 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
31564 IX86_BUILTIN_SCATTERSIV2DF);
31566 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
31567 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
31568 IX86_BUILTIN_SCATTERDIV8SF);
31570 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
31571 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
31572 IX86_BUILTIN_SCATTERDIV4SF);
31574 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
31575 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
31576 IX86_BUILTIN_SCATTERDIV4DF);
31578 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
31579 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
31580 IX86_BUILTIN_SCATTERDIV2DF);
31582 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
31583 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
31584 IX86_BUILTIN_SCATTERSIV8SI);
31586 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
31587 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
31588 IX86_BUILTIN_SCATTERSIV4SI);
31590 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
31591 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
31592 IX86_BUILTIN_SCATTERSIV4DI);
31594 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
31595 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
31596 IX86_BUILTIN_SCATTERSIV2DI);
31598 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
31599 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
31600 IX86_BUILTIN_SCATTERDIV8SI);
31602 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
31603 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
31604 IX86_BUILTIN_SCATTERDIV4SI);
31606 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
31607 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
31608 IX86_BUILTIN_SCATTERDIV4DI);
31610 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
31611 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
31612 IX86_BUILTIN_SCATTERDIV2DI);
31613 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
31614 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
31615 IX86_BUILTIN_SCATTERALTSIV8DF);
31617 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
31618 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
31619 IX86_BUILTIN_SCATTERALTDIV16SF);
31621 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
31622 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
31623 IX86_BUILTIN_SCATTERALTSIV8DI);
31625 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
31626 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
31627 IX86_BUILTIN_SCATTERALTDIV16SI);
31629 /* AVX512PF */
31630 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
31631 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31632 IX86_BUILTIN_GATHERPFDPD);
31633 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
31634 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31635 IX86_BUILTIN_GATHERPFDPS);
31636 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
31637 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31638 IX86_BUILTIN_GATHERPFQPD);
31639 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
31640 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31641 IX86_BUILTIN_GATHERPFQPS);
31642 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
31643 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
31644 IX86_BUILTIN_SCATTERPFDPD);
31645 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
31646 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
31647 IX86_BUILTIN_SCATTERPFDPS);
31648 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
31649 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31650 IX86_BUILTIN_SCATTERPFQPD);
31651 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
31652 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
31653 IX86_BUILTIN_SCATTERPFQPS);
31655 /* SHA */
31656 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
31657 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
31658 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
31659 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
31660 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
31661 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
31662 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
31663 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
31664 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
31665 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
31666 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
31667 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
31668 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
31669 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
31671 /* RTM. */
31672 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
31673 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
31675 /* MMX access to the vec_init patterns. */
31676 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
31677 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
31679 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
31680 V4HI_FTYPE_HI_HI_HI_HI,
31681 IX86_BUILTIN_VEC_INIT_V4HI);
31683 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
31684 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
31685 IX86_BUILTIN_VEC_INIT_V8QI);
31687 /* Access to the vec_extract patterns. */
31688 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
31689 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
31690 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
31691 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
31692 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
31693 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
31694 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
31695 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
31696 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
31697 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
31699 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31700 /* As it uses V4HImode, we have to require -mmmx too. */
31701 | OPTION_MASK_ISA_MMX,
31702 "__builtin_ia32_vec_ext_v4hi",
31703 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
31705 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
31706 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
31708 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
31709 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
31711 /* Access to the vec_set patterns. */
31712 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
31713 "__builtin_ia32_vec_set_v2di",
31714 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
31716 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
31717 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
31719 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
31720 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
31722 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
31723 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
31725 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
31726 /* As it uses V4HImode, we have to require -mmmx too. */
31727 | OPTION_MASK_ISA_MMX,
31728 "__builtin_ia32_vec_set_v4hi",
31729 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
31731 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
31732 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
31734 /* RDSEED */
31735 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
31736 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
31737 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
31738 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
31739 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
31740 "__builtin_ia32_rdseed_di_step",
31741 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
31743 /* ADCX */
31744 def_builtin (0, "__builtin_ia32_addcarryx_u32",
31745 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
31746 def_builtin (OPTION_MASK_ISA_64BIT,
31747 "__builtin_ia32_addcarryx_u64",
31748 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31749 IX86_BUILTIN_ADDCARRYX64);
31751 /* SBB */
31752 def_builtin (0, "__builtin_ia32_sbb_u32",
31753 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
31754 def_builtin (OPTION_MASK_ISA_64BIT,
31755 "__builtin_ia32_sbb_u64",
31756 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
31757 IX86_BUILTIN_SBB64);
31759 /* Read/write FLAGS. */
31760 if (TARGET_64BIT)
31762 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
31763 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31764 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
31765 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
31767 else
31769 def_builtin (0, "__builtin_ia32_readeflags_u32",
31770 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
31771 def_builtin (0, "__builtin_ia32_writeeflags_u32",
31772 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
31775 /* CLFLUSHOPT. */
31776 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
31777 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
31779 /* CLWB. */
31780 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
31781 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
31783 /* MONITORX and MWAITX. */
31784 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
31785 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
31786 def_builtin2 (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
31787 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
31789 /* CLZERO. */
31790 def_builtin2 (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
31791 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
31793 /* WAITPKG. */
31794 def_builtin2 (OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor",
31795 VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR);
31796 def_builtin2 (OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait",
31797 UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT);
31798 def_builtin2 (OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause",
31799 UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE);
31801 /* CLDEMOTE. */
31802 def_builtin2 (OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote",
31803 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE);
31805 /* Add FMA4 multi-arg argument instructions */
31806 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
31808 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
31809 if (d->name == 0)
31810 continue;
31812 ftype = (enum ix86_builtin_func_type) d->flag;
31813 def_builtin_const (d->mask, d->name, ftype, d->code);
31815 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
31816 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
31817 ARRAY_SIZE (bdesc_multi_arg) - 1);
31819 /* Add CET inrinsics. */
31820 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
31822 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
31823 if (d->name == 0)
31824 continue;
31826 ftype = (enum ix86_builtin_func_type) d->flag;
31827 def_builtin (d->mask, d->name, ftype, d->code);
31829 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
31830 IX86_BUILTIN__BDESC_CET_FIRST,
31831 ARRAY_SIZE (bdesc_cet) - 1);
31833 for (i = 0, d = bdesc_cet_rdssp;
31834 i < ARRAY_SIZE (bdesc_cet_rdssp);
31835 i++, d++)
31837 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
31838 if (d->name == 0)
31839 continue;
31841 ftype = (enum ix86_builtin_func_type) d->flag;
31842 def_builtin (d->mask, d->name, ftype, d->code);
31844 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
31845 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
31846 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
31849 #undef BDESC_VERIFY
31850 #undef BDESC_VERIFYS
31852 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
31853 to return a pointer to VERSION_DECL if the outcome of the expression
31854 formed by PREDICATE_CHAIN is true. This function will be called during
31855 version dispatch to decide which function version to execute. It returns
31856 the basic block at the end, to which more conditions can be added. */
31858 static basic_block
31859 add_condition_to_bb (tree function_decl, tree version_decl,
31860 tree predicate_chain, basic_block new_bb)
31862 gimple *return_stmt;
31863 tree convert_expr, result_var;
31864 gimple *convert_stmt;
31865 gimple *call_cond_stmt;
31866 gimple *if_else_stmt;
31868 basic_block bb1, bb2, bb3;
31869 edge e12, e23;
31871 tree cond_var, and_expr_var = NULL_TREE;
31872 gimple_seq gseq;
31874 tree predicate_decl, predicate_arg;
31876 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
31878 gcc_assert (new_bb != NULL);
31879 gseq = bb_seq (new_bb);
31882 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31883 build_fold_addr_expr (version_decl));
31884 result_var = create_tmp_var (ptr_type_node);
31885 convert_stmt = gimple_build_assign (result_var, convert_expr);
31886 return_stmt = gimple_build_return (result_var);
31888 if (predicate_chain == NULL_TREE)
31890 gimple_seq_add_stmt (&gseq, convert_stmt);
31891 gimple_seq_add_stmt (&gseq, return_stmt);
31892 set_bb_seq (new_bb, gseq);
31893 gimple_set_bb (convert_stmt, new_bb);
31894 gimple_set_bb (return_stmt, new_bb);
31895 pop_cfun ();
31896 return new_bb;
31899 while (predicate_chain != NULL)
31901 cond_var = create_tmp_var (integer_type_node);
31902 predicate_decl = TREE_PURPOSE (predicate_chain);
31903 predicate_arg = TREE_VALUE (predicate_chain);
31904 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31905 gimple_call_set_lhs (call_cond_stmt, cond_var);
31907 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31908 gimple_set_bb (call_cond_stmt, new_bb);
31909 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31911 predicate_chain = TREE_CHAIN (predicate_chain);
31913 if (and_expr_var == NULL)
31914 and_expr_var = cond_var;
31915 else
31917 gimple *assign_stmt;
31918 /* Use MIN_EXPR to check if any integer is zero?.
31919 and_expr_var = min_expr <cond_var, and_expr_var> */
31920 assign_stmt = gimple_build_assign (and_expr_var,
31921 build2 (MIN_EXPR, integer_type_node,
31922 cond_var, and_expr_var));
31924 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31925 gimple_set_bb (assign_stmt, new_bb);
31926 gimple_seq_add_stmt (&gseq, assign_stmt);
31930 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31931 integer_zero_node,
31932 NULL_TREE, NULL_TREE);
31933 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31934 gimple_set_bb (if_else_stmt, new_bb);
31935 gimple_seq_add_stmt (&gseq, if_else_stmt);
31937 gimple_seq_add_stmt (&gseq, convert_stmt);
31938 gimple_seq_add_stmt (&gseq, return_stmt);
31939 set_bb_seq (new_bb, gseq);
31941 bb1 = new_bb;
31942 e12 = split_block (bb1, if_else_stmt);
31943 bb2 = e12->dest;
31944 e12->flags &= ~EDGE_FALLTHRU;
31945 e12->flags |= EDGE_TRUE_VALUE;
31947 e23 = split_block (bb2, return_stmt);
31949 gimple_set_bb (convert_stmt, bb2);
31950 gimple_set_bb (return_stmt, bb2);
31952 bb3 = e23->dest;
31953 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31955 remove_edge (e23);
31956 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31958 pop_cfun ();
31960 return bb3;
31963 /* This parses the attribute arguments to target in DECL and determines
31964 the right builtin to use to match the platform specification.
31965 It returns the priority value for this version decl. If PREDICATE_LIST
31966 is not NULL, it stores the list of cpu features that need to be checked
31967 before dispatching this function. */
31969 static unsigned int
31970 get_builtin_code_for_version (tree decl, tree *predicate_list)
31972 tree attrs;
31973 struct cl_target_option cur_target;
31974 tree target_node;
31975 struct cl_target_option *new_target;
31976 const char *arg_str = NULL;
31977 const char *attrs_str = NULL;
31978 char *tok_str = NULL;
31979 char *token;
31981 /* Priority of i386 features, greater value is higher priority. This is
31982 used to decide the order in which function dispatch must happen. For
31983 instance, a version specialized for SSE4.2 should be checked for dispatch
31984 before a version for SSE3, as SSE4.2 implies SSE3. */
31985 enum feature_priority
31987 P_ZERO = 0,
31988 P_MMX,
31989 P_SSE,
31990 P_SSE2,
31991 P_SSE3,
31992 P_SSSE3,
31993 P_PROC_SSSE3,
31994 P_SSE4_A,
31995 P_PROC_SSE4_A,
31996 P_SSE4_1,
31997 P_SSE4_2,
31998 P_PROC_SSE4_2,
31999 P_POPCNT,
32000 P_AES,
32001 P_PCLMUL,
32002 P_AVX,
32003 P_PROC_AVX,
32004 P_BMI,
32005 P_PROC_BMI,
32006 P_FMA4,
32007 P_XOP,
32008 P_PROC_XOP,
32009 P_FMA,
32010 P_PROC_FMA,
32011 P_BMI2,
32012 P_AVX2,
32013 P_PROC_AVX2,
32014 P_AVX512F,
32015 P_PROC_AVX512F
32018 enum feature_priority priority = P_ZERO;
32020 /* These are the target attribute strings for which a dispatcher is
32021 available, from fold_builtin_cpu. */
32023 static struct _feature_list
32025 const char *const name;
32026 const enum feature_priority priority;
32028 const feature_list[] =
32030 {"mmx", P_MMX},
32031 {"sse", P_SSE},
32032 {"sse2", P_SSE2},
32033 {"sse3", P_SSE3},
32034 {"sse4a", P_SSE4_A},
32035 {"ssse3", P_SSSE3},
32036 {"sse4.1", P_SSE4_1},
32037 {"sse4.2", P_SSE4_2},
32038 {"popcnt", P_POPCNT},
32039 {"aes", P_AES},
32040 {"pclmul", P_PCLMUL},
32041 {"avx", P_AVX},
32042 {"bmi", P_BMI},
32043 {"fma4", P_FMA4},
32044 {"xop", P_XOP},
32045 {"fma", P_FMA},
32046 {"bmi2", P_BMI2},
32047 {"avx2", P_AVX2},
32048 {"avx512f", P_AVX512F}
32052 static unsigned int NUM_FEATURES
32053 = sizeof (feature_list) / sizeof (struct _feature_list);
32055 unsigned int i;
32057 tree predicate_chain = NULL_TREE;
32058 tree predicate_decl, predicate_arg;
32060 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32061 gcc_assert (attrs != NULL);
32063 attrs = TREE_VALUE (TREE_VALUE (attrs));
32065 gcc_assert (TREE_CODE (attrs) == STRING_CST);
32066 attrs_str = TREE_STRING_POINTER (attrs);
32068 /* Return priority zero for default function. */
32069 if (strcmp (attrs_str, "default") == 0)
32070 return 0;
32072 /* Handle arch= if specified. For priority, set it to be 1 more than
32073 the best instruction set the processor can handle. For instance, if
32074 there is a version for atom and a version for ssse3 (the highest ISA
32075 priority for atom), the atom version must be checked for dispatch
32076 before the ssse3 version. */
32077 if (strstr (attrs_str, "arch=") != NULL)
32079 cl_target_option_save (&cur_target, &global_options);
32080 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
32081 &global_options_set);
32083 gcc_assert (target_node);
32084 if (target_node == error_mark_node)
32085 return 0;
32086 new_target = TREE_TARGET_OPTION (target_node);
32087 gcc_assert (new_target);
32089 if (new_target->arch_specified && new_target->arch > 0)
32091 switch (new_target->arch)
32093 case PROCESSOR_CORE2:
32094 arg_str = "core2";
32095 priority = P_PROC_SSSE3;
32096 break;
32097 case PROCESSOR_NEHALEM:
32098 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
32100 arg_str = "westmere";
32101 priority = P_AES;
32103 else
32105 /* We translate "arch=corei7" and "arch=nehalem" to
32106 "corei7" so that it will be mapped to M_INTEL_COREI7
32107 as cpu type to cover all M_INTEL_COREI7_XXXs. */
32108 arg_str = "corei7";
32109 priority = P_PROC_SSE4_2;
32111 break;
32112 case PROCESSOR_SANDYBRIDGE:
32113 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
32114 arg_str = "ivybridge";
32115 else
32116 arg_str = "sandybridge";
32117 priority = P_PROC_AVX;
32118 break;
32119 case PROCESSOR_HASWELL:
32120 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
32121 arg_str = "broadwell";
32122 else
32123 arg_str = "haswell";
32124 priority = P_PROC_AVX2;
32125 break;
32126 case PROCESSOR_SKYLAKE:
32127 arg_str = "skylake";
32128 priority = P_PROC_AVX2;
32129 break;
32130 case PROCESSOR_SKYLAKE_AVX512:
32131 arg_str = "skylake-avx512";
32132 priority = P_PROC_AVX512F;
32133 break;
32134 case PROCESSOR_CANNONLAKE:
32135 arg_str = "cannonlake";
32136 priority = P_PROC_AVX512F;
32137 break;
32138 case PROCESSOR_ICELAKE_CLIENT:
32139 arg_str = "icelake-client";
32140 priority = P_PROC_AVX512F;
32141 break;
32142 case PROCESSOR_ICELAKE_SERVER:
32143 arg_str = "icelake-server";
32144 priority = P_PROC_AVX512F;
32145 break;
32146 case PROCESSOR_BONNELL:
32147 arg_str = "bonnell";
32148 priority = P_PROC_SSSE3;
32149 break;
32150 case PROCESSOR_KNL:
32151 arg_str = "knl";
32152 priority = P_PROC_AVX512F;
32153 break;
32154 case PROCESSOR_KNM:
32155 arg_str = "knm";
32156 priority = P_PROC_AVX512F;
32157 break;
32158 case PROCESSOR_SILVERMONT:
32159 arg_str = "silvermont";
32160 priority = P_PROC_SSE4_2;
32161 break;
32162 case PROCESSOR_GOLDMONT:
32163 arg_str = "goldmont";
32164 priority = P_PROC_SSE4_2;
32165 break;
32166 case PROCESSOR_GOLDMONT_PLUS:
32167 arg_str = "goldmont-plus";
32168 priority = P_PROC_SSE4_2;
32169 break;
32170 case PROCESSOR_TREMONT:
32171 arg_str = "tremont";
32172 priority = P_PROC_SSE4_2;
32173 break;
32174 case PROCESSOR_AMDFAM10:
32175 arg_str = "amdfam10h";
32176 priority = P_PROC_SSE4_A;
32177 break;
32178 case PROCESSOR_BTVER1:
32179 arg_str = "btver1";
32180 priority = P_PROC_SSE4_A;
32181 break;
32182 case PROCESSOR_BTVER2:
32183 arg_str = "btver2";
32184 priority = P_PROC_BMI;
32185 break;
32186 case PROCESSOR_BDVER1:
32187 arg_str = "bdver1";
32188 priority = P_PROC_XOP;
32189 break;
32190 case PROCESSOR_BDVER2:
32191 arg_str = "bdver2";
32192 priority = P_PROC_FMA;
32193 break;
32194 case PROCESSOR_BDVER3:
32195 arg_str = "bdver3";
32196 priority = P_PROC_FMA;
32197 break;
32198 case PROCESSOR_BDVER4:
32199 arg_str = "bdver4";
32200 priority = P_PROC_AVX2;
32201 break;
32202 case PROCESSOR_ZNVER1:
32203 arg_str = "znver1";
32204 priority = P_PROC_AVX2;
32205 break;
32209 cl_target_option_restore (&global_options, &cur_target);
32211 if (predicate_list && arg_str == NULL)
32213 error_at (DECL_SOURCE_LOCATION (decl),
32214 "No dispatcher found for the versioning attributes");
32215 return 0;
32218 if (predicate_list)
32220 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
32221 /* For a C string literal the length includes the trailing NULL. */
32222 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
32223 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32224 predicate_chain);
32228 /* Process feature name. */
32229 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
32230 strcpy (tok_str, attrs_str);
32231 token = strtok (tok_str, ",");
32232 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
32234 while (token != NULL)
32236 /* Do not process "arch=" */
32237 if (strncmp (token, "arch=", 5) == 0)
32239 token = strtok (NULL, ",");
32240 continue;
32242 for (i = 0; i < NUM_FEATURES; ++i)
32244 if (strcmp (token, feature_list[i].name) == 0)
32246 if (predicate_list)
32248 predicate_arg = build_string_literal (
32249 strlen (feature_list[i].name) + 1,
32250 feature_list[i].name);
32251 predicate_chain = tree_cons (predicate_decl, predicate_arg,
32252 predicate_chain);
32254 /* Find the maximum priority feature. */
32255 if (feature_list[i].priority > priority)
32256 priority = feature_list[i].priority;
32258 break;
32261 if (predicate_list && i == NUM_FEATURES)
32263 error_at (DECL_SOURCE_LOCATION (decl),
32264 "No dispatcher found for %s", token);
32265 return 0;
32267 token = strtok (NULL, ",");
32269 free (tok_str);
32271 if (predicate_list && predicate_chain == NULL_TREE)
32273 error_at (DECL_SOURCE_LOCATION (decl),
32274 "No dispatcher found for the versioning attributes : %s",
32275 attrs_str);
32276 return 0;
32278 else if (predicate_list)
32280 predicate_chain = nreverse (predicate_chain);
32281 *predicate_list = predicate_chain;
32284 return priority;
32287 /* This compares the priority of target features in function DECL1
32288 and DECL2. It returns positive value if DECL1 is higher priority,
32289 negative value if DECL2 is higher priority and 0 if they are the
32290 same. */
32292 static int
32293 ix86_compare_version_priority (tree decl1, tree decl2)
32295 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
32296 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
32298 return (int)priority1 - (int)priority2;
32301 /* V1 and V2 point to function versions with different priorities
32302 based on the target ISA. This function compares their priorities. */
32304 static int
32305 feature_compare (const void *v1, const void *v2)
32307 typedef struct _function_version_info
32309 tree version_decl;
32310 tree predicate_chain;
32311 unsigned int dispatch_priority;
32312 } function_version_info;
32314 const function_version_info c1 = *(const function_version_info *)v1;
32315 const function_version_info c2 = *(const function_version_info *)v2;
32316 return (c2.dispatch_priority - c1.dispatch_priority);
32319 /* This function generates the dispatch function for
32320 multi-versioned functions. DISPATCH_DECL is the function which will
32321 contain the dispatch logic. FNDECLS are the function choices for
32322 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
32323 in DISPATCH_DECL in which the dispatch code is generated. */
32325 static int
32326 dispatch_function_versions (tree dispatch_decl,
32327 void *fndecls_p,
32328 basic_block *empty_bb)
32330 tree default_decl;
32331 gimple *ifunc_cpu_init_stmt;
32332 gimple_seq gseq;
32333 int ix;
32334 tree ele;
32335 vec<tree> *fndecls;
32336 unsigned int num_versions = 0;
32337 unsigned int actual_versions = 0;
32338 unsigned int i;
32340 struct _function_version_info
32342 tree version_decl;
32343 tree predicate_chain;
32344 unsigned int dispatch_priority;
32345 }*function_version_info;
32347 gcc_assert (dispatch_decl != NULL
32348 && fndecls_p != NULL
32349 && empty_bb != NULL);
32351 /*fndecls_p is actually a vector. */
32352 fndecls = static_cast<vec<tree> *> (fndecls_p);
32354 /* At least one more version other than the default. */
32355 num_versions = fndecls->length ();
32356 gcc_assert (num_versions >= 2);
32358 function_version_info = (struct _function_version_info *)
32359 XNEWVEC (struct _function_version_info, (num_versions - 1));
32361 /* The first version in the vector is the default decl. */
32362 default_decl = (*fndecls)[0];
32364 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
32366 gseq = bb_seq (*empty_bb);
32367 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
32368 constructors, so explicity call __builtin_cpu_init here. */
32369 ifunc_cpu_init_stmt = gimple_build_call_vec (
32370 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
32371 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
32372 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
32373 set_bb_seq (*empty_bb, gseq);
32375 pop_cfun ();
32378 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
32380 tree version_decl = ele;
32381 tree predicate_chain = NULL_TREE;
32382 unsigned int priority;
32383 /* Get attribute string, parse it and find the right predicate decl.
32384 The predicate function could be a lengthy combination of many
32385 features, like arch-type and various isa-variants. */
32386 priority = get_builtin_code_for_version (version_decl,
32387 &predicate_chain);
32389 if (predicate_chain == NULL_TREE)
32390 continue;
32392 function_version_info [actual_versions].version_decl = version_decl;
32393 function_version_info [actual_versions].predicate_chain
32394 = predicate_chain;
32395 function_version_info [actual_versions].dispatch_priority = priority;
32396 actual_versions++;
32399 /* Sort the versions according to descending order of dispatch priority. The
32400 priority is based on the ISA. This is not a perfect solution. There
32401 could still be ambiguity. If more than one function version is suitable
32402 to execute, which one should be dispatched? In future, allow the user
32403 to specify a dispatch priority next to the version. */
32404 qsort (function_version_info, actual_versions,
32405 sizeof (struct _function_version_info), feature_compare);
32407 for (i = 0; i < actual_versions; ++i)
32408 *empty_bb = add_condition_to_bb (dispatch_decl,
32409 function_version_info[i].version_decl,
32410 function_version_info[i].predicate_chain,
32411 *empty_bb);
32413 /* dispatch default version at the end. */
32414 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
32415 NULL, *empty_bb);
32417 free (function_version_info);
32418 return 0;
32421 /* This function changes the assembler name for functions that are
32422 versions. If DECL is a function version and has a "target"
32423 attribute, it appends the attribute string to its assembler name. */
32425 static tree
32426 ix86_mangle_function_version_assembler_name (tree decl, tree id)
32428 tree version_attr;
32429 const char *orig_name, *version_string;
32430 char *attr_str, *assembler_name;
32432 if (DECL_DECLARED_INLINE_P (decl)
32433 && lookup_attribute ("gnu_inline",
32434 DECL_ATTRIBUTES (decl)))
32435 error_at (DECL_SOURCE_LOCATION (decl),
32436 "Function versions cannot be marked as gnu_inline,"
32437 " bodies have to be generated");
32439 if (DECL_VIRTUAL_P (decl)
32440 || DECL_VINDEX (decl))
32441 sorry ("Virtual function multiversioning not supported");
32443 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
32445 /* target attribute string cannot be NULL. */
32446 gcc_assert (version_attr != NULL_TREE);
32448 orig_name = IDENTIFIER_POINTER (id);
32449 version_string
32450 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
32452 if (strcmp (version_string, "default") == 0)
32453 return id;
32455 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
32456 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
32458 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
32460 /* Allow assembler name to be modified if already set. */
32461 if (DECL_ASSEMBLER_NAME_SET_P (decl))
32462 SET_DECL_RTL (decl, NULL);
32464 tree ret = get_identifier (assembler_name);
32465 XDELETEVEC (attr_str);
32466 XDELETEVEC (assembler_name);
32467 return ret;
32471 static tree
32472 ix86_mangle_decl_assembler_name (tree decl, tree id)
32474 /* For function version, add the target suffix to the assembler name. */
32475 if (TREE_CODE (decl) == FUNCTION_DECL
32476 && DECL_FUNCTION_VERSIONED (decl))
32477 id = ix86_mangle_function_version_assembler_name (decl, id);
32478 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
32479 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
32480 #endif
32482 return id;
32485 /* Make a dispatcher declaration for the multi-versioned function DECL.
32486 Calls to DECL function will be replaced with calls to the dispatcher
32487 by the front-end. Returns the decl of the dispatcher function. */
32489 static tree
32490 ix86_get_function_versions_dispatcher (void *decl)
32492 tree fn = (tree) decl;
32493 struct cgraph_node *node = NULL;
32494 struct cgraph_node *default_node = NULL;
32495 struct cgraph_function_version_info *node_v = NULL;
32496 struct cgraph_function_version_info *first_v = NULL;
32498 tree dispatch_decl = NULL;
32500 struct cgraph_function_version_info *default_version_info = NULL;
32502 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
32504 node = cgraph_node::get (fn);
32505 gcc_assert (node != NULL);
32507 node_v = node->function_version ();
32508 gcc_assert (node_v != NULL);
32510 if (node_v->dispatcher_resolver != NULL)
32511 return node_v->dispatcher_resolver;
32513 /* Find the default version and make it the first node. */
32514 first_v = node_v;
32515 /* Go to the beginning of the chain. */
32516 while (first_v->prev != NULL)
32517 first_v = first_v->prev;
32518 default_version_info = first_v;
32519 while (default_version_info != NULL)
32521 if (is_function_default_version
32522 (default_version_info->this_node->decl))
32523 break;
32524 default_version_info = default_version_info->next;
32527 /* If there is no default node, just return NULL. */
32528 if (default_version_info == NULL)
32529 return NULL;
32531 /* Make default info the first node. */
32532 if (first_v != default_version_info)
32534 default_version_info->prev->next = default_version_info->next;
32535 if (default_version_info->next)
32536 default_version_info->next->prev = default_version_info->prev;
32537 first_v->prev = default_version_info;
32538 default_version_info->next = first_v;
32539 default_version_info->prev = NULL;
32542 default_node = default_version_info->this_node;
32544 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
32545 if (targetm.has_ifunc_p ())
32547 struct cgraph_function_version_info *it_v = NULL;
32548 struct cgraph_node *dispatcher_node = NULL;
32549 struct cgraph_function_version_info *dispatcher_version_info = NULL;
32551 /* Right now, the dispatching is done via ifunc. */
32552 dispatch_decl = make_dispatcher_decl (default_node->decl);
32554 dispatcher_node = cgraph_node::get_create (dispatch_decl);
32555 gcc_assert (dispatcher_node != NULL);
32556 dispatcher_node->dispatcher_function = 1;
32557 dispatcher_version_info
32558 = dispatcher_node->insert_new_function_version ();
32559 dispatcher_version_info->next = default_version_info;
32560 dispatcher_node->definition = 1;
32562 /* Set the dispatcher for all the versions. */
32563 it_v = default_version_info;
32564 while (it_v != NULL)
32566 it_v->dispatcher_resolver = dispatch_decl;
32567 it_v = it_v->next;
32570 else
32571 #endif
32573 error_at (DECL_SOURCE_LOCATION (default_node->decl),
32574 "multiversioning needs ifunc which is not supported "
32575 "on this target");
32578 return dispatch_decl;
32581 /* Make the resolver function decl to dispatch the versions of
32582 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
32583 ifunc alias that will point to the created resolver. Create an
32584 empty basic block in the resolver and store the pointer in
32585 EMPTY_BB. Return the decl of the resolver function. */
32587 static tree
32588 make_resolver_func (const tree default_decl,
32589 const tree ifunc_alias_decl,
32590 basic_block *empty_bb)
32592 char *resolver_name;
32593 tree decl, type, decl_name, t;
32595 /* IFUNC's have to be globally visible. So, if the default_decl is
32596 not, then the name of the IFUNC should be made unique. */
32597 if (TREE_PUBLIC (default_decl) == 0)
32599 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
32600 symtab->change_decl_assembler_name (ifunc_alias_decl,
32601 get_identifier (ifunc_name));
32602 XDELETEVEC (ifunc_name);
32605 resolver_name = make_unique_name (default_decl, "resolver", false);
32607 /* The resolver function should return a (void *). */
32608 type = build_function_type_list (ptr_type_node, NULL_TREE);
32610 decl = build_fn_decl (resolver_name, type);
32611 decl_name = get_identifier (resolver_name);
32612 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
32614 DECL_NAME (decl) = decl_name;
32615 TREE_USED (decl) = 1;
32616 DECL_ARTIFICIAL (decl) = 1;
32617 DECL_IGNORED_P (decl) = 1;
32618 TREE_PUBLIC (decl) = 0;
32619 DECL_UNINLINABLE (decl) = 1;
32621 /* Resolver is not external, body is generated. */
32622 DECL_EXTERNAL (decl) = 0;
32623 DECL_EXTERNAL (ifunc_alias_decl) = 0;
32625 DECL_CONTEXT (decl) = NULL_TREE;
32626 DECL_INITIAL (decl) = make_node (BLOCK);
32627 DECL_STATIC_CONSTRUCTOR (decl) = 0;
32629 if (DECL_COMDAT_GROUP (default_decl)
32630 || TREE_PUBLIC (default_decl))
32632 /* In this case, each translation unit with a call to this
32633 versioned function will put out a resolver. Ensure it
32634 is comdat to keep just one copy. */
32635 DECL_COMDAT (decl) = 1;
32636 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
32638 /* Build result decl and add to function_decl. */
32639 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
32640 DECL_ARTIFICIAL (t) = 1;
32641 DECL_IGNORED_P (t) = 1;
32642 DECL_RESULT (decl) = t;
32644 gimplify_function_tree (decl);
32645 push_cfun (DECL_STRUCT_FUNCTION (decl));
32646 *empty_bb = init_lowered_empty_function (decl, false,
32647 profile_count::uninitialized ());
32649 cgraph_node::add_new_function (decl, true);
32650 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
32652 pop_cfun ();
32654 gcc_assert (ifunc_alias_decl != NULL);
32655 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
32656 DECL_ATTRIBUTES (ifunc_alias_decl)
32657 = make_attribute ("ifunc", resolver_name,
32658 DECL_ATTRIBUTES (ifunc_alias_decl));
32660 /* Create the alias for dispatch to resolver here. */
32661 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
32662 XDELETEVEC (resolver_name);
32663 return decl;
32666 /* Generate the dispatching code body to dispatch multi-versioned function
32667 DECL. The target hook is called to process the "target" attributes and
32668 provide the code to dispatch the right function at run-time. NODE points
32669 to the dispatcher decl whose body will be created. */
32671 static tree
32672 ix86_generate_version_dispatcher_body (void *node_p)
32674 tree resolver_decl;
32675 basic_block empty_bb;
32676 tree default_ver_decl;
32677 struct cgraph_node *versn;
32678 struct cgraph_node *node;
32680 struct cgraph_function_version_info *node_version_info = NULL;
32681 struct cgraph_function_version_info *versn_info = NULL;
32683 node = (cgraph_node *)node_p;
32685 node_version_info = node->function_version ();
32686 gcc_assert (node->dispatcher_function
32687 && node_version_info != NULL);
32689 if (node_version_info->dispatcher_resolver)
32690 return node_version_info->dispatcher_resolver;
32692 /* The first version in the chain corresponds to the default version. */
32693 default_ver_decl = node_version_info->next->this_node->decl;
32695 /* node is going to be an alias, so remove the finalized bit. */
32696 node->definition = false;
32698 resolver_decl = make_resolver_func (default_ver_decl,
32699 node->decl, &empty_bb);
32701 node_version_info->dispatcher_resolver = resolver_decl;
32703 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
32705 auto_vec<tree, 2> fn_ver_vec;
32707 for (versn_info = node_version_info->next; versn_info;
32708 versn_info = versn_info->next)
32710 versn = versn_info->this_node;
32711 /* Check for virtual functions here again, as by this time it should
32712 have been determined if this function needs a vtable index or
32713 not. This happens for methods in derived classes that override
32714 virtual methods in base classes but are not explicitly marked as
32715 virtual. */
32716 if (DECL_VINDEX (versn->decl))
32717 sorry ("Virtual function multiversioning not supported");
32719 fn_ver_vec.safe_push (versn->decl);
32722 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
32723 cgraph_edge::rebuild_edges ();
32724 pop_cfun ();
32725 return resolver_decl;
32727 /* This builds the processor_model struct type defined in
32728 libgcc/config/i386/cpuinfo.c */
32730 static tree
32731 build_processor_model_struct (void)
32733 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
32734 "__cpu_features"};
32735 tree field = NULL_TREE, field_chain = NULL_TREE;
32736 int i;
32737 tree type = make_node (RECORD_TYPE);
32739 /* The first 3 fields are unsigned int. */
32740 for (i = 0; i < 3; ++i)
32742 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32743 get_identifier (field_name[i]), unsigned_type_node);
32744 if (field_chain != NULL_TREE)
32745 DECL_CHAIN (field) = field_chain;
32746 field_chain = field;
32749 /* The last field is an array of unsigned integers of size one. */
32750 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
32751 get_identifier (field_name[3]),
32752 build_array_type (unsigned_type_node,
32753 build_index_type (size_one_node)));
32754 if (field_chain != NULL_TREE)
32755 DECL_CHAIN (field) = field_chain;
32756 field_chain = field;
32758 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
32759 return type;
32762 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
32764 static tree
32765 make_var_decl (tree type, const char *name)
32767 tree new_decl;
32769 new_decl = build_decl (UNKNOWN_LOCATION,
32770 VAR_DECL,
32771 get_identifier(name),
32772 type);
32774 DECL_EXTERNAL (new_decl) = 1;
32775 TREE_STATIC (new_decl) = 1;
32776 TREE_PUBLIC (new_decl) = 1;
32777 DECL_INITIAL (new_decl) = 0;
32778 DECL_ARTIFICIAL (new_decl) = 0;
32779 DECL_PRESERVE_P (new_decl) = 1;
32781 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
32782 assemble_variable (new_decl, 0, 0, 0);
32784 return new_decl;
32787 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
32788 into an integer defined in libgcc/config/i386/cpuinfo.c */
32790 static tree
32791 fold_builtin_cpu (tree fndecl, tree *args)
32793 unsigned int i;
32794 enum ix86_builtins fn_code = (enum ix86_builtins)
32795 DECL_FUNCTION_CODE (fndecl);
32796 tree param_string_cst = NULL;
32798 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
32799 enum processor_features
32801 F_CMOV = 0,
32802 F_MMX,
32803 F_POPCNT,
32804 F_SSE,
32805 F_SSE2,
32806 F_SSE3,
32807 F_SSSE3,
32808 F_SSE4_1,
32809 F_SSE4_2,
32810 F_AVX,
32811 F_AVX2,
32812 F_SSE4_A,
32813 F_FMA4,
32814 F_XOP,
32815 F_FMA,
32816 F_AVX512F,
32817 F_BMI,
32818 F_BMI2,
32819 F_AES,
32820 F_PCLMUL,
32821 F_AVX512VL,
32822 F_AVX512BW,
32823 F_AVX512DQ,
32824 F_AVX512CD,
32825 F_AVX512ER,
32826 F_AVX512PF,
32827 F_AVX512VBMI,
32828 F_AVX512IFMA,
32829 F_AVX5124VNNIW,
32830 F_AVX5124FMAPS,
32831 F_AVX512VPOPCNTDQ,
32832 F_AVX512VBMI2,
32833 F_GFNI,
32834 F_VPCLMULQDQ,
32835 F_AVX512VNNI,
32836 F_AVX512BITALG,
32837 F_MAX
32840 /* These are the values for vendor types and cpu types and subtypes
32841 in cpuinfo.c. Cpu types and subtypes should be subtracted by
32842 the corresponding start value. */
32843 enum processor_model
32845 M_INTEL = 1,
32846 M_AMD,
32847 M_CPU_TYPE_START,
32848 M_INTEL_BONNELL,
32849 M_INTEL_CORE2,
32850 M_INTEL_COREI7,
32851 M_AMDFAM10H,
32852 M_AMDFAM15H,
32853 M_INTEL_SILVERMONT,
32854 M_INTEL_KNL,
32855 M_AMD_BTVER1,
32856 M_AMD_BTVER2,
32857 M_AMDFAM17H,
32858 M_INTEL_KNM,
32859 M_INTEL_GOLDMONT,
32860 M_INTEL_GOLDMONT_PLUS,
32861 M_INTEL_TREMONT,
32862 M_CPU_SUBTYPE_START,
32863 M_INTEL_COREI7_NEHALEM,
32864 M_INTEL_COREI7_WESTMERE,
32865 M_INTEL_COREI7_SANDYBRIDGE,
32866 M_AMDFAM10H_BARCELONA,
32867 M_AMDFAM10H_SHANGHAI,
32868 M_AMDFAM10H_ISTANBUL,
32869 M_AMDFAM15H_BDVER1,
32870 M_AMDFAM15H_BDVER2,
32871 M_AMDFAM15H_BDVER3,
32872 M_AMDFAM15H_BDVER4,
32873 M_AMDFAM17H_ZNVER1,
32874 M_INTEL_COREI7_IVYBRIDGE,
32875 M_INTEL_COREI7_HASWELL,
32876 M_INTEL_COREI7_BROADWELL,
32877 M_INTEL_COREI7_SKYLAKE,
32878 M_INTEL_COREI7_SKYLAKE_AVX512,
32879 M_INTEL_COREI7_CANNONLAKE,
32880 M_INTEL_COREI7_ICELAKE_CLIENT,
32881 M_INTEL_COREI7_ICELAKE_SERVER
32884 static struct _arch_names_table
32886 const char *const name;
32887 const enum processor_model model;
32889 const arch_names_table[] =
32891 {"amd", M_AMD},
32892 {"intel", M_INTEL},
32893 {"atom", M_INTEL_BONNELL},
32894 {"slm", M_INTEL_SILVERMONT},
32895 {"core2", M_INTEL_CORE2},
32896 {"corei7", M_INTEL_COREI7},
32897 {"nehalem", M_INTEL_COREI7_NEHALEM},
32898 {"westmere", M_INTEL_COREI7_WESTMERE},
32899 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
32900 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
32901 {"haswell", M_INTEL_COREI7_HASWELL},
32902 {"broadwell", M_INTEL_COREI7_BROADWELL},
32903 {"skylake", M_INTEL_COREI7_SKYLAKE},
32904 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
32905 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
32906 {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
32907 {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
32908 {"bonnell", M_INTEL_BONNELL},
32909 {"silvermont", M_INTEL_SILVERMONT},
32910 {"goldmont", M_INTEL_GOLDMONT},
32911 {"goldmont-plus", M_INTEL_GOLDMONT_PLUS},
32912 {"tremont", M_INTEL_TREMONT},
32913 {"knl", M_INTEL_KNL},
32914 {"knm", M_INTEL_KNM},
32915 {"amdfam10h", M_AMDFAM10H},
32916 {"barcelona", M_AMDFAM10H_BARCELONA},
32917 {"shanghai", M_AMDFAM10H_SHANGHAI},
32918 {"istanbul", M_AMDFAM10H_ISTANBUL},
32919 {"btver1", M_AMD_BTVER1},
32920 {"amdfam15h", M_AMDFAM15H},
32921 {"bdver1", M_AMDFAM15H_BDVER1},
32922 {"bdver2", M_AMDFAM15H_BDVER2},
32923 {"bdver3", M_AMDFAM15H_BDVER3},
32924 {"bdver4", M_AMDFAM15H_BDVER4},
32925 {"btver2", M_AMD_BTVER2},
32926 {"amdfam17h", M_AMDFAM17H},
32927 {"znver1", M_AMDFAM17H_ZNVER1},
32930 static struct _isa_names_table
32932 const char *const name;
32933 const enum processor_features feature;
32935 const isa_names_table[] =
32937 {"cmov", F_CMOV},
32938 {"mmx", F_MMX},
32939 {"popcnt", F_POPCNT},
32940 {"sse", F_SSE},
32941 {"sse2", F_SSE2},
32942 {"sse3", F_SSE3},
32943 {"ssse3", F_SSSE3},
32944 {"sse4a", F_SSE4_A},
32945 {"sse4.1", F_SSE4_1},
32946 {"sse4.2", F_SSE4_2},
32947 {"avx", F_AVX},
32948 {"fma4", F_FMA4},
32949 {"xop", F_XOP},
32950 {"fma", F_FMA},
32951 {"avx2", F_AVX2},
32952 {"avx512f", F_AVX512F},
32953 {"bmi", F_BMI},
32954 {"bmi2", F_BMI2},
32955 {"aes", F_AES},
32956 {"pclmul", F_PCLMUL},
32957 {"avx512vl",F_AVX512VL},
32958 {"avx512bw",F_AVX512BW},
32959 {"avx512dq",F_AVX512DQ},
32960 {"avx512cd",F_AVX512CD},
32961 {"avx512er",F_AVX512ER},
32962 {"avx512pf",F_AVX512PF},
32963 {"avx512vbmi",F_AVX512VBMI},
32964 {"avx512ifma",F_AVX512IFMA},
32965 {"avx5124vnniw",F_AVX5124VNNIW},
32966 {"avx5124fmaps",F_AVX5124FMAPS},
32967 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ},
32968 {"avx512vbmi2", F_AVX512VBMI2},
32969 {"gfni", F_GFNI},
32970 {"vpclmulqdq", F_VPCLMULQDQ},
32971 {"avx512vnni", F_AVX512VNNI},
32972 {"avx512bitalg", F_AVX512BITALG}
32975 tree __processor_model_type = build_processor_model_struct ();
32976 tree __cpu_model_var = make_var_decl (__processor_model_type,
32977 "__cpu_model");
32980 varpool_node::add (__cpu_model_var);
32982 gcc_assert ((args != NULL) && (*args != NULL));
32984 param_string_cst = *args;
32985 while (param_string_cst
32986 && TREE_CODE (param_string_cst) != STRING_CST)
32988 /* *args must be a expr that can contain other EXPRS leading to a
32989 STRING_CST. */
32990 if (!EXPR_P (param_string_cst))
32992 error ("Parameter to builtin must be a string constant or literal");
32993 return integer_zero_node;
32995 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32998 gcc_assert (param_string_cst);
33000 if (fn_code == IX86_BUILTIN_CPU_IS)
33002 tree ref;
33003 tree field;
33004 tree final;
33006 unsigned int field_val = 0;
33007 unsigned int NUM_ARCH_NAMES
33008 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
33010 for (i = 0; i < NUM_ARCH_NAMES; i++)
33011 if (strcmp (arch_names_table[i].name,
33012 TREE_STRING_POINTER (param_string_cst)) == 0)
33013 break;
33015 if (i == NUM_ARCH_NAMES)
33017 error ("Parameter to builtin not valid: %s",
33018 TREE_STRING_POINTER (param_string_cst));
33019 return integer_zero_node;
33022 field = TYPE_FIELDS (__processor_model_type);
33023 field_val = arch_names_table[i].model;
33025 /* CPU types are stored in the next field. */
33026 if (field_val > M_CPU_TYPE_START
33027 && field_val < M_CPU_SUBTYPE_START)
33029 field = DECL_CHAIN (field);
33030 field_val -= M_CPU_TYPE_START;
33033 /* CPU subtypes are stored in the next field. */
33034 if (field_val > M_CPU_SUBTYPE_START)
33036 field = DECL_CHAIN ( DECL_CHAIN (field));
33037 field_val -= M_CPU_SUBTYPE_START;
33040 /* Get the appropriate field in __cpu_model. */
33041 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33042 field, NULL_TREE);
33044 /* Check the value. */
33045 final = build2 (EQ_EXPR, unsigned_type_node, ref,
33046 build_int_cstu (unsigned_type_node, field_val));
33047 return build1 (CONVERT_EXPR, integer_type_node, final);
33049 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
33051 tree ref;
33052 tree array_elt;
33053 tree field;
33054 tree final;
33056 unsigned int field_val = 0;
33057 unsigned int NUM_ISA_NAMES
33058 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
33060 for (i = 0; i < NUM_ISA_NAMES; i++)
33061 if (strcmp (isa_names_table[i].name,
33062 TREE_STRING_POINTER (param_string_cst)) == 0)
33063 break;
33065 if (i == NUM_ISA_NAMES)
33067 error ("Parameter to builtin not valid: %s",
33068 TREE_STRING_POINTER (param_string_cst));
33069 return integer_zero_node;
33072 if (isa_names_table[i].feature >= 32)
33074 tree __cpu_features2_var = make_var_decl (unsigned_type_node,
33075 "__cpu_features2");
33077 varpool_node::add (__cpu_features2_var);
33078 field_val = (1U << (isa_names_table[i].feature - 32));
33079 /* Return __cpu_features2 & field_val */
33080 final = build2 (BIT_AND_EXPR, unsigned_type_node,
33081 __cpu_features2_var,
33082 build_int_cstu (unsigned_type_node, field_val));
33083 return build1 (CONVERT_EXPR, integer_type_node, final);
33086 field = TYPE_FIELDS (__processor_model_type);
33087 /* Get the last field, which is __cpu_features. */
33088 while (DECL_CHAIN (field))
33089 field = DECL_CHAIN (field);
33091 /* Get the appropriate field: __cpu_model.__cpu_features */
33092 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
33093 field, NULL_TREE);
33095 /* Access the 0th element of __cpu_features array. */
33096 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
33097 integer_zero_node, NULL_TREE, NULL_TREE);
33099 field_val = (1U << isa_names_table[i].feature);
33100 /* Return __cpu_model.__cpu_features[0] & field_val */
33101 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
33102 build_int_cstu (unsigned_type_node, field_val));
33103 return build1 (CONVERT_EXPR, integer_type_node, final);
33105 gcc_unreachable ();
33108 /* Return the shift count of a vector by scalar shift builtin second argument
33109 ARG1. */
33110 static tree
33111 ix86_vector_shift_count (tree arg1)
33113 if (tree_fits_uhwi_p (arg1))
33114 return arg1;
33115 else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
33117 /* The count argument is weird, passed in as various 128-bit
33118 (or 64-bit) vectors, the low 64 bits from it are the count. */
33119 unsigned char buf[16];
33120 int len = native_encode_expr (arg1, buf, 16);
33121 if (len == 0)
33122 return NULL_TREE;
33123 tree t = native_interpret_expr (uint64_type_node, buf, len);
33124 if (t && tree_fits_uhwi_p (t))
33125 return t;
33127 return NULL_TREE;
33130 static tree
33131 ix86_fold_builtin (tree fndecl, int n_args,
33132 tree *args, bool ignore ATTRIBUTE_UNUSED)
33134 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33136 enum ix86_builtins fn_code = (enum ix86_builtins)
33137 DECL_FUNCTION_CODE (fndecl);
33138 enum rtx_code rcode;
33139 bool is_vshift;
33140 unsigned HOST_WIDE_INT mask;
33142 switch (fn_code)
33144 case IX86_BUILTIN_CPU_IS:
33145 case IX86_BUILTIN_CPU_SUPPORTS:
33146 gcc_assert (n_args == 1);
33147 return fold_builtin_cpu (fndecl, args);
33149 case IX86_BUILTIN_NANQ:
33150 case IX86_BUILTIN_NANSQ:
33152 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33153 const char *str = c_getstr (*args);
33154 int quiet = fn_code == IX86_BUILTIN_NANQ;
33155 REAL_VALUE_TYPE real;
33157 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
33158 return build_real (type, real);
33159 return NULL_TREE;
33162 case IX86_BUILTIN_INFQ:
33163 case IX86_BUILTIN_HUGE_VALQ:
33165 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33166 REAL_VALUE_TYPE inf;
33167 real_inf (&inf);
33168 return build_real (type, inf);
33171 case IX86_BUILTIN_TZCNT16:
33172 case IX86_BUILTIN_CTZS:
33173 case IX86_BUILTIN_TZCNT32:
33174 case IX86_BUILTIN_TZCNT64:
33175 gcc_assert (n_args == 1);
33176 if (TREE_CODE (args[0]) == INTEGER_CST)
33178 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33179 tree arg = args[0];
33180 if (fn_code == IX86_BUILTIN_TZCNT16
33181 || fn_code == IX86_BUILTIN_CTZS)
33182 arg = fold_convert (short_unsigned_type_node, arg);
33183 if (integer_zerop (arg))
33184 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33185 else
33186 return fold_const_call (CFN_CTZ, type, arg);
33188 break;
33190 case IX86_BUILTIN_LZCNT16:
33191 case IX86_BUILTIN_CLZS:
33192 case IX86_BUILTIN_LZCNT32:
33193 case IX86_BUILTIN_LZCNT64:
33194 gcc_assert (n_args == 1);
33195 if (TREE_CODE (args[0]) == INTEGER_CST)
33197 tree type = TREE_TYPE (TREE_TYPE (fndecl));
33198 tree arg = args[0];
33199 if (fn_code == IX86_BUILTIN_LZCNT16
33200 || fn_code == IX86_BUILTIN_CLZS)
33201 arg = fold_convert (short_unsigned_type_node, arg);
33202 if (integer_zerop (arg))
33203 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
33204 else
33205 return fold_const_call (CFN_CLZ, type, arg);
33207 break;
33209 case IX86_BUILTIN_BEXTR32:
33210 case IX86_BUILTIN_BEXTR64:
33211 case IX86_BUILTIN_BEXTRI32:
33212 case IX86_BUILTIN_BEXTRI64:
33213 gcc_assert (n_args == 2);
33214 if (tree_fits_uhwi_p (args[1]))
33216 unsigned HOST_WIDE_INT res = 0;
33217 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
33218 unsigned int start = tree_to_uhwi (args[1]);
33219 unsigned int len = (start & 0xff00) >> 8;
33220 start &= 0xff;
33221 if (start >= prec || len == 0)
33222 res = 0;
33223 else if (!tree_fits_uhwi_p (args[0]))
33224 break;
33225 else
33226 res = tree_to_uhwi (args[0]) >> start;
33227 if (len > prec)
33228 len = prec;
33229 if (len < HOST_BITS_PER_WIDE_INT)
33230 res &= (HOST_WIDE_INT_1U << len) - 1;
33231 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33233 break;
33235 case IX86_BUILTIN_BZHI32:
33236 case IX86_BUILTIN_BZHI64:
33237 gcc_assert (n_args == 2);
33238 if (tree_fits_uhwi_p (args[1]))
33240 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
33241 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
33242 return args[0];
33243 if (!tree_fits_uhwi_p (args[0]))
33244 break;
33245 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
33246 res &= ~(HOST_WIDE_INT_M1U << idx);
33247 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33249 break;
33251 case IX86_BUILTIN_PDEP32:
33252 case IX86_BUILTIN_PDEP64:
33253 gcc_assert (n_args == 2);
33254 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33256 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33257 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33258 unsigned HOST_WIDE_INT res = 0;
33259 unsigned HOST_WIDE_INT m, k = 1;
33260 for (m = 1; m; m <<= 1)
33261 if ((mask & m) != 0)
33263 if ((src & k) != 0)
33264 res |= m;
33265 k <<= 1;
33267 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33269 break;
33271 case IX86_BUILTIN_PEXT32:
33272 case IX86_BUILTIN_PEXT64:
33273 gcc_assert (n_args == 2);
33274 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
33276 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
33277 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
33278 unsigned HOST_WIDE_INT res = 0;
33279 unsigned HOST_WIDE_INT m, k = 1;
33280 for (m = 1; m; m <<= 1)
33281 if ((mask & m) != 0)
33283 if ((src & m) != 0)
33284 res |= k;
33285 k <<= 1;
33287 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
33289 break;
33291 case IX86_BUILTIN_MOVMSKPS:
33292 case IX86_BUILTIN_PMOVMSKB:
33293 case IX86_BUILTIN_MOVMSKPD:
33294 case IX86_BUILTIN_PMOVMSKB128:
33295 case IX86_BUILTIN_MOVMSKPD256:
33296 case IX86_BUILTIN_MOVMSKPS256:
33297 case IX86_BUILTIN_PMOVMSKB256:
33298 gcc_assert (n_args == 1);
33299 if (TREE_CODE (args[0]) == VECTOR_CST)
33301 HOST_WIDE_INT res = 0;
33302 for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
33304 tree e = VECTOR_CST_ELT (args[0], i);
33305 if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
33307 if (wi::neg_p (wi::to_wide (e)))
33308 res |= HOST_WIDE_INT_1 << i;
33310 else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
33312 if (TREE_REAL_CST (e).sign)
33313 res |= HOST_WIDE_INT_1 << i;
33315 else
33316 return NULL_TREE;
33318 return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
33320 break;
33322 case IX86_BUILTIN_PSLLD:
33323 case IX86_BUILTIN_PSLLD128:
33324 case IX86_BUILTIN_PSLLD128_MASK:
33325 case IX86_BUILTIN_PSLLD256:
33326 case IX86_BUILTIN_PSLLD256_MASK:
33327 case IX86_BUILTIN_PSLLD512:
33328 case IX86_BUILTIN_PSLLDI:
33329 case IX86_BUILTIN_PSLLDI128:
33330 case IX86_BUILTIN_PSLLDI128_MASK:
33331 case IX86_BUILTIN_PSLLDI256:
33332 case IX86_BUILTIN_PSLLDI256_MASK:
33333 case IX86_BUILTIN_PSLLDI512:
33334 case IX86_BUILTIN_PSLLQ:
33335 case IX86_BUILTIN_PSLLQ128:
33336 case IX86_BUILTIN_PSLLQ128_MASK:
33337 case IX86_BUILTIN_PSLLQ256:
33338 case IX86_BUILTIN_PSLLQ256_MASK:
33339 case IX86_BUILTIN_PSLLQ512:
33340 case IX86_BUILTIN_PSLLQI:
33341 case IX86_BUILTIN_PSLLQI128:
33342 case IX86_BUILTIN_PSLLQI128_MASK:
33343 case IX86_BUILTIN_PSLLQI256:
33344 case IX86_BUILTIN_PSLLQI256_MASK:
33345 case IX86_BUILTIN_PSLLQI512:
33346 case IX86_BUILTIN_PSLLW:
33347 case IX86_BUILTIN_PSLLW128:
33348 case IX86_BUILTIN_PSLLW128_MASK:
33349 case IX86_BUILTIN_PSLLW256:
33350 case IX86_BUILTIN_PSLLW256_MASK:
33351 case IX86_BUILTIN_PSLLW512_MASK:
33352 case IX86_BUILTIN_PSLLWI:
33353 case IX86_BUILTIN_PSLLWI128:
33354 case IX86_BUILTIN_PSLLWI128_MASK:
33355 case IX86_BUILTIN_PSLLWI256:
33356 case IX86_BUILTIN_PSLLWI256_MASK:
33357 case IX86_BUILTIN_PSLLWI512_MASK:
33358 rcode = ASHIFT;
33359 is_vshift = false;
33360 goto do_shift;
33361 case IX86_BUILTIN_PSRAD:
33362 case IX86_BUILTIN_PSRAD128:
33363 case IX86_BUILTIN_PSRAD128_MASK:
33364 case IX86_BUILTIN_PSRAD256:
33365 case IX86_BUILTIN_PSRAD256_MASK:
33366 case IX86_BUILTIN_PSRAD512:
33367 case IX86_BUILTIN_PSRADI:
33368 case IX86_BUILTIN_PSRADI128:
33369 case IX86_BUILTIN_PSRADI128_MASK:
33370 case IX86_BUILTIN_PSRADI256:
33371 case IX86_BUILTIN_PSRADI256_MASK:
33372 case IX86_BUILTIN_PSRADI512:
33373 case IX86_BUILTIN_PSRAQ128_MASK:
33374 case IX86_BUILTIN_PSRAQ256_MASK:
33375 case IX86_BUILTIN_PSRAQ512:
33376 case IX86_BUILTIN_PSRAQI128_MASK:
33377 case IX86_BUILTIN_PSRAQI256_MASK:
33378 case IX86_BUILTIN_PSRAQI512:
33379 case IX86_BUILTIN_PSRAW:
33380 case IX86_BUILTIN_PSRAW128:
33381 case IX86_BUILTIN_PSRAW128_MASK:
33382 case IX86_BUILTIN_PSRAW256:
33383 case IX86_BUILTIN_PSRAW256_MASK:
33384 case IX86_BUILTIN_PSRAW512:
33385 case IX86_BUILTIN_PSRAWI:
33386 case IX86_BUILTIN_PSRAWI128:
33387 case IX86_BUILTIN_PSRAWI128_MASK:
33388 case IX86_BUILTIN_PSRAWI256:
33389 case IX86_BUILTIN_PSRAWI256_MASK:
33390 case IX86_BUILTIN_PSRAWI512:
33391 rcode = ASHIFTRT;
33392 is_vshift = false;
33393 goto do_shift;
33394 case IX86_BUILTIN_PSRLD:
33395 case IX86_BUILTIN_PSRLD128:
33396 case IX86_BUILTIN_PSRLD128_MASK:
33397 case IX86_BUILTIN_PSRLD256:
33398 case IX86_BUILTIN_PSRLD256_MASK:
33399 case IX86_BUILTIN_PSRLD512:
33400 case IX86_BUILTIN_PSRLDI:
33401 case IX86_BUILTIN_PSRLDI128:
33402 case IX86_BUILTIN_PSRLDI128_MASK:
33403 case IX86_BUILTIN_PSRLDI256:
33404 case IX86_BUILTIN_PSRLDI256_MASK:
33405 case IX86_BUILTIN_PSRLDI512:
33406 case IX86_BUILTIN_PSRLQ:
33407 case IX86_BUILTIN_PSRLQ128:
33408 case IX86_BUILTIN_PSRLQ128_MASK:
33409 case IX86_BUILTIN_PSRLQ256:
33410 case IX86_BUILTIN_PSRLQ256_MASK:
33411 case IX86_BUILTIN_PSRLQ512:
33412 case IX86_BUILTIN_PSRLQI:
33413 case IX86_BUILTIN_PSRLQI128:
33414 case IX86_BUILTIN_PSRLQI128_MASK:
33415 case IX86_BUILTIN_PSRLQI256:
33416 case IX86_BUILTIN_PSRLQI256_MASK:
33417 case IX86_BUILTIN_PSRLQI512:
33418 case IX86_BUILTIN_PSRLW:
33419 case IX86_BUILTIN_PSRLW128:
33420 case IX86_BUILTIN_PSRLW128_MASK:
33421 case IX86_BUILTIN_PSRLW256:
33422 case IX86_BUILTIN_PSRLW256_MASK:
33423 case IX86_BUILTIN_PSRLW512:
33424 case IX86_BUILTIN_PSRLWI:
33425 case IX86_BUILTIN_PSRLWI128:
33426 case IX86_BUILTIN_PSRLWI128_MASK:
33427 case IX86_BUILTIN_PSRLWI256:
33428 case IX86_BUILTIN_PSRLWI256_MASK:
33429 case IX86_BUILTIN_PSRLWI512:
33430 rcode = LSHIFTRT;
33431 is_vshift = false;
33432 goto do_shift;
33433 case IX86_BUILTIN_PSLLVV16HI:
33434 case IX86_BUILTIN_PSLLVV16SI:
33435 case IX86_BUILTIN_PSLLVV2DI:
33436 case IX86_BUILTIN_PSLLVV2DI_MASK:
33437 case IX86_BUILTIN_PSLLVV32HI:
33438 case IX86_BUILTIN_PSLLVV4DI:
33439 case IX86_BUILTIN_PSLLVV4DI_MASK:
33440 case IX86_BUILTIN_PSLLVV4SI:
33441 case IX86_BUILTIN_PSLLVV4SI_MASK:
33442 case IX86_BUILTIN_PSLLVV8DI:
33443 case IX86_BUILTIN_PSLLVV8HI:
33444 case IX86_BUILTIN_PSLLVV8SI:
33445 case IX86_BUILTIN_PSLLVV8SI_MASK:
33446 rcode = ASHIFT;
33447 is_vshift = true;
33448 goto do_shift;
33449 case IX86_BUILTIN_PSRAVQ128:
33450 case IX86_BUILTIN_PSRAVQ256:
33451 case IX86_BUILTIN_PSRAVV16HI:
33452 case IX86_BUILTIN_PSRAVV16SI:
33453 case IX86_BUILTIN_PSRAVV32HI:
33454 case IX86_BUILTIN_PSRAVV4SI:
33455 case IX86_BUILTIN_PSRAVV4SI_MASK:
33456 case IX86_BUILTIN_PSRAVV8DI:
33457 case IX86_BUILTIN_PSRAVV8HI:
33458 case IX86_BUILTIN_PSRAVV8SI:
33459 case IX86_BUILTIN_PSRAVV8SI_MASK:
33460 rcode = ASHIFTRT;
33461 is_vshift = true;
33462 goto do_shift;
33463 case IX86_BUILTIN_PSRLVV16HI:
33464 case IX86_BUILTIN_PSRLVV16SI:
33465 case IX86_BUILTIN_PSRLVV2DI:
33466 case IX86_BUILTIN_PSRLVV2DI_MASK:
33467 case IX86_BUILTIN_PSRLVV32HI:
33468 case IX86_BUILTIN_PSRLVV4DI:
33469 case IX86_BUILTIN_PSRLVV4DI_MASK:
33470 case IX86_BUILTIN_PSRLVV4SI:
33471 case IX86_BUILTIN_PSRLVV4SI_MASK:
33472 case IX86_BUILTIN_PSRLVV8DI:
33473 case IX86_BUILTIN_PSRLVV8HI:
33474 case IX86_BUILTIN_PSRLVV8SI:
33475 case IX86_BUILTIN_PSRLVV8SI_MASK:
33476 rcode = LSHIFTRT;
33477 is_vshift = true;
33478 goto do_shift;
33480 do_shift:
33481 gcc_assert (n_args >= 2);
33482 if (TREE_CODE (args[0]) != VECTOR_CST)
33483 break;
33484 mask = HOST_WIDE_INT_M1U;
33485 if (n_args > 2)
33487 /* This is masked shift. */
33488 if (!tree_fits_uhwi_p (args[n_args - 1])
33489 || TREE_SIDE_EFFECTS (args[n_args - 2]))
33490 break;
33491 mask = tree_to_uhwi (args[n_args - 1]);
33492 unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
33493 mask |= HOST_WIDE_INT_M1U << elems;
33494 if (mask != HOST_WIDE_INT_M1U
33495 && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
33496 break;
33497 if (mask == (HOST_WIDE_INT_M1U << elems))
33498 return args[n_args - 2];
33500 if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
33501 break;
33502 if (tree tem = (is_vshift ? integer_one_node
33503 : ix86_vector_shift_count (args[1])))
33505 unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
33506 unsigned HOST_WIDE_INT prec
33507 = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
33508 if (count == 0 && mask == HOST_WIDE_INT_M1U)
33509 return args[0];
33510 if (count >= prec)
33512 if (rcode == ASHIFTRT)
33513 count = prec - 1;
33514 else if (mask == HOST_WIDE_INT_M1U)
33515 return build_zero_cst (TREE_TYPE (args[0]));
33517 tree countt = NULL_TREE;
33518 if (!is_vshift)
33520 if (count >= prec)
33521 countt = integer_zero_node;
33522 else
33523 countt = build_int_cst (integer_type_node, count);
33525 tree_vector_builder builder;
33526 builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
33527 false);
33528 unsigned int cnt = builder.encoded_nelts ();
33529 for (unsigned int i = 0; i < cnt; ++i)
33531 tree elt = VECTOR_CST_ELT (args[0], i);
33532 if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
33533 return NULL_TREE;
33534 tree type = TREE_TYPE (elt);
33535 if (rcode == LSHIFTRT)
33536 elt = fold_convert (unsigned_type_for (type), elt);
33537 if (is_vshift)
33539 countt = VECTOR_CST_ELT (args[1], i);
33540 if (TREE_CODE (countt) != INTEGER_CST
33541 || TREE_OVERFLOW (countt))
33542 return NULL_TREE;
33543 if (wi::neg_p (wi::to_wide (countt))
33544 || wi::to_widest (countt) >= prec)
33546 if (rcode == ASHIFTRT)
33547 countt = build_int_cst (TREE_TYPE (countt),
33548 prec - 1);
33549 else
33551 elt = build_zero_cst (TREE_TYPE (elt));
33552 countt = build_zero_cst (TREE_TYPE (countt));
33556 else if (count >= prec)
33557 elt = build_zero_cst (TREE_TYPE (elt));
33558 elt = const_binop (rcode == ASHIFT
33559 ? LSHIFT_EXPR : RSHIFT_EXPR,
33560 TREE_TYPE (elt), elt, countt);
33561 if (!elt || TREE_CODE (elt) != INTEGER_CST)
33562 return NULL_TREE;
33563 if (rcode == LSHIFTRT)
33564 elt = fold_convert (type, elt);
33565 if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
33567 elt = VECTOR_CST_ELT (args[n_args - 2], i);
33568 if (TREE_CODE (elt) != INTEGER_CST
33569 || TREE_OVERFLOW (elt))
33570 return NULL_TREE;
33572 builder.quick_push (elt);
33574 return builder.build ();
33576 break;
33578 default:
33579 break;
33583 #ifdef SUBTARGET_FOLD_BUILTIN
33584 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
33585 #endif
33587 return NULL_TREE;
33590 /* Fold a MD builtin (use ix86_fold_builtin for folding into
33591 constant) in GIMPLE. */
33593 bool
33594 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
33596 gimple *stmt = gsi_stmt (*gsi);
33597 tree fndecl = gimple_call_fndecl (stmt);
33598 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
33599 int n_args = gimple_call_num_args (stmt);
33600 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
33601 tree decl = NULL_TREE;
33602 tree arg0, arg1;
33603 enum rtx_code rcode;
33604 unsigned HOST_WIDE_INT count;
33605 bool is_vshift;
33607 switch (fn_code)
33609 case IX86_BUILTIN_TZCNT32:
33610 decl = builtin_decl_implicit (BUILT_IN_CTZ);
33611 goto fold_tzcnt_lzcnt;
33613 case IX86_BUILTIN_TZCNT64:
33614 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
33615 goto fold_tzcnt_lzcnt;
33617 case IX86_BUILTIN_LZCNT32:
33618 decl = builtin_decl_implicit (BUILT_IN_CLZ);
33619 goto fold_tzcnt_lzcnt;
33621 case IX86_BUILTIN_LZCNT64:
33622 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
33623 goto fold_tzcnt_lzcnt;
33625 fold_tzcnt_lzcnt:
33626 gcc_assert (n_args == 1);
33627 arg0 = gimple_call_arg (stmt, 0);
33628 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
33630 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
33631 /* If arg0 is provably non-zero, optimize into generic
33632 __builtin_c[tl]z{,ll} function the middle-end handles
33633 better. */
33634 if (!expr_not_equal_to (arg0, wi::zero (prec)))
33635 return false;
33637 location_t loc = gimple_location (stmt);
33638 gimple *g = gimple_build_call (decl, 1, arg0);
33639 gimple_set_location (g, loc);
33640 tree lhs = make_ssa_name (integer_type_node);
33641 gimple_call_set_lhs (g, lhs);
33642 gsi_insert_before (gsi, g, GSI_SAME_STMT);
33643 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
33644 gimple_set_location (g, loc);
33645 gsi_replace (gsi, g, false);
33646 return true;
33648 break;
33650 case IX86_BUILTIN_BZHI32:
33651 case IX86_BUILTIN_BZHI64:
33652 gcc_assert (n_args == 2);
33653 arg1 = gimple_call_arg (stmt, 1);
33654 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
33656 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
33657 arg0 = gimple_call_arg (stmt, 0);
33658 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
33659 break;
33660 location_t loc = gimple_location (stmt);
33661 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33662 gimple_set_location (g, loc);
33663 gsi_replace (gsi, g, false);
33664 return true;
33666 break;
33668 case IX86_BUILTIN_PDEP32:
33669 case IX86_BUILTIN_PDEP64:
33670 case IX86_BUILTIN_PEXT32:
33671 case IX86_BUILTIN_PEXT64:
33672 gcc_assert (n_args == 2);
33673 arg1 = gimple_call_arg (stmt, 1);
33674 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
33676 location_t loc = gimple_location (stmt);
33677 arg0 = gimple_call_arg (stmt, 0);
33678 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33679 gimple_set_location (g, loc);
33680 gsi_replace (gsi, g, false);
33681 return true;
33683 break;
33685 case IX86_BUILTIN_PSLLD:
33686 case IX86_BUILTIN_PSLLD128:
33687 case IX86_BUILTIN_PSLLD128_MASK:
33688 case IX86_BUILTIN_PSLLD256:
33689 case IX86_BUILTIN_PSLLD256_MASK:
33690 case IX86_BUILTIN_PSLLD512:
33691 case IX86_BUILTIN_PSLLDI:
33692 case IX86_BUILTIN_PSLLDI128:
33693 case IX86_BUILTIN_PSLLDI128_MASK:
33694 case IX86_BUILTIN_PSLLDI256:
33695 case IX86_BUILTIN_PSLLDI256_MASK:
33696 case IX86_BUILTIN_PSLLDI512:
33697 case IX86_BUILTIN_PSLLQ:
33698 case IX86_BUILTIN_PSLLQ128:
33699 case IX86_BUILTIN_PSLLQ128_MASK:
33700 case IX86_BUILTIN_PSLLQ256:
33701 case IX86_BUILTIN_PSLLQ256_MASK:
33702 case IX86_BUILTIN_PSLLQ512:
33703 case IX86_BUILTIN_PSLLQI:
33704 case IX86_BUILTIN_PSLLQI128:
33705 case IX86_BUILTIN_PSLLQI128_MASK:
33706 case IX86_BUILTIN_PSLLQI256:
33707 case IX86_BUILTIN_PSLLQI256_MASK:
33708 case IX86_BUILTIN_PSLLQI512:
33709 case IX86_BUILTIN_PSLLW:
33710 case IX86_BUILTIN_PSLLW128:
33711 case IX86_BUILTIN_PSLLW128_MASK:
33712 case IX86_BUILTIN_PSLLW256:
33713 case IX86_BUILTIN_PSLLW256_MASK:
33714 case IX86_BUILTIN_PSLLW512_MASK:
33715 case IX86_BUILTIN_PSLLWI:
33716 case IX86_BUILTIN_PSLLWI128:
33717 case IX86_BUILTIN_PSLLWI128_MASK:
33718 case IX86_BUILTIN_PSLLWI256:
33719 case IX86_BUILTIN_PSLLWI256_MASK:
33720 case IX86_BUILTIN_PSLLWI512_MASK:
33721 rcode = ASHIFT;
33722 is_vshift = false;
33723 goto do_shift;
33724 case IX86_BUILTIN_PSRAD:
33725 case IX86_BUILTIN_PSRAD128:
33726 case IX86_BUILTIN_PSRAD128_MASK:
33727 case IX86_BUILTIN_PSRAD256:
33728 case IX86_BUILTIN_PSRAD256_MASK:
33729 case IX86_BUILTIN_PSRAD512:
33730 case IX86_BUILTIN_PSRADI:
33731 case IX86_BUILTIN_PSRADI128:
33732 case IX86_BUILTIN_PSRADI128_MASK:
33733 case IX86_BUILTIN_PSRADI256:
33734 case IX86_BUILTIN_PSRADI256_MASK:
33735 case IX86_BUILTIN_PSRADI512:
33736 case IX86_BUILTIN_PSRAQ128_MASK:
33737 case IX86_BUILTIN_PSRAQ256_MASK:
33738 case IX86_BUILTIN_PSRAQ512:
33739 case IX86_BUILTIN_PSRAQI128_MASK:
33740 case IX86_BUILTIN_PSRAQI256_MASK:
33741 case IX86_BUILTIN_PSRAQI512:
33742 case IX86_BUILTIN_PSRAW:
33743 case IX86_BUILTIN_PSRAW128:
33744 case IX86_BUILTIN_PSRAW128_MASK:
33745 case IX86_BUILTIN_PSRAW256:
33746 case IX86_BUILTIN_PSRAW256_MASK:
33747 case IX86_BUILTIN_PSRAW512:
33748 case IX86_BUILTIN_PSRAWI:
33749 case IX86_BUILTIN_PSRAWI128:
33750 case IX86_BUILTIN_PSRAWI128_MASK:
33751 case IX86_BUILTIN_PSRAWI256:
33752 case IX86_BUILTIN_PSRAWI256_MASK:
33753 case IX86_BUILTIN_PSRAWI512:
33754 rcode = ASHIFTRT;
33755 is_vshift = false;
33756 goto do_shift;
33757 case IX86_BUILTIN_PSRLD:
33758 case IX86_BUILTIN_PSRLD128:
33759 case IX86_BUILTIN_PSRLD128_MASK:
33760 case IX86_BUILTIN_PSRLD256:
33761 case IX86_BUILTIN_PSRLD256_MASK:
33762 case IX86_BUILTIN_PSRLD512:
33763 case IX86_BUILTIN_PSRLDI:
33764 case IX86_BUILTIN_PSRLDI128:
33765 case IX86_BUILTIN_PSRLDI128_MASK:
33766 case IX86_BUILTIN_PSRLDI256:
33767 case IX86_BUILTIN_PSRLDI256_MASK:
33768 case IX86_BUILTIN_PSRLDI512:
33769 case IX86_BUILTIN_PSRLQ:
33770 case IX86_BUILTIN_PSRLQ128:
33771 case IX86_BUILTIN_PSRLQ128_MASK:
33772 case IX86_BUILTIN_PSRLQ256:
33773 case IX86_BUILTIN_PSRLQ256_MASK:
33774 case IX86_BUILTIN_PSRLQ512:
33775 case IX86_BUILTIN_PSRLQI:
33776 case IX86_BUILTIN_PSRLQI128:
33777 case IX86_BUILTIN_PSRLQI128_MASK:
33778 case IX86_BUILTIN_PSRLQI256:
33779 case IX86_BUILTIN_PSRLQI256_MASK:
33780 case IX86_BUILTIN_PSRLQI512:
33781 case IX86_BUILTIN_PSRLW:
33782 case IX86_BUILTIN_PSRLW128:
33783 case IX86_BUILTIN_PSRLW128_MASK:
33784 case IX86_BUILTIN_PSRLW256:
33785 case IX86_BUILTIN_PSRLW256_MASK:
33786 case IX86_BUILTIN_PSRLW512:
33787 case IX86_BUILTIN_PSRLWI:
33788 case IX86_BUILTIN_PSRLWI128:
33789 case IX86_BUILTIN_PSRLWI128_MASK:
33790 case IX86_BUILTIN_PSRLWI256:
33791 case IX86_BUILTIN_PSRLWI256_MASK:
33792 case IX86_BUILTIN_PSRLWI512:
33793 rcode = LSHIFTRT;
33794 is_vshift = false;
33795 goto do_shift;
33796 case IX86_BUILTIN_PSLLVV16HI:
33797 case IX86_BUILTIN_PSLLVV16SI:
33798 case IX86_BUILTIN_PSLLVV2DI:
33799 case IX86_BUILTIN_PSLLVV2DI_MASK:
33800 case IX86_BUILTIN_PSLLVV32HI:
33801 case IX86_BUILTIN_PSLLVV4DI:
33802 case IX86_BUILTIN_PSLLVV4DI_MASK:
33803 case IX86_BUILTIN_PSLLVV4SI:
33804 case IX86_BUILTIN_PSLLVV4SI_MASK:
33805 case IX86_BUILTIN_PSLLVV8DI:
33806 case IX86_BUILTIN_PSLLVV8HI:
33807 case IX86_BUILTIN_PSLLVV8SI:
33808 case IX86_BUILTIN_PSLLVV8SI_MASK:
33809 rcode = ASHIFT;
33810 is_vshift = true;
33811 goto do_shift;
33812 case IX86_BUILTIN_PSRAVQ128:
33813 case IX86_BUILTIN_PSRAVQ256:
33814 case IX86_BUILTIN_PSRAVV16HI:
33815 case IX86_BUILTIN_PSRAVV16SI:
33816 case IX86_BUILTIN_PSRAVV32HI:
33817 case IX86_BUILTIN_PSRAVV4SI:
33818 case IX86_BUILTIN_PSRAVV4SI_MASK:
33819 case IX86_BUILTIN_PSRAVV8DI:
33820 case IX86_BUILTIN_PSRAVV8HI:
33821 case IX86_BUILTIN_PSRAVV8SI:
33822 case IX86_BUILTIN_PSRAVV8SI_MASK:
33823 rcode = ASHIFTRT;
33824 is_vshift = true;
33825 goto do_shift;
33826 case IX86_BUILTIN_PSRLVV16HI:
33827 case IX86_BUILTIN_PSRLVV16SI:
33828 case IX86_BUILTIN_PSRLVV2DI:
33829 case IX86_BUILTIN_PSRLVV2DI_MASK:
33830 case IX86_BUILTIN_PSRLVV32HI:
33831 case IX86_BUILTIN_PSRLVV4DI:
33832 case IX86_BUILTIN_PSRLVV4DI_MASK:
33833 case IX86_BUILTIN_PSRLVV4SI:
33834 case IX86_BUILTIN_PSRLVV4SI_MASK:
33835 case IX86_BUILTIN_PSRLVV8DI:
33836 case IX86_BUILTIN_PSRLVV8HI:
33837 case IX86_BUILTIN_PSRLVV8SI:
33838 case IX86_BUILTIN_PSRLVV8SI_MASK:
33839 rcode = LSHIFTRT;
33840 is_vshift = true;
33841 goto do_shift;
33843 do_shift:
33844 gcc_assert (n_args >= 2);
33845 arg0 = gimple_call_arg (stmt, 0);
33846 arg1 = gimple_call_arg (stmt, 1);
33847 if (n_args > 2)
33849 /* This is masked shift. Only optimize if the mask is all ones. */
33850 tree argl = gimple_call_arg (stmt, n_args - 1);
33851 if (!tree_fits_uhwi_p (argl))
33852 break;
33853 unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
33854 unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
33855 if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
33856 break;
33858 if (is_vshift)
33860 if (TREE_CODE (arg1) != VECTOR_CST)
33861 break;
33862 count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
33863 if (integer_zerop (arg1))
33864 count = 0;
33865 else if (rcode == ASHIFTRT)
33866 break;
33867 else
33868 for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
33870 tree elt = VECTOR_CST_ELT (arg1, i);
33871 if (!wi::neg_p (wi::to_wide (elt))
33872 && wi::to_widest (elt) < count)
33873 return false;
33876 else
33878 arg1 = ix86_vector_shift_count (arg1);
33879 if (!arg1)
33880 break;
33881 count = tree_to_uhwi (arg1);
33883 if (count == 0)
33885 /* Just return the first argument for shift by 0. */
33886 location_t loc = gimple_location (stmt);
33887 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
33888 gimple_set_location (g, loc);
33889 gsi_replace (gsi, g, false);
33890 return true;
33892 if (rcode != ASHIFTRT
33893 && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
33895 /* For shift counts equal or greater than precision, except for
33896 arithmetic right shift the result is zero. */
33897 location_t loc = gimple_location (stmt);
33898 gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
33899 build_zero_cst (TREE_TYPE (arg0)));
33900 gimple_set_location (g, loc);
33901 gsi_replace (gsi, g, false);
33902 return true;
33904 break;
33906 default:
33907 break;
33910 return false;
33913 /* Make builtins to detect cpu type and features supported. NAME is
33914 the builtin name, CODE is the builtin code, and FTYPE is the function
33915 type of the builtin. */
33917 static void
33918 make_cpu_type_builtin (const char* name, int code,
33919 enum ix86_builtin_func_type ftype, bool is_const)
33921 tree decl;
33922 tree type;
33924 type = ix86_get_builtin_func_type (ftype);
33925 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
33926 NULL, NULL_TREE);
33927 gcc_assert (decl != NULL_TREE);
33928 ix86_builtins[(int) code] = decl;
33929 TREE_READONLY (decl) = is_const;
33932 /* Make builtins to get CPU type and features supported. The created
33933 builtins are :
33935 __builtin_cpu_init (), to detect cpu type and features,
33936 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
33937 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
33940 static void
33941 ix86_init_platform_type_builtins (void)
33943 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
33944 INT_FTYPE_VOID, false);
33945 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
33946 INT_FTYPE_PCCHAR, true);
33947 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
33948 INT_FTYPE_PCCHAR, true);
33951 /* Internal method for ix86_init_builtins. */
33953 static void
33954 ix86_init_builtins_va_builtins_abi (void)
33956 tree ms_va_ref, sysv_va_ref;
33957 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
33958 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
33959 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
33960 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
33962 if (!TARGET_64BIT)
33963 return;
33964 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
33965 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
33966 ms_va_ref = build_reference_type (ms_va_list_type_node);
33967 sysv_va_ref =
33968 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
33970 fnvoid_va_end_ms =
33971 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33972 fnvoid_va_start_ms =
33973 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
33974 fnvoid_va_end_sysv =
33975 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
33976 fnvoid_va_start_sysv =
33977 build_varargs_function_type_list (void_type_node, sysv_va_ref,
33978 NULL_TREE);
33979 fnvoid_va_copy_ms =
33980 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
33981 NULL_TREE);
33982 fnvoid_va_copy_sysv =
33983 build_function_type_list (void_type_node, sysv_va_ref,
33984 sysv_va_ref, NULL_TREE);
33986 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
33987 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
33988 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
33989 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
33990 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
33991 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
33992 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
33993 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33994 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
33995 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
33996 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
33997 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
34000 static void
34001 ix86_init_builtin_types (void)
34003 tree float80_type_node, const_string_type_node;
34005 /* The __float80 type. */
34006 float80_type_node = long_double_type_node;
34007 if (TYPE_MODE (float80_type_node) != XFmode)
34009 if (float64x_type_node != NULL_TREE
34010 && TYPE_MODE (float64x_type_node) == XFmode)
34011 float80_type_node = float64x_type_node;
34012 else
34014 /* The __float80 type. */
34015 float80_type_node = make_node (REAL_TYPE);
34017 TYPE_PRECISION (float80_type_node) = 80;
34018 layout_type (float80_type_node);
34021 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
34023 /* The __float128 type. The node has already been created as
34024 _Float128, so we only need to register the __float128 name for
34025 it. */
34026 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
34028 const_string_type_node
34029 = build_pointer_type (build_qualified_type
34030 (char_type_node, TYPE_QUAL_CONST));
34032 /* This macro is built by i386-builtin-types.awk. */
34033 DEFINE_BUILTIN_PRIMITIVE_TYPES;
34036 static void
34037 ix86_init_builtins (void)
34039 tree ftype, decl;
34041 ix86_init_builtin_types ();
34043 /* Builtins to get CPU type and features. */
34044 ix86_init_platform_type_builtins ();
34046 /* TFmode support builtins. */
34047 def_builtin_const (0, "__builtin_infq",
34048 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
34049 def_builtin_const (0, "__builtin_huge_valq",
34050 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
34052 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
34053 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
34054 BUILT_IN_MD, "nanq", NULL_TREE);
34055 TREE_READONLY (decl) = 1;
34056 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
34058 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
34059 BUILT_IN_MD, "nansq", NULL_TREE);
34060 TREE_READONLY (decl) = 1;
34061 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
34063 /* We will expand them to normal call if SSE isn't available since
34064 they are used by libgcc. */
34065 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
34066 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
34067 BUILT_IN_MD, "__fabstf2", NULL_TREE);
34068 TREE_READONLY (decl) = 1;
34069 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
34071 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
34072 decl = add_builtin_function ("__builtin_copysignq", ftype,
34073 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
34074 "__copysigntf3", NULL_TREE);
34075 TREE_READONLY (decl) = 1;
34076 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
34078 ix86_init_tm_builtins ();
34079 ix86_init_mmx_sse_builtins ();
34081 if (TARGET_LP64)
34082 ix86_init_builtins_va_builtins_abi ();
34084 #ifdef SUBTARGET_INIT_BUILTINS
34085 SUBTARGET_INIT_BUILTINS;
34086 #endif
34089 /* Return the ix86 builtin for CODE. */
34091 static tree
34092 ix86_builtin_decl (unsigned code, bool)
34094 if (code >= IX86_BUILTIN_MAX)
34095 return error_mark_node;
34097 return ix86_builtins[code];
34100 /* Errors in the source file can cause expand_expr to return const0_rtx
34101 where we expect a vector. To avoid crashing, use one of the vector
34102 clear instructions. */
34103 static rtx
34104 safe_vector_operand (rtx x, machine_mode mode)
34106 if (x == const0_rtx)
34107 x = CONST0_RTX (mode);
34108 return x;
34111 /* Fixup modeless constants to fit required mode. */
34112 static rtx
34113 fixup_modeless_constant (rtx x, machine_mode mode)
34115 if (GET_MODE (x) == VOIDmode)
34116 x = convert_to_mode (mode, x, 1);
34117 return x;
34120 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
34122 static rtx
34123 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
34125 rtx pat;
34126 tree arg0 = CALL_EXPR_ARG (exp, 0);
34127 tree arg1 = CALL_EXPR_ARG (exp, 1);
34128 rtx op0 = expand_normal (arg0);
34129 rtx op1 = expand_normal (arg1);
34130 machine_mode tmode = insn_data[icode].operand[0].mode;
34131 machine_mode mode0 = insn_data[icode].operand[1].mode;
34132 machine_mode mode1 = insn_data[icode].operand[2].mode;
34134 if (VECTOR_MODE_P (mode0))
34135 op0 = safe_vector_operand (op0, mode0);
34136 if (VECTOR_MODE_P (mode1))
34137 op1 = safe_vector_operand (op1, mode1);
34139 if (optimize || !target
34140 || GET_MODE (target) != tmode
34141 || !insn_data[icode].operand[0].predicate (target, tmode))
34142 target = gen_reg_rtx (tmode);
34144 if (GET_MODE (op1) == SImode && mode1 == TImode)
34146 rtx x = gen_reg_rtx (V4SImode);
34147 emit_insn (gen_sse2_loadd (x, op1));
34148 op1 = gen_lowpart (TImode, x);
34151 if (!insn_data[icode].operand[1].predicate (op0, mode0))
34152 op0 = copy_to_mode_reg (mode0, op0);
34153 if (!insn_data[icode].operand[2].predicate (op1, mode1))
34154 op1 = copy_to_mode_reg (mode1, op1);
34156 pat = GEN_FCN (icode) (target, op0, op1);
34157 if (! pat)
34158 return 0;
34160 emit_insn (pat);
34162 return target;
34165 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
34167 static rtx
34168 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
34169 enum ix86_builtin_func_type m_type,
34170 enum rtx_code sub_code)
34172 rtx pat;
34173 int i;
34174 int nargs;
34175 bool comparison_p = false;
34176 bool tf_p = false;
34177 bool last_arg_constant = false;
34178 int num_memory = 0;
34179 struct {
34180 rtx op;
34181 machine_mode mode;
34182 } args[4];
34184 machine_mode tmode = insn_data[icode].operand[0].mode;
34186 switch (m_type)
34188 case MULTI_ARG_4_DF2_DI_I:
34189 case MULTI_ARG_4_DF2_DI_I1:
34190 case MULTI_ARG_4_SF2_SI_I:
34191 case MULTI_ARG_4_SF2_SI_I1:
34192 nargs = 4;
34193 last_arg_constant = true;
34194 break;
34196 case MULTI_ARG_3_SF:
34197 case MULTI_ARG_3_DF:
34198 case MULTI_ARG_3_SF2:
34199 case MULTI_ARG_3_DF2:
34200 case MULTI_ARG_3_DI:
34201 case MULTI_ARG_3_SI:
34202 case MULTI_ARG_3_SI_DI:
34203 case MULTI_ARG_3_HI:
34204 case MULTI_ARG_3_HI_SI:
34205 case MULTI_ARG_3_QI:
34206 case MULTI_ARG_3_DI2:
34207 case MULTI_ARG_3_SI2:
34208 case MULTI_ARG_3_HI2:
34209 case MULTI_ARG_3_QI2:
34210 nargs = 3;
34211 break;
34213 case MULTI_ARG_2_SF:
34214 case MULTI_ARG_2_DF:
34215 case MULTI_ARG_2_DI:
34216 case MULTI_ARG_2_SI:
34217 case MULTI_ARG_2_HI:
34218 case MULTI_ARG_2_QI:
34219 nargs = 2;
34220 break;
34222 case MULTI_ARG_2_DI_IMM:
34223 case MULTI_ARG_2_SI_IMM:
34224 case MULTI_ARG_2_HI_IMM:
34225 case MULTI_ARG_2_QI_IMM:
34226 nargs = 2;
34227 last_arg_constant = true;
34228 break;
34230 case MULTI_ARG_1_SF:
34231 case MULTI_ARG_1_DF:
34232 case MULTI_ARG_1_SF2:
34233 case MULTI_ARG_1_DF2:
34234 case MULTI_ARG_1_DI:
34235 case MULTI_ARG_1_SI:
34236 case MULTI_ARG_1_HI:
34237 case MULTI_ARG_1_QI:
34238 case MULTI_ARG_1_SI_DI:
34239 case MULTI_ARG_1_HI_DI:
34240 case MULTI_ARG_1_HI_SI:
34241 case MULTI_ARG_1_QI_DI:
34242 case MULTI_ARG_1_QI_SI:
34243 case MULTI_ARG_1_QI_HI:
34244 nargs = 1;
34245 break;
34247 case MULTI_ARG_2_DI_CMP:
34248 case MULTI_ARG_2_SI_CMP:
34249 case MULTI_ARG_2_HI_CMP:
34250 case MULTI_ARG_2_QI_CMP:
34251 nargs = 2;
34252 comparison_p = true;
34253 break;
34255 case MULTI_ARG_2_SF_TF:
34256 case MULTI_ARG_2_DF_TF:
34257 case MULTI_ARG_2_DI_TF:
34258 case MULTI_ARG_2_SI_TF:
34259 case MULTI_ARG_2_HI_TF:
34260 case MULTI_ARG_2_QI_TF:
34261 nargs = 2;
34262 tf_p = true;
34263 break;
34265 default:
34266 gcc_unreachable ();
34269 if (optimize || !target
34270 || GET_MODE (target) != tmode
34271 || !insn_data[icode].operand[0].predicate (target, tmode))
34272 target = gen_reg_rtx (tmode);
34273 else if (memory_operand (target, tmode))
34274 num_memory++;
34276 gcc_assert (nargs <= 4);
34278 for (i = 0; i < nargs; i++)
34280 tree arg = CALL_EXPR_ARG (exp, i);
34281 rtx op = expand_normal (arg);
34282 int adjust = (comparison_p) ? 1 : 0;
34283 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
34285 if (last_arg_constant && i == nargs - 1)
34287 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
34289 enum insn_code new_icode = icode;
34290 switch (icode)
34292 case CODE_FOR_xop_vpermil2v2df3:
34293 case CODE_FOR_xop_vpermil2v4sf3:
34294 case CODE_FOR_xop_vpermil2v4df3:
34295 case CODE_FOR_xop_vpermil2v8sf3:
34296 error ("the last argument must be a 2-bit immediate");
34297 return gen_reg_rtx (tmode);
34298 case CODE_FOR_xop_rotlv2di3:
34299 new_icode = CODE_FOR_rotlv2di3;
34300 goto xop_rotl;
34301 case CODE_FOR_xop_rotlv4si3:
34302 new_icode = CODE_FOR_rotlv4si3;
34303 goto xop_rotl;
34304 case CODE_FOR_xop_rotlv8hi3:
34305 new_icode = CODE_FOR_rotlv8hi3;
34306 goto xop_rotl;
34307 case CODE_FOR_xop_rotlv16qi3:
34308 new_icode = CODE_FOR_rotlv16qi3;
34309 xop_rotl:
34310 if (CONST_INT_P (op))
34312 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
34313 op = GEN_INT (INTVAL (op) & mask);
34314 gcc_checking_assert
34315 (insn_data[icode].operand[i + 1].predicate (op, mode));
34317 else
34319 gcc_checking_assert
34320 (nargs == 2
34321 && insn_data[new_icode].operand[0].mode == tmode
34322 && insn_data[new_icode].operand[1].mode == tmode
34323 && insn_data[new_icode].operand[2].mode == mode
34324 && insn_data[new_icode].operand[0].predicate
34325 == insn_data[icode].operand[0].predicate
34326 && insn_data[new_icode].operand[1].predicate
34327 == insn_data[icode].operand[1].predicate);
34328 icode = new_icode;
34329 goto non_constant;
34331 break;
34332 default:
34333 gcc_unreachable ();
34337 else
34339 non_constant:
34340 if (VECTOR_MODE_P (mode))
34341 op = safe_vector_operand (op, mode);
34343 /* If we aren't optimizing, only allow one memory operand to be
34344 generated. */
34345 if (memory_operand (op, mode))
34346 num_memory++;
34348 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
34350 if (optimize
34351 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
34352 || num_memory > 1)
34353 op = force_reg (mode, op);
34356 args[i].op = op;
34357 args[i].mode = mode;
34360 switch (nargs)
34362 case 1:
34363 pat = GEN_FCN (icode) (target, args[0].op);
34364 break;
34366 case 2:
34367 if (tf_p)
34368 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34369 GEN_INT ((int)sub_code));
34370 else if (! comparison_p)
34371 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34372 else
34374 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
34375 args[0].op,
34376 args[1].op);
34378 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
34380 break;
34382 case 3:
34383 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
34384 break;
34386 case 4:
34387 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
34388 break;
34390 default:
34391 gcc_unreachable ();
34394 if (! pat)
34395 return 0;
34397 emit_insn (pat);
34398 return target;
34401 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
34402 insns with vec_merge. */
34404 static rtx
34405 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
34406 rtx target)
34408 rtx pat;
34409 tree arg0 = CALL_EXPR_ARG (exp, 0);
34410 rtx op1, op0 = expand_normal (arg0);
34411 machine_mode tmode = insn_data[icode].operand[0].mode;
34412 machine_mode mode0 = insn_data[icode].operand[1].mode;
34414 if (optimize || !target
34415 || GET_MODE (target) != tmode
34416 || !insn_data[icode].operand[0].predicate (target, tmode))
34417 target = gen_reg_rtx (tmode);
34419 if (VECTOR_MODE_P (mode0))
34420 op0 = safe_vector_operand (op0, mode0);
34422 if ((optimize && !register_operand (op0, mode0))
34423 || !insn_data[icode].operand[1].predicate (op0, mode0))
34424 op0 = copy_to_mode_reg (mode0, op0);
34426 op1 = op0;
34427 if (!insn_data[icode].operand[2].predicate (op1, mode0))
34428 op1 = copy_to_mode_reg (mode0, op1);
34430 pat = GEN_FCN (icode) (target, op0, op1);
34431 if (! pat)
34432 return 0;
34433 emit_insn (pat);
34434 return target;
34437 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
34439 static rtx
34440 ix86_expand_sse_compare (const struct builtin_description *d,
34441 tree exp, rtx target, bool swap)
34443 rtx pat;
34444 tree arg0 = CALL_EXPR_ARG (exp, 0);
34445 tree arg1 = CALL_EXPR_ARG (exp, 1);
34446 rtx op0 = expand_normal (arg0);
34447 rtx op1 = expand_normal (arg1);
34448 rtx op2;
34449 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34450 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34451 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34452 enum rtx_code comparison = d->comparison;
34454 if (VECTOR_MODE_P (mode0))
34455 op0 = safe_vector_operand (op0, mode0);
34456 if (VECTOR_MODE_P (mode1))
34457 op1 = safe_vector_operand (op1, mode1);
34459 /* Swap operands if we have a comparison that isn't available in
34460 hardware. */
34461 if (swap)
34462 std::swap (op0, op1);
34464 if (optimize || !target
34465 || GET_MODE (target) != tmode
34466 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34467 target = gen_reg_rtx (tmode);
34469 if ((optimize && !register_operand (op0, mode0))
34470 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
34471 op0 = copy_to_mode_reg (mode0, op0);
34472 if ((optimize && !register_operand (op1, mode1))
34473 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
34474 op1 = copy_to_mode_reg (mode1, op1);
34476 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
34477 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34478 if (! pat)
34479 return 0;
34480 emit_insn (pat);
34481 return target;
34484 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
34486 static rtx
34487 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
34488 rtx target)
34490 rtx pat;
34491 tree arg0 = CALL_EXPR_ARG (exp, 0);
34492 tree arg1 = CALL_EXPR_ARG (exp, 1);
34493 rtx op0 = expand_normal (arg0);
34494 rtx op1 = expand_normal (arg1);
34495 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34496 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34497 enum rtx_code comparison = d->comparison;
34499 if (VECTOR_MODE_P (mode0))
34500 op0 = safe_vector_operand (op0, mode0);
34501 if (VECTOR_MODE_P (mode1))
34502 op1 = safe_vector_operand (op1, mode1);
34504 /* Swap operands if we have a comparison that isn't available in
34505 hardware. */
34506 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
34507 std::swap (op0, op1);
34509 target = gen_reg_rtx (SImode);
34510 emit_move_insn (target, const0_rtx);
34511 target = gen_rtx_SUBREG (QImode, target, 0);
34513 if ((optimize && !register_operand (op0, mode0))
34514 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34515 op0 = copy_to_mode_reg (mode0, op0);
34516 if ((optimize && !register_operand (op1, mode1))
34517 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34518 op1 = copy_to_mode_reg (mode1, op1);
34520 pat = GEN_FCN (d->icode) (op0, op1);
34521 if (! pat)
34522 return 0;
34523 emit_insn (pat);
34524 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34525 gen_rtx_fmt_ee (comparison, QImode,
34526 SET_DEST (pat),
34527 const0_rtx)));
34529 return SUBREG_REG (target);
34532 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
34534 static rtx
34535 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
34536 rtx target)
34538 rtx pat;
34539 tree arg0 = CALL_EXPR_ARG (exp, 0);
34540 rtx op1, op0 = expand_normal (arg0);
34541 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34542 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34544 if (optimize || target == 0
34545 || GET_MODE (target) != tmode
34546 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34547 target = gen_reg_rtx (tmode);
34549 if (VECTOR_MODE_P (mode0))
34550 op0 = safe_vector_operand (op0, mode0);
34552 if ((optimize && !register_operand (op0, mode0))
34553 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34554 op0 = copy_to_mode_reg (mode0, op0);
34556 op1 = GEN_INT (d->comparison);
34558 pat = GEN_FCN (d->icode) (target, op0, op1);
34559 if (! pat)
34560 return 0;
34561 emit_insn (pat);
34562 return target;
34565 static rtx
34566 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
34567 tree exp, rtx target)
34569 rtx pat;
34570 tree arg0 = CALL_EXPR_ARG (exp, 0);
34571 tree arg1 = CALL_EXPR_ARG (exp, 1);
34572 rtx op0 = expand_normal (arg0);
34573 rtx op1 = expand_normal (arg1);
34574 rtx op2;
34575 machine_mode tmode = insn_data[d->icode].operand[0].mode;
34576 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
34577 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
34579 if (optimize || target == 0
34580 || GET_MODE (target) != tmode
34581 || !insn_data[d->icode].operand[0].predicate (target, tmode))
34582 target = gen_reg_rtx (tmode);
34584 op0 = safe_vector_operand (op0, mode0);
34585 op1 = safe_vector_operand (op1, mode1);
34587 if ((optimize && !register_operand (op0, mode0))
34588 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34589 op0 = copy_to_mode_reg (mode0, op0);
34590 if ((optimize && !register_operand (op1, mode1))
34591 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34592 op1 = copy_to_mode_reg (mode1, op1);
34594 op2 = GEN_INT (d->comparison);
34596 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
34597 if (! pat)
34598 return 0;
34599 emit_insn (pat);
34600 return target;
34603 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
34605 static rtx
34606 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
34607 rtx target)
34609 rtx pat;
34610 tree arg0 = CALL_EXPR_ARG (exp, 0);
34611 tree arg1 = CALL_EXPR_ARG (exp, 1);
34612 rtx op0 = expand_normal (arg0);
34613 rtx op1 = expand_normal (arg1);
34614 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
34615 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
34616 enum rtx_code comparison = d->comparison;
34618 if (VECTOR_MODE_P (mode0))
34619 op0 = safe_vector_operand (op0, mode0);
34620 if (VECTOR_MODE_P (mode1))
34621 op1 = safe_vector_operand (op1, mode1);
34623 target = gen_reg_rtx (SImode);
34624 emit_move_insn (target, const0_rtx);
34625 target = gen_rtx_SUBREG (QImode, target, 0);
34627 if ((optimize && !register_operand (op0, mode0))
34628 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
34629 op0 = copy_to_mode_reg (mode0, op0);
34630 if ((optimize && !register_operand (op1, mode1))
34631 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
34632 op1 = copy_to_mode_reg (mode1, op1);
34634 pat = GEN_FCN (d->icode) (op0, op1);
34635 if (! pat)
34636 return 0;
34637 emit_insn (pat);
34638 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34639 gen_rtx_fmt_ee (comparison, QImode,
34640 SET_DEST (pat),
34641 const0_rtx)));
34643 return SUBREG_REG (target);
34646 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
34648 static rtx
34649 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
34650 tree exp, rtx target)
34652 rtx pat;
34653 tree arg0 = CALL_EXPR_ARG (exp, 0);
34654 tree arg1 = CALL_EXPR_ARG (exp, 1);
34655 tree arg2 = CALL_EXPR_ARG (exp, 2);
34656 tree arg3 = CALL_EXPR_ARG (exp, 3);
34657 tree arg4 = CALL_EXPR_ARG (exp, 4);
34658 rtx scratch0, scratch1;
34659 rtx op0 = expand_normal (arg0);
34660 rtx op1 = expand_normal (arg1);
34661 rtx op2 = expand_normal (arg2);
34662 rtx op3 = expand_normal (arg3);
34663 rtx op4 = expand_normal (arg4);
34664 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
34666 tmode0 = insn_data[d->icode].operand[0].mode;
34667 tmode1 = insn_data[d->icode].operand[1].mode;
34668 modev2 = insn_data[d->icode].operand[2].mode;
34669 modei3 = insn_data[d->icode].operand[3].mode;
34670 modev4 = insn_data[d->icode].operand[4].mode;
34671 modei5 = insn_data[d->icode].operand[5].mode;
34672 modeimm = insn_data[d->icode].operand[6].mode;
34674 if (VECTOR_MODE_P (modev2))
34675 op0 = safe_vector_operand (op0, modev2);
34676 if (VECTOR_MODE_P (modev4))
34677 op2 = safe_vector_operand (op2, modev4);
34679 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34680 op0 = copy_to_mode_reg (modev2, op0);
34681 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
34682 op1 = copy_to_mode_reg (modei3, op1);
34683 if ((optimize && !register_operand (op2, modev4))
34684 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
34685 op2 = copy_to_mode_reg (modev4, op2);
34686 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
34687 op3 = copy_to_mode_reg (modei5, op3);
34689 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
34691 error ("the fifth argument must be an 8-bit immediate");
34692 return const0_rtx;
34695 if (d->code == IX86_BUILTIN_PCMPESTRI128)
34697 if (optimize || !target
34698 || GET_MODE (target) != tmode0
34699 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34700 target = gen_reg_rtx (tmode0);
34702 scratch1 = gen_reg_rtx (tmode1);
34704 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
34706 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
34708 if (optimize || !target
34709 || GET_MODE (target) != tmode1
34710 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34711 target = gen_reg_rtx (tmode1);
34713 scratch0 = gen_reg_rtx (tmode0);
34715 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
34717 else
34719 gcc_assert (d->flag);
34721 scratch0 = gen_reg_rtx (tmode0);
34722 scratch1 = gen_reg_rtx (tmode1);
34724 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
34727 if (! pat)
34728 return 0;
34730 emit_insn (pat);
34732 if (d->flag)
34734 target = gen_reg_rtx (SImode);
34735 emit_move_insn (target, const0_rtx);
34736 target = gen_rtx_SUBREG (QImode, target, 0);
34738 emit_insn
34739 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34740 gen_rtx_fmt_ee (EQ, QImode,
34741 gen_rtx_REG ((machine_mode) d->flag,
34742 FLAGS_REG),
34743 const0_rtx)));
34744 return SUBREG_REG (target);
34746 else
34747 return target;
34751 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
34753 static rtx
34754 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
34755 tree exp, rtx target)
34757 rtx pat;
34758 tree arg0 = CALL_EXPR_ARG (exp, 0);
34759 tree arg1 = CALL_EXPR_ARG (exp, 1);
34760 tree arg2 = CALL_EXPR_ARG (exp, 2);
34761 rtx scratch0, scratch1;
34762 rtx op0 = expand_normal (arg0);
34763 rtx op1 = expand_normal (arg1);
34764 rtx op2 = expand_normal (arg2);
34765 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
34767 tmode0 = insn_data[d->icode].operand[0].mode;
34768 tmode1 = insn_data[d->icode].operand[1].mode;
34769 modev2 = insn_data[d->icode].operand[2].mode;
34770 modev3 = insn_data[d->icode].operand[3].mode;
34771 modeimm = insn_data[d->icode].operand[4].mode;
34773 if (VECTOR_MODE_P (modev2))
34774 op0 = safe_vector_operand (op0, modev2);
34775 if (VECTOR_MODE_P (modev3))
34776 op1 = safe_vector_operand (op1, modev3);
34778 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
34779 op0 = copy_to_mode_reg (modev2, op0);
34780 if ((optimize && !register_operand (op1, modev3))
34781 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
34782 op1 = copy_to_mode_reg (modev3, op1);
34784 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
34786 error ("the third argument must be an 8-bit immediate");
34787 return const0_rtx;
34790 if (d->code == IX86_BUILTIN_PCMPISTRI128)
34792 if (optimize || !target
34793 || GET_MODE (target) != tmode0
34794 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
34795 target = gen_reg_rtx (tmode0);
34797 scratch1 = gen_reg_rtx (tmode1);
34799 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
34801 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
34803 if (optimize || !target
34804 || GET_MODE (target) != tmode1
34805 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
34806 target = gen_reg_rtx (tmode1);
34808 scratch0 = gen_reg_rtx (tmode0);
34810 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
34812 else
34814 gcc_assert (d->flag);
34816 scratch0 = gen_reg_rtx (tmode0);
34817 scratch1 = gen_reg_rtx (tmode1);
34819 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
34822 if (! pat)
34823 return 0;
34825 emit_insn (pat);
34827 if (d->flag)
34829 target = gen_reg_rtx (SImode);
34830 emit_move_insn (target, const0_rtx);
34831 target = gen_rtx_SUBREG (QImode, target, 0);
34833 emit_insn
34834 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34835 gen_rtx_fmt_ee (EQ, QImode,
34836 gen_rtx_REG ((machine_mode) d->flag,
34837 FLAGS_REG),
34838 const0_rtx)));
34839 return SUBREG_REG (target);
34841 else
34842 return target;
34845 /* Subroutine of ix86_expand_builtin to take care of insns with
34846 variable number of operands. */
34848 static rtx
34849 ix86_expand_args_builtin (const struct builtin_description *d,
34850 tree exp, rtx target)
34852 rtx pat, real_target;
34853 unsigned int i, nargs;
34854 unsigned int nargs_constant = 0;
34855 unsigned int mask_pos = 0;
34856 int num_memory = 0;
34857 struct
34859 rtx op;
34860 machine_mode mode;
34861 } args[6];
34862 bool second_arg_count = false;
34863 enum insn_code icode = d->icode;
34864 const struct insn_data_d *insn_p = &insn_data[icode];
34865 machine_mode tmode = insn_p->operand[0].mode;
34866 machine_mode rmode = VOIDmode;
34867 bool swap = false;
34868 enum rtx_code comparison = d->comparison;
34870 switch ((enum ix86_builtin_func_type) d->flag)
34872 case V2DF_FTYPE_V2DF_ROUND:
34873 case V4DF_FTYPE_V4DF_ROUND:
34874 case V8DF_FTYPE_V8DF_ROUND:
34875 case V4SF_FTYPE_V4SF_ROUND:
34876 case V8SF_FTYPE_V8SF_ROUND:
34877 case V16SF_FTYPE_V16SF_ROUND:
34878 case V4SI_FTYPE_V4SF_ROUND:
34879 case V8SI_FTYPE_V8SF_ROUND:
34880 case V16SI_FTYPE_V16SF_ROUND:
34881 return ix86_expand_sse_round (d, exp, target);
34882 case V4SI_FTYPE_V2DF_V2DF_ROUND:
34883 case V8SI_FTYPE_V4DF_V4DF_ROUND:
34884 case V16SI_FTYPE_V8DF_V8DF_ROUND:
34885 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
34886 case INT_FTYPE_V8SF_V8SF_PTEST:
34887 case INT_FTYPE_V4DI_V4DI_PTEST:
34888 case INT_FTYPE_V4DF_V4DF_PTEST:
34889 case INT_FTYPE_V4SF_V4SF_PTEST:
34890 case INT_FTYPE_V2DI_V2DI_PTEST:
34891 case INT_FTYPE_V2DF_V2DF_PTEST:
34892 return ix86_expand_sse_ptest (d, exp, target);
34893 case FLOAT128_FTYPE_FLOAT128:
34894 case FLOAT_FTYPE_FLOAT:
34895 case INT_FTYPE_INT:
34896 case UINT_FTYPE_UINT:
34897 case UINT16_FTYPE_UINT16:
34898 case UINT64_FTYPE_INT:
34899 case UINT64_FTYPE_UINT64:
34900 case INT64_FTYPE_INT64:
34901 case INT64_FTYPE_V4SF:
34902 case INT64_FTYPE_V2DF:
34903 case INT_FTYPE_V16QI:
34904 case INT_FTYPE_V8QI:
34905 case INT_FTYPE_V8SF:
34906 case INT_FTYPE_V4DF:
34907 case INT_FTYPE_V4SF:
34908 case INT_FTYPE_V2DF:
34909 case INT_FTYPE_V32QI:
34910 case V16QI_FTYPE_V16QI:
34911 case V8SI_FTYPE_V8SF:
34912 case V8SI_FTYPE_V4SI:
34913 case V8HI_FTYPE_V8HI:
34914 case V8HI_FTYPE_V16QI:
34915 case V8QI_FTYPE_V8QI:
34916 case V8SF_FTYPE_V8SF:
34917 case V8SF_FTYPE_V8SI:
34918 case V8SF_FTYPE_V4SF:
34919 case V8SF_FTYPE_V8HI:
34920 case V4SI_FTYPE_V4SI:
34921 case V4SI_FTYPE_V16QI:
34922 case V4SI_FTYPE_V4SF:
34923 case V4SI_FTYPE_V8SI:
34924 case V4SI_FTYPE_V8HI:
34925 case V4SI_FTYPE_V4DF:
34926 case V4SI_FTYPE_V2DF:
34927 case V4HI_FTYPE_V4HI:
34928 case V4DF_FTYPE_V4DF:
34929 case V4DF_FTYPE_V4SI:
34930 case V4DF_FTYPE_V4SF:
34931 case V4DF_FTYPE_V2DF:
34932 case V4SF_FTYPE_V4SF:
34933 case V4SF_FTYPE_V4SI:
34934 case V4SF_FTYPE_V8SF:
34935 case V4SF_FTYPE_V4DF:
34936 case V4SF_FTYPE_V8HI:
34937 case V4SF_FTYPE_V2DF:
34938 case V2DI_FTYPE_V2DI:
34939 case V2DI_FTYPE_V16QI:
34940 case V2DI_FTYPE_V8HI:
34941 case V2DI_FTYPE_V4SI:
34942 case V2DF_FTYPE_V2DF:
34943 case V2DF_FTYPE_V4SI:
34944 case V2DF_FTYPE_V4DF:
34945 case V2DF_FTYPE_V4SF:
34946 case V2DF_FTYPE_V2SI:
34947 case V2SI_FTYPE_V2SI:
34948 case V2SI_FTYPE_V4SF:
34949 case V2SI_FTYPE_V2SF:
34950 case V2SI_FTYPE_V2DF:
34951 case V2SF_FTYPE_V2SF:
34952 case V2SF_FTYPE_V2SI:
34953 case V32QI_FTYPE_V32QI:
34954 case V32QI_FTYPE_V16QI:
34955 case V16HI_FTYPE_V16HI:
34956 case V16HI_FTYPE_V8HI:
34957 case V8SI_FTYPE_V8SI:
34958 case V16HI_FTYPE_V16QI:
34959 case V8SI_FTYPE_V16QI:
34960 case V4DI_FTYPE_V16QI:
34961 case V8SI_FTYPE_V8HI:
34962 case V4DI_FTYPE_V8HI:
34963 case V4DI_FTYPE_V4SI:
34964 case V4DI_FTYPE_V2DI:
34965 case UQI_FTYPE_UQI:
34966 case UHI_FTYPE_UHI:
34967 case USI_FTYPE_USI:
34968 case USI_FTYPE_UQI:
34969 case USI_FTYPE_UHI:
34970 case UDI_FTYPE_UDI:
34971 case UHI_FTYPE_V16QI:
34972 case USI_FTYPE_V32QI:
34973 case UDI_FTYPE_V64QI:
34974 case V16QI_FTYPE_UHI:
34975 case V32QI_FTYPE_USI:
34976 case V64QI_FTYPE_UDI:
34977 case V8HI_FTYPE_UQI:
34978 case V16HI_FTYPE_UHI:
34979 case V32HI_FTYPE_USI:
34980 case V4SI_FTYPE_UQI:
34981 case V8SI_FTYPE_UQI:
34982 case V4SI_FTYPE_UHI:
34983 case V8SI_FTYPE_UHI:
34984 case UQI_FTYPE_V8HI:
34985 case UHI_FTYPE_V16HI:
34986 case USI_FTYPE_V32HI:
34987 case UQI_FTYPE_V4SI:
34988 case UQI_FTYPE_V8SI:
34989 case UHI_FTYPE_V16SI:
34990 case UQI_FTYPE_V2DI:
34991 case UQI_FTYPE_V4DI:
34992 case UQI_FTYPE_V8DI:
34993 case V16SI_FTYPE_UHI:
34994 case V2DI_FTYPE_UQI:
34995 case V4DI_FTYPE_UQI:
34996 case V16SI_FTYPE_INT:
34997 case V16SF_FTYPE_V8SF:
34998 case V16SI_FTYPE_V8SI:
34999 case V16SF_FTYPE_V4SF:
35000 case V16SI_FTYPE_V4SI:
35001 case V16SI_FTYPE_V16SF:
35002 case V16SI_FTYPE_V16SI:
35003 case V64QI_FTYPE_V64QI:
35004 case V32HI_FTYPE_V32HI:
35005 case V16SF_FTYPE_V16SF:
35006 case V8DI_FTYPE_UQI:
35007 case V8DI_FTYPE_V8DI:
35008 case V8DF_FTYPE_V4DF:
35009 case V8DF_FTYPE_V2DF:
35010 case V8DF_FTYPE_V8DF:
35011 case V4DI_FTYPE_V4DI:
35012 nargs = 1;
35013 break;
35014 case V4SF_FTYPE_V4SF_VEC_MERGE:
35015 case V2DF_FTYPE_V2DF_VEC_MERGE:
35016 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
35017 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
35018 case V16QI_FTYPE_V16QI_V16QI:
35019 case V16QI_FTYPE_V8HI_V8HI:
35020 case V16SF_FTYPE_V16SF_V16SF:
35021 case V8QI_FTYPE_V8QI_V8QI:
35022 case V8QI_FTYPE_V4HI_V4HI:
35023 case V8HI_FTYPE_V8HI_V8HI:
35024 case V8HI_FTYPE_V16QI_V16QI:
35025 case V8HI_FTYPE_V4SI_V4SI:
35026 case V8SF_FTYPE_V8SF_V8SF:
35027 case V8SF_FTYPE_V8SF_V8SI:
35028 case V8DF_FTYPE_V8DF_V8DF:
35029 case V4SI_FTYPE_V4SI_V4SI:
35030 case V4SI_FTYPE_V8HI_V8HI:
35031 case V4SI_FTYPE_V2DF_V2DF:
35032 case V4HI_FTYPE_V4HI_V4HI:
35033 case V4HI_FTYPE_V8QI_V8QI:
35034 case V4HI_FTYPE_V2SI_V2SI:
35035 case V4DF_FTYPE_V4DF_V4DF:
35036 case V4DF_FTYPE_V4DF_V4DI:
35037 case V4SF_FTYPE_V4SF_V4SF:
35038 case V4SF_FTYPE_V4SF_V4SI:
35039 case V4SF_FTYPE_V4SF_V2SI:
35040 case V4SF_FTYPE_V4SF_V2DF:
35041 case V4SF_FTYPE_V4SF_UINT:
35042 case V4SF_FTYPE_V4SF_DI:
35043 case V4SF_FTYPE_V4SF_SI:
35044 case V2DI_FTYPE_V2DI_V2DI:
35045 case V2DI_FTYPE_V16QI_V16QI:
35046 case V2DI_FTYPE_V4SI_V4SI:
35047 case V2DI_FTYPE_V2DI_V16QI:
35048 case V2SI_FTYPE_V2SI_V2SI:
35049 case V2SI_FTYPE_V4HI_V4HI:
35050 case V2SI_FTYPE_V2SF_V2SF:
35051 case V2DF_FTYPE_V2DF_V2DF:
35052 case V2DF_FTYPE_V2DF_V4SF:
35053 case V2DF_FTYPE_V2DF_V2DI:
35054 case V2DF_FTYPE_V2DF_DI:
35055 case V2DF_FTYPE_V2DF_SI:
35056 case V2DF_FTYPE_V2DF_UINT:
35057 case V2SF_FTYPE_V2SF_V2SF:
35058 case V1DI_FTYPE_V1DI_V1DI:
35059 case V1DI_FTYPE_V8QI_V8QI:
35060 case V1DI_FTYPE_V2SI_V2SI:
35061 case V32QI_FTYPE_V16HI_V16HI:
35062 case V16HI_FTYPE_V8SI_V8SI:
35063 case V64QI_FTYPE_V64QI_V64QI:
35064 case V32QI_FTYPE_V32QI_V32QI:
35065 case V16HI_FTYPE_V32QI_V32QI:
35066 case V16HI_FTYPE_V16HI_V16HI:
35067 case V8SI_FTYPE_V4DF_V4DF:
35068 case V8SI_FTYPE_V8SI_V8SI:
35069 case V8SI_FTYPE_V16HI_V16HI:
35070 case V4DI_FTYPE_V4DI_V4DI:
35071 case V4DI_FTYPE_V8SI_V8SI:
35072 case V8DI_FTYPE_V64QI_V64QI:
35073 if (comparison == UNKNOWN)
35074 return ix86_expand_binop_builtin (icode, exp, target);
35075 nargs = 2;
35076 break;
35077 case V4SF_FTYPE_V4SF_V4SF_SWAP:
35078 case V2DF_FTYPE_V2DF_V2DF_SWAP:
35079 gcc_assert (comparison != UNKNOWN);
35080 nargs = 2;
35081 swap = true;
35082 break;
35083 case V16HI_FTYPE_V16HI_V8HI_COUNT:
35084 case V16HI_FTYPE_V16HI_SI_COUNT:
35085 case V8SI_FTYPE_V8SI_V4SI_COUNT:
35086 case V8SI_FTYPE_V8SI_SI_COUNT:
35087 case V4DI_FTYPE_V4DI_V2DI_COUNT:
35088 case V4DI_FTYPE_V4DI_INT_COUNT:
35089 case V8HI_FTYPE_V8HI_V8HI_COUNT:
35090 case V8HI_FTYPE_V8HI_SI_COUNT:
35091 case V4SI_FTYPE_V4SI_V4SI_COUNT:
35092 case V4SI_FTYPE_V4SI_SI_COUNT:
35093 case V4HI_FTYPE_V4HI_V4HI_COUNT:
35094 case V4HI_FTYPE_V4HI_SI_COUNT:
35095 case V2DI_FTYPE_V2DI_V2DI_COUNT:
35096 case V2DI_FTYPE_V2DI_SI_COUNT:
35097 case V2SI_FTYPE_V2SI_V2SI_COUNT:
35098 case V2SI_FTYPE_V2SI_SI_COUNT:
35099 case V1DI_FTYPE_V1DI_V1DI_COUNT:
35100 case V1DI_FTYPE_V1DI_SI_COUNT:
35101 nargs = 2;
35102 second_arg_count = true;
35103 break;
35104 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
35105 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
35106 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
35107 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
35108 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
35109 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
35110 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
35111 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
35112 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
35113 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
35114 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
35115 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
35116 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
35117 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
35118 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
35119 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
35120 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
35121 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
35122 nargs = 4;
35123 second_arg_count = true;
35124 break;
35125 case UINT64_FTYPE_UINT64_UINT64:
35126 case UINT_FTYPE_UINT_UINT:
35127 case UINT_FTYPE_UINT_USHORT:
35128 case UINT_FTYPE_UINT_UCHAR:
35129 case UINT16_FTYPE_UINT16_INT:
35130 case UINT8_FTYPE_UINT8_INT:
35131 case UQI_FTYPE_UQI_UQI:
35132 case UHI_FTYPE_UHI_UHI:
35133 case USI_FTYPE_USI_USI:
35134 case UDI_FTYPE_UDI_UDI:
35135 case V16SI_FTYPE_V8DF_V8DF:
35136 nargs = 2;
35137 break;
35138 case V2DI_FTYPE_V2DI_INT_CONVERT:
35139 nargs = 2;
35140 rmode = V1TImode;
35141 nargs_constant = 1;
35142 break;
35143 case V4DI_FTYPE_V4DI_INT_CONVERT:
35144 nargs = 2;
35145 rmode = V2TImode;
35146 nargs_constant = 1;
35147 break;
35148 case V8DI_FTYPE_V8DI_INT_CONVERT:
35149 nargs = 2;
35150 rmode = V4TImode;
35151 nargs_constant = 1;
35152 break;
35153 case V8HI_FTYPE_V8HI_INT:
35154 case V8HI_FTYPE_V8SF_INT:
35155 case V16HI_FTYPE_V16SF_INT:
35156 case V8HI_FTYPE_V4SF_INT:
35157 case V8SF_FTYPE_V8SF_INT:
35158 case V4SF_FTYPE_V16SF_INT:
35159 case V16SF_FTYPE_V16SF_INT:
35160 case V4SI_FTYPE_V4SI_INT:
35161 case V4SI_FTYPE_V8SI_INT:
35162 case V4HI_FTYPE_V4HI_INT:
35163 case V4DF_FTYPE_V4DF_INT:
35164 case V4DF_FTYPE_V8DF_INT:
35165 case V4SF_FTYPE_V4SF_INT:
35166 case V4SF_FTYPE_V8SF_INT:
35167 case V2DI_FTYPE_V2DI_INT:
35168 case V2DF_FTYPE_V2DF_INT:
35169 case V2DF_FTYPE_V4DF_INT:
35170 case V16HI_FTYPE_V16HI_INT:
35171 case V8SI_FTYPE_V8SI_INT:
35172 case V16SI_FTYPE_V16SI_INT:
35173 case V4SI_FTYPE_V16SI_INT:
35174 case V4DI_FTYPE_V4DI_INT:
35175 case V2DI_FTYPE_V4DI_INT:
35176 case V4DI_FTYPE_V8DI_INT:
35177 case QI_FTYPE_V4SF_INT:
35178 case QI_FTYPE_V2DF_INT:
35179 case UQI_FTYPE_UQI_UQI_CONST:
35180 case UHI_FTYPE_UHI_UQI:
35181 case USI_FTYPE_USI_UQI:
35182 case UDI_FTYPE_UDI_UQI:
35183 nargs = 2;
35184 nargs_constant = 1;
35185 break;
35186 case V16QI_FTYPE_V16QI_V16QI_V16QI:
35187 case V8SF_FTYPE_V8SF_V8SF_V8SF:
35188 case V4DF_FTYPE_V4DF_V4DF_V4DF:
35189 case V4SF_FTYPE_V4SF_V4SF_V4SF:
35190 case V2DF_FTYPE_V2DF_V2DF_V2DF:
35191 case V32QI_FTYPE_V32QI_V32QI_V32QI:
35192 case UHI_FTYPE_V16SI_V16SI_UHI:
35193 case UQI_FTYPE_V8DI_V8DI_UQI:
35194 case V16HI_FTYPE_V16SI_V16HI_UHI:
35195 case V16QI_FTYPE_V16SI_V16QI_UHI:
35196 case V16QI_FTYPE_V8DI_V16QI_UQI:
35197 case V16SF_FTYPE_V16SF_V16SF_UHI:
35198 case V16SF_FTYPE_V4SF_V16SF_UHI:
35199 case V16SI_FTYPE_SI_V16SI_UHI:
35200 case V16SI_FTYPE_V16HI_V16SI_UHI:
35201 case V16SI_FTYPE_V16QI_V16SI_UHI:
35202 case V8SF_FTYPE_V4SF_V8SF_UQI:
35203 case V4DF_FTYPE_V2DF_V4DF_UQI:
35204 case V8SI_FTYPE_V4SI_V8SI_UQI:
35205 case V8SI_FTYPE_SI_V8SI_UQI:
35206 case V4SI_FTYPE_V4SI_V4SI_UQI:
35207 case V4SI_FTYPE_SI_V4SI_UQI:
35208 case V4DI_FTYPE_V2DI_V4DI_UQI:
35209 case V4DI_FTYPE_DI_V4DI_UQI:
35210 case V2DI_FTYPE_V2DI_V2DI_UQI:
35211 case V2DI_FTYPE_DI_V2DI_UQI:
35212 case V64QI_FTYPE_V64QI_V64QI_UDI:
35213 case V64QI_FTYPE_V16QI_V64QI_UDI:
35214 case V64QI_FTYPE_QI_V64QI_UDI:
35215 case V32QI_FTYPE_V32QI_V32QI_USI:
35216 case V32QI_FTYPE_V16QI_V32QI_USI:
35217 case V32QI_FTYPE_QI_V32QI_USI:
35218 case V16QI_FTYPE_V16QI_V16QI_UHI:
35219 case V16QI_FTYPE_QI_V16QI_UHI:
35220 case V32HI_FTYPE_V8HI_V32HI_USI:
35221 case V32HI_FTYPE_HI_V32HI_USI:
35222 case V16HI_FTYPE_V8HI_V16HI_UHI:
35223 case V16HI_FTYPE_HI_V16HI_UHI:
35224 case V8HI_FTYPE_V8HI_V8HI_UQI:
35225 case V8HI_FTYPE_HI_V8HI_UQI:
35226 case V8SF_FTYPE_V8HI_V8SF_UQI:
35227 case V4SF_FTYPE_V8HI_V4SF_UQI:
35228 case V8SI_FTYPE_V8SF_V8SI_UQI:
35229 case V4SI_FTYPE_V4SF_V4SI_UQI:
35230 case V4DI_FTYPE_V4SF_V4DI_UQI:
35231 case V2DI_FTYPE_V4SF_V2DI_UQI:
35232 case V4SF_FTYPE_V4DI_V4SF_UQI:
35233 case V4SF_FTYPE_V2DI_V4SF_UQI:
35234 case V4DF_FTYPE_V4DI_V4DF_UQI:
35235 case V2DF_FTYPE_V2DI_V2DF_UQI:
35236 case V16QI_FTYPE_V8HI_V16QI_UQI:
35237 case V16QI_FTYPE_V16HI_V16QI_UHI:
35238 case V16QI_FTYPE_V4SI_V16QI_UQI:
35239 case V16QI_FTYPE_V8SI_V16QI_UQI:
35240 case V8HI_FTYPE_V4SI_V8HI_UQI:
35241 case V8HI_FTYPE_V8SI_V8HI_UQI:
35242 case V16QI_FTYPE_V2DI_V16QI_UQI:
35243 case V16QI_FTYPE_V4DI_V16QI_UQI:
35244 case V8HI_FTYPE_V2DI_V8HI_UQI:
35245 case V8HI_FTYPE_V4DI_V8HI_UQI:
35246 case V4SI_FTYPE_V2DI_V4SI_UQI:
35247 case V4SI_FTYPE_V4DI_V4SI_UQI:
35248 case V32QI_FTYPE_V32HI_V32QI_USI:
35249 case UHI_FTYPE_V16QI_V16QI_UHI:
35250 case USI_FTYPE_V32QI_V32QI_USI:
35251 case UDI_FTYPE_V64QI_V64QI_UDI:
35252 case UQI_FTYPE_V8HI_V8HI_UQI:
35253 case UHI_FTYPE_V16HI_V16HI_UHI:
35254 case USI_FTYPE_V32HI_V32HI_USI:
35255 case UQI_FTYPE_V4SI_V4SI_UQI:
35256 case UQI_FTYPE_V8SI_V8SI_UQI:
35257 case UQI_FTYPE_V2DI_V2DI_UQI:
35258 case UQI_FTYPE_V4DI_V4DI_UQI:
35259 case V4SF_FTYPE_V2DF_V4SF_UQI:
35260 case V4SF_FTYPE_V4DF_V4SF_UQI:
35261 case V16SI_FTYPE_V16SI_V16SI_UHI:
35262 case V16SI_FTYPE_V4SI_V16SI_UHI:
35263 case V2DI_FTYPE_V4SI_V2DI_UQI:
35264 case V2DI_FTYPE_V8HI_V2DI_UQI:
35265 case V2DI_FTYPE_V16QI_V2DI_UQI:
35266 case V4DI_FTYPE_V4DI_V4DI_UQI:
35267 case V4DI_FTYPE_V4SI_V4DI_UQI:
35268 case V4DI_FTYPE_V8HI_V4DI_UQI:
35269 case V4DI_FTYPE_V16QI_V4DI_UQI:
35270 case V4DI_FTYPE_V4DF_V4DI_UQI:
35271 case V2DI_FTYPE_V2DF_V2DI_UQI:
35272 case V4SI_FTYPE_V4DF_V4SI_UQI:
35273 case V4SI_FTYPE_V2DF_V4SI_UQI:
35274 case V4SI_FTYPE_V8HI_V4SI_UQI:
35275 case V4SI_FTYPE_V16QI_V4SI_UQI:
35276 case V4DI_FTYPE_V4DI_V4DI_V4DI:
35277 case V8DF_FTYPE_V2DF_V8DF_UQI:
35278 case V8DF_FTYPE_V4DF_V8DF_UQI:
35279 case V8DF_FTYPE_V8DF_V8DF_UQI:
35280 case V8SF_FTYPE_V8SF_V8SF_UQI:
35281 case V8SF_FTYPE_V8SI_V8SF_UQI:
35282 case V4DF_FTYPE_V4DF_V4DF_UQI:
35283 case V4SF_FTYPE_V4SF_V4SF_UQI:
35284 case V2DF_FTYPE_V2DF_V2DF_UQI:
35285 case V2DF_FTYPE_V4SF_V2DF_UQI:
35286 case V2DF_FTYPE_V4SI_V2DF_UQI:
35287 case V4SF_FTYPE_V4SI_V4SF_UQI:
35288 case V4DF_FTYPE_V4SF_V4DF_UQI:
35289 case V4DF_FTYPE_V4SI_V4DF_UQI:
35290 case V8SI_FTYPE_V8SI_V8SI_UQI:
35291 case V8SI_FTYPE_V8HI_V8SI_UQI:
35292 case V8SI_FTYPE_V16QI_V8SI_UQI:
35293 case V8DF_FTYPE_V8SI_V8DF_UQI:
35294 case V8DI_FTYPE_DI_V8DI_UQI:
35295 case V16SF_FTYPE_V8SF_V16SF_UHI:
35296 case V16SI_FTYPE_V8SI_V16SI_UHI:
35297 case V16HI_FTYPE_V16HI_V16HI_UHI:
35298 case V8HI_FTYPE_V16QI_V8HI_UQI:
35299 case V16HI_FTYPE_V16QI_V16HI_UHI:
35300 case V32HI_FTYPE_V32HI_V32HI_USI:
35301 case V32HI_FTYPE_V32QI_V32HI_USI:
35302 case V8DI_FTYPE_V16QI_V8DI_UQI:
35303 case V8DI_FTYPE_V2DI_V8DI_UQI:
35304 case V8DI_FTYPE_V4DI_V8DI_UQI:
35305 case V8DI_FTYPE_V8DI_V8DI_UQI:
35306 case V8DI_FTYPE_V8HI_V8DI_UQI:
35307 case V8DI_FTYPE_V8SI_V8DI_UQI:
35308 case V8HI_FTYPE_V8DI_V8HI_UQI:
35309 case V8SI_FTYPE_V8DI_V8SI_UQI:
35310 case V4SI_FTYPE_V4SI_V4SI_V4SI:
35311 case V16SI_FTYPE_V16SI_V16SI_V16SI:
35312 case V8DI_FTYPE_V8DI_V8DI_V8DI:
35313 case V32HI_FTYPE_V32HI_V32HI_V32HI:
35314 case V2DI_FTYPE_V2DI_V2DI_V2DI:
35315 case V16HI_FTYPE_V16HI_V16HI_V16HI:
35316 case V8SI_FTYPE_V8SI_V8SI_V8SI:
35317 case V8HI_FTYPE_V8HI_V8HI_V8HI:
35318 nargs = 3;
35319 break;
35320 case V32QI_FTYPE_V32QI_V32QI_INT:
35321 case V16HI_FTYPE_V16HI_V16HI_INT:
35322 case V16QI_FTYPE_V16QI_V16QI_INT:
35323 case V4DI_FTYPE_V4DI_V4DI_INT:
35324 case V8HI_FTYPE_V8HI_V8HI_INT:
35325 case V8SI_FTYPE_V8SI_V8SI_INT:
35326 case V8SI_FTYPE_V8SI_V4SI_INT:
35327 case V8SF_FTYPE_V8SF_V8SF_INT:
35328 case V8SF_FTYPE_V8SF_V4SF_INT:
35329 case V4SI_FTYPE_V4SI_V4SI_INT:
35330 case V4DF_FTYPE_V4DF_V4DF_INT:
35331 case V16SF_FTYPE_V16SF_V16SF_INT:
35332 case V16SF_FTYPE_V16SF_V4SF_INT:
35333 case V16SI_FTYPE_V16SI_V4SI_INT:
35334 case V4DF_FTYPE_V4DF_V2DF_INT:
35335 case V4SF_FTYPE_V4SF_V4SF_INT:
35336 case V2DI_FTYPE_V2DI_V2DI_INT:
35337 case V4DI_FTYPE_V4DI_V2DI_INT:
35338 case V2DF_FTYPE_V2DF_V2DF_INT:
35339 case UQI_FTYPE_V8DI_V8UDI_INT:
35340 case UQI_FTYPE_V8DF_V8DF_INT:
35341 case UQI_FTYPE_V2DF_V2DF_INT:
35342 case UQI_FTYPE_V4SF_V4SF_INT:
35343 case UHI_FTYPE_V16SI_V16SI_INT:
35344 case UHI_FTYPE_V16SF_V16SF_INT:
35345 case V64QI_FTYPE_V64QI_V64QI_INT:
35346 case V32HI_FTYPE_V32HI_V32HI_INT:
35347 case V16SI_FTYPE_V16SI_V16SI_INT:
35348 case V8DI_FTYPE_V8DI_V8DI_INT:
35349 nargs = 3;
35350 nargs_constant = 1;
35351 break;
35352 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
35353 nargs = 3;
35354 rmode = V4DImode;
35355 nargs_constant = 1;
35356 break;
35357 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
35358 nargs = 3;
35359 rmode = V2DImode;
35360 nargs_constant = 1;
35361 break;
35362 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
35363 nargs = 3;
35364 rmode = DImode;
35365 nargs_constant = 1;
35366 break;
35367 case V2DI_FTYPE_V2DI_UINT_UINT:
35368 nargs = 3;
35369 nargs_constant = 2;
35370 break;
35371 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
35372 nargs = 3;
35373 rmode = V8DImode;
35374 nargs_constant = 1;
35375 break;
35376 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
35377 nargs = 5;
35378 rmode = V8DImode;
35379 mask_pos = 2;
35380 nargs_constant = 1;
35381 break;
35382 case QI_FTYPE_V8DF_INT_UQI:
35383 case QI_FTYPE_V4DF_INT_UQI:
35384 case QI_FTYPE_V2DF_INT_UQI:
35385 case HI_FTYPE_V16SF_INT_UHI:
35386 case QI_FTYPE_V8SF_INT_UQI:
35387 case QI_FTYPE_V4SF_INT_UQI:
35388 case V4SI_FTYPE_V4SI_V4SI_UHI:
35389 case V8SI_FTYPE_V8SI_V8SI_UHI:
35390 nargs = 3;
35391 mask_pos = 1;
35392 nargs_constant = 1;
35393 break;
35394 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
35395 nargs = 5;
35396 rmode = V4DImode;
35397 mask_pos = 2;
35398 nargs_constant = 1;
35399 break;
35400 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
35401 nargs = 5;
35402 rmode = V2DImode;
35403 mask_pos = 2;
35404 nargs_constant = 1;
35405 break;
35406 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
35407 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
35408 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
35409 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
35410 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
35411 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
35412 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
35413 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
35414 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
35415 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
35416 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
35417 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
35418 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
35419 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
35420 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
35421 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
35422 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
35423 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
35424 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
35425 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
35426 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
35427 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
35428 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
35429 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
35430 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
35431 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
35432 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
35433 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
35434 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
35435 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
35436 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
35437 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
35438 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
35439 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
35440 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
35441 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
35442 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
35443 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
35444 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
35445 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
35446 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
35447 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
35448 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
35449 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
35450 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
35451 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
35452 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
35453 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
35454 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
35455 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
35456 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
35457 nargs = 4;
35458 break;
35459 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
35460 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
35461 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
35462 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
35463 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
35464 nargs = 4;
35465 nargs_constant = 1;
35466 break;
35467 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
35468 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
35469 case QI_FTYPE_V4DF_V4DF_INT_UQI:
35470 case QI_FTYPE_V8SF_V8SF_INT_UQI:
35471 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
35472 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
35473 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
35474 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
35475 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
35476 case USI_FTYPE_V32QI_V32QI_INT_USI:
35477 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
35478 case USI_FTYPE_V32HI_V32HI_INT_USI:
35479 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
35480 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
35481 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
35482 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
35483 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
35484 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
35485 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
35486 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
35487 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
35488 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
35489 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
35490 nargs = 4;
35491 mask_pos = 1;
35492 nargs_constant = 1;
35493 break;
35494 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
35495 nargs = 4;
35496 nargs_constant = 2;
35497 break;
35498 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
35499 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
35500 nargs = 4;
35501 break;
35502 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
35503 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
35504 mask_pos = 1;
35505 nargs = 4;
35506 nargs_constant = 1;
35507 break;
35508 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
35509 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
35510 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
35511 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
35512 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
35513 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
35514 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
35515 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
35516 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
35517 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
35518 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
35519 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
35520 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
35521 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
35522 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
35523 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
35524 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
35525 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
35526 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
35527 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
35528 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
35529 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
35530 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
35531 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
35532 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
35533 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
35534 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
35535 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
35536 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
35537 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
35538 nargs = 4;
35539 mask_pos = 2;
35540 nargs_constant = 1;
35541 break;
35542 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
35543 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
35544 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
35545 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
35546 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
35547 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
35548 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
35549 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
35550 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
35551 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
35552 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
35553 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
35554 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
35555 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
35556 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
35557 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
35558 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
35559 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
35560 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
35561 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
35562 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
35563 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
35564 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
35565 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
35566 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
35567 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
35568 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
35569 nargs = 5;
35570 mask_pos = 2;
35571 nargs_constant = 1;
35572 break;
35573 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
35574 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
35575 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
35576 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
35577 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
35578 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
35579 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
35580 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
35581 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
35582 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
35583 nargs = 5;
35584 mask_pos = 1;
35585 nargs_constant = 1;
35586 break;
35587 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
35588 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
35589 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
35590 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
35591 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
35592 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
35593 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
35594 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
35595 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
35596 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
35597 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
35598 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
35599 nargs = 5;
35600 mask_pos = 1;
35601 nargs_constant = 2;
35602 break;
35604 default:
35605 gcc_unreachable ();
35608 gcc_assert (nargs <= ARRAY_SIZE (args));
35610 if (comparison != UNKNOWN)
35612 gcc_assert (nargs == 2);
35613 return ix86_expand_sse_compare (d, exp, target, swap);
35616 if (rmode == VOIDmode || rmode == tmode)
35618 if (optimize
35619 || target == 0
35620 || GET_MODE (target) != tmode
35621 || !insn_p->operand[0].predicate (target, tmode))
35622 target = gen_reg_rtx (tmode);
35623 else if (memory_operand (target, tmode))
35624 num_memory++;
35625 real_target = target;
35627 else
35629 real_target = gen_reg_rtx (tmode);
35630 target = lowpart_subreg (rmode, real_target, tmode);
35633 for (i = 0; i < nargs; i++)
35635 tree arg = CALL_EXPR_ARG (exp, i);
35636 rtx op = expand_normal (arg);
35637 machine_mode mode = insn_p->operand[i + 1].mode;
35638 bool match = insn_p->operand[i + 1].predicate (op, mode);
35640 if (second_arg_count && i == 1)
35642 /* SIMD shift insns take either an 8-bit immediate or
35643 register as count. But builtin functions take int as
35644 count. If count doesn't match, we put it in register.
35645 The instructions are using 64-bit count, if op is just
35646 32-bit, zero-extend it, as negative shift counts
35647 are undefined behavior and zero-extension is more
35648 efficient. */
35649 if (!match)
35651 if (SCALAR_INT_MODE_P (GET_MODE (op)))
35652 op = convert_modes (mode, GET_MODE (op), op, 1);
35653 else
35654 op = lowpart_subreg (mode, op, GET_MODE (op));
35655 if (!insn_p->operand[i + 1].predicate (op, mode))
35656 op = copy_to_reg (op);
35659 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35660 (!mask_pos && (nargs - i) <= nargs_constant))
35662 if (!match)
35663 switch (icode)
35665 case CODE_FOR_avx_vinsertf128v4di:
35666 case CODE_FOR_avx_vextractf128v4di:
35667 error ("the last argument must be an 1-bit immediate");
35668 return const0_rtx;
35670 case CODE_FOR_avx512f_cmpv8di3_mask:
35671 case CODE_FOR_avx512f_cmpv16si3_mask:
35672 case CODE_FOR_avx512f_ucmpv8di3_mask:
35673 case CODE_FOR_avx512f_ucmpv16si3_mask:
35674 case CODE_FOR_avx512vl_cmpv4di3_mask:
35675 case CODE_FOR_avx512vl_cmpv8si3_mask:
35676 case CODE_FOR_avx512vl_ucmpv4di3_mask:
35677 case CODE_FOR_avx512vl_ucmpv8si3_mask:
35678 case CODE_FOR_avx512vl_cmpv2di3_mask:
35679 case CODE_FOR_avx512vl_cmpv4si3_mask:
35680 case CODE_FOR_avx512vl_ucmpv2di3_mask:
35681 case CODE_FOR_avx512vl_ucmpv4si3_mask:
35682 error ("the last argument must be a 3-bit immediate");
35683 return const0_rtx;
35685 case CODE_FOR_sse4_1_roundsd:
35686 case CODE_FOR_sse4_1_roundss:
35688 case CODE_FOR_sse4_1_roundpd:
35689 case CODE_FOR_sse4_1_roundps:
35690 case CODE_FOR_avx_roundpd256:
35691 case CODE_FOR_avx_roundps256:
35693 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
35694 case CODE_FOR_sse4_1_roundps_sfix:
35695 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
35696 case CODE_FOR_avx_roundps_sfix256:
35698 case CODE_FOR_sse4_1_blendps:
35699 case CODE_FOR_avx_blendpd256:
35700 case CODE_FOR_avx_vpermilv4df:
35701 case CODE_FOR_avx_vpermilv4df_mask:
35702 case CODE_FOR_avx512f_getmantv8df_mask:
35703 case CODE_FOR_avx512f_getmantv16sf_mask:
35704 case CODE_FOR_avx512vl_getmantv8sf_mask:
35705 case CODE_FOR_avx512vl_getmantv4df_mask:
35706 case CODE_FOR_avx512vl_getmantv4sf_mask:
35707 case CODE_FOR_avx512vl_getmantv2df_mask:
35708 case CODE_FOR_avx512dq_rangepv8df_mask_round:
35709 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
35710 case CODE_FOR_avx512dq_rangepv4df_mask:
35711 case CODE_FOR_avx512dq_rangepv8sf_mask:
35712 case CODE_FOR_avx512dq_rangepv2df_mask:
35713 case CODE_FOR_avx512dq_rangepv4sf_mask:
35714 case CODE_FOR_avx_shufpd256_mask:
35715 error ("the last argument must be a 4-bit immediate");
35716 return const0_rtx;
35718 case CODE_FOR_sha1rnds4:
35719 case CODE_FOR_sse4_1_blendpd:
35720 case CODE_FOR_avx_vpermilv2df:
35721 case CODE_FOR_avx_vpermilv2df_mask:
35722 case CODE_FOR_xop_vpermil2v2df3:
35723 case CODE_FOR_xop_vpermil2v4sf3:
35724 case CODE_FOR_xop_vpermil2v4df3:
35725 case CODE_FOR_xop_vpermil2v8sf3:
35726 case CODE_FOR_avx512f_vinsertf32x4_mask:
35727 case CODE_FOR_avx512f_vinserti32x4_mask:
35728 case CODE_FOR_avx512f_vextractf32x4_mask:
35729 case CODE_FOR_avx512f_vextracti32x4_mask:
35730 case CODE_FOR_sse2_shufpd:
35731 case CODE_FOR_sse2_shufpd_mask:
35732 case CODE_FOR_avx512dq_shuf_f64x2_mask:
35733 case CODE_FOR_avx512dq_shuf_i64x2_mask:
35734 case CODE_FOR_avx512vl_shuf_i32x4_mask:
35735 case CODE_FOR_avx512vl_shuf_f32x4_mask:
35736 error ("the last argument must be a 2-bit immediate");
35737 return const0_rtx;
35739 case CODE_FOR_avx_vextractf128v4df:
35740 case CODE_FOR_avx_vextractf128v8sf:
35741 case CODE_FOR_avx_vextractf128v8si:
35742 case CODE_FOR_avx_vinsertf128v4df:
35743 case CODE_FOR_avx_vinsertf128v8sf:
35744 case CODE_FOR_avx_vinsertf128v8si:
35745 case CODE_FOR_avx512f_vinsertf64x4_mask:
35746 case CODE_FOR_avx512f_vinserti64x4_mask:
35747 case CODE_FOR_avx512f_vextractf64x4_mask:
35748 case CODE_FOR_avx512f_vextracti64x4_mask:
35749 case CODE_FOR_avx512dq_vinsertf32x8_mask:
35750 case CODE_FOR_avx512dq_vinserti32x8_mask:
35751 case CODE_FOR_avx512vl_vinsertv4df:
35752 case CODE_FOR_avx512vl_vinsertv4di:
35753 case CODE_FOR_avx512vl_vinsertv8sf:
35754 case CODE_FOR_avx512vl_vinsertv8si:
35755 error ("the last argument must be a 1-bit immediate");
35756 return const0_rtx;
35758 case CODE_FOR_avx_vmcmpv2df3:
35759 case CODE_FOR_avx_vmcmpv4sf3:
35760 case CODE_FOR_avx_cmpv2df3:
35761 case CODE_FOR_avx_cmpv4sf3:
35762 case CODE_FOR_avx_cmpv4df3:
35763 case CODE_FOR_avx_cmpv8sf3:
35764 case CODE_FOR_avx512f_cmpv8df3_mask:
35765 case CODE_FOR_avx512f_cmpv16sf3_mask:
35766 case CODE_FOR_avx512f_vmcmpv2df3_mask:
35767 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
35768 error ("the last argument must be a 5-bit immediate");
35769 return const0_rtx;
35771 default:
35772 switch (nargs_constant)
35774 case 2:
35775 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
35776 (!mask_pos && (nargs - i) == nargs_constant))
35778 error ("the next to last argument must be an 8-bit immediate");
35779 break;
35781 /* FALLTHRU */
35782 case 1:
35783 error ("the last argument must be an 8-bit immediate");
35784 break;
35785 default:
35786 gcc_unreachable ();
35788 return const0_rtx;
35791 else
35793 if (VECTOR_MODE_P (mode))
35794 op = safe_vector_operand (op, mode);
35796 /* If we aren't optimizing, only allow one memory operand to
35797 be generated. */
35798 if (memory_operand (op, mode))
35799 num_memory++;
35801 op = fixup_modeless_constant (op, mode);
35803 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35805 if (optimize || !match || num_memory > 1)
35806 op = copy_to_mode_reg (mode, op);
35808 else
35810 op = copy_to_reg (op);
35811 op = lowpart_subreg (mode, op, GET_MODE (op));
35815 args[i].op = op;
35816 args[i].mode = mode;
35819 switch (nargs)
35821 case 1:
35822 pat = GEN_FCN (icode) (real_target, args[0].op);
35823 break;
35824 case 2:
35825 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
35826 break;
35827 case 3:
35828 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35829 args[2].op);
35830 break;
35831 case 4:
35832 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35833 args[2].op, args[3].op);
35834 break;
35835 case 5:
35836 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35837 args[2].op, args[3].op, args[4].op);
35838 break;
35839 case 6:
35840 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
35841 args[2].op, args[3].op, args[4].op,
35842 args[5].op);
35843 break;
35844 default:
35845 gcc_unreachable ();
35848 if (! pat)
35849 return 0;
35851 emit_insn (pat);
35852 return target;
35855 /* Transform pattern of following layout:
35856 (set A
35857 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
35859 into:
35860 (set (A B)) */
35862 static rtx
35863 ix86_erase_embedded_rounding (rtx pat)
35865 if (GET_CODE (pat) == INSN)
35866 pat = PATTERN (pat);
35868 gcc_assert (GET_CODE (pat) == SET);
35869 rtx src = SET_SRC (pat);
35870 gcc_assert (XVECLEN (src, 0) == 2);
35871 rtx p0 = XVECEXP (src, 0, 0);
35872 gcc_assert (GET_CODE (src) == UNSPEC
35873 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
35874 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
35875 return res;
35878 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
35879 with rounding. */
35880 static rtx
35881 ix86_expand_sse_comi_round (const struct builtin_description *d,
35882 tree exp, rtx target)
35884 rtx pat, set_dst;
35885 tree arg0 = CALL_EXPR_ARG (exp, 0);
35886 tree arg1 = CALL_EXPR_ARG (exp, 1);
35887 tree arg2 = CALL_EXPR_ARG (exp, 2);
35888 tree arg3 = CALL_EXPR_ARG (exp, 3);
35889 rtx op0 = expand_normal (arg0);
35890 rtx op1 = expand_normal (arg1);
35891 rtx op2 = expand_normal (arg2);
35892 rtx op3 = expand_normal (arg3);
35893 enum insn_code icode = d->icode;
35894 const struct insn_data_d *insn_p = &insn_data[icode];
35895 machine_mode mode0 = insn_p->operand[0].mode;
35896 machine_mode mode1 = insn_p->operand[1].mode;
35897 enum rtx_code comparison = UNEQ;
35898 bool need_ucomi = false;
35900 /* See avxintrin.h for values. */
35901 enum rtx_code comi_comparisons[32] =
35903 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
35904 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
35905 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
35907 bool need_ucomi_values[32] =
35909 true, false, false, true, true, false, false, true,
35910 true, false, false, true, true, false, false, true,
35911 false, true, true, false, false, true, true, false,
35912 false, true, true, false, false, true, true, false
35915 if (!CONST_INT_P (op2))
35917 error ("the third argument must be comparison constant");
35918 return const0_rtx;
35920 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
35922 error ("incorrect comparison mode");
35923 return const0_rtx;
35926 if (!insn_p->operand[2].predicate (op3, SImode))
35928 error ("incorrect rounding operand");
35929 return const0_rtx;
35932 comparison = comi_comparisons[INTVAL (op2)];
35933 need_ucomi = need_ucomi_values[INTVAL (op2)];
35935 if (VECTOR_MODE_P (mode0))
35936 op0 = safe_vector_operand (op0, mode0);
35937 if (VECTOR_MODE_P (mode1))
35938 op1 = safe_vector_operand (op1, mode1);
35940 target = gen_reg_rtx (SImode);
35941 emit_move_insn (target, const0_rtx);
35942 target = gen_rtx_SUBREG (QImode, target, 0);
35944 if ((optimize && !register_operand (op0, mode0))
35945 || !insn_p->operand[0].predicate (op0, mode0))
35946 op0 = copy_to_mode_reg (mode0, op0);
35947 if ((optimize && !register_operand (op1, mode1))
35948 || !insn_p->operand[1].predicate (op1, mode1))
35949 op1 = copy_to_mode_reg (mode1, op1);
35951 if (need_ucomi)
35952 icode = icode == CODE_FOR_sse_comi_round
35953 ? CODE_FOR_sse_ucomi_round
35954 : CODE_FOR_sse2_ucomi_round;
35956 pat = GEN_FCN (icode) (op0, op1, op3);
35957 if (! pat)
35958 return 0;
35960 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
35961 if (INTVAL (op3) == NO_ROUND)
35963 pat = ix86_erase_embedded_rounding (pat);
35964 if (! pat)
35965 return 0;
35967 set_dst = SET_DEST (pat);
35969 else
35971 gcc_assert (GET_CODE (pat) == SET);
35972 set_dst = SET_DEST (pat);
35975 emit_insn (pat);
35976 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
35977 gen_rtx_fmt_ee (comparison, QImode,
35978 set_dst,
35979 const0_rtx)));
35981 return SUBREG_REG (target);
35984 static rtx
35985 ix86_expand_round_builtin (const struct builtin_description *d,
35986 tree exp, rtx target)
35988 rtx pat;
35989 unsigned int i, nargs;
35990 struct
35992 rtx op;
35993 machine_mode mode;
35994 } args[6];
35995 enum insn_code icode = d->icode;
35996 const struct insn_data_d *insn_p = &insn_data[icode];
35997 machine_mode tmode = insn_p->operand[0].mode;
35998 unsigned int nargs_constant = 0;
35999 unsigned int redundant_embed_rnd = 0;
36001 switch ((enum ix86_builtin_func_type) d->flag)
36003 case UINT64_FTYPE_V2DF_INT:
36004 case UINT64_FTYPE_V4SF_INT:
36005 case UINT_FTYPE_V2DF_INT:
36006 case UINT_FTYPE_V4SF_INT:
36007 case INT64_FTYPE_V2DF_INT:
36008 case INT64_FTYPE_V4SF_INT:
36009 case INT_FTYPE_V2DF_INT:
36010 case INT_FTYPE_V4SF_INT:
36011 nargs = 2;
36012 break;
36013 case V4SF_FTYPE_V4SF_UINT_INT:
36014 case V4SF_FTYPE_V4SF_UINT64_INT:
36015 case V2DF_FTYPE_V2DF_UINT64_INT:
36016 case V4SF_FTYPE_V4SF_INT_INT:
36017 case V4SF_FTYPE_V4SF_INT64_INT:
36018 case V2DF_FTYPE_V2DF_INT64_INT:
36019 case V4SF_FTYPE_V4SF_V4SF_INT:
36020 case V2DF_FTYPE_V2DF_V2DF_INT:
36021 case V4SF_FTYPE_V4SF_V2DF_INT:
36022 case V2DF_FTYPE_V2DF_V4SF_INT:
36023 nargs = 3;
36024 break;
36025 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
36026 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
36027 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
36028 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
36029 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
36030 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
36031 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
36032 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
36033 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
36034 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
36035 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
36036 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
36037 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
36038 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
36039 nargs = 4;
36040 break;
36041 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
36042 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
36043 nargs_constant = 2;
36044 nargs = 4;
36045 break;
36046 case INT_FTYPE_V4SF_V4SF_INT_INT:
36047 case INT_FTYPE_V2DF_V2DF_INT_INT:
36048 return ix86_expand_sse_comi_round (d, exp, target);
36049 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
36050 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
36051 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
36052 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
36053 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
36054 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
36055 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
36056 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
36057 nargs = 5;
36058 break;
36059 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
36060 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
36061 nargs_constant = 4;
36062 nargs = 5;
36063 break;
36064 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
36065 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
36066 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
36067 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
36068 nargs_constant = 3;
36069 nargs = 5;
36070 break;
36071 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
36072 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
36073 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
36074 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
36075 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
36076 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
36077 nargs = 6;
36078 nargs_constant = 4;
36079 break;
36080 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
36081 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
36082 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
36083 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
36084 nargs = 6;
36085 nargs_constant = 3;
36086 break;
36087 default:
36088 gcc_unreachable ();
36090 gcc_assert (nargs <= ARRAY_SIZE (args));
36092 if (optimize
36093 || target == 0
36094 || GET_MODE (target) != tmode
36095 || !insn_p->operand[0].predicate (target, tmode))
36096 target = gen_reg_rtx (tmode);
36098 for (i = 0; i < nargs; i++)
36100 tree arg = CALL_EXPR_ARG (exp, i);
36101 rtx op = expand_normal (arg);
36102 machine_mode mode = insn_p->operand[i + 1].mode;
36103 bool match = insn_p->operand[i + 1].predicate (op, mode);
36105 if (i == nargs - nargs_constant)
36107 if (!match)
36109 switch (icode)
36111 case CODE_FOR_avx512f_getmantv8df_mask_round:
36112 case CODE_FOR_avx512f_getmantv16sf_mask_round:
36113 case CODE_FOR_avx512f_vgetmantv2df_round:
36114 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
36115 case CODE_FOR_avx512f_vgetmantv4sf_round:
36116 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
36117 error ("the immediate argument must be a 4-bit immediate");
36118 return const0_rtx;
36119 case CODE_FOR_avx512f_cmpv8df3_mask_round:
36120 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
36121 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
36122 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
36123 error ("the immediate argument must be a 5-bit immediate");
36124 return const0_rtx;
36125 default:
36126 error ("the immediate argument must be an 8-bit immediate");
36127 return const0_rtx;
36131 else if (i == nargs-1)
36133 if (!insn_p->operand[nargs].predicate (op, SImode))
36135 error ("incorrect rounding operand");
36136 return const0_rtx;
36139 /* If there is no rounding use normal version of the pattern. */
36140 if (INTVAL (op) == NO_ROUND)
36141 redundant_embed_rnd = 1;
36143 else
36145 if (VECTOR_MODE_P (mode))
36146 op = safe_vector_operand (op, mode);
36148 op = fixup_modeless_constant (op, mode);
36150 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36152 if (optimize || !match)
36153 op = copy_to_mode_reg (mode, op);
36155 else
36157 op = copy_to_reg (op);
36158 op = lowpart_subreg (mode, op, GET_MODE (op));
36162 args[i].op = op;
36163 args[i].mode = mode;
36166 switch (nargs)
36168 case 1:
36169 pat = GEN_FCN (icode) (target, args[0].op);
36170 break;
36171 case 2:
36172 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36173 break;
36174 case 3:
36175 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36176 args[2].op);
36177 break;
36178 case 4:
36179 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36180 args[2].op, args[3].op);
36181 break;
36182 case 5:
36183 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36184 args[2].op, args[3].op, args[4].op);
36185 break;
36186 case 6:
36187 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
36188 args[2].op, args[3].op, args[4].op,
36189 args[5].op);
36190 break;
36191 default:
36192 gcc_unreachable ();
36195 if (!pat)
36196 return 0;
36198 if (redundant_embed_rnd)
36199 pat = ix86_erase_embedded_rounding (pat);
36201 emit_insn (pat);
36202 return target;
36205 /* Subroutine of ix86_expand_builtin to take care of special insns
36206 with variable number of operands. */
36208 static rtx
36209 ix86_expand_special_args_builtin (const struct builtin_description *d,
36210 tree exp, rtx target)
36212 tree arg;
36213 rtx pat, op;
36214 unsigned int i, nargs, arg_adjust, memory;
36215 bool aligned_mem = false;
36216 struct
36218 rtx op;
36219 machine_mode mode;
36220 } args[3];
36221 enum insn_code icode = d->icode;
36222 bool last_arg_constant = false;
36223 const struct insn_data_d *insn_p = &insn_data[icode];
36224 machine_mode tmode = insn_p->operand[0].mode;
36225 enum { load, store } klass;
36227 switch ((enum ix86_builtin_func_type) d->flag)
36229 case VOID_FTYPE_VOID:
36230 emit_insn (GEN_FCN (icode) (target));
36231 return 0;
36232 case VOID_FTYPE_UINT64:
36233 case VOID_FTYPE_UNSIGNED:
36234 nargs = 0;
36235 klass = store;
36236 memory = 0;
36237 break;
36239 case INT_FTYPE_VOID:
36240 case USHORT_FTYPE_VOID:
36241 case UINT64_FTYPE_VOID:
36242 case UINT_FTYPE_VOID:
36243 case UNSIGNED_FTYPE_VOID:
36244 nargs = 0;
36245 klass = load;
36246 memory = 0;
36247 break;
36248 case UINT64_FTYPE_PUNSIGNED:
36249 case V2DI_FTYPE_PV2DI:
36250 case V4DI_FTYPE_PV4DI:
36251 case V32QI_FTYPE_PCCHAR:
36252 case V16QI_FTYPE_PCCHAR:
36253 case V8SF_FTYPE_PCV4SF:
36254 case V8SF_FTYPE_PCFLOAT:
36255 case V4SF_FTYPE_PCFLOAT:
36256 case V4DF_FTYPE_PCV2DF:
36257 case V4DF_FTYPE_PCDOUBLE:
36258 case V2DF_FTYPE_PCDOUBLE:
36259 case VOID_FTYPE_PVOID:
36260 case V8DI_FTYPE_PV8DI:
36261 nargs = 1;
36262 klass = load;
36263 memory = 0;
36264 switch (icode)
36266 case CODE_FOR_sse4_1_movntdqa:
36267 case CODE_FOR_avx2_movntdqa:
36268 case CODE_FOR_avx512f_movntdqa:
36269 aligned_mem = true;
36270 break;
36271 default:
36272 break;
36274 break;
36275 case VOID_FTYPE_PV2SF_V4SF:
36276 case VOID_FTYPE_PV8DI_V8DI:
36277 case VOID_FTYPE_PV4DI_V4DI:
36278 case VOID_FTYPE_PV2DI_V2DI:
36279 case VOID_FTYPE_PCHAR_V32QI:
36280 case VOID_FTYPE_PCHAR_V16QI:
36281 case VOID_FTYPE_PFLOAT_V16SF:
36282 case VOID_FTYPE_PFLOAT_V8SF:
36283 case VOID_FTYPE_PFLOAT_V4SF:
36284 case VOID_FTYPE_PDOUBLE_V8DF:
36285 case VOID_FTYPE_PDOUBLE_V4DF:
36286 case VOID_FTYPE_PDOUBLE_V2DF:
36287 case VOID_FTYPE_PLONGLONG_LONGLONG:
36288 case VOID_FTYPE_PULONGLONG_ULONGLONG:
36289 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
36290 case VOID_FTYPE_PINT_INT:
36291 nargs = 1;
36292 klass = store;
36293 /* Reserve memory operand for target. */
36294 memory = ARRAY_SIZE (args);
36295 switch (icode)
36297 /* These builtins and instructions require the memory
36298 to be properly aligned. */
36299 case CODE_FOR_avx_movntv4di:
36300 case CODE_FOR_sse2_movntv2di:
36301 case CODE_FOR_avx_movntv8sf:
36302 case CODE_FOR_sse_movntv4sf:
36303 case CODE_FOR_sse4a_vmmovntv4sf:
36304 case CODE_FOR_avx_movntv4df:
36305 case CODE_FOR_sse2_movntv2df:
36306 case CODE_FOR_sse4a_vmmovntv2df:
36307 case CODE_FOR_sse2_movntidi:
36308 case CODE_FOR_sse_movntq:
36309 case CODE_FOR_sse2_movntisi:
36310 case CODE_FOR_avx512f_movntv16sf:
36311 case CODE_FOR_avx512f_movntv8df:
36312 case CODE_FOR_avx512f_movntv8di:
36313 aligned_mem = true;
36314 break;
36315 default:
36316 break;
36318 break;
36319 case VOID_FTYPE_PVOID_PCVOID:
36320 nargs = 1;
36321 klass = store;
36322 memory = 0;
36324 break;
36325 case V4SF_FTYPE_V4SF_PCV2SF:
36326 case V2DF_FTYPE_V2DF_PCDOUBLE:
36327 nargs = 2;
36328 klass = load;
36329 memory = 1;
36330 break;
36331 case V8SF_FTYPE_PCV8SF_V8SI:
36332 case V4DF_FTYPE_PCV4DF_V4DI:
36333 case V4SF_FTYPE_PCV4SF_V4SI:
36334 case V2DF_FTYPE_PCV2DF_V2DI:
36335 case V8SI_FTYPE_PCV8SI_V8SI:
36336 case V4DI_FTYPE_PCV4DI_V4DI:
36337 case V4SI_FTYPE_PCV4SI_V4SI:
36338 case V2DI_FTYPE_PCV2DI_V2DI:
36339 case VOID_FTYPE_INT_INT64:
36340 nargs = 2;
36341 klass = load;
36342 memory = 0;
36343 break;
36344 case VOID_FTYPE_PV8DF_V8DF_UQI:
36345 case VOID_FTYPE_PV4DF_V4DF_UQI:
36346 case VOID_FTYPE_PV2DF_V2DF_UQI:
36347 case VOID_FTYPE_PV16SF_V16SF_UHI:
36348 case VOID_FTYPE_PV8SF_V8SF_UQI:
36349 case VOID_FTYPE_PV4SF_V4SF_UQI:
36350 case VOID_FTYPE_PV8DI_V8DI_UQI:
36351 case VOID_FTYPE_PV4DI_V4DI_UQI:
36352 case VOID_FTYPE_PV2DI_V2DI_UQI:
36353 case VOID_FTYPE_PV16SI_V16SI_UHI:
36354 case VOID_FTYPE_PV8SI_V8SI_UQI:
36355 case VOID_FTYPE_PV4SI_V4SI_UQI:
36356 case VOID_FTYPE_PV64QI_V64QI_UDI:
36357 case VOID_FTYPE_PV32HI_V32HI_USI:
36358 case VOID_FTYPE_PV32QI_V32QI_USI:
36359 case VOID_FTYPE_PV16QI_V16QI_UHI:
36360 case VOID_FTYPE_PV16HI_V16HI_UHI:
36361 case VOID_FTYPE_PV8HI_V8HI_UQI:
36362 switch (icode)
36364 /* These builtins and instructions require the memory
36365 to be properly aligned. */
36366 case CODE_FOR_avx512f_storev16sf_mask:
36367 case CODE_FOR_avx512f_storev16si_mask:
36368 case CODE_FOR_avx512f_storev8df_mask:
36369 case CODE_FOR_avx512f_storev8di_mask:
36370 case CODE_FOR_avx512vl_storev8sf_mask:
36371 case CODE_FOR_avx512vl_storev8si_mask:
36372 case CODE_FOR_avx512vl_storev4df_mask:
36373 case CODE_FOR_avx512vl_storev4di_mask:
36374 case CODE_FOR_avx512vl_storev4sf_mask:
36375 case CODE_FOR_avx512vl_storev4si_mask:
36376 case CODE_FOR_avx512vl_storev2df_mask:
36377 case CODE_FOR_avx512vl_storev2di_mask:
36378 aligned_mem = true;
36379 break;
36380 default:
36381 break;
36383 /* FALLTHRU */
36384 case VOID_FTYPE_PV8SF_V8SI_V8SF:
36385 case VOID_FTYPE_PV4DF_V4DI_V4DF:
36386 case VOID_FTYPE_PV4SF_V4SI_V4SF:
36387 case VOID_FTYPE_PV2DF_V2DI_V2DF:
36388 case VOID_FTYPE_PV8SI_V8SI_V8SI:
36389 case VOID_FTYPE_PV4DI_V4DI_V4DI:
36390 case VOID_FTYPE_PV4SI_V4SI_V4SI:
36391 case VOID_FTYPE_PV2DI_V2DI_V2DI:
36392 case VOID_FTYPE_PV8SI_V8DI_UQI:
36393 case VOID_FTYPE_PV8HI_V8DI_UQI:
36394 case VOID_FTYPE_PV16HI_V16SI_UHI:
36395 case VOID_FTYPE_PV16QI_V8DI_UQI:
36396 case VOID_FTYPE_PV16QI_V16SI_UHI:
36397 case VOID_FTYPE_PV4SI_V4DI_UQI:
36398 case VOID_FTYPE_PV4SI_V2DI_UQI:
36399 case VOID_FTYPE_PV8HI_V4DI_UQI:
36400 case VOID_FTYPE_PV8HI_V2DI_UQI:
36401 case VOID_FTYPE_PV8HI_V8SI_UQI:
36402 case VOID_FTYPE_PV8HI_V4SI_UQI:
36403 case VOID_FTYPE_PV16QI_V4DI_UQI:
36404 case VOID_FTYPE_PV16QI_V2DI_UQI:
36405 case VOID_FTYPE_PV16QI_V8SI_UQI:
36406 case VOID_FTYPE_PV16QI_V4SI_UQI:
36407 case VOID_FTYPE_PCHAR_V64QI_UDI:
36408 case VOID_FTYPE_PCHAR_V32QI_USI:
36409 case VOID_FTYPE_PCHAR_V16QI_UHI:
36410 case VOID_FTYPE_PSHORT_V32HI_USI:
36411 case VOID_FTYPE_PSHORT_V16HI_UHI:
36412 case VOID_FTYPE_PSHORT_V8HI_UQI:
36413 case VOID_FTYPE_PINT_V16SI_UHI:
36414 case VOID_FTYPE_PINT_V8SI_UQI:
36415 case VOID_FTYPE_PINT_V4SI_UQI:
36416 case VOID_FTYPE_PINT64_V8DI_UQI:
36417 case VOID_FTYPE_PINT64_V4DI_UQI:
36418 case VOID_FTYPE_PINT64_V2DI_UQI:
36419 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
36420 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
36421 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
36422 case VOID_FTYPE_PFLOAT_V16SF_UHI:
36423 case VOID_FTYPE_PFLOAT_V8SF_UQI:
36424 case VOID_FTYPE_PFLOAT_V4SF_UQI:
36425 case VOID_FTYPE_PV32QI_V32HI_USI:
36426 case VOID_FTYPE_PV16QI_V16HI_UHI:
36427 case VOID_FTYPE_PV8QI_V8HI_UQI:
36428 nargs = 2;
36429 klass = store;
36430 /* Reserve memory operand for target. */
36431 memory = ARRAY_SIZE (args);
36432 break;
36433 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
36434 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
36435 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
36436 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
36437 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
36438 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
36439 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
36440 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
36441 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
36442 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
36443 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
36444 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
36445 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
36446 case V32HI_FTYPE_PCV32HI_V32HI_USI:
36447 case V32QI_FTYPE_PCV32QI_V32QI_USI:
36448 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
36449 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
36450 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
36451 switch (icode)
36453 /* These builtins and instructions require the memory
36454 to be properly aligned. */
36455 case CODE_FOR_avx512f_loadv16sf_mask:
36456 case CODE_FOR_avx512f_loadv16si_mask:
36457 case CODE_FOR_avx512f_loadv8df_mask:
36458 case CODE_FOR_avx512f_loadv8di_mask:
36459 case CODE_FOR_avx512vl_loadv8sf_mask:
36460 case CODE_FOR_avx512vl_loadv8si_mask:
36461 case CODE_FOR_avx512vl_loadv4df_mask:
36462 case CODE_FOR_avx512vl_loadv4di_mask:
36463 case CODE_FOR_avx512vl_loadv4sf_mask:
36464 case CODE_FOR_avx512vl_loadv4si_mask:
36465 case CODE_FOR_avx512vl_loadv2df_mask:
36466 case CODE_FOR_avx512vl_loadv2di_mask:
36467 case CODE_FOR_avx512bw_loadv64qi_mask:
36468 case CODE_FOR_avx512vl_loadv32qi_mask:
36469 case CODE_FOR_avx512vl_loadv16qi_mask:
36470 case CODE_FOR_avx512bw_loadv32hi_mask:
36471 case CODE_FOR_avx512vl_loadv16hi_mask:
36472 case CODE_FOR_avx512vl_loadv8hi_mask:
36473 aligned_mem = true;
36474 break;
36475 default:
36476 break;
36478 /* FALLTHRU */
36479 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
36480 case V32QI_FTYPE_PCCHAR_V32QI_USI:
36481 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
36482 case V32HI_FTYPE_PCSHORT_V32HI_USI:
36483 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
36484 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
36485 case V16SI_FTYPE_PCINT_V16SI_UHI:
36486 case V8SI_FTYPE_PCINT_V8SI_UQI:
36487 case V4SI_FTYPE_PCINT_V4SI_UQI:
36488 case V8DI_FTYPE_PCINT64_V8DI_UQI:
36489 case V4DI_FTYPE_PCINT64_V4DI_UQI:
36490 case V2DI_FTYPE_PCINT64_V2DI_UQI:
36491 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
36492 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
36493 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
36494 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
36495 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
36496 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
36497 nargs = 3;
36498 klass = load;
36499 memory = 0;
36500 break;
36501 case VOID_FTYPE_UINT_UINT_UINT:
36502 case VOID_FTYPE_UINT64_UINT_UINT:
36503 case UCHAR_FTYPE_UINT_UINT_UINT:
36504 case UCHAR_FTYPE_UINT64_UINT_UINT:
36505 nargs = 3;
36506 klass = load;
36507 memory = ARRAY_SIZE (args);
36508 last_arg_constant = true;
36509 break;
36510 default:
36511 gcc_unreachable ();
36514 gcc_assert (nargs <= ARRAY_SIZE (args));
36516 if (klass == store)
36518 arg = CALL_EXPR_ARG (exp, 0);
36519 op = expand_normal (arg);
36520 gcc_assert (target == 0);
36521 if (memory)
36523 op = ix86_zero_extend_to_Pmode (op);
36524 target = gen_rtx_MEM (tmode, op);
36525 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
36526 on it. Try to improve it using get_pointer_alignment,
36527 and if the special builtin is one that requires strict
36528 mode alignment, also from it's GET_MODE_ALIGNMENT.
36529 Failure to do so could lead to ix86_legitimate_combined_insn
36530 rejecting all changes to such insns. */
36531 unsigned int align = get_pointer_alignment (arg);
36532 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
36533 align = GET_MODE_ALIGNMENT (tmode);
36534 if (MEM_ALIGN (target) < align)
36535 set_mem_align (target, align);
36537 else
36538 target = force_reg (tmode, op);
36539 arg_adjust = 1;
36541 else
36543 arg_adjust = 0;
36544 if (optimize
36545 || target == 0
36546 || !register_operand (target, tmode)
36547 || GET_MODE (target) != tmode)
36548 target = gen_reg_rtx (tmode);
36551 for (i = 0; i < nargs; i++)
36553 machine_mode mode = insn_p->operand[i + 1].mode;
36554 bool match;
36556 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
36557 op = expand_normal (arg);
36558 match = insn_p->operand[i + 1].predicate (op, mode);
36560 if (last_arg_constant && (i + 1) == nargs)
36562 if (!match)
36564 if (icode == CODE_FOR_lwp_lwpvalsi3
36565 || icode == CODE_FOR_lwp_lwpinssi3
36566 || icode == CODE_FOR_lwp_lwpvaldi3
36567 || icode == CODE_FOR_lwp_lwpinsdi3)
36568 error ("the last argument must be a 32-bit immediate");
36569 else
36570 error ("the last argument must be an 8-bit immediate");
36571 return const0_rtx;
36574 else
36576 if (i == memory)
36578 /* This must be the memory operand. */
36579 op = ix86_zero_extend_to_Pmode (op);
36580 op = gen_rtx_MEM (mode, op);
36581 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
36582 on it. Try to improve it using get_pointer_alignment,
36583 and if the special builtin is one that requires strict
36584 mode alignment, also from it's GET_MODE_ALIGNMENT.
36585 Failure to do so could lead to ix86_legitimate_combined_insn
36586 rejecting all changes to such insns. */
36587 unsigned int align = get_pointer_alignment (arg);
36588 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
36589 align = GET_MODE_ALIGNMENT (mode);
36590 if (MEM_ALIGN (op) < align)
36591 set_mem_align (op, align);
36593 else
36595 /* This must be register. */
36596 if (VECTOR_MODE_P (mode))
36597 op = safe_vector_operand (op, mode);
36599 op = fixup_modeless_constant (op, mode);
36601 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
36602 op = copy_to_mode_reg (mode, op);
36603 else
36605 op = copy_to_reg (op);
36606 op = lowpart_subreg (mode, op, GET_MODE (op));
36611 args[i].op = op;
36612 args[i].mode = mode;
36615 switch (nargs)
36617 case 0:
36618 pat = GEN_FCN (icode) (target);
36619 break;
36620 case 1:
36621 pat = GEN_FCN (icode) (target, args[0].op);
36622 break;
36623 case 2:
36624 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
36625 break;
36626 case 3:
36627 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
36628 break;
36629 default:
36630 gcc_unreachable ();
36633 if (! pat)
36634 return 0;
36635 emit_insn (pat);
36636 return klass == store ? 0 : target;
36639 /* Return the integer constant in ARG. Constrain it to be in the range
36640 of the subparts of VEC_TYPE; issue an error if not. */
36642 static int
36643 get_element_number (tree vec_type, tree arg)
36645 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
36647 if (!tree_fits_uhwi_p (arg)
36648 || (elt = tree_to_uhwi (arg), elt > max))
36650 error ("selector must be an integer constant in the range 0..%wi", max);
36651 return 0;
36654 return elt;
36657 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36658 ix86_expand_vector_init. We DO have language-level syntax for this, in
36659 the form of (type){ init-list }. Except that since we can't place emms
36660 instructions from inside the compiler, we can't allow the use of MMX
36661 registers unless the user explicitly asks for it. So we do *not* define
36662 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
36663 we have builtins invoked by mmintrin.h that gives us license to emit
36664 these sorts of instructions. */
36666 static rtx
36667 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
36669 machine_mode tmode = TYPE_MODE (type);
36670 machine_mode inner_mode = GET_MODE_INNER (tmode);
36671 int i, n_elt = GET_MODE_NUNITS (tmode);
36672 rtvec v = rtvec_alloc (n_elt);
36674 gcc_assert (VECTOR_MODE_P (tmode));
36675 gcc_assert (call_expr_nargs (exp) == n_elt);
36677 for (i = 0; i < n_elt; ++i)
36679 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
36680 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
36683 if (!target || !register_operand (target, tmode))
36684 target = gen_reg_rtx (tmode);
36686 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
36687 return target;
36690 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36691 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
36692 had a language-level syntax for referencing vector elements. */
36694 static rtx
36695 ix86_expand_vec_ext_builtin (tree exp, rtx target)
36697 machine_mode tmode, mode0;
36698 tree arg0, arg1;
36699 int elt;
36700 rtx op0;
36702 arg0 = CALL_EXPR_ARG (exp, 0);
36703 arg1 = CALL_EXPR_ARG (exp, 1);
36705 op0 = expand_normal (arg0);
36706 elt = get_element_number (TREE_TYPE (arg0), arg1);
36708 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36709 mode0 = TYPE_MODE (TREE_TYPE (arg0));
36710 gcc_assert (VECTOR_MODE_P (mode0));
36712 op0 = force_reg (mode0, op0);
36714 if (optimize || !target || !register_operand (target, tmode))
36715 target = gen_reg_rtx (tmode);
36717 ix86_expand_vector_extract (true, target, op0, elt);
36719 return target;
36722 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
36723 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
36724 a language-level syntax for referencing vector elements. */
36726 static rtx
36727 ix86_expand_vec_set_builtin (tree exp)
36729 machine_mode tmode, mode1;
36730 tree arg0, arg1, arg2;
36731 int elt;
36732 rtx op0, op1, target;
36734 arg0 = CALL_EXPR_ARG (exp, 0);
36735 arg1 = CALL_EXPR_ARG (exp, 1);
36736 arg2 = CALL_EXPR_ARG (exp, 2);
36738 tmode = TYPE_MODE (TREE_TYPE (arg0));
36739 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
36740 gcc_assert (VECTOR_MODE_P (tmode));
36742 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
36743 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
36744 elt = get_element_number (TREE_TYPE (arg0), arg2);
36746 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
36747 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
36749 op0 = force_reg (tmode, op0);
36750 op1 = force_reg (mode1, op1);
36752 /* OP0 is the source of these builtin functions and shouldn't be
36753 modified. Create a copy, use it and return it as target. */
36754 target = gen_reg_rtx (tmode);
36755 emit_move_insn (target, op0);
36756 ix86_expand_vector_set (true, target, op1, elt);
36758 return target;
36761 /* Expand an expression EXP that calls a built-in function,
36762 with result going to TARGET if that's convenient
36763 (and in mode MODE if that's convenient).
36764 SUBTARGET may be used as the target for computing one of EXP's operands.
36765 IGNORE is nonzero if the value is to be ignored. */
36767 static rtx
36768 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
36769 machine_mode mode, int ignore)
36771 size_t i;
36772 enum insn_code icode, icode2;
36773 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
36774 tree arg0, arg1, arg2, arg3, arg4;
36775 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
36776 machine_mode mode0, mode1, mode2, mode3, mode4;
36777 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
36779 /* For CPU builtins that can be folded, fold first and expand the fold. */
36780 switch (fcode)
36782 case IX86_BUILTIN_CPU_INIT:
36784 /* Make it call __cpu_indicator_init in libgcc. */
36785 tree call_expr, fndecl, type;
36786 type = build_function_type_list (integer_type_node, NULL_TREE);
36787 fndecl = build_fn_decl ("__cpu_indicator_init", type);
36788 call_expr = build_call_expr (fndecl, 0);
36789 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
36791 case IX86_BUILTIN_CPU_IS:
36792 case IX86_BUILTIN_CPU_SUPPORTS:
36794 tree arg0 = CALL_EXPR_ARG (exp, 0);
36795 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
36796 gcc_assert (fold_expr != NULL_TREE);
36797 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
36801 HOST_WIDE_INT isa = ix86_isa_flags;
36802 HOST_WIDE_INT isa2 = ix86_isa_flags2;
36803 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
36804 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
36805 /* The general case is we require all the ISAs specified in bisa{,2}
36806 to be enabled.
36807 The exceptions are:
36808 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
36809 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
36810 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
36811 where for each this pair it is sufficient if either of the ISAs is
36812 enabled, plus if it is ored with other options also those others. */
36813 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36814 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
36815 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
36816 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
36817 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36818 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
36819 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
36820 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
36821 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36822 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
36823 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
36824 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
36825 if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
36827 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
36828 (enum fpmath_unit) 0, false);
36829 if (!opts)
36830 error ("%qE needs unknown isa option", fndecl);
36831 else
36833 gcc_assert (opts != NULL);
36834 error ("%qE needs isa option %s", fndecl, opts);
36835 free (opts);
36837 return expand_call (exp, target, ignore);
36840 switch (fcode)
36842 case IX86_BUILTIN_MASKMOVQ:
36843 case IX86_BUILTIN_MASKMOVDQU:
36844 icode = (fcode == IX86_BUILTIN_MASKMOVQ
36845 ? CODE_FOR_mmx_maskmovq
36846 : CODE_FOR_sse2_maskmovdqu);
36847 /* Note the arg order is different from the operand order. */
36848 arg1 = CALL_EXPR_ARG (exp, 0);
36849 arg2 = CALL_EXPR_ARG (exp, 1);
36850 arg0 = CALL_EXPR_ARG (exp, 2);
36851 op0 = expand_normal (arg0);
36852 op1 = expand_normal (arg1);
36853 op2 = expand_normal (arg2);
36854 mode0 = insn_data[icode].operand[0].mode;
36855 mode1 = insn_data[icode].operand[1].mode;
36856 mode2 = insn_data[icode].operand[2].mode;
36858 op0 = ix86_zero_extend_to_Pmode (op0);
36859 op0 = gen_rtx_MEM (mode1, op0);
36861 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36862 op0 = copy_to_mode_reg (mode0, op0);
36863 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36864 op1 = copy_to_mode_reg (mode1, op1);
36865 if (!insn_data[icode].operand[2].predicate (op2, mode2))
36866 op2 = copy_to_mode_reg (mode2, op2);
36867 pat = GEN_FCN (icode) (op0, op1, op2);
36868 if (! pat)
36869 return 0;
36870 emit_insn (pat);
36871 return 0;
36873 case IX86_BUILTIN_LDMXCSR:
36874 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
36875 target = assign_386_stack_local (SImode, SLOT_TEMP);
36876 emit_move_insn (target, op0);
36877 emit_insn (gen_sse_ldmxcsr (target));
36878 return 0;
36880 case IX86_BUILTIN_STMXCSR:
36881 target = assign_386_stack_local (SImode, SLOT_TEMP);
36882 emit_insn (gen_sse_stmxcsr (target));
36883 return copy_to_mode_reg (SImode, target);
36885 case IX86_BUILTIN_CLFLUSH:
36886 arg0 = CALL_EXPR_ARG (exp, 0);
36887 op0 = expand_normal (arg0);
36888 icode = CODE_FOR_sse2_clflush;
36889 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36890 op0 = ix86_zero_extend_to_Pmode (op0);
36892 emit_insn (gen_sse2_clflush (op0));
36893 return 0;
36895 case IX86_BUILTIN_CLWB:
36896 arg0 = CALL_EXPR_ARG (exp, 0);
36897 op0 = expand_normal (arg0);
36898 icode = CODE_FOR_clwb;
36899 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36900 op0 = ix86_zero_extend_to_Pmode (op0);
36902 emit_insn (gen_clwb (op0));
36903 return 0;
36905 case IX86_BUILTIN_CLFLUSHOPT:
36906 arg0 = CALL_EXPR_ARG (exp, 0);
36907 op0 = expand_normal (arg0);
36908 icode = CODE_FOR_clflushopt;
36909 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36910 op0 = ix86_zero_extend_to_Pmode (op0);
36912 emit_insn (gen_clflushopt (op0));
36913 return 0;
36915 case IX86_BUILTIN_MONITOR:
36916 case IX86_BUILTIN_MONITORX:
36917 arg0 = CALL_EXPR_ARG (exp, 0);
36918 arg1 = CALL_EXPR_ARG (exp, 1);
36919 arg2 = CALL_EXPR_ARG (exp, 2);
36920 op0 = expand_normal (arg0);
36921 op1 = expand_normal (arg1);
36922 op2 = expand_normal (arg2);
36923 if (!REG_P (op0))
36924 op0 = ix86_zero_extend_to_Pmode (op0);
36925 if (!REG_P (op1))
36926 op1 = copy_to_mode_reg (SImode, op1);
36927 if (!REG_P (op2))
36928 op2 = copy_to_mode_reg (SImode, op2);
36930 emit_insn (fcode == IX86_BUILTIN_MONITOR
36931 ? ix86_gen_monitor (op0, op1, op2)
36932 : ix86_gen_monitorx (op0, op1, op2));
36933 return 0;
36935 case IX86_BUILTIN_MWAIT:
36936 arg0 = CALL_EXPR_ARG (exp, 0);
36937 arg1 = CALL_EXPR_ARG (exp, 1);
36938 op0 = expand_normal (arg0);
36939 op1 = expand_normal (arg1);
36940 if (!REG_P (op0))
36941 op0 = copy_to_mode_reg (SImode, op0);
36942 if (!REG_P (op1))
36943 op1 = copy_to_mode_reg (SImode, op1);
36944 emit_insn (gen_sse3_mwait (op0, op1));
36945 return 0;
36947 case IX86_BUILTIN_MWAITX:
36948 arg0 = CALL_EXPR_ARG (exp, 0);
36949 arg1 = CALL_EXPR_ARG (exp, 1);
36950 arg2 = CALL_EXPR_ARG (exp, 2);
36951 op0 = expand_normal (arg0);
36952 op1 = expand_normal (arg1);
36953 op2 = expand_normal (arg2);
36954 if (!REG_P (op0))
36955 op0 = copy_to_mode_reg (SImode, op0);
36956 if (!REG_P (op1))
36957 op1 = copy_to_mode_reg (SImode, op1);
36958 if (!REG_P (op2))
36959 op2 = copy_to_mode_reg (SImode, op2);
36960 emit_insn (gen_mwaitx (op0, op1, op2));
36961 return 0;
36963 case IX86_BUILTIN_UMONITOR:
36964 arg0 = CALL_EXPR_ARG (exp, 0);
36965 op0 = expand_normal (arg0);
36967 op0 = ix86_zero_extend_to_Pmode (op0);
36969 insn = (TARGET_64BIT
36970 ? gen_umonitor_di (op0)
36971 : gen_umonitor_si (op0));
36973 emit_insn (insn);
36974 return 0;
36976 case IX86_BUILTIN_UMWAIT:
36977 case IX86_BUILTIN_TPAUSE:
36978 arg0 = CALL_EXPR_ARG (exp, 0);
36979 arg1 = CALL_EXPR_ARG (exp, 1);
36980 op0 = expand_normal (arg0);
36981 op1 = expand_normal (arg1);
36983 if (!REG_P (op0))
36984 op0 = copy_to_mode_reg (SImode, op0);
36986 op1 = force_reg (DImode, op1);
36988 if (TARGET_64BIT)
36990 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36991 NULL, 1, OPTAB_DIRECT);
36992 switch (fcode)
36994 case IX86_BUILTIN_UMWAIT:
36995 icode = CODE_FOR_umwait_rex64;
36996 break;
36997 case IX86_BUILTIN_TPAUSE:
36998 icode = CODE_FOR_tpause_rex64;
36999 break;
37000 default:
37001 gcc_unreachable ();
37004 op2 = gen_lowpart (SImode, op2);
37005 op1 = gen_lowpart (SImode, op1);
37006 pat = GEN_FCN (icode) (op0, op1, op2);
37008 else
37010 switch (fcode)
37012 case IX86_BUILTIN_UMWAIT:
37013 icode = CODE_FOR_umwait;
37014 break;
37015 case IX86_BUILTIN_TPAUSE:
37016 icode = CODE_FOR_tpause;
37017 break;
37018 default:
37019 gcc_unreachable ();
37021 pat = GEN_FCN (icode) (op0, op1);
37024 if (!pat)
37025 return 0;
37027 emit_insn (pat);
37029 if (target == 0
37030 || !register_operand (target, QImode))
37031 target = gen_reg_rtx (QImode);
37033 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37034 const0_rtx);
37035 emit_insn (gen_rtx_SET (target, pat));
37037 return target;
37039 case IX86_BUILTIN_CLZERO:
37040 arg0 = CALL_EXPR_ARG (exp, 0);
37041 op0 = expand_normal (arg0);
37042 if (!REG_P (op0))
37043 op0 = ix86_zero_extend_to_Pmode (op0);
37044 emit_insn (ix86_gen_clzero (op0));
37045 return 0;
37047 case IX86_BUILTIN_CLDEMOTE:
37048 arg0 = CALL_EXPR_ARG (exp, 0);
37049 op0 = expand_normal (arg0);
37050 icode = CODE_FOR_cldemote;
37051 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37052 op0 = ix86_zero_extend_to_Pmode (op0);
37054 emit_insn (gen_cldemote (op0));
37055 return 0;
37057 case IX86_BUILTIN_VEC_INIT_V2SI:
37058 case IX86_BUILTIN_VEC_INIT_V4HI:
37059 case IX86_BUILTIN_VEC_INIT_V8QI:
37060 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
37062 case IX86_BUILTIN_VEC_EXT_V2DF:
37063 case IX86_BUILTIN_VEC_EXT_V2DI:
37064 case IX86_BUILTIN_VEC_EXT_V4SF:
37065 case IX86_BUILTIN_VEC_EXT_V4SI:
37066 case IX86_BUILTIN_VEC_EXT_V8HI:
37067 case IX86_BUILTIN_VEC_EXT_V2SI:
37068 case IX86_BUILTIN_VEC_EXT_V4HI:
37069 case IX86_BUILTIN_VEC_EXT_V16QI:
37070 return ix86_expand_vec_ext_builtin (exp, target);
37072 case IX86_BUILTIN_VEC_SET_V2DI:
37073 case IX86_BUILTIN_VEC_SET_V4SF:
37074 case IX86_BUILTIN_VEC_SET_V4SI:
37075 case IX86_BUILTIN_VEC_SET_V8HI:
37076 case IX86_BUILTIN_VEC_SET_V4HI:
37077 case IX86_BUILTIN_VEC_SET_V16QI:
37078 return ix86_expand_vec_set_builtin (exp);
37080 case IX86_BUILTIN_NANQ:
37081 case IX86_BUILTIN_NANSQ:
37082 return expand_call (exp, target, ignore);
37084 case IX86_BUILTIN_RDPID:
37086 op0 = gen_reg_rtx (word_mode);
37088 if (TARGET_64BIT)
37090 insn = gen_rdpid_rex64 (op0);
37091 op0 = convert_to_mode (SImode, op0, 1);
37093 else
37094 insn = gen_rdpid (op0);
37096 emit_insn (insn);
37098 if (target == 0
37099 || !register_operand (target, SImode))
37100 target = gen_reg_rtx (SImode);
37102 emit_move_insn (target, op0);
37103 return target;
37105 case IX86_BUILTIN_RDPMC:
37106 case IX86_BUILTIN_RDTSC:
37107 case IX86_BUILTIN_RDTSCP:
37108 case IX86_BUILTIN_XGETBV:
37110 op0 = gen_reg_rtx (DImode);
37111 op1 = gen_reg_rtx (DImode);
37113 if (fcode == IX86_BUILTIN_RDPMC)
37115 arg0 = CALL_EXPR_ARG (exp, 0);
37116 op2 = expand_normal (arg0);
37117 if (!register_operand (op2, SImode))
37118 op2 = copy_to_mode_reg (SImode, op2);
37120 insn = (TARGET_64BIT
37121 ? gen_rdpmc_rex64 (op0, op1, op2)
37122 : gen_rdpmc (op0, op2));
37123 emit_insn (insn);
37125 else if (fcode == IX86_BUILTIN_XGETBV)
37127 arg0 = CALL_EXPR_ARG (exp, 0);
37128 op2 = expand_normal (arg0);
37129 if (!register_operand (op2, SImode))
37130 op2 = copy_to_mode_reg (SImode, op2);
37132 insn = (TARGET_64BIT
37133 ? gen_xgetbv_rex64 (op0, op1, op2)
37134 : gen_xgetbv (op0, op2));
37135 emit_insn (insn);
37137 else if (fcode == IX86_BUILTIN_RDTSC)
37139 insn = (TARGET_64BIT
37140 ? gen_rdtsc_rex64 (op0, op1)
37141 : gen_rdtsc (op0));
37142 emit_insn (insn);
37144 else
37146 op2 = gen_reg_rtx (SImode);
37148 insn = (TARGET_64BIT
37149 ? gen_rdtscp_rex64 (op0, op1, op2)
37150 : gen_rdtscp (op0, op2));
37151 emit_insn (insn);
37153 arg0 = CALL_EXPR_ARG (exp, 0);
37154 op4 = expand_normal (arg0);
37155 if (!address_operand (op4, VOIDmode))
37157 op4 = convert_memory_address (Pmode, op4);
37158 op4 = copy_addr_to_reg (op4);
37160 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
37163 if (target == 0
37164 || !register_operand (target, DImode))
37165 target = gen_reg_rtx (DImode);
37167 if (TARGET_64BIT)
37169 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
37170 op1, 1, OPTAB_DIRECT);
37171 op0 = expand_simple_binop (DImode, IOR, op0, op1,
37172 op0, 1, OPTAB_DIRECT);
37175 emit_move_insn (target, op0);
37176 return target;
37178 case IX86_BUILTIN_MOVDIR64B:
37180 arg0 = CALL_EXPR_ARG (exp, 0);
37181 arg1 = CALL_EXPR_ARG (exp, 1);
37182 op0 = expand_normal (arg0);
37183 op1 = expand_normal (arg1);
37185 op0 = ix86_zero_extend_to_Pmode (op0);
37186 if (!address_operand (op1, VOIDmode))
37188 op1 = convert_memory_address (Pmode, op1);
37189 op1 = copy_addr_to_reg (op1);
37191 op1 = gen_rtx_MEM (XImode, op1);
37193 insn = (TARGET_64BIT
37194 ? gen_movdir64b_di (op0, op1)
37195 : gen_movdir64b_si (op0, op1));
37196 emit_insn (insn);
37197 return 0;
37199 case IX86_BUILTIN_FXSAVE:
37200 case IX86_BUILTIN_FXRSTOR:
37201 case IX86_BUILTIN_FXSAVE64:
37202 case IX86_BUILTIN_FXRSTOR64:
37203 case IX86_BUILTIN_FNSTENV:
37204 case IX86_BUILTIN_FLDENV:
37205 mode0 = BLKmode;
37206 switch (fcode)
37208 case IX86_BUILTIN_FXSAVE:
37209 icode = CODE_FOR_fxsave;
37210 break;
37211 case IX86_BUILTIN_FXRSTOR:
37212 icode = CODE_FOR_fxrstor;
37213 break;
37214 case IX86_BUILTIN_FXSAVE64:
37215 icode = CODE_FOR_fxsave64;
37216 break;
37217 case IX86_BUILTIN_FXRSTOR64:
37218 icode = CODE_FOR_fxrstor64;
37219 break;
37220 case IX86_BUILTIN_FNSTENV:
37221 icode = CODE_FOR_fnstenv;
37222 break;
37223 case IX86_BUILTIN_FLDENV:
37224 icode = CODE_FOR_fldenv;
37225 break;
37226 default:
37227 gcc_unreachable ();
37230 arg0 = CALL_EXPR_ARG (exp, 0);
37231 op0 = expand_normal (arg0);
37233 if (!address_operand (op0, VOIDmode))
37235 op0 = convert_memory_address (Pmode, op0);
37236 op0 = copy_addr_to_reg (op0);
37238 op0 = gen_rtx_MEM (mode0, op0);
37240 pat = GEN_FCN (icode) (op0);
37241 if (pat)
37242 emit_insn (pat);
37243 return 0;
37245 case IX86_BUILTIN_XSETBV:
37246 arg0 = CALL_EXPR_ARG (exp, 0);
37247 arg1 = CALL_EXPR_ARG (exp, 1);
37248 op0 = expand_normal (arg0);
37249 op1 = expand_normal (arg1);
37251 if (!REG_P (op0))
37252 op0 = copy_to_mode_reg (SImode, op0);
37254 op1 = force_reg (DImode, op1);
37256 if (TARGET_64BIT)
37258 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37259 NULL, 1, OPTAB_DIRECT);
37261 icode = CODE_FOR_xsetbv_rex64;
37263 op2 = gen_lowpart (SImode, op2);
37264 op1 = gen_lowpart (SImode, op1);
37265 pat = GEN_FCN (icode) (op0, op1, op2);
37267 else
37269 icode = CODE_FOR_xsetbv;
37271 pat = GEN_FCN (icode) (op0, op1);
37273 if (pat)
37274 emit_insn (pat);
37275 return 0;
37277 case IX86_BUILTIN_XSAVE:
37278 case IX86_BUILTIN_XRSTOR:
37279 case IX86_BUILTIN_XSAVE64:
37280 case IX86_BUILTIN_XRSTOR64:
37281 case IX86_BUILTIN_XSAVEOPT:
37282 case IX86_BUILTIN_XSAVEOPT64:
37283 case IX86_BUILTIN_XSAVES:
37284 case IX86_BUILTIN_XRSTORS:
37285 case IX86_BUILTIN_XSAVES64:
37286 case IX86_BUILTIN_XRSTORS64:
37287 case IX86_BUILTIN_XSAVEC:
37288 case IX86_BUILTIN_XSAVEC64:
37289 arg0 = CALL_EXPR_ARG (exp, 0);
37290 arg1 = CALL_EXPR_ARG (exp, 1);
37291 op0 = expand_normal (arg0);
37292 op1 = expand_normal (arg1);
37294 if (!address_operand (op0, VOIDmode))
37296 op0 = convert_memory_address (Pmode, op0);
37297 op0 = copy_addr_to_reg (op0);
37299 op0 = gen_rtx_MEM (BLKmode, op0);
37301 op1 = force_reg (DImode, op1);
37303 if (TARGET_64BIT)
37305 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
37306 NULL, 1, OPTAB_DIRECT);
37307 switch (fcode)
37309 case IX86_BUILTIN_XSAVE:
37310 icode = CODE_FOR_xsave_rex64;
37311 break;
37312 case IX86_BUILTIN_XRSTOR:
37313 icode = CODE_FOR_xrstor_rex64;
37314 break;
37315 case IX86_BUILTIN_XSAVE64:
37316 icode = CODE_FOR_xsave64;
37317 break;
37318 case IX86_BUILTIN_XRSTOR64:
37319 icode = CODE_FOR_xrstor64;
37320 break;
37321 case IX86_BUILTIN_XSAVEOPT:
37322 icode = CODE_FOR_xsaveopt_rex64;
37323 break;
37324 case IX86_BUILTIN_XSAVEOPT64:
37325 icode = CODE_FOR_xsaveopt64;
37326 break;
37327 case IX86_BUILTIN_XSAVES:
37328 icode = CODE_FOR_xsaves_rex64;
37329 break;
37330 case IX86_BUILTIN_XRSTORS:
37331 icode = CODE_FOR_xrstors_rex64;
37332 break;
37333 case IX86_BUILTIN_XSAVES64:
37334 icode = CODE_FOR_xsaves64;
37335 break;
37336 case IX86_BUILTIN_XRSTORS64:
37337 icode = CODE_FOR_xrstors64;
37338 break;
37339 case IX86_BUILTIN_XSAVEC:
37340 icode = CODE_FOR_xsavec_rex64;
37341 break;
37342 case IX86_BUILTIN_XSAVEC64:
37343 icode = CODE_FOR_xsavec64;
37344 break;
37345 default:
37346 gcc_unreachable ();
37349 op2 = gen_lowpart (SImode, op2);
37350 op1 = gen_lowpart (SImode, op1);
37351 pat = GEN_FCN (icode) (op0, op1, op2);
37353 else
37355 switch (fcode)
37357 case IX86_BUILTIN_XSAVE:
37358 icode = CODE_FOR_xsave;
37359 break;
37360 case IX86_BUILTIN_XRSTOR:
37361 icode = CODE_FOR_xrstor;
37362 break;
37363 case IX86_BUILTIN_XSAVEOPT:
37364 icode = CODE_FOR_xsaveopt;
37365 break;
37366 case IX86_BUILTIN_XSAVES:
37367 icode = CODE_FOR_xsaves;
37368 break;
37369 case IX86_BUILTIN_XRSTORS:
37370 icode = CODE_FOR_xrstors;
37371 break;
37372 case IX86_BUILTIN_XSAVEC:
37373 icode = CODE_FOR_xsavec;
37374 break;
37375 default:
37376 gcc_unreachable ();
37378 pat = GEN_FCN (icode) (op0, op1);
37381 if (pat)
37382 emit_insn (pat);
37383 return 0;
37385 case IX86_BUILTIN_LLWPCB:
37386 arg0 = CALL_EXPR_ARG (exp, 0);
37387 op0 = expand_normal (arg0);
37388 icode = CODE_FOR_lwp_llwpcb;
37389 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37390 op0 = ix86_zero_extend_to_Pmode (op0);
37391 emit_insn (gen_lwp_llwpcb (op0));
37392 return 0;
37394 case IX86_BUILTIN_SLWPCB:
37395 icode = CODE_FOR_lwp_slwpcb;
37396 if (!target
37397 || !insn_data[icode].operand[0].predicate (target, Pmode))
37398 target = gen_reg_rtx (Pmode);
37399 emit_insn (gen_lwp_slwpcb (target));
37400 return target;
37402 case IX86_BUILTIN_BEXTRI32:
37403 case IX86_BUILTIN_BEXTRI64:
37404 arg0 = CALL_EXPR_ARG (exp, 0);
37405 arg1 = CALL_EXPR_ARG (exp, 1);
37406 op0 = expand_normal (arg0);
37407 op1 = expand_normal (arg1);
37408 icode = (fcode == IX86_BUILTIN_BEXTRI32
37409 ? CODE_FOR_tbm_bextri_si
37410 : CODE_FOR_tbm_bextri_di);
37411 if (!CONST_INT_P (op1))
37413 error ("last argument must be an immediate");
37414 return const0_rtx;
37416 else
37418 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
37419 unsigned char lsb_index = INTVAL (op1) & 0xFF;
37420 op1 = GEN_INT (length);
37421 op2 = GEN_INT (lsb_index);
37422 pat = GEN_FCN (icode) (target, op0, op1, op2);
37423 if (pat)
37424 emit_insn (pat);
37425 return target;
37428 case IX86_BUILTIN_RDRAND16_STEP:
37429 icode = CODE_FOR_rdrandhi_1;
37430 mode0 = HImode;
37431 goto rdrand_step;
37433 case IX86_BUILTIN_RDRAND32_STEP:
37434 icode = CODE_FOR_rdrandsi_1;
37435 mode0 = SImode;
37436 goto rdrand_step;
37438 case IX86_BUILTIN_RDRAND64_STEP:
37439 icode = CODE_FOR_rdranddi_1;
37440 mode0 = DImode;
37442 rdrand_step:
37443 arg0 = CALL_EXPR_ARG (exp, 0);
37444 op1 = expand_normal (arg0);
37445 if (!address_operand (op1, VOIDmode))
37447 op1 = convert_memory_address (Pmode, op1);
37448 op1 = copy_addr_to_reg (op1);
37451 op0 = gen_reg_rtx (mode0);
37452 emit_insn (GEN_FCN (icode) (op0));
37454 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37456 op1 = gen_reg_rtx (SImode);
37457 emit_move_insn (op1, CONST1_RTX (SImode));
37459 /* Emit SImode conditional move. */
37460 if (mode0 == HImode)
37462 if (TARGET_ZERO_EXTEND_WITH_AND
37463 && optimize_function_for_speed_p (cfun))
37465 op2 = force_reg (SImode, const0_rtx);
37467 emit_insn (gen_movstricthi
37468 (gen_lowpart (HImode, op2), op0));
37470 else
37472 op2 = gen_reg_rtx (SImode);
37474 emit_insn (gen_zero_extendhisi2 (op2, op0));
37477 else if (mode0 == SImode)
37478 op2 = op0;
37479 else
37480 op2 = gen_rtx_SUBREG (SImode, op0, 0);
37482 if (target == 0
37483 || !register_operand (target, SImode))
37484 target = gen_reg_rtx (SImode);
37486 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
37487 const0_rtx);
37488 emit_insn (gen_rtx_SET (target,
37489 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
37490 return target;
37492 case IX86_BUILTIN_RDSEED16_STEP:
37493 icode = CODE_FOR_rdseedhi_1;
37494 mode0 = HImode;
37495 goto rdseed_step;
37497 case IX86_BUILTIN_RDSEED32_STEP:
37498 icode = CODE_FOR_rdseedsi_1;
37499 mode0 = SImode;
37500 goto rdseed_step;
37502 case IX86_BUILTIN_RDSEED64_STEP:
37503 icode = CODE_FOR_rdseeddi_1;
37504 mode0 = DImode;
37506 rdseed_step:
37507 arg0 = CALL_EXPR_ARG (exp, 0);
37508 op1 = expand_normal (arg0);
37509 if (!address_operand (op1, VOIDmode))
37511 op1 = convert_memory_address (Pmode, op1);
37512 op1 = copy_addr_to_reg (op1);
37515 op0 = gen_reg_rtx (mode0);
37516 emit_insn (GEN_FCN (icode) (op0));
37518 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
37520 op2 = gen_reg_rtx (QImode);
37522 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
37523 const0_rtx);
37524 emit_insn (gen_rtx_SET (op2, pat));
37526 if (target == 0
37527 || !register_operand (target, SImode))
37528 target = gen_reg_rtx (SImode);
37530 emit_insn (gen_zero_extendqisi2 (target, op2));
37531 return target;
37533 case IX86_BUILTIN_SBB32:
37534 icode = CODE_FOR_subborrowsi;
37535 icode2 = CODE_FOR_subborrowsi_0;
37536 mode0 = SImode;
37537 mode1 = DImode;
37538 mode2 = CCmode;
37539 goto handlecarry;
37541 case IX86_BUILTIN_SBB64:
37542 icode = CODE_FOR_subborrowdi;
37543 icode2 = CODE_FOR_subborrowdi_0;
37544 mode0 = DImode;
37545 mode1 = TImode;
37546 mode2 = CCmode;
37547 goto handlecarry;
37549 case IX86_BUILTIN_ADDCARRYX32:
37550 icode = CODE_FOR_addcarrysi;
37551 icode2 = CODE_FOR_addcarrysi_0;
37552 mode0 = SImode;
37553 mode1 = DImode;
37554 mode2 = CCCmode;
37555 goto handlecarry;
37557 case IX86_BUILTIN_ADDCARRYX64:
37558 icode = CODE_FOR_addcarrydi;
37559 icode2 = CODE_FOR_addcarrydi_0;
37560 mode0 = DImode;
37561 mode1 = TImode;
37562 mode2 = CCCmode;
37564 handlecarry:
37565 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
37566 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
37567 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
37568 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
37570 op1 = expand_normal (arg0);
37571 if (!integer_zerop (arg0))
37572 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
37574 op2 = expand_normal (arg1);
37575 if (!register_operand (op2, mode0))
37576 op2 = copy_to_mode_reg (mode0, op2);
37578 op3 = expand_normal (arg2);
37579 if (!register_operand (op3, mode0))
37580 op3 = copy_to_mode_reg (mode0, op3);
37582 op4 = expand_normal (arg3);
37583 if (!address_operand (op4, VOIDmode))
37585 op4 = convert_memory_address (Pmode, op4);
37586 op4 = copy_addr_to_reg (op4);
37589 op0 = gen_reg_rtx (mode0);
37590 if (integer_zerop (arg0))
37592 /* If arg0 is 0, optimize right away into add or sub
37593 instruction that sets CCCmode flags. */
37594 op1 = gen_rtx_REG (mode2, FLAGS_REG);
37595 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
37597 else
37599 /* Generate CF from input operand. */
37600 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
37602 /* Generate instruction that consumes CF. */
37603 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
37604 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
37605 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
37606 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
37609 /* Return current CF value. */
37610 if (target == 0)
37611 target = gen_reg_rtx (QImode);
37613 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
37614 emit_insn (gen_rtx_SET (target, pat));
37616 /* Store the result. */
37617 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
37619 return target;
37621 case IX86_BUILTIN_READ_FLAGS:
37622 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
37624 if (optimize
37625 || target == NULL_RTX
37626 || !nonimmediate_operand (target, word_mode)
37627 || GET_MODE (target) != word_mode)
37628 target = gen_reg_rtx (word_mode);
37630 emit_insn (gen_pop (target));
37631 return target;
37633 case IX86_BUILTIN_WRITE_FLAGS:
37635 arg0 = CALL_EXPR_ARG (exp, 0);
37636 op0 = expand_normal (arg0);
37637 if (!general_no_elim_operand (op0, word_mode))
37638 op0 = copy_to_mode_reg (word_mode, op0);
37640 emit_insn (gen_push (op0));
37641 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
37642 return 0;
37644 case IX86_BUILTIN_KTESTC8:
37645 icode = CODE_FOR_ktestqi;
37646 mode3 = CCCmode;
37647 goto kortest;
37649 case IX86_BUILTIN_KTESTZ8:
37650 icode = CODE_FOR_ktestqi;
37651 mode3 = CCZmode;
37652 goto kortest;
37654 case IX86_BUILTIN_KTESTC16:
37655 icode = CODE_FOR_ktesthi;
37656 mode3 = CCCmode;
37657 goto kortest;
37659 case IX86_BUILTIN_KTESTZ16:
37660 icode = CODE_FOR_ktesthi;
37661 mode3 = CCZmode;
37662 goto kortest;
37664 case IX86_BUILTIN_KTESTC32:
37665 icode = CODE_FOR_ktestsi;
37666 mode3 = CCCmode;
37667 goto kortest;
37669 case IX86_BUILTIN_KTESTZ32:
37670 icode = CODE_FOR_ktestsi;
37671 mode3 = CCZmode;
37672 goto kortest;
37674 case IX86_BUILTIN_KTESTC64:
37675 icode = CODE_FOR_ktestdi;
37676 mode3 = CCCmode;
37677 goto kortest;
37679 case IX86_BUILTIN_KTESTZ64:
37680 icode = CODE_FOR_ktestdi;
37681 mode3 = CCZmode;
37682 goto kortest;
37684 case IX86_BUILTIN_KORTESTC8:
37685 icode = CODE_FOR_kortestqi;
37686 mode3 = CCCmode;
37687 goto kortest;
37689 case IX86_BUILTIN_KORTESTZ8:
37690 icode = CODE_FOR_kortestqi;
37691 mode3 = CCZmode;
37692 goto kortest;
37694 case IX86_BUILTIN_KORTESTC16:
37695 icode = CODE_FOR_kortesthi;
37696 mode3 = CCCmode;
37697 goto kortest;
37699 case IX86_BUILTIN_KORTESTZ16:
37700 icode = CODE_FOR_kortesthi;
37701 mode3 = CCZmode;
37702 goto kortest;
37704 case IX86_BUILTIN_KORTESTC32:
37705 icode = CODE_FOR_kortestsi;
37706 mode3 = CCCmode;
37707 goto kortest;
37709 case IX86_BUILTIN_KORTESTZ32:
37710 icode = CODE_FOR_kortestsi;
37711 mode3 = CCZmode;
37712 goto kortest;
37714 case IX86_BUILTIN_KORTESTC64:
37715 icode = CODE_FOR_kortestdi;
37716 mode3 = CCCmode;
37717 goto kortest;
37719 case IX86_BUILTIN_KORTESTZ64:
37720 icode = CODE_FOR_kortestdi;
37721 mode3 = CCZmode;
37723 kortest:
37724 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
37725 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
37726 op0 = expand_normal (arg0);
37727 op1 = expand_normal (arg1);
37729 mode0 = insn_data[icode].operand[0].mode;
37730 mode1 = insn_data[icode].operand[1].mode;
37732 if (GET_MODE (op0) != VOIDmode)
37733 op0 = force_reg (GET_MODE (op0), op0);
37735 op0 = gen_lowpart (mode0, op0);
37737 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37738 op0 = copy_to_mode_reg (mode0, op0);
37740 if (GET_MODE (op1) != VOIDmode)
37741 op1 = force_reg (GET_MODE (op1), op1);
37743 op1 = gen_lowpart (mode1, op1);
37745 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37746 op1 = copy_to_mode_reg (mode1, op1);
37748 target = gen_reg_rtx (QImode);
37750 /* Emit kortest. */
37751 emit_insn (GEN_FCN (icode) (op0, op1));
37752 /* And use setcc to return result from flags. */
37753 ix86_expand_setcc (target, EQ,
37754 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
37755 return target;
37757 case IX86_BUILTIN_GATHERSIV2DF:
37758 icode = CODE_FOR_avx2_gathersiv2df;
37759 goto gather_gen;
37760 case IX86_BUILTIN_GATHERSIV4DF:
37761 icode = CODE_FOR_avx2_gathersiv4df;
37762 goto gather_gen;
37763 case IX86_BUILTIN_GATHERDIV2DF:
37764 icode = CODE_FOR_avx2_gatherdiv2df;
37765 goto gather_gen;
37766 case IX86_BUILTIN_GATHERDIV4DF:
37767 icode = CODE_FOR_avx2_gatherdiv4df;
37768 goto gather_gen;
37769 case IX86_BUILTIN_GATHERSIV4SF:
37770 icode = CODE_FOR_avx2_gathersiv4sf;
37771 goto gather_gen;
37772 case IX86_BUILTIN_GATHERSIV8SF:
37773 icode = CODE_FOR_avx2_gathersiv8sf;
37774 goto gather_gen;
37775 case IX86_BUILTIN_GATHERDIV4SF:
37776 icode = CODE_FOR_avx2_gatherdiv4sf;
37777 goto gather_gen;
37778 case IX86_BUILTIN_GATHERDIV8SF:
37779 icode = CODE_FOR_avx2_gatherdiv8sf;
37780 goto gather_gen;
37781 case IX86_BUILTIN_GATHERSIV2DI:
37782 icode = CODE_FOR_avx2_gathersiv2di;
37783 goto gather_gen;
37784 case IX86_BUILTIN_GATHERSIV4DI:
37785 icode = CODE_FOR_avx2_gathersiv4di;
37786 goto gather_gen;
37787 case IX86_BUILTIN_GATHERDIV2DI:
37788 icode = CODE_FOR_avx2_gatherdiv2di;
37789 goto gather_gen;
37790 case IX86_BUILTIN_GATHERDIV4DI:
37791 icode = CODE_FOR_avx2_gatherdiv4di;
37792 goto gather_gen;
37793 case IX86_BUILTIN_GATHERSIV4SI:
37794 icode = CODE_FOR_avx2_gathersiv4si;
37795 goto gather_gen;
37796 case IX86_BUILTIN_GATHERSIV8SI:
37797 icode = CODE_FOR_avx2_gathersiv8si;
37798 goto gather_gen;
37799 case IX86_BUILTIN_GATHERDIV4SI:
37800 icode = CODE_FOR_avx2_gatherdiv4si;
37801 goto gather_gen;
37802 case IX86_BUILTIN_GATHERDIV8SI:
37803 icode = CODE_FOR_avx2_gatherdiv8si;
37804 goto gather_gen;
37805 case IX86_BUILTIN_GATHERALTSIV4DF:
37806 icode = CODE_FOR_avx2_gathersiv4df;
37807 goto gather_gen;
37808 case IX86_BUILTIN_GATHERALTDIV8SF:
37809 icode = CODE_FOR_avx2_gatherdiv8sf;
37810 goto gather_gen;
37811 case IX86_BUILTIN_GATHERALTSIV4DI:
37812 icode = CODE_FOR_avx2_gathersiv4di;
37813 goto gather_gen;
37814 case IX86_BUILTIN_GATHERALTDIV8SI:
37815 icode = CODE_FOR_avx2_gatherdiv8si;
37816 goto gather_gen;
37817 case IX86_BUILTIN_GATHER3SIV16SF:
37818 icode = CODE_FOR_avx512f_gathersiv16sf;
37819 goto gather_gen;
37820 case IX86_BUILTIN_GATHER3SIV8DF:
37821 icode = CODE_FOR_avx512f_gathersiv8df;
37822 goto gather_gen;
37823 case IX86_BUILTIN_GATHER3DIV16SF:
37824 icode = CODE_FOR_avx512f_gatherdiv16sf;
37825 goto gather_gen;
37826 case IX86_BUILTIN_GATHER3DIV8DF:
37827 icode = CODE_FOR_avx512f_gatherdiv8df;
37828 goto gather_gen;
37829 case IX86_BUILTIN_GATHER3SIV16SI:
37830 icode = CODE_FOR_avx512f_gathersiv16si;
37831 goto gather_gen;
37832 case IX86_BUILTIN_GATHER3SIV8DI:
37833 icode = CODE_FOR_avx512f_gathersiv8di;
37834 goto gather_gen;
37835 case IX86_BUILTIN_GATHER3DIV16SI:
37836 icode = CODE_FOR_avx512f_gatherdiv16si;
37837 goto gather_gen;
37838 case IX86_BUILTIN_GATHER3DIV8DI:
37839 icode = CODE_FOR_avx512f_gatherdiv8di;
37840 goto gather_gen;
37841 case IX86_BUILTIN_GATHER3ALTSIV8DF:
37842 icode = CODE_FOR_avx512f_gathersiv8df;
37843 goto gather_gen;
37844 case IX86_BUILTIN_GATHER3ALTDIV16SF:
37845 icode = CODE_FOR_avx512f_gatherdiv16sf;
37846 goto gather_gen;
37847 case IX86_BUILTIN_GATHER3ALTSIV8DI:
37848 icode = CODE_FOR_avx512f_gathersiv8di;
37849 goto gather_gen;
37850 case IX86_BUILTIN_GATHER3ALTDIV16SI:
37851 icode = CODE_FOR_avx512f_gatherdiv16si;
37852 goto gather_gen;
37853 case IX86_BUILTIN_GATHER3SIV2DF:
37854 icode = CODE_FOR_avx512vl_gathersiv2df;
37855 goto gather_gen;
37856 case IX86_BUILTIN_GATHER3SIV4DF:
37857 icode = CODE_FOR_avx512vl_gathersiv4df;
37858 goto gather_gen;
37859 case IX86_BUILTIN_GATHER3DIV2DF:
37860 icode = CODE_FOR_avx512vl_gatherdiv2df;
37861 goto gather_gen;
37862 case IX86_BUILTIN_GATHER3DIV4DF:
37863 icode = CODE_FOR_avx512vl_gatherdiv4df;
37864 goto gather_gen;
37865 case IX86_BUILTIN_GATHER3SIV4SF:
37866 icode = CODE_FOR_avx512vl_gathersiv4sf;
37867 goto gather_gen;
37868 case IX86_BUILTIN_GATHER3SIV8SF:
37869 icode = CODE_FOR_avx512vl_gathersiv8sf;
37870 goto gather_gen;
37871 case IX86_BUILTIN_GATHER3DIV4SF:
37872 icode = CODE_FOR_avx512vl_gatherdiv4sf;
37873 goto gather_gen;
37874 case IX86_BUILTIN_GATHER3DIV8SF:
37875 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37876 goto gather_gen;
37877 case IX86_BUILTIN_GATHER3SIV2DI:
37878 icode = CODE_FOR_avx512vl_gathersiv2di;
37879 goto gather_gen;
37880 case IX86_BUILTIN_GATHER3SIV4DI:
37881 icode = CODE_FOR_avx512vl_gathersiv4di;
37882 goto gather_gen;
37883 case IX86_BUILTIN_GATHER3DIV2DI:
37884 icode = CODE_FOR_avx512vl_gatherdiv2di;
37885 goto gather_gen;
37886 case IX86_BUILTIN_GATHER3DIV4DI:
37887 icode = CODE_FOR_avx512vl_gatherdiv4di;
37888 goto gather_gen;
37889 case IX86_BUILTIN_GATHER3SIV4SI:
37890 icode = CODE_FOR_avx512vl_gathersiv4si;
37891 goto gather_gen;
37892 case IX86_BUILTIN_GATHER3SIV8SI:
37893 icode = CODE_FOR_avx512vl_gathersiv8si;
37894 goto gather_gen;
37895 case IX86_BUILTIN_GATHER3DIV4SI:
37896 icode = CODE_FOR_avx512vl_gatherdiv4si;
37897 goto gather_gen;
37898 case IX86_BUILTIN_GATHER3DIV8SI:
37899 icode = CODE_FOR_avx512vl_gatherdiv8si;
37900 goto gather_gen;
37901 case IX86_BUILTIN_GATHER3ALTSIV4DF:
37902 icode = CODE_FOR_avx512vl_gathersiv4df;
37903 goto gather_gen;
37904 case IX86_BUILTIN_GATHER3ALTDIV8SF:
37905 icode = CODE_FOR_avx512vl_gatherdiv8sf;
37906 goto gather_gen;
37907 case IX86_BUILTIN_GATHER3ALTSIV4DI:
37908 icode = CODE_FOR_avx512vl_gathersiv4di;
37909 goto gather_gen;
37910 case IX86_BUILTIN_GATHER3ALTDIV8SI:
37911 icode = CODE_FOR_avx512vl_gatherdiv8si;
37912 goto gather_gen;
37913 case IX86_BUILTIN_SCATTERSIV16SF:
37914 icode = CODE_FOR_avx512f_scattersiv16sf;
37915 goto scatter_gen;
37916 case IX86_BUILTIN_SCATTERSIV8DF:
37917 icode = CODE_FOR_avx512f_scattersiv8df;
37918 goto scatter_gen;
37919 case IX86_BUILTIN_SCATTERDIV16SF:
37920 icode = CODE_FOR_avx512f_scatterdiv16sf;
37921 goto scatter_gen;
37922 case IX86_BUILTIN_SCATTERDIV8DF:
37923 icode = CODE_FOR_avx512f_scatterdiv8df;
37924 goto scatter_gen;
37925 case IX86_BUILTIN_SCATTERSIV16SI:
37926 icode = CODE_FOR_avx512f_scattersiv16si;
37927 goto scatter_gen;
37928 case IX86_BUILTIN_SCATTERSIV8DI:
37929 icode = CODE_FOR_avx512f_scattersiv8di;
37930 goto scatter_gen;
37931 case IX86_BUILTIN_SCATTERDIV16SI:
37932 icode = CODE_FOR_avx512f_scatterdiv16si;
37933 goto scatter_gen;
37934 case IX86_BUILTIN_SCATTERDIV8DI:
37935 icode = CODE_FOR_avx512f_scatterdiv8di;
37936 goto scatter_gen;
37937 case IX86_BUILTIN_SCATTERSIV8SF:
37938 icode = CODE_FOR_avx512vl_scattersiv8sf;
37939 goto scatter_gen;
37940 case IX86_BUILTIN_SCATTERSIV4SF:
37941 icode = CODE_FOR_avx512vl_scattersiv4sf;
37942 goto scatter_gen;
37943 case IX86_BUILTIN_SCATTERSIV4DF:
37944 icode = CODE_FOR_avx512vl_scattersiv4df;
37945 goto scatter_gen;
37946 case IX86_BUILTIN_SCATTERSIV2DF:
37947 icode = CODE_FOR_avx512vl_scattersiv2df;
37948 goto scatter_gen;
37949 case IX86_BUILTIN_SCATTERDIV8SF:
37950 icode = CODE_FOR_avx512vl_scatterdiv8sf;
37951 goto scatter_gen;
37952 case IX86_BUILTIN_SCATTERDIV4SF:
37953 icode = CODE_FOR_avx512vl_scatterdiv4sf;
37954 goto scatter_gen;
37955 case IX86_BUILTIN_SCATTERDIV4DF:
37956 icode = CODE_FOR_avx512vl_scatterdiv4df;
37957 goto scatter_gen;
37958 case IX86_BUILTIN_SCATTERDIV2DF:
37959 icode = CODE_FOR_avx512vl_scatterdiv2df;
37960 goto scatter_gen;
37961 case IX86_BUILTIN_SCATTERSIV8SI:
37962 icode = CODE_FOR_avx512vl_scattersiv8si;
37963 goto scatter_gen;
37964 case IX86_BUILTIN_SCATTERSIV4SI:
37965 icode = CODE_FOR_avx512vl_scattersiv4si;
37966 goto scatter_gen;
37967 case IX86_BUILTIN_SCATTERSIV4DI:
37968 icode = CODE_FOR_avx512vl_scattersiv4di;
37969 goto scatter_gen;
37970 case IX86_BUILTIN_SCATTERSIV2DI:
37971 icode = CODE_FOR_avx512vl_scattersiv2di;
37972 goto scatter_gen;
37973 case IX86_BUILTIN_SCATTERDIV8SI:
37974 icode = CODE_FOR_avx512vl_scatterdiv8si;
37975 goto scatter_gen;
37976 case IX86_BUILTIN_SCATTERDIV4SI:
37977 icode = CODE_FOR_avx512vl_scatterdiv4si;
37978 goto scatter_gen;
37979 case IX86_BUILTIN_SCATTERDIV4DI:
37980 icode = CODE_FOR_avx512vl_scatterdiv4di;
37981 goto scatter_gen;
37982 case IX86_BUILTIN_SCATTERDIV2DI:
37983 icode = CODE_FOR_avx512vl_scatterdiv2di;
37984 goto scatter_gen;
37985 case IX86_BUILTIN_GATHERPFDPD:
37986 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
37987 goto vec_prefetch_gen;
37988 case IX86_BUILTIN_SCATTERALTSIV8DF:
37989 icode = CODE_FOR_avx512f_scattersiv8df;
37990 goto scatter_gen;
37991 case IX86_BUILTIN_SCATTERALTDIV16SF:
37992 icode = CODE_FOR_avx512f_scatterdiv16sf;
37993 goto scatter_gen;
37994 case IX86_BUILTIN_SCATTERALTSIV8DI:
37995 icode = CODE_FOR_avx512f_scattersiv8di;
37996 goto scatter_gen;
37997 case IX86_BUILTIN_SCATTERALTDIV16SI:
37998 icode = CODE_FOR_avx512f_scatterdiv16si;
37999 goto scatter_gen;
38000 case IX86_BUILTIN_GATHERPFDPS:
38001 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
38002 goto vec_prefetch_gen;
38003 case IX86_BUILTIN_GATHERPFQPD:
38004 icode = CODE_FOR_avx512pf_gatherpfv8didf;
38005 goto vec_prefetch_gen;
38006 case IX86_BUILTIN_GATHERPFQPS:
38007 icode = CODE_FOR_avx512pf_gatherpfv8disf;
38008 goto vec_prefetch_gen;
38009 case IX86_BUILTIN_SCATTERPFDPD:
38010 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
38011 goto vec_prefetch_gen;
38012 case IX86_BUILTIN_SCATTERPFDPS:
38013 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
38014 goto vec_prefetch_gen;
38015 case IX86_BUILTIN_SCATTERPFQPD:
38016 icode = CODE_FOR_avx512pf_scatterpfv8didf;
38017 goto vec_prefetch_gen;
38018 case IX86_BUILTIN_SCATTERPFQPS:
38019 icode = CODE_FOR_avx512pf_scatterpfv8disf;
38020 goto vec_prefetch_gen;
38022 gather_gen:
38023 rtx half;
38024 rtx (*gen) (rtx, rtx);
38026 arg0 = CALL_EXPR_ARG (exp, 0);
38027 arg1 = CALL_EXPR_ARG (exp, 1);
38028 arg2 = CALL_EXPR_ARG (exp, 2);
38029 arg3 = CALL_EXPR_ARG (exp, 3);
38030 arg4 = CALL_EXPR_ARG (exp, 4);
38031 op0 = expand_normal (arg0);
38032 op1 = expand_normal (arg1);
38033 op2 = expand_normal (arg2);
38034 op3 = expand_normal (arg3);
38035 op4 = expand_normal (arg4);
38036 /* Note the arg order is different from the operand order. */
38037 mode0 = insn_data[icode].operand[1].mode;
38038 mode2 = insn_data[icode].operand[3].mode;
38039 mode3 = insn_data[icode].operand[4].mode;
38040 mode4 = insn_data[icode].operand[5].mode;
38042 if (target == NULL_RTX
38043 || GET_MODE (target) != insn_data[icode].operand[0].mode
38044 || !insn_data[icode].operand[0].predicate (target,
38045 GET_MODE (target)))
38046 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
38047 else
38048 subtarget = target;
38050 switch (fcode)
38052 case IX86_BUILTIN_GATHER3ALTSIV8DF:
38053 case IX86_BUILTIN_GATHER3ALTSIV8DI:
38054 half = gen_reg_rtx (V8SImode);
38055 if (!nonimmediate_operand (op2, V16SImode))
38056 op2 = copy_to_mode_reg (V16SImode, op2);
38057 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38058 op2 = half;
38059 break;
38060 case IX86_BUILTIN_GATHER3ALTSIV4DF:
38061 case IX86_BUILTIN_GATHER3ALTSIV4DI:
38062 case IX86_BUILTIN_GATHERALTSIV4DF:
38063 case IX86_BUILTIN_GATHERALTSIV4DI:
38064 half = gen_reg_rtx (V4SImode);
38065 if (!nonimmediate_operand (op2, V8SImode))
38066 op2 = copy_to_mode_reg (V8SImode, op2);
38067 emit_insn (gen_vec_extract_lo_v8si (half, op2));
38068 op2 = half;
38069 break;
38070 case IX86_BUILTIN_GATHER3ALTDIV16SF:
38071 case IX86_BUILTIN_GATHER3ALTDIV16SI:
38072 half = gen_reg_rtx (mode0);
38073 if (mode0 == V8SFmode)
38074 gen = gen_vec_extract_lo_v16sf;
38075 else
38076 gen = gen_vec_extract_lo_v16si;
38077 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38078 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38079 emit_insn (gen (half, op0));
38080 op0 = half;
38081 if (GET_MODE (op3) != VOIDmode)
38083 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38084 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38085 emit_insn (gen (half, op3));
38086 op3 = half;
38088 break;
38089 case IX86_BUILTIN_GATHER3ALTDIV8SF:
38090 case IX86_BUILTIN_GATHER3ALTDIV8SI:
38091 case IX86_BUILTIN_GATHERALTDIV8SF:
38092 case IX86_BUILTIN_GATHERALTDIV8SI:
38093 half = gen_reg_rtx (mode0);
38094 if (mode0 == V4SFmode)
38095 gen = gen_vec_extract_lo_v8sf;
38096 else
38097 gen = gen_vec_extract_lo_v8si;
38098 if (!nonimmediate_operand (op0, GET_MODE (op0)))
38099 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
38100 emit_insn (gen (half, op0));
38101 op0 = half;
38102 if (GET_MODE (op3) != VOIDmode)
38104 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38105 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38106 emit_insn (gen (half, op3));
38107 op3 = half;
38109 break;
38110 default:
38111 break;
38114 /* Force memory operand only with base register here. But we
38115 don't want to do it on memory operand for other builtin
38116 functions. */
38117 op1 = ix86_zero_extend_to_Pmode (op1);
38119 if (!insn_data[icode].operand[1].predicate (op0, mode0))
38120 op0 = copy_to_mode_reg (mode0, op0);
38121 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
38122 op1 = copy_to_mode_reg (Pmode, op1);
38123 if (!insn_data[icode].operand[3].predicate (op2, mode2))
38124 op2 = copy_to_mode_reg (mode2, op2);
38126 op3 = fixup_modeless_constant (op3, mode3);
38128 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
38130 if (!insn_data[icode].operand[4].predicate (op3, mode3))
38131 op3 = copy_to_mode_reg (mode3, op3);
38133 else
38135 op3 = copy_to_reg (op3);
38136 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
38138 if (!insn_data[icode].operand[5].predicate (op4, mode4))
38140 error ("the last argument must be scale 1, 2, 4, 8");
38141 return const0_rtx;
38144 /* Optimize. If mask is known to have all high bits set,
38145 replace op0 with pc_rtx to signal that the instruction
38146 overwrites the whole destination and doesn't use its
38147 previous contents. */
38148 if (optimize)
38150 if (TREE_CODE (arg3) == INTEGER_CST)
38152 if (integer_all_onesp (arg3))
38153 op0 = pc_rtx;
38155 else if (TREE_CODE (arg3) == VECTOR_CST)
38157 unsigned int negative = 0;
38158 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
38160 tree cst = VECTOR_CST_ELT (arg3, i);
38161 if (TREE_CODE (cst) == INTEGER_CST
38162 && tree_int_cst_sign_bit (cst))
38163 negative++;
38164 else if (TREE_CODE (cst) == REAL_CST
38165 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
38166 negative++;
38168 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
38169 op0 = pc_rtx;
38171 else if (TREE_CODE (arg3) == SSA_NAME
38172 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
38174 /* Recognize also when mask is like:
38175 __v2df src = _mm_setzero_pd ();
38176 __v2df mask = _mm_cmpeq_pd (src, src);
38178 __v8sf src = _mm256_setzero_ps ();
38179 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
38180 as that is a cheaper way to load all ones into
38181 a register than having to load a constant from
38182 memory. */
38183 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
38184 if (is_gimple_call (def_stmt))
38186 tree fndecl = gimple_call_fndecl (def_stmt);
38187 if (fndecl
38188 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
38189 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
38191 case IX86_BUILTIN_CMPPD:
38192 case IX86_BUILTIN_CMPPS:
38193 case IX86_BUILTIN_CMPPD256:
38194 case IX86_BUILTIN_CMPPS256:
38195 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
38196 break;
38197 /* FALLTHRU */
38198 case IX86_BUILTIN_CMPEQPD:
38199 case IX86_BUILTIN_CMPEQPS:
38200 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
38201 && initializer_zerop (gimple_call_arg (def_stmt,
38202 1)))
38203 op0 = pc_rtx;
38204 break;
38205 default:
38206 break;
38212 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
38213 if (! pat)
38214 return const0_rtx;
38215 emit_insn (pat);
38217 switch (fcode)
38219 case IX86_BUILTIN_GATHER3DIV16SF:
38220 if (target == NULL_RTX)
38221 target = gen_reg_rtx (V8SFmode);
38222 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
38223 break;
38224 case IX86_BUILTIN_GATHER3DIV16SI:
38225 if (target == NULL_RTX)
38226 target = gen_reg_rtx (V8SImode);
38227 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
38228 break;
38229 case IX86_BUILTIN_GATHER3DIV8SF:
38230 case IX86_BUILTIN_GATHERDIV8SF:
38231 if (target == NULL_RTX)
38232 target = gen_reg_rtx (V4SFmode);
38233 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
38234 break;
38235 case IX86_BUILTIN_GATHER3DIV8SI:
38236 case IX86_BUILTIN_GATHERDIV8SI:
38237 if (target == NULL_RTX)
38238 target = gen_reg_rtx (V4SImode);
38239 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
38240 break;
38241 default:
38242 target = subtarget;
38243 break;
38245 return target;
38247 scatter_gen:
38248 arg0 = CALL_EXPR_ARG (exp, 0);
38249 arg1 = CALL_EXPR_ARG (exp, 1);
38250 arg2 = CALL_EXPR_ARG (exp, 2);
38251 arg3 = CALL_EXPR_ARG (exp, 3);
38252 arg4 = CALL_EXPR_ARG (exp, 4);
38253 op0 = expand_normal (arg0);
38254 op1 = expand_normal (arg1);
38255 op2 = expand_normal (arg2);
38256 op3 = expand_normal (arg3);
38257 op4 = expand_normal (arg4);
38258 mode1 = insn_data[icode].operand[1].mode;
38259 mode2 = insn_data[icode].operand[2].mode;
38260 mode3 = insn_data[icode].operand[3].mode;
38261 mode4 = insn_data[icode].operand[4].mode;
38263 /* Scatter instruction stores operand op3 to memory with
38264 indices from op2 and scale from op4 under writemask op1.
38265 If index operand op2 has more elements then source operand
38266 op3 one need to use only its low half. And vice versa. */
38267 switch (fcode)
38269 case IX86_BUILTIN_SCATTERALTSIV8DF:
38270 case IX86_BUILTIN_SCATTERALTSIV8DI:
38271 half = gen_reg_rtx (V8SImode);
38272 if (!nonimmediate_operand (op2, V16SImode))
38273 op2 = copy_to_mode_reg (V16SImode, op2);
38274 emit_insn (gen_vec_extract_lo_v16si (half, op2));
38275 op2 = half;
38276 break;
38277 case IX86_BUILTIN_SCATTERALTDIV16SF:
38278 case IX86_BUILTIN_SCATTERALTDIV16SI:
38279 half = gen_reg_rtx (mode3);
38280 if (mode3 == V8SFmode)
38281 gen = gen_vec_extract_lo_v16sf;
38282 else
38283 gen = gen_vec_extract_lo_v16si;
38284 if (!nonimmediate_operand (op3, GET_MODE (op3)))
38285 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
38286 emit_insn (gen (half, op3));
38287 op3 = half;
38288 break;
38289 default:
38290 break;
38293 /* Force memory operand only with base register here. But we
38294 don't want to do it on memory operand for other builtin
38295 functions. */
38296 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
38298 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
38299 op0 = copy_to_mode_reg (Pmode, op0);
38301 op1 = fixup_modeless_constant (op1, mode1);
38303 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
38305 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38306 op1 = copy_to_mode_reg (mode1, op1);
38308 else
38310 op1 = copy_to_reg (op1);
38311 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
38314 if (!insn_data[icode].operand[2].predicate (op2, mode2))
38315 op2 = copy_to_mode_reg (mode2, op2);
38317 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38318 op3 = copy_to_mode_reg (mode3, op3);
38320 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38322 error ("the last argument must be scale 1, 2, 4, 8");
38323 return const0_rtx;
38326 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38327 if (! pat)
38328 return const0_rtx;
38330 emit_insn (pat);
38331 return 0;
38333 vec_prefetch_gen:
38334 arg0 = CALL_EXPR_ARG (exp, 0);
38335 arg1 = CALL_EXPR_ARG (exp, 1);
38336 arg2 = CALL_EXPR_ARG (exp, 2);
38337 arg3 = CALL_EXPR_ARG (exp, 3);
38338 arg4 = CALL_EXPR_ARG (exp, 4);
38339 op0 = expand_normal (arg0);
38340 op1 = expand_normal (arg1);
38341 op2 = expand_normal (arg2);
38342 op3 = expand_normal (arg3);
38343 op4 = expand_normal (arg4);
38344 mode0 = insn_data[icode].operand[0].mode;
38345 mode1 = insn_data[icode].operand[1].mode;
38346 mode3 = insn_data[icode].operand[3].mode;
38347 mode4 = insn_data[icode].operand[4].mode;
38349 op0 = fixup_modeless_constant (op0, mode0);
38351 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
38353 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38354 op0 = copy_to_mode_reg (mode0, op0);
38356 else
38358 op0 = copy_to_reg (op0);
38359 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
38362 if (!insn_data[icode].operand[1].predicate (op1, mode1))
38363 op1 = copy_to_mode_reg (mode1, op1);
38365 /* Force memory operand only with base register here. But we
38366 don't want to do it on memory operand for other builtin
38367 functions. */
38368 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
38370 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
38371 op2 = copy_to_mode_reg (Pmode, op2);
38373 if (!insn_data[icode].operand[3].predicate (op3, mode3))
38375 error ("the forth argument must be scale 1, 2, 4, 8");
38376 return const0_rtx;
38379 if (!insn_data[icode].operand[4].predicate (op4, mode4))
38381 error ("incorrect hint operand");
38382 return const0_rtx;
38385 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
38386 if (! pat)
38387 return const0_rtx;
38389 emit_insn (pat);
38391 return 0;
38393 case IX86_BUILTIN_XABORT:
38394 icode = CODE_FOR_xabort;
38395 arg0 = CALL_EXPR_ARG (exp, 0);
38396 op0 = expand_normal (arg0);
38397 mode0 = insn_data[icode].operand[0].mode;
38398 if (!insn_data[icode].operand[0].predicate (op0, mode0))
38400 error ("the xabort's argument must be an 8-bit immediate");
38401 return const0_rtx;
38403 emit_insn (gen_xabort (op0));
38404 return 0;
38406 case IX86_BUILTIN_RSTORSSP:
38407 case IX86_BUILTIN_CLRSSBSY:
38408 arg0 = CALL_EXPR_ARG (exp, 0);
38409 op0 = expand_normal (arg0);
38410 icode = (fcode == IX86_BUILTIN_RSTORSSP
38411 ? CODE_FOR_rstorssp
38412 : CODE_FOR_clrssbsy);
38413 if (!address_operand (op0, VOIDmode))
38415 op1 = convert_memory_address (Pmode, op0);
38416 op0 = copy_addr_to_reg (op1);
38418 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
38419 return 0;
38421 case IX86_BUILTIN_WRSSD:
38422 case IX86_BUILTIN_WRSSQ:
38423 case IX86_BUILTIN_WRUSSD:
38424 case IX86_BUILTIN_WRUSSQ:
38425 arg0 = CALL_EXPR_ARG (exp, 0);
38426 op0 = expand_normal (arg0);
38427 arg1 = CALL_EXPR_ARG (exp, 1);
38428 op1 = expand_normal (arg1);
38429 switch (fcode)
38431 case IX86_BUILTIN_WRSSD:
38432 icode = CODE_FOR_wrsssi;
38433 mode = SImode;
38434 break;
38435 case IX86_BUILTIN_WRSSQ:
38436 icode = CODE_FOR_wrssdi;
38437 mode = DImode;
38438 break;
38439 case IX86_BUILTIN_WRUSSD:
38440 icode = CODE_FOR_wrusssi;
38441 mode = SImode;
38442 break;
38443 case IX86_BUILTIN_WRUSSQ:
38444 icode = CODE_FOR_wrussdi;
38445 mode = DImode;
38446 break;
38448 op0 = force_reg (mode, op0);
38449 if (!address_operand (op1, VOIDmode))
38451 op2 = convert_memory_address (Pmode, op1);
38452 op1 = copy_addr_to_reg (op2);
38454 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
38455 return 0;
38457 default:
38458 break;
38461 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
38462 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
38464 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
38465 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
38466 target);
38469 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
38470 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
38472 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
38473 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
38474 target);
38477 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
38478 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
38480 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
38481 switch (fcode)
38483 case IX86_BUILTIN_FABSQ:
38484 case IX86_BUILTIN_COPYSIGNQ:
38485 if (!TARGET_SSE)
38486 /* Emit a normal call if SSE isn't available. */
38487 return expand_call (exp, target, ignore);
38488 /* FALLTHRU */
38489 default:
38490 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
38494 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
38495 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
38497 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
38498 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
38499 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
38500 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
38501 int masked = 1;
38502 machine_mode mode, wide_mode, nar_mode;
38504 nar_mode = V4SFmode;
38505 mode = V16SFmode;
38506 wide_mode = V64SFmode;
38507 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
38508 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
38510 switch (fcode)
38512 case IX86_BUILTIN_4FMAPS:
38513 fcn = gen_avx5124fmaddps_4fmaddps;
38514 masked = 0;
38515 goto v4fma_expand;
38517 case IX86_BUILTIN_4DPWSSD:
38518 nar_mode = V4SImode;
38519 mode = V16SImode;
38520 wide_mode = V64SImode;
38521 fcn = gen_avx5124vnniw_vp4dpwssd;
38522 masked = 0;
38523 goto v4fma_expand;
38525 case IX86_BUILTIN_4DPWSSDS:
38526 nar_mode = V4SImode;
38527 mode = V16SImode;
38528 wide_mode = V64SImode;
38529 fcn = gen_avx5124vnniw_vp4dpwssds;
38530 masked = 0;
38531 goto v4fma_expand;
38533 case IX86_BUILTIN_4FNMAPS:
38534 fcn = gen_avx5124fmaddps_4fnmaddps;
38535 masked = 0;
38536 goto v4fma_expand;
38538 case IX86_BUILTIN_4FNMAPS_MASK:
38539 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
38540 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
38541 goto v4fma_expand;
38543 case IX86_BUILTIN_4DPWSSD_MASK:
38544 nar_mode = V4SImode;
38545 mode = V16SImode;
38546 wide_mode = V64SImode;
38547 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
38548 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
38549 goto v4fma_expand;
38551 case IX86_BUILTIN_4DPWSSDS_MASK:
38552 nar_mode = V4SImode;
38553 mode = V16SImode;
38554 wide_mode = V64SImode;
38555 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
38556 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
38557 goto v4fma_expand;
38559 case IX86_BUILTIN_4FMAPS_MASK:
38561 tree args[4];
38562 rtx ops[4];
38563 rtx wide_reg;
38564 rtx accum;
38565 rtx addr;
38566 rtx mem;
38568 v4fma_expand:
38569 wide_reg = gen_reg_rtx (wide_mode);
38570 for (i = 0; i < 4; i++)
38572 args[i] = CALL_EXPR_ARG (exp, i);
38573 ops[i] = expand_normal (args[i]);
38575 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
38576 ops[i]);
38579 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38580 accum = force_reg (mode, accum);
38582 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38583 addr = force_reg (Pmode, addr);
38585 mem = gen_rtx_MEM (nar_mode, addr);
38587 target = gen_reg_rtx (mode);
38589 emit_move_insn (target, accum);
38591 if (! masked)
38592 emit_insn (fcn (target, accum, wide_reg, mem));
38593 else
38595 rtx merge, mask;
38596 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38598 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38600 if (CONST_INT_P (mask))
38601 mask = fixup_modeless_constant (mask, HImode);
38603 mask = force_reg (HImode, mask);
38605 if (GET_MODE (mask) != HImode)
38606 mask = gen_rtx_SUBREG (HImode, mask, 0);
38608 /* If merge is 0 then we're about to emit z-masked variant. */
38609 if (const0_operand (merge, mode))
38610 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38611 /* If merge is the same as accum then emit merge-masked variant. */
38612 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38614 merge = force_reg (mode, merge);
38615 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38617 /* Merge with something unknown might happen if we z-mask w/ -O0. */
38618 else
38620 target = gen_reg_rtx (mode);
38621 emit_move_insn (target, merge);
38622 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38625 return target;
38628 case IX86_BUILTIN_4FNMASS:
38629 fcn = gen_avx5124fmaddps_4fnmaddss;
38630 masked = 0;
38631 goto s4fma_expand;
38633 case IX86_BUILTIN_4FMASS:
38634 fcn = gen_avx5124fmaddps_4fmaddss;
38635 masked = 0;
38636 goto s4fma_expand;
38638 case IX86_BUILTIN_4FNMASS_MASK:
38639 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
38640 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
38641 goto s4fma_expand;
38643 case IX86_BUILTIN_4FMASS_MASK:
38645 tree args[4];
38646 rtx ops[4];
38647 rtx wide_reg;
38648 rtx accum;
38649 rtx addr;
38650 rtx mem;
38652 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
38653 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
38655 s4fma_expand:
38656 mode = V4SFmode;
38657 wide_reg = gen_reg_rtx (V64SFmode);
38658 for (i = 0; i < 4; i++)
38660 rtx tmp;
38661 args[i] = CALL_EXPR_ARG (exp, i);
38662 ops[i] = expand_normal (args[i]);
38664 tmp = gen_reg_rtx (SFmode);
38665 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
38667 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
38668 gen_rtx_SUBREG (V16SFmode, tmp, 0));
38671 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
38672 accum = force_reg (V4SFmode, accum);
38674 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
38675 addr = force_reg (Pmode, addr);
38677 mem = gen_rtx_MEM (V4SFmode, addr);
38679 target = gen_reg_rtx (V4SFmode);
38681 emit_move_insn (target, accum);
38683 if (! masked)
38684 emit_insn (fcn (target, accum, wide_reg, mem));
38685 else
38687 rtx merge, mask;
38688 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
38690 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
38692 if (CONST_INT_P (mask))
38693 mask = fixup_modeless_constant (mask, QImode);
38695 mask = force_reg (QImode, mask);
38697 if (GET_MODE (mask) != QImode)
38698 mask = gen_rtx_SUBREG (QImode, mask, 0);
38700 /* If merge is 0 then we're about to emit z-masked variant. */
38701 if (const0_operand (merge, mode))
38702 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
38703 /* If merge is the same as accum then emit merge-masked
38704 variant. */
38705 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
38707 merge = force_reg (mode, merge);
38708 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
38710 /* Merge with something unknown might happen if we z-mask
38711 w/ -O0. */
38712 else
38714 target = gen_reg_rtx (mode);
38715 emit_move_insn (target, merge);
38716 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
38719 return target;
38721 case IX86_BUILTIN_RDPID:
38722 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
38723 target);
38724 default:
38725 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
38729 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
38730 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
38732 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
38733 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
38736 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
38737 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
38739 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
38740 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
38743 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
38744 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
38746 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
38747 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
38750 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
38751 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
38753 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
38754 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
38757 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
38758 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
38760 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
38761 const struct builtin_description *d = bdesc_multi_arg + i;
38762 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
38763 (enum ix86_builtin_func_type)
38764 d->flag, d->comparison);
38767 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
38768 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
38770 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
38771 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
38772 target);
38775 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
38776 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
38778 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
38779 return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
38780 target);
38783 gcc_unreachable ();
38786 /* This returns the target-specific builtin with code CODE if
38787 current_function_decl has visibility on this builtin, which is checked
38788 using isa flags. Returns NULL_TREE otherwise. */
38790 static tree ix86_get_builtin (enum ix86_builtins code)
38792 struct cl_target_option *opts;
38793 tree target_tree = NULL_TREE;
38795 /* Determine the isa flags of current_function_decl. */
38797 if (current_function_decl)
38798 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
38800 if (target_tree == NULL)
38801 target_tree = target_option_default_node;
38803 opts = TREE_TARGET_OPTION (target_tree);
38805 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
38806 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
38807 return ix86_builtin_decl (code, true);
38808 else
38809 return NULL_TREE;
38812 /* Returns a function decl for a vectorized version of the combined function
38813 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
38814 if it is not available. */
38816 static tree
38817 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
38818 tree type_in)
38820 machine_mode in_mode, out_mode;
38821 int in_n, out_n;
38823 if (TREE_CODE (type_out) != VECTOR_TYPE
38824 || TREE_CODE (type_in) != VECTOR_TYPE)
38825 return NULL_TREE;
38827 out_mode = TYPE_MODE (TREE_TYPE (type_out));
38828 out_n = TYPE_VECTOR_SUBPARTS (type_out);
38829 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38830 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38832 switch (fn)
38834 CASE_CFN_EXP2:
38835 if (out_mode == SFmode && in_mode == SFmode)
38837 if (out_n == 16 && in_n == 16)
38838 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
38840 break;
38842 CASE_CFN_IFLOOR:
38843 CASE_CFN_LFLOOR:
38844 CASE_CFN_LLFLOOR:
38845 /* The round insn does not trap on denormals. */
38846 if (flag_trapping_math || !TARGET_SSE4_1)
38847 break;
38849 if (out_mode == SImode && in_mode == DFmode)
38851 if (out_n == 4 && in_n == 2)
38852 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
38853 else if (out_n == 8 && in_n == 4)
38854 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
38855 else if (out_n == 16 && in_n == 8)
38856 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
38858 if (out_mode == SImode && in_mode == SFmode)
38860 if (out_n == 4 && in_n == 4)
38861 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
38862 else if (out_n == 8 && in_n == 8)
38863 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
38864 else if (out_n == 16 && in_n == 16)
38865 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
38867 break;
38869 CASE_CFN_ICEIL:
38870 CASE_CFN_LCEIL:
38871 CASE_CFN_LLCEIL:
38872 /* The round insn does not trap on denormals. */
38873 if (flag_trapping_math || !TARGET_SSE4_1)
38874 break;
38876 if (out_mode == SImode && in_mode == DFmode)
38878 if (out_n == 4 && in_n == 2)
38879 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
38880 else if (out_n == 8 && in_n == 4)
38881 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
38882 else if (out_n == 16 && in_n == 8)
38883 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
38885 if (out_mode == SImode && in_mode == SFmode)
38887 if (out_n == 4 && in_n == 4)
38888 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
38889 else if (out_n == 8 && in_n == 8)
38890 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
38891 else if (out_n == 16 && in_n == 16)
38892 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
38894 break;
38896 CASE_CFN_IRINT:
38897 CASE_CFN_LRINT:
38898 CASE_CFN_LLRINT:
38899 if (out_mode == SImode && in_mode == DFmode)
38901 if (out_n == 4 && in_n == 2)
38902 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
38903 else if (out_n == 8 && in_n == 4)
38904 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
38905 else if (out_n == 16 && in_n == 8)
38906 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
38908 if (out_mode == SImode && in_mode == SFmode)
38910 if (out_n == 4 && in_n == 4)
38911 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
38912 else if (out_n == 8 && in_n == 8)
38913 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
38914 else if (out_n == 16 && in_n == 16)
38915 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
38917 break;
38919 CASE_CFN_IROUND:
38920 CASE_CFN_LROUND:
38921 CASE_CFN_LLROUND:
38922 /* The round insn does not trap on denormals. */
38923 if (flag_trapping_math || !TARGET_SSE4_1)
38924 break;
38926 if (out_mode == SImode && in_mode == DFmode)
38928 if (out_n == 4 && in_n == 2)
38929 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
38930 else if (out_n == 8 && in_n == 4)
38931 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
38932 else if (out_n == 16 && in_n == 8)
38933 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
38935 if (out_mode == SImode && in_mode == SFmode)
38937 if (out_n == 4 && in_n == 4)
38938 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
38939 else if (out_n == 8 && in_n == 8)
38940 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
38941 else if (out_n == 16 && in_n == 16)
38942 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
38944 break;
38946 CASE_CFN_FLOOR:
38947 /* The round insn does not trap on denormals. */
38948 if (flag_trapping_math || !TARGET_SSE4_1)
38949 break;
38951 if (out_mode == DFmode && in_mode == DFmode)
38953 if (out_n == 2 && in_n == 2)
38954 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
38955 else if (out_n == 4 && in_n == 4)
38956 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
38957 else if (out_n == 8 && in_n == 8)
38958 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
38960 if (out_mode == SFmode && in_mode == SFmode)
38962 if (out_n == 4 && in_n == 4)
38963 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
38964 else if (out_n == 8 && in_n == 8)
38965 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
38966 else if (out_n == 16 && in_n == 16)
38967 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
38969 break;
38971 CASE_CFN_CEIL:
38972 /* The round insn does not trap on denormals. */
38973 if (flag_trapping_math || !TARGET_SSE4_1)
38974 break;
38976 if (out_mode == DFmode && in_mode == DFmode)
38978 if (out_n == 2 && in_n == 2)
38979 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
38980 else if (out_n == 4 && in_n == 4)
38981 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
38982 else if (out_n == 8 && in_n == 8)
38983 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
38985 if (out_mode == SFmode && in_mode == SFmode)
38987 if (out_n == 4 && in_n == 4)
38988 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
38989 else if (out_n == 8 && in_n == 8)
38990 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
38991 else if (out_n == 16 && in_n == 16)
38992 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
38994 break;
38996 CASE_CFN_TRUNC:
38997 /* The round insn does not trap on denormals. */
38998 if (flag_trapping_math || !TARGET_SSE4_1)
38999 break;
39001 if (out_mode == DFmode && in_mode == DFmode)
39003 if (out_n == 2 && in_n == 2)
39004 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
39005 else if (out_n == 4 && in_n == 4)
39006 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
39007 else if (out_n == 8 && in_n == 8)
39008 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
39010 if (out_mode == SFmode && in_mode == SFmode)
39012 if (out_n == 4 && in_n == 4)
39013 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
39014 else if (out_n == 8 && in_n == 8)
39015 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
39016 else if (out_n == 16 && in_n == 16)
39017 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
39019 break;
39021 CASE_CFN_RINT:
39022 /* The round insn does not trap on denormals. */
39023 if (flag_trapping_math || !TARGET_SSE4_1)
39024 break;
39026 if (out_mode == DFmode && in_mode == DFmode)
39028 if (out_n == 2 && in_n == 2)
39029 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
39030 else if (out_n == 4 && in_n == 4)
39031 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
39033 if (out_mode == SFmode && in_mode == SFmode)
39035 if (out_n == 4 && in_n == 4)
39036 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
39037 else if (out_n == 8 && in_n == 8)
39038 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
39040 break;
39042 CASE_CFN_FMA:
39043 if (out_mode == DFmode && in_mode == DFmode)
39045 if (out_n == 2 && in_n == 2)
39046 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
39047 if (out_n == 4 && in_n == 4)
39048 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
39050 if (out_mode == SFmode && in_mode == SFmode)
39052 if (out_n == 4 && in_n == 4)
39053 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
39054 if (out_n == 8 && in_n == 8)
39055 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
39057 break;
39059 default:
39060 break;
39063 /* Dispatch to a handler for a vectorization library. */
39064 if (ix86_veclib_handler)
39065 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
39067 return NULL_TREE;
39070 /* Handler for an SVML-style interface to
39071 a library with vectorized intrinsics. */
39073 static tree
39074 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
39076 char name[20];
39077 tree fntype, new_fndecl, args;
39078 unsigned arity;
39079 const char *bname;
39080 machine_mode el_mode, in_mode;
39081 int n, in_n;
39083 /* The SVML is suitable for unsafe math only. */
39084 if (!flag_unsafe_math_optimizations)
39085 return NULL_TREE;
39087 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39088 n = TYPE_VECTOR_SUBPARTS (type_out);
39089 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39090 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39091 if (el_mode != in_mode
39092 || n != in_n)
39093 return NULL_TREE;
39095 switch (fn)
39097 CASE_CFN_EXP:
39098 CASE_CFN_LOG:
39099 CASE_CFN_LOG10:
39100 CASE_CFN_POW:
39101 CASE_CFN_TANH:
39102 CASE_CFN_TAN:
39103 CASE_CFN_ATAN:
39104 CASE_CFN_ATAN2:
39105 CASE_CFN_ATANH:
39106 CASE_CFN_CBRT:
39107 CASE_CFN_SINH:
39108 CASE_CFN_SIN:
39109 CASE_CFN_ASINH:
39110 CASE_CFN_ASIN:
39111 CASE_CFN_COSH:
39112 CASE_CFN_COS:
39113 CASE_CFN_ACOSH:
39114 CASE_CFN_ACOS:
39115 if ((el_mode != DFmode || n != 2)
39116 && (el_mode != SFmode || n != 4))
39117 return NULL_TREE;
39118 break;
39120 default:
39121 return NULL_TREE;
39124 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39125 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39127 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
39128 strcpy (name, "vmlsLn4");
39129 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
39130 strcpy (name, "vmldLn2");
39131 else if (n == 4)
39133 sprintf (name, "vmls%s", bname+10);
39134 name[strlen (name)-1] = '4';
39136 else
39137 sprintf (name, "vmld%s2", bname+10);
39139 /* Convert to uppercase. */
39140 name[4] &= ~0x20;
39142 arity = 0;
39143 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39144 arity++;
39146 if (arity == 1)
39147 fntype = build_function_type_list (type_out, type_in, NULL);
39148 else
39149 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39151 /* Build a function declaration for the vectorized function. */
39152 new_fndecl = build_decl (BUILTINS_LOCATION,
39153 FUNCTION_DECL, get_identifier (name), fntype);
39154 TREE_PUBLIC (new_fndecl) = 1;
39155 DECL_EXTERNAL (new_fndecl) = 1;
39156 DECL_IS_NOVOPS (new_fndecl) = 1;
39157 TREE_READONLY (new_fndecl) = 1;
39159 return new_fndecl;
39162 /* Handler for an ACML-style interface to
39163 a library with vectorized intrinsics. */
39165 static tree
39166 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
39168 char name[20] = "__vr.._";
39169 tree fntype, new_fndecl, args;
39170 unsigned arity;
39171 const char *bname;
39172 machine_mode el_mode, in_mode;
39173 int n, in_n;
39175 /* The ACML is 64bits only and suitable for unsafe math only as
39176 it does not correctly support parts of IEEE with the required
39177 precision such as denormals. */
39178 if (!TARGET_64BIT
39179 || !flag_unsafe_math_optimizations)
39180 return NULL_TREE;
39182 el_mode = TYPE_MODE (TREE_TYPE (type_out));
39183 n = TYPE_VECTOR_SUBPARTS (type_out);
39184 in_mode = TYPE_MODE (TREE_TYPE (type_in));
39185 in_n = TYPE_VECTOR_SUBPARTS (type_in);
39186 if (el_mode != in_mode
39187 || n != in_n)
39188 return NULL_TREE;
39190 switch (fn)
39192 CASE_CFN_SIN:
39193 CASE_CFN_COS:
39194 CASE_CFN_EXP:
39195 CASE_CFN_LOG:
39196 CASE_CFN_LOG2:
39197 CASE_CFN_LOG10:
39198 if (el_mode == DFmode && n == 2)
39200 name[4] = 'd';
39201 name[5] = '2';
39203 else if (el_mode == SFmode && n == 4)
39205 name[4] = 's';
39206 name[5] = '4';
39208 else
39209 return NULL_TREE;
39210 break;
39212 default:
39213 return NULL_TREE;
39216 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
39217 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
39218 sprintf (name + 7, "%s", bname+10);
39220 arity = 0;
39221 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
39222 arity++;
39224 if (arity == 1)
39225 fntype = build_function_type_list (type_out, type_in, NULL);
39226 else
39227 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
39229 /* Build a function declaration for the vectorized function. */
39230 new_fndecl = build_decl (BUILTINS_LOCATION,
39231 FUNCTION_DECL, get_identifier (name), fntype);
39232 TREE_PUBLIC (new_fndecl) = 1;
39233 DECL_EXTERNAL (new_fndecl) = 1;
39234 DECL_IS_NOVOPS (new_fndecl) = 1;
39235 TREE_READONLY (new_fndecl) = 1;
39237 return new_fndecl;
39240 /* Returns a decl of a function that implements gather load with
39241 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
39242 Return NULL_TREE if it is not available. */
39244 static tree
39245 ix86_vectorize_builtin_gather (const_tree mem_vectype,
39246 const_tree index_type, int scale)
39248 bool si;
39249 enum ix86_builtins code;
39251 if (! TARGET_AVX2 || !TARGET_USE_GATHER)
39252 return NULL_TREE;
39254 if ((TREE_CODE (index_type) != INTEGER_TYPE
39255 && !POINTER_TYPE_P (index_type))
39256 || (TYPE_MODE (index_type) != SImode
39257 && TYPE_MODE (index_type) != DImode))
39258 return NULL_TREE;
39260 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39261 return NULL_TREE;
39263 /* v*gather* insn sign extends index to pointer mode. */
39264 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39265 && TYPE_UNSIGNED (index_type))
39266 return NULL_TREE;
39268 if (scale <= 0
39269 || scale > 8
39270 || (scale & (scale - 1)) != 0)
39271 return NULL_TREE;
39273 si = TYPE_MODE (index_type) == SImode;
39274 switch (TYPE_MODE (mem_vectype))
39276 case E_V2DFmode:
39277 if (TARGET_AVX512VL)
39278 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
39279 else
39280 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
39281 break;
39282 case E_V4DFmode:
39283 if (TARGET_AVX512VL)
39284 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
39285 else
39286 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
39287 break;
39288 case E_V2DImode:
39289 if (TARGET_AVX512VL)
39290 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
39291 else
39292 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
39293 break;
39294 case E_V4DImode:
39295 if (TARGET_AVX512VL)
39296 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
39297 else
39298 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
39299 break;
39300 case E_V4SFmode:
39301 if (TARGET_AVX512VL)
39302 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
39303 else
39304 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
39305 break;
39306 case E_V8SFmode:
39307 if (TARGET_AVX512VL)
39308 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
39309 else
39310 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
39311 break;
39312 case E_V4SImode:
39313 if (TARGET_AVX512VL)
39314 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
39315 else
39316 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
39317 break;
39318 case E_V8SImode:
39319 if (TARGET_AVX512VL)
39320 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
39321 else
39322 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
39323 break;
39324 case E_V8DFmode:
39325 if (TARGET_AVX512F)
39326 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
39327 else
39328 return NULL_TREE;
39329 break;
39330 case E_V8DImode:
39331 if (TARGET_AVX512F)
39332 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
39333 else
39334 return NULL_TREE;
39335 break;
39336 case E_V16SFmode:
39337 if (TARGET_AVX512F)
39338 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
39339 else
39340 return NULL_TREE;
39341 break;
39342 case E_V16SImode:
39343 if (TARGET_AVX512F)
39344 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
39345 else
39346 return NULL_TREE;
39347 break;
39348 default:
39349 return NULL_TREE;
39352 return ix86_get_builtin (code);
39355 /* Returns a decl of a function that implements scatter store with
39356 register type VECTYPE and index type INDEX_TYPE and SCALE.
39357 Return NULL_TREE if it is not available. */
39359 static tree
39360 ix86_vectorize_builtin_scatter (const_tree vectype,
39361 const_tree index_type, int scale)
39363 bool si;
39364 enum ix86_builtins code;
39366 if (!TARGET_AVX512F)
39367 return NULL_TREE;
39369 if ((TREE_CODE (index_type) != INTEGER_TYPE
39370 && !POINTER_TYPE_P (index_type))
39371 || (TYPE_MODE (index_type) != SImode
39372 && TYPE_MODE (index_type) != DImode))
39373 return NULL_TREE;
39375 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
39376 return NULL_TREE;
39378 /* v*scatter* insn sign extends index to pointer mode. */
39379 if (TYPE_PRECISION (index_type) < POINTER_SIZE
39380 && TYPE_UNSIGNED (index_type))
39381 return NULL_TREE;
39383 /* Scale can be 1, 2, 4 or 8. */
39384 if (scale <= 0
39385 || scale > 8
39386 || (scale & (scale - 1)) != 0)
39387 return NULL_TREE;
39389 si = TYPE_MODE (index_type) == SImode;
39390 switch (TYPE_MODE (vectype))
39392 case E_V8DFmode:
39393 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
39394 break;
39395 case E_V8DImode:
39396 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
39397 break;
39398 case E_V16SFmode:
39399 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
39400 break;
39401 case E_V16SImode:
39402 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
39403 break;
39404 default:
39405 return NULL_TREE;
39408 return ix86_builtins[code];
39411 /* Return true if it is safe to use the rsqrt optabs to optimize
39412 1.0/sqrt. */
39414 static bool
39415 use_rsqrt_p ()
39417 return (TARGET_SSE_MATH
39418 && flag_finite_math_only
39419 && !flag_trapping_math
39420 && flag_unsafe_math_optimizations);
39423 /* Returns a code for a target-specific builtin that implements
39424 reciprocal of the function, or NULL_TREE if not available. */
39426 static tree
39427 ix86_builtin_reciprocal (tree fndecl)
39429 switch (DECL_FUNCTION_CODE (fndecl))
39431 /* Vectorized version of sqrt to rsqrt conversion. */
39432 case IX86_BUILTIN_SQRTPS_NR:
39433 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
39435 case IX86_BUILTIN_SQRTPS_NR256:
39436 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
39438 default:
39439 return NULL_TREE;
39443 /* Helper for avx_vpermilps256_operand et al. This is also used by
39444 the expansion functions to turn the parallel back into a mask.
39445 The return value is 0 for no match and the imm8+1 for a match. */
39448 avx_vpermilp_parallel (rtx par, machine_mode mode)
39450 unsigned i, nelt = GET_MODE_NUNITS (mode);
39451 unsigned mask = 0;
39452 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
39454 if (XVECLEN (par, 0) != (int) nelt)
39455 return 0;
39457 /* Validate that all of the elements are constants, and not totally
39458 out of range. Copy the data into an integral array to make the
39459 subsequent checks easier. */
39460 for (i = 0; i < nelt; ++i)
39462 rtx er = XVECEXP (par, 0, i);
39463 unsigned HOST_WIDE_INT ei;
39465 if (!CONST_INT_P (er))
39466 return 0;
39467 ei = INTVAL (er);
39468 if (ei >= nelt)
39469 return 0;
39470 ipar[i] = ei;
39473 switch (mode)
39475 case E_V8DFmode:
39476 /* In the 512-bit DFmode case, we can only move elements within
39477 a 128-bit lane. First fill the second part of the mask,
39478 then fallthru. */
39479 for (i = 4; i < 6; ++i)
39481 if (ipar[i] < 4 || ipar[i] >= 6)
39482 return 0;
39483 mask |= (ipar[i] - 4) << i;
39485 for (i = 6; i < 8; ++i)
39487 if (ipar[i] < 6)
39488 return 0;
39489 mask |= (ipar[i] - 6) << i;
39491 /* FALLTHRU */
39493 case E_V4DFmode:
39494 /* In the 256-bit DFmode case, we can only move elements within
39495 a 128-bit lane. */
39496 for (i = 0; i < 2; ++i)
39498 if (ipar[i] >= 2)
39499 return 0;
39500 mask |= ipar[i] << i;
39502 for (i = 2; i < 4; ++i)
39504 if (ipar[i] < 2)
39505 return 0;
39506 mask |= (ipar[i] - 2) << i;
39508 break;
39510 case E_V16SFmode:
39511 /* In 512 bit SFmode case, permutation in the upper 256 bits
39512 must mirror the permutation in the lower 256-bits. */
39513 for (i = 0; i < 8; ++i)
39514 if (ipar[i] + 8 != ipar[i + 8])
39515 return 0;
39516 /* FALLTHRU */
39518 case E_V8SFmode:
39519 /* In 256 bit SFmode case, we have full freedom of
39520 movement within the low 128-bit lane, but the high 128-bit
39521 lane must mirror the exact same pattern. */
39522 for (i = 0; i < 4; ++i)
39523 if (ipar[i] + 4 != ipar[i + 4])
39524 return 0;
39525 nelt = 4;
39526 /* FALLTHRU */
39528 case E_V2DFmode:
39529 case E_V4SFmode:
39530 /* In the 128-bit case, we've full freedom in the placement of
39531 the elements from the source operand. */
39532 for (i = 0; i < nelt; ++i)
39533 mask |= ipar[i] << (i * (nelt / 2));
39534 break;
39536 default:
39537 gcc_unreachable ();
39540 /* Make sure success has a non-zero value by adding one. */
39541 return mask + 1;
39544 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
39545 the expansion functions to turn the parallel back into a mask.
39546 The return value is 0 for no match and the imm8+1 for a match. */
39549 avx_vperm2f128_parallel (rtx par, machine_mode mode)
39551 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
39552 unsigned mask = 0;
39553 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
39555 if (XVECLEN (par, 0) != (int) nelt)
39556 return 0;
39558 /* Validate that all of the elements are constants, and not totally
39559 out of range. Copy the data into an integral array to make the
39560 subsequent checks easier. */
39561 for (i = 0; i < nelt; ++i)
39563 rtx er = XVECEXP (par, 0, i);
39564 unsigned HOST_WIDE_INT ei;
39566 if (!CONST_INT_P (er))
39567 return 0;
39568 ei = INTVAL (er);
39569 if (ei >= 2 * nelt)
39570 return 0;
39571 ipar[i] = ei;
39574 /* Validate that the halves of the permute are halves. */
39575 for (i = 0; i < nelt2 - 1; ++i)
39576 if (ipar[i] + 1 != ipar[i + 1])
39577 return 0;
39578 for (i = nelt2; i < nelt - 1; ++i)
39579 if (ipar[i] + 1 != ipar[i + 1])
39580 return 0;
39582 /* Reconstruct the mask. */
39583 for (i = 0; i < 2; ++i)
39585 unsigned e = ipar[i * nelt2];
39586 if (e % nelt2)
39587 return 0;
39588 e /= nelt2;
39589 mask |= e << (i * 4);
39592 /* Make sure success has a non-zero value by adding one. */
39593 return mask + 1;
39596 /* Return a register priority for hard reg REGNO. */
39597 static int
39598 ix86_register_priority (int hard_regno)
39600 /* ebp and r13 as the base always wants a displacement, r12 as the
39601 base always wants an index. So discourage their usage in an
39602 address. */
39603 if (hard_regno == R12_REG || hard_regno == R13_REG)
39604 return 0;
39605 if (hard_regno == BP_REG)
39606 return 1;
39607 /* New x86-64 int registers result in bigger code size. Discourage
39608 them. */
39609 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
39610 return 2;
39611 /* New x86-64 SSE registers result in bigger code size. Discourage
39612 them. */
39613 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
39614 return 2;
39615 /* Usage of AX register results in smaller code. Prefer it. */
39616 if (hard_regno == AX_REG)
39617 return 4;
39618 return 3;
39621 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
39623 Put float CONST_DOUBLE in the constant pool instead of fp regs.
39624 QImode must go into class Q_REGS.
39625 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
39626 movdf to do mem-to-mem moves through integer regs. */
39628 static reg_class_t
39629 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
39631 machine_mode mode = GET_MODE (x);
39633 /* We're only allowed to return a subclass of CLASS. Many of the
39634 following checks fail for NO_REGS, so eliminate that early. */
39635 if (regclass == NO_REGS)
39636 return NO_REGS;
39638 /* All classes can load zeros. */
39639 if (x == CONST0_RTX (mode))
39640 return regclass;
39642 /* Force constants into memory if we are loading a (nonzero) constant into
39643 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
39644 instructions to load from a constant. */
39645 if (CONSTANT_P (x)
39646 && (MAYBE_MMX_CLASS_P (regclass)
39647 || MAYBE_SSE_CLASS_P (regclass)
39648 || MAYBE_MASK_CLASS_P (regclass)))
39649 return NO_REGS;
39651 /* Floating-point constants need more complex checks. */
39652 if (CONST_DOUBLE_P (x))
39654 /* General regs can load everything. */
39655 if (INTEGER_CLASS_P (regclass))
39656 return regclass;
39658 /* Floats can load 0 and 1 plus some others. Note that we eliminated
39659 zero above. We only want to wind up preferring 80387 registers if
39660 we plan on doing computation with them. */
39661 if (IS_STACK_MODE (mode)
39662 && standard_80387_constant_p (x) > 0)
39664 /* Limit class to FP regs. */
39665 if (FLOAT_CLASS_P (regclass))
39666 return FLOAT_REGS;
39667 else if (regclass == FP_TOP_SSE_REGS)
39668 return FP_TOP_REG;
39669 else if (regclass == FP_SECOND_SSE_REGS)
39670 return FP_SECOND_REG;
39673 return NO_REGS;
39676 /* Prefer SSE regs only, if we can use them for math. */
39677 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39678 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
39680 /* Generally when we see PLUS here, it's the function invariant
39681 (plus soft-fp const_int). Which can only be computed into general
39682 regs. */
39683 if (GET_CODE (x) == PLUS)
39684 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
39686 /* QImode constants are easy to load, but non-constant QImode data
39687 must go into Q_REGS. */
39688 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
39690 if (Q_CLASS_P (regclass))
39691 return regclass;
39692 else if (reg_class_subset_p (Q_REGS, regclass))
39693 return Q_REGS;
39694 else
39695 return NO_REGS;
39698 return regclass;
39701 /* Discourage putting floating-point values in SSE registers unless
39702 SSE math is being used, and likewise for the 387 registers. */
39703 static reg_class_t
39704 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
39706 machine_mode mode = GET_MODE (x);
39708 /* Restrict the output reload class to the register bank that we are doing
39709 math on. If we would like not to return a subset of CLASS, reject this
39710 alternative: if reload cannot do this, it will still use its choice. */
39711 mode = GET_MODE (x);
39712 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39713 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
39715 if (IS_STACK_MODE (mode))
39717 if (regclass == FP_TOP_SSE_REGS)
39718 return FP_TOP_REG;
39719 else if (regclass == FP_SECOND_SSE_REGS)
39720 return FP_SECOND_REG;
39721 else
39722 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
39725 return regclass;
39728 static reg_class_t
39729 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
39730 machine_mode mode, secondary_reload_info *sri)
39732 /* Double-word spills from general registers to non-offsettable memory
39733 references (zero-extended addresses) require special handling. */
39734 if (TARGET_64BIT
39735 && MEM_P (x)
39736 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
39737 && INTEGER_CLASS_P (rclass)
39738 && !offsettable_memref_p (x))
39740 sri->icode = (in_p
39741 ? CODE_FOR_reload_noff_load
39742 : CODE_FOR_reload_noff_store);
39743 /* Add the cost of moving address to a temporary. */
39744 sri->extra_cost = 1;
39746 return NO_REGS;
39749 /* QImode spills from non-QI registers require
39750 intermediate register on 32bit targets. */
39751 if (mode == QImode
39752 && ((!TARGET_64BIT && !in_p
39753 && INTEGER_CLASS_P (rclass)
39754 && MAYBE_NON_Q_CLASS_P (rclass))
39755 || (!TARGET_AVX512DQ
39756 && MAYBE_MASK_CLASS_P (rclass))))
39758 int regno = true_regnum (x);
39760 /* Return Q_REGS if the operand is in memory. */
39761 if (regno == -1)
39762 return Q_REGS;
39764 return NO_REGS;
39767 /* This condition handles corner case where an expression involving
39768 pointers gets vectorized. We're trying to use the address of a
39769 stack slot as a vector initializer.
39771 (set (reg:V2DI 74 [ vect_cst_.2 ])
39772 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
39774 Eventually frame gets turned into sp+offset like this:
39776 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39777 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39778 (const_int 392 [0x188]))))
39780 That later gets turned into:
39782 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39783 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
39784 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
39786 We'll have the following reload recorded:
39788 Reload 0: reload_in (DI) =
39789 (plus:DI (reg/f:DI 7 sp)
39790 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
39791 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39792 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
39793 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
39794 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
39795 reload_reg_rtx: (reg:V2DI 22 xmm1)
39797 Which isn't going to work since SSE instructions can't handle scalar
39798 additions. Returning GENERAL_REGS forces the addition into integer
39799 register and reload can handle subsequent reloads without problems. */
39801 if (in_p && GET_CODE (x) == PLUS
39802 && SSE_CLASS_P (rclass)
39803 && SCALAR_INT_MODE_P (mode))
39804 return GENERAL_REGS;
39806 return NO_REGS;
39809 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
39811 static bool
39812 ix86_class_likely_spilled_p (reg_class_t rclass)
39814 switch (rclass)
39816 case AREG:
39817 case DREG:
39818 case CREG:
39819 case BREG:
39820 case AD_REGS:
39821 case SIREG:
39822 case DIREG:
39823 case SSE_FIRST_REG:
39824 case FP_TOP_REG:
39825 case FP_SECOND_REG:
39826 return true;
39828 default:
39829 break;
39832 return false;
39835 /* If we are copying between registers from different register sets
39836 (e.g. FP and integer), we may need a memory location.
39838 The function can't work reliably when one of the CLASSES is a class
39839 containing registers from multiple sets. We avoid this by never combining
39840 different sets in a single alternative in the machine description.
39841 Ensure that this constraint holds to avoid unexpected surprises.
39843 When STRICT is false, we are being called from REGISTER_MOVE_COST,
39844 so do not enforce these sanity checks.
39846 To optimize register_move_cost performance, define inline variant. */
39848 static inline bool
39849 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39850 reg_class_t class2, int strict)
39852 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
39853 return false;
39855 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
39856 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
39857 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
39858 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
39859 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
39860 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
39861 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
39862 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
39864 gcc_assert (!strict || lra_in_progress);
39865 return true;
39868 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
39869 return true;
39871 /* Between mask and general, we have moves no larger than word size. */
39872 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
39873 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
39874 return true;
39876 /* ??? This is a lie. We do have moves between mmx/general, and for
39877 mmx/sse2. But by saying we need secondary memory we discourage the
39878 register allocator from using the mmx registers unless needed. */
39879 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
39880 return true;
39882 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39884 /* SSE1 doesn't have any direct moves from other classes. */
39885 if (!TARGET_SSE2)
39886 return true;
39888 /* If the target says that inter-unit moves are more expensive
39889 than moving through memory, then don't generate them. */
39890 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
39891 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
39892 return true;
39894 /* Between SSE and general, we have moves no larger than word size. */
39895 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39896 return true;
39899 return false;
39902 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
39904 static bool
39905 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
39906 reg_class_t class2)
39908 return inline_secondary_memory_needed (mode, class1, class2, true);
39911 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
39913 get_secondary_mem widens integral modes to BITS_PER_WORD.
39914 There is no need to emit full 64 bit move on 64 bit targets
39915 for integral modes that can be moved using 32 bit move. */
39917 static machine_mode
39918 ix86_secondary_memory_needed_mode (machine_mode mode)
39920 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
39921 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
39922 return mode;
39925 /* Implement the TARGET_CLASS_MAX_NREGS hook.
39927 On the 80386, this is the size of MODE in words,
39928 except in the FP regs, where a single reg is always enough. */
39930 static unsigned char
39931 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
39933 if (MAYBE_INTEGER_CLASS_P (rclass))
39935 if (mode == XFmode)
39936 return (TARGET_64BIT ? 2 : 3);
39937 else if (mode == XCmode)
39938 return (TARGET_64BIT ? 4 : 6);
39939 else
39940 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39942 else
39944 if (COMPLEX_MODE_P (mode))
39945 return 2;
39946 else
39947 return 1;
39951 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
39953 static bool
39954 ix86_can_change_mode_class (machine_mode from, machine_mode to,
39955 reg_class_t regclass)
39957 if (from == to)
39958 return true;
39960 /* x87 registers can't do subreg at all, as all values are reformatted
39961 to extended precision. */
39962 if (MAYBE_FLOAT_CLASS_P (regclass))
39963 return false;
39965 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
39967 /* Vector registers do not support QI or HImode loads. If we don't
39968 disallow a change to these modes, reload will assume it's ok to
39969 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
39970 the vec_dupv4hi pattern. */
39971 if (GET_MODE_SIZE (from) < 4)
39972 return false;
39975 return true;
39978 /* Return index of MODE in the sse load/store tables. */
39980 static inline int
39981 sse_store_index (machine_mode mode)
39983 switch (GET_MODE_SIZE (mode))
39985 case 4:
39986 return 0;
39987 case 8:
39988 return 1;
39989 case 16:
39990 return 2;
39991 case 32:
39992 return 3;
39993 case 64:
39994 return 4;
39995 default:
39996 return -1;
40000 /* Return the cost of moving data of mode M between a
40001 register and memory. A value of 2 is the default; this cost is
40002 relative to those in `REGISTER_MOVE_COST'.
40004 This function is used extensively by register_move_cost that is used to
40005 build tables at startup. Make it inline in this case.
40006 When IN is 2, return maximum of in and out move cost.
40008 If moving between registers and memory is more expensive than
40009 between two registers, you should define this macro to express the
40010 relative cost.
40012 Model also increased moving costs of QImode registers in non
40013 Q_REGS classes.
40015 static inline int
40016 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
40017 int in)
40019 int cost;
40020 if (FLOAT_CLASS_P (regclass))
40022 int index;
40023 switch (mode)
40025 case E_SFmode:
40026 index = 0;
40027 break;
40028 case E_DFmode:
40029 index = 1;
40030 break;
40031 case E_XFmode:
40032 index = 2;
40033 break;
40034 default:
40035 return 100;
40037 if (in == 2)
40038 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
40039 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
40041 if (SSE_CLASS_P (regclass))
40043 int index = sse_store_index (mode);
40044 if (index == -1)
40045 return 100;
40046 if (in == 2)
40047 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
40048 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
40050 if (MMX_CLASS_P (regclass))
40052 int index;
40053 switch (GET_MODE_SIZE (mode))
40055 case 4:
40056 index = 0;
40057 break;
40058 case 8:
40059 index = 1;
40060 break;
40061 default:
40062 return 100;
40064 if (in)
40065 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
40066 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
40068 switch (GET_MODE_SIZE (mode))
40070 case 1:
40071 if (Q_CLASS_P (regclass) || TARGET_64BIT)
40073 if (!in)
40074 return ix86_cost->int_store[0];
40075 if (TARGET_PARTIAL_REG_DEPENDENCY
40076 && optimize_function_for_speed_p (cfun))
40077 cost = ix86_cost->movzbl_load;
40078 else
40079 cost = ix86_cost->int_load[0];
40080 if (in == 2)
40081 return MAX (cost, ix86_cost->int_store[0]);
40082 return cost;
40084 else
40086 if (in == 2)
40087 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
40088 if (in)
40089 return ix86_cost->movzbl_load;
40090 else
40091 return ix86_cost->int_store[0] + 4;
40093 break;
40094 case 2:
40095 if (in == 2)
40096 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
40097 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
40098 default:
40099 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
40100 if (mode == TFmode)
40101 mode = XFmode;
40102 if (in == 2)
40103 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
40104 else if (in)
40105 cost = ix86_cost->int_load[2];
40106 else
40107 cost = ix86_cost->int_store[2];
40108 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
40112 static int
40113 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
40114 bool in)
40116 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
40120 /* Return the cost of moving data from a register in class CLASS1 to
40121 one in class CLASS2.
40123 It is not required that the cost always equal 2 when FROM is the same as TO;
40124 on some machines it is expensive to move between registers if they are not
40125 general registers. */
40127 static int
40128 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
40129 reg_class_t class2_i)
40131 enum reg_class class1 = (enum reg_class) class1_i;
40132 enum reg_class class2 = (enum reg_class) class2_i;
40134 /* In case we require secondary memory, compute cost of the store followed
40135 by load. In order to avoid bad register allocation choices, we need
40136 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
40138 if (inline_secondary_memory_needed (mode, class1, class2, false))
40140 int cost = 1;
40142 cost += inline_memory_move_cost (mode, class1, 2);
40143 cost += inline_memory_move_cost (mode, class2, 2);
40145 /* In case of copying from general_purpose_register we may emit multiple
40146 stores followed by single load causing memory size mismatch stall.
40147 Count this as arbitrarily high cost of 20. */
40148 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
40149 && TARGET_MEMORY_MISMATCH_STALL
40150 && targetm.class_max_nregs (class1, mode)
40151 > targetm.class_max_nregs (class2, mode))
40152 cost += 20;
40154 /* In the case of FP/MMX moves, the registers actually overlap, and we
40155 have to switch modes in order to treat them differently. */
40156 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
40157 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
40158 cost += 20;
40160 return cost;
40163 /* Moves between SSE/MMX and integer unit are expensive. */
40164 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
40165 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
40167 /* ??? By keeping returned value relatively high, we limit the number
40168 of moves between integer and MMX/SSE registers for all targets.
40169 Additionally, high value prevents problem with x86_modes_tieable_p(),
40170 where integer modes in MMX/SSE registers are not tieable
40171 because of missing QImode and HImode moves to, from or between
40172 MMX/SSE registers. */
40173 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
40174 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
40176 if (MAYBE_FLOAT_CLASS_P (class1))
40177 return ix86_cost->fp_move;
40178 if (MAYBE_SSE_CLASS_P (class1))
40180 if (GET_MODE_BITSIZE (mode) <= 128)
40181 return ix86_cost->xmm_move;
40182 if (GET_MODE_BITSIZE (mode) <= 256)
40183 return ix86_cost->ymm_move;
40184 return ix86_cost->zmm_move;
40186 if (MAYBE_MMX_CLASS_P (class1))
40187 return ix86_cost->mmx_move;
40188 return 2;
40191 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
40192 words of a value of mode MODE but can be less for certain modes in
40193 special long registers.
40195 Actually there are no two word move instructions for consecutive
40196 registers. And only registers 0-3 may have mov byte instructions
40197 applied to them. */
40199 static unsigned int
40200 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
40202 if (GENERAL_REGNO_P (regno))
40204 if (mode == XFmode)
40205 return TARGET_64BIT ? 2 : 3;
40206 if (mode == XCmode)
40207 return TARGET_64BIT ? 4 : 6;
40208 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
40210 if (COMPLEX_MODE_P (mode))
40211 return 2;
40212 if (mode == V64SFmode || mode == V64SImode)
40213 return 4;
40214 return 1;
40217 /* Implement TARGET_HARD_REGNO_MODE_OK. */
40219 static bool
40220 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
40222 /* Flags and only flags can only hold CCmode values. */
40223 if (CC_REGNO_P (regno))
40224 return GET_MODE_CLASS (mode) == MODE_CC;
40225 if (GET_MODE_CLASS (mode) == MODE_CC
40226 || GET_MODE_CLASS (mode) == MODE_RANDOM
40227 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
40228 return false;
40229 if (STACK_REGNO_P (regno))
40230 return VALID_FP_MODE_P (mode);
40231 if (MASK_REGNO_P (regno))
40232 return (VALID_MASK_REG_MODE (mode)
40233 || (TARGET_AVX512BW
40234 && VALID_MASK_AVX512BW_MODE (mode)));
40235 if (SSE_REGNO_P (regno))
40237 /* We implement the move patterns for all vector modes into and
40238 out of SSE registers, even when no operation instructions
40239 are available. */
40241 /* For AVX-512 we allow, regardless of regno:
40242 - XI mode
40243 - any of 512-bit wide vector mode
40244 - any scalar mode. */
40245 if (TARGET_AVX512F
40246 && (mode == XImode
40247 || VALID_AVX512F_REG_MODE (mode)
40248 || VALID_AVX512F_SCALAR_MODE (mode)))
40249 return true;
40251 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
40252 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40253 && MOD4_SSE_REGNO_P (regno)
40254 && mode == V64SFmode)
40255 return true;
40257 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
40258 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
40259 && MOD4_SSE_REGNO_P (regno)
40260 && mode == V64SImode)
40261 return true;
40263 /* TODO check for QI/HI scalars. */
40264 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
40265 if (TARGET_AVX512VL
40266 && (mode == OImode
40267 || mode == TImode
40268 || VALID_AVX256_REG_MODE (mode)
40269 || VALID_AVX512VL_128_REG_MODE (mode)))
40270 return true;
40272 /* xmm16-xmm31 are only available for AVX-512. */
40273 if (EXT_REX_SSE_REGNO_P (regno))
40274 return false;
40276 /* OImode and AVX modes are available only when AVX is enabled. */
40277 return ((TARGET_AVX
40278 && VALID_AVX256_REG_OR_OI_MODE (mode))
40279 || VALID_SSE_REG_MODE (mode)
40280 || VALID_SSE2_REG_MODE (mode)
40281 || VALID_MMX_REG_MODE (mode)
40282 || VALID_MMX_REG_MODE_3DNOW (mode));
40284 if (MMX_REGNO_P (regno))
40286 /* We implement the move patterns for 3DNOW modes even in MMX mode,
40287 so if the register is available at all, then we can move data of
40288 the given mode into or out of it. */
40289 return (VALID_MMX_REG_MODE (mode)
40290 || VALID_MMX_REG_MODE_3DNOW (mode));
40293 if (mode == QImode)
40295 /* Take care for QImode values - they can be in non-QI regs,
40296 but then they do cause partial register stalls. */
40297 if (ANY_QI_REGNO_P (regno))
40298 return true;
40299 if (!TARGET_PARTIAL_REG_STALL)
40300 return true;
40301 /* LRA checks if the hard register is OK for the given mode.
40302 QImode values can live in non-QI regs, so we allow all
40303 registers here. */
40304 if (lra_in_progress)
40305 return true;
40306 return !can_create_pseudo_p ();
40308 /* We handle both integer and floats in the general purpose registers. */
40309 else if (VALID_INT_MODE_P (mode))
40310 return true;
40311 else if (VALID_FP_MODE_P (mode))
40312 return true;
40313 else if (VALID_DFP_MODE_P (mode))
40314 return true;
40315 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
40316 on to use that value in smaller contexts, this can easily force a
40317 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
40318 supporting DImode, allow it. */
40319 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
40320 return true;
40322 return false;
40325 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
40326 saves SSE registers across calls is Win64 (thus no need to check the
40327 current ABI here), and with AVX enabled Win64 only guarantees that
40328 the low 16 bytes are saved. */
40330 static bool
40331 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
40333 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
40336 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
40337 tieable integer mode. */
40339 static bool
40340 ix86_tieable_integer_mode_p (machine_mode mode)
40342 switch (mode)
40344 case E_HImode:
40345 case E_SImode:
40346 return true;
40348 case E_QImode:
40349 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
40351 case E_DImode:
40352 return TARGET_64BIT;
40354 default:
40355 return false;
40359 /* Implement TARGET_MODES_TIEABLE_P.
40361 Return true if MODE1 is accessible in a register that can hold MODE2
40362 without copying. That is, all register classes that can hold MODE2
40363 can also hold MODE1. */
40365 static bool
40366 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
40368 if (mode1 == mode2)
40369 return true;
40371 if (ix86_tieable_integer_mode_p (mode1)
40372 && ix86_tieable_integer_mode_p (mode2))
40373 return true;
40375 /* MODE2 being XFmode implies fp stack or general regs, which means we
40376 can tie any smaller floating point modes to it. Note that we do not
40377 tie this with TFmode. */
40378 if (mode2 == XFmode)
40379 return mode1 == SFmode || mode1 == DFmode;
40381 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
40382 that we can tie it with SFmode. */
40383 if (mode2 == DFmode)
40384 return mode1 == SFmode;
40386 /* If MODE2 is only appropriate for an SSE register, then tie with
40387 any other mode acceptable to SSE registers. */
40388 if (GET_MODE_SIZE (mode2) == 32
40389 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40390 return (GET_MODE_SIZE (mode1) == 32
40391 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40392 if (GET_MODE_SIZE (mode2) == 16
40393 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
40394 return (GET_MODE_SIZE (mode1) == 16
40395 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
40397 /* If MODE2 is appropriate for an MMX register, then tie
40398 with any other mode acceptable to MMX registers. */
40399 if (GET_MODE_SIZE (mode2) == 8
40400 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
40401 return (GET_MODE_SIZE (mode1) == 8
40402 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
40404 return false;
40407 /* Return the cost of moving between two registers of mode MODE. */
40409 static int
40410 ix86_set_reg_reg_cost (machine_mode mode)
40412 unsigned int units = UNITS_PER_WORD;
40414 switch (GET_MODE_CLASS (mode))
40416 default:
40417 break;
40419 case MODE_CC:
40420 units = GET_MODE_SIZE (CCmode);
40421 break;
40423 case MODE_FLOAT:
40424 if ((TARGET_SSE && mode == TFmode)
40425 || (TARGET_80387 && mode == XFmode)
40426 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
40427 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
40428 units = GET_MODE_SIZE (mode);
40429 break;
40431 case MODE_COMPLEX_FLOAT:
40432 if ((TARGET_SSE && mode == TCmode)
40433 || (TARGET_80387 && mode == XCmode)
40434 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
40435 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
40436 units = GET_MODE_SIZE (mode);
40437 break;
40439 case MODE_VECTOR_INT:
40440 case MODE_VECTOR_FLOAT:
40441 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
40442 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
40443 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
40444 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
40445 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
40446 units = GET_MODE_SIZE (mode);
40449 /* Return the cost of moving between two registers of mode MODE,
40450 assuming that the move will be in pieces of at most UNITS bytes. */
40451 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
40454 /* Return cost of vector operation in MODE given that scalar version has
40455 COST. If PARALLEL is true assume that CPU has more than one unit
40456 performing the operation. */
40458 static int
40459 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
40461 if (!VECTOR_MODE_P (mode))
40462 return cost;
40464 if (!parallel)
40465 return cost * GET_MODE_NUNITS (mode);
40466 if (GET_MODE_BITSIZE (mode) == 128
40467 && TARGET_SSE_SPLIT_REGS)
40468 return cost * 2;
40469 if (GET_MODE_BITSIZE (mode) > 128
40470 && TARGET_AVX128_OPTIMAL)
40471 return cost * GET_MODE_BITSIZE (mode) / 128;
40472 return cost;
40475 /* Return cost of multiplication in MODE. */
40477 static int
40478 ix86_multiplication_cost (const struct processor_costs *cost,
40479 enum machine_mode mode)
40481 machine_mode inner_mode = mode;
40482 if (VECTOR_MODE_P (mode))
40483 inner_mode = GET_MODE_INNER (mode);
40485 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40486 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
40487 else if (X87_FLOAT_MODE_P (mode))
40488 return cost->fmul;
40489 else if (FLOAT_MODE_P (mode))
40490 return ix86_vec_cost (mode,
40491 inner_mode == DFmode
40492 ? cost->mulsd : cost->mulss, true);
40493 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40495 /* vpmullq is used in this case. No emulation is needed. */
40496 if (TARGET_AVX512DQ)
40497 return ix86_vec_cost (mode, cost->mulss, true);
40499 /* V*QImode is emulated with 7-13 insns. */
40500 if (mode == V16QImode || mode == V32QImode)
40502 int extra = 11;
40503 if (TARGET_XOP && mode == V16QImode)
40504 extra = 5;
40505 else if (TARGET_SSSE3)
40506 extra = 6;
40507 return ix86_vec_cost (mode,
40508 cost->mulss * 2 + cost->sse_op * extra,
40509 true);
40511 /* V*DImode is emulated with 5-8 insns. */
40512 else if (mode == V2DImode || mode == V4DImode)
40514 if (TARGET_XOP && mode == V2DImode)
40515 return ix86_vec_cost (mode,
40516 cost->mulss * 2 + cost->sse_op * 3,
40517 true);
40518 else
40519 return ix86_vec_cost (mode,
40520 cost->mulss * 3 + cost->sse_op * 5,
40521 true);
40523 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
40524 insns, including two PMULUDQ. */
40525 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
40526 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
40527 true);
40528 else
40529 return ix86_vec_cost (mode, cost->mulss, true);
40531 else
40532 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
40535 /* Return cost of multiplication in MODE. */
40537 static int
40538 ix86_division_cost (const struct processor_costs *cost,
40539 enum machine_mode mode)
40541 machine_mode inner_mode = mode;
40542 if (VECTOR_MODE_P (mode))
40543 inner_mode = GET_MODE_INNER (mode);
40545 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40546 return inner_mode == DFmode ? cost->divsd : cost->divss;
40547 else if (X87_FLOAT_MODE_P (mode))
40548 return cost->fdiv;
40549 else if (FLOAT_MODE_P (mode))
40550 return ix86_vec_cost (mode,
40551 inner_mode == DFmode ? cost->divsd : cost->divss,
40552 true);
40553 else
40554 return cost->divide[MODE_INDEX (mode)];
40557 /* Return cost of shift in MODE.
40558 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
40559 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
40560 if op1 is a result of subreg.
40562 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
40564 static int
40565 ix86_shift_rotate_cost (const struct processor_costs *cost,
40566 enum machine_mode mode, bool constant_op1,
40567 HOST_WIDE_INT op1_val,
40568 bool speed,
40569 bool and_in_op1,
40570 bool shift_and_truncate,
40571 bool *skip_op0, bool *skip_op1)
40573 if (skip_op0)
40574 *skip_op0 = *skip_op1 = false;
40575 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
40577 /* V*QImode is emulated with 1-11 insns. */
40578 if (mode == V16QImode || mode == V32QImode)
40580 int count = 11;
40581 if (TARGET_XOP && mode == V16QImode)
40583 /* For XOP we use vpshab, which requires a broadcast of the
40584 value to the variable shift insn. For constants this
40585 means a V16Q const in mem; even when we can perform the
40586 shift with one insn set the cost to prefer paddb. */
40587 if (constant_op1)
40589 if (skip_op1)
40590 *skip_op1 = true;
40591 return ix86_vec_cost (mode,
40592 cost->sse_op
40593 + (speed
40595 : COSTS_N_BYTES
40596 (GET_MODE_UNIT_SIZE (mode))), true);
40598 count = 3;
40600 else if (TARGET_SSSE3)
40601 count = 7;
40602 return ix86_vec_cost (mode, cost->sse_op * count, true);
40604 else
40605 return ix86_vec_cost (mode, cost->sse_op, true);
40607 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40609 if (constant_op1)
40611 if (op1_val > 32)
40612 return cost->shift_const + COSTS_N_INSNS (2);
40613 else
40614 return cost->shift_const * 2;
40616 else
40618 if (and_in_op1)
40619 return cost->shift_var * 2;
40620 else
40621 return cost->shift_var * 6 + COSTS_N_INSNS (2);
40624 else
40626 if (constant_op1)
40627 return cost->shift_const;
40628 else if (shift_and_truncate)
40630 if (skip_op0)
40631 *skip_op0 = *skip_op1 = true;
40632 /* Return the cost after shift-and truncation. */
40633 return cost->shift_var;
40635 else
40636 return cost->shift_var;
40638 return cost->shift_const;
40641 /* Compute a (partial) cost for rtx X. Return true if the complete
40642 cost has been computed, and false if subexpressions should be
40643 scanned. In either case, *TOTAL contains the cost result. */
40645 static bool
40646 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
40647 int *total, bool speed)
40649 rtx mask;
40650 enum rtx_code code = GET_CODE (x);
40651 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
40652 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
40653 int src_cost;
40655 switch (code)
40657 case SET:
40658 if (register_operand (SET_DEST (x), VOIDmode)
40659 && reg_or_0_operand (SET_SRC (x), VOIDmode))
40661 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
40662 return true;
40665 if (register_operand (SET_SRC (x), VOIDmode))
40666 /* Avoid potentially incorrect high cost from rtx_costs
40667 for non-tieable SUBREGs. */
40668 src_cost = 0;
40669 else
40671 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
40673 if (CONSTANT_P (SET_SRC (x)))
40674 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
40675 a small value, possibly zero for cheap constants. */
40676 src_cost += COSTS_N_INSNS (1);
40679 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
40680 return true;
40682 case CONST_INT:
40683 case CONST:
40684 case LABEL_REF:
40685 case SYMBOL_REF:
40686 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
40687 *total = 3;
40688 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
40689 *total = 2;
40690 else if (flag_pic && SYMBOLIC_CONST (x)
40691 && !(TARGET_64BIT
40692 && (GET_CODE (x) == LABEL_REF
40693 || (GET_CODE (x) == SYMBOL_REF
40694 && SYMBOL_REF_LOCAL_P (x))))
40695 /* Use 0 cost for CONST to improve its propagation. */
40696 && (TARGET_64BIT || GET_CODE (x) != CONST))
40697 *total = 1;
40698 else
40699 *total = 0;
40700 return true;
40702 case CONST_DOUBLE:
40703 if (IS_STACK_MODE (mode))
40704 switch (standard_80387_constant_p (x))
40706 case -1:
40707 case 0:
40708 break;
40709 case 1: /* 0.0 */
40710 *total = 1;
40711 return true;
40712 default: /* Other constants */
40713 *total = 2;
40714 return true;
40716 /* FALLTHRU */
40718 case CONST_VECTOR:
40719 switch (standard_sse_constant_p (x, mode))
40721 case 0:
40722 break;
40723 case 1: /* 0: xor eliminates false dependency */
40724 *total = 0;
40725 return true;
40726 default: /* -1: cmp contains false dependency */
40727 *total = 1;
40728 return true;
40730 /* FALLTHRU */
40732 case CONST_WIDE_INT:
40733 /* Fall back to (MEM (SYMBOL_REF)), since that's where
40734 it'll probably end up. Add a penalty for size. */
40735 *total = (COSTS_N_INSNS (1)
40736 + (!TARGET_64BIT && flag_pic)
40737 + (GET_MODE_SIZE (mode) <= 4
40738 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
40739 return true;
40741 case ZERO_EXTEND:
40742 /* The zero extensions is often completely free on x86_64, so make
40743 it as cheap as possible. */
40744 if (TARGET_64BIT && mode == DImode
40745 && GET_MODE (XEXP (x, 0)) == SImode)
40746 *total = 1;
40747 else if (TARGET_ZERO_EXTEND_WITH_AND)
40748 *total = cost->add;
40749 else
40750 *total = cost->movzx;
40751 return false;
40753 case SIGN_EXTEND:
40754 *total = cost->movsx;
40755 return false;
40757 case ASHIFT:
40758 if (SCALAR_INT_MODE_P (mode)
40759 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
40760 && CONST_INT_P (XEXP (x, 1)))
40762 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40763 if (value == 1)
40765 *total = cost->add;
40766 return false;
40768 if ((value == 2 || value == 3)
40769 && cost->lea <= cost->shift_const)
40771 *total = cost->lea;
40772 return false;
40775 /* FALLTHRU */
40777 case ROTATE:
40778 case ASHIFTRT:
40779 case LSHIFTRT:
40780 case ROTATERT:
40781 bool skip_op0, skip_op1;
40782 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
40783 CONST_INT_P (XEXP (x, 1))
40784 ? INTVAL (XEXP (x, 1)) : -1,
40785 speed,
40786 GET_CODE (XEXP (x, 1)) == AND,
40787 SUBREG_P (XEXP (x, 1))
40788 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
40789 &skip_op0, &skip_op1);
40790 if (skip_op0 || skip_op1)
40792 if (!skip_op0)
40793 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
40794 if (!skip_op1)
40795 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
40796 return true;
40798 return false;
40800 case FMA:
40802 rtx sub;
40804 gcc_assert (FLOAT_MODE_P (mode));
40805 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
40807 *total = ix86_vec_cost (mode,
40808 mode == SFmode ? cost->fmass : cost->fmasd,
40809 true);
40810 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
40812 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
40813 sub = XEXP (x, 0);
40814 if (GET_CODE (sub) == NEG)
40815 sub = XEXP (sub, 0);
40816 *total += rtx_cost (sub, mode, FMA, 0, speed);
40818 sub = XEXP (x, 2);
40819 if (GET_CODE (sub) == NEG)
40820 sub = XEXP (sub, 0);
40821 *total += rtx_cost (sub, mode, FMA, 2, speed);
40822 return true;
40825 case MULT:
40826 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
40828 rtx op0 = XEXP (x, 0);
40829 rtx op1 = XEXP (x, 1);
40830 int nbits;
40831 if (CONST_INT_P (XEXP (x, 1)))
40833 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
40834 for (nbits = 0; value != 0; value &= value - 1)
40835 nbits++;
40837 else
40838 /* This is arbitrary. */
40839 nbits = 7;
40841 /* Compute costs correctly for widening multiplication. */
40842 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
40843 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
40844 == GET_MODE_SIZE (mode))
40846 int is_mulwiden = 0;
40847 machine_mode inner_mode = GET_MODE (op0);
40849 if (GET_CODE (op0) == GET_CODE (op1))
40850 is_mulwiden = 1, op1 = XEXP (op1, 0);
40851 else if (CONST_INT_P (op1))
40853 if (GET_CODE (op0) == SIGN_EXTEND)
40854 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
40855 == INTVAL (op1);
40856 else
40857 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
40860 if (is_mulwiden)
40861 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
40864 *total = (cost->mult_init[MODE_INDEX (mode)]
40865 + nbits * cost->mult_bit
40866 + rtx_cost (op0, mode, outer_code, opno, speed)
40867 + rtx_cost (op1, mode, outer_code, opno, speed));
40869 return true;
40871 *total = ix86_multiplication_cost (cost, mode);
40872 return false;
40874 case DIV:
40875 case UDIV:
40876 case MOD:
40877 case UMOD:
40878 *total = ix86_division_cost (cost, mode);
40879 return false;
40881 case PLUS:
40882 if (GET_MODE_CLASS (mode) == MODE_INT
40883 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
40885 if (GET_CODE (XEXP (x, 0)) == PLUS
40886 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
40887 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
40888 && CONSTANT_P (XEXP (x, 1)))
40890 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
40891 if (val == 2 || val == 4 || val == 8)
40893 *total = cost->lea;
40894 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40895 outer_code, opno, speed);
40896 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
40897 outer_code, opno, speed);
40898 *total += rtx_cost (XEXP (x, 1), mode,
40899 outer_code, opno, speed);
40900 return true;
40903 else if (GET_CODE (XEXP (x, 0)) == MULT
40904 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
40906 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
40907 if (val == 2 || val == 4 || val == 8)
40909 *total = cost->lea;
40910 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40911 outer_code, opno, speed);
40912 *total += rtx_cost (XEXP (x, 1), mode,
40913 outer_code, opno, speed);
40914 return true;
40917 else if (GET_CODE (XEXP (x, 0)) == PLUS)
40919 /* Add with carry, ignore the cost of adding a carry flag. */
40920 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
40921 *total = cost->add;
40922 else
40924 *total = cost->lea;
40925 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40926 outer_code, opno, speed);
40929 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
40930 outer_code, opno, speed);
40931 *total += rtx_cost (XEXP (x, 1), mode,
40932 outer_code, opno, speed);
40933 return true;
40936 /* FALLTHRU */
40938 case MINUS:
40939 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
40940 if (GET_MODE_CLASS (mode) == MODE_INT
40941 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
40942 && GET_CODE (XEXP (x, 0)) == MINUS
40943 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
40945 *total = cost->add;
40946 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
40947 outer_code, opno, speed);
40948 *total += rtx_cost (XEXP (x, 1), mode,
40949 outer_code, opno, speed);
40950 return true;
40953 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40955 *total = cost->addss;
40956 return false;
40958 else if (X87_FLOAT_MODE_P (mode))
40960 *total = cost->fadd;
40961 return false;
40963 else if (FLOAT_MODE_P (mode))
40965 *total = ix86_vec_cost (mode, cost->addss, true);
40966 return false;
40968 /* FALLTHRU */
40970 case AND:
40971 case IOR:
40972 case XOR:
40973 if (GET_MODE_CLASS (mode) == MODE_INT
40974 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
40976 *total = (cost->add * 2
40977 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
40978 << (GET_MODE (XEXP (x, 0)) != DImode))
40979 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
40980 << (GET_MODE (XEXP (x, 1)) != DImode)));
40981 return true;
40983 /* FALLTHRU */
40985 case NEG:
40986 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40988 *total = cost->sse_op;
40989 return false;
40991 else if (X87_FLOAT_MODE_P (mode))
40993 *total = cost->fchs;
40994 return false;
40996 else if (FLOAT_MODE_P (mode))
40998 *total = ix86_vec_cost (mode, cost->sse_op, true);
40999 return false;
41001 /* FALLTHRU */
41003 case NOT:
41004 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
41005 *total = ix86_vec_cost (mode, cost->sse_op, true);
41006 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
41007 *total = cost->add * 2;
41008 else
41009 *total = cost->add;
41010 return false;
41012 case COMPARE:
41013 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
41014 && XEXP (XEXP (x, 0), 1) == const1_rtx
41015 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
41016 && XEXP (x, 1) == const0_rtx)
41018 /* This kind of construct is implemented using test[bwl].
41019 Treat it as if we had an AND. */
41020 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
41021 *total = (cost->add
41022 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
41023 opno, speed)
41024 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
41025 return true;
41028 /* The embedded comparison operand is completely free. */
41029 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
41030 && XEXP (x, 1) == const0_rtx)
41031 *total = 0;
41033 return false;
41035 case FLOAT_EXTEND:
41036 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41037 *total = 0;
41038 else
41039 *total = ix86_vec_cost (mode, cost->addss, true);
41040 return false;
41042 case FLOAT_TRUNCATE:
41043 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
41044 *total = cost->fadd;
41045 else
41046 *total = ix86_vec_cost (mode, cost->addss, true);
41047 return false;
41049 case ABS:
41050 /* SSE requires memory load for the constant operand. It may make
41051 sense to account for this. Of course the constant operand may or
41052 may not be reused. */
41053 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41054 *total = cost->sse_op;
41055 else if (X87_FLOAT_MODE_P (mode))
41056 *total = cost->fabs;
41057 else if (FLOAT_MODE_P (mode))
41058 *total = ix86_vec_cost (mode, cost->sse_op, true);
41059 return false;
41061 case SQRT:
41062 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
41063 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
41064 else if (X87_FLOAT_MODE_P (mode))
41065 *total = cost->fsqrt;
41066 else if (FLOAT_MODE_P (mode))
41067 *total = ix86_vec_cost (mode,
41068 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
41069 true);
41070 return false;
41072 case UNSPEC:
41073 if (XINT (x, 1) == UNSPEC_TP)
41074 *total = 0;
41075 return false;
41077 case VEC_SELECT:
41078 case VEC_CONCAT:
41079 case VEC_DUPLICATE:
41080 /* ??? Assume all of these vector manipulation patterns are
41081 recognizable. In which case they all pretty much have the
41082 same cost. */
41083 *total = cost->sse_op;
41084 return true;
41085 case VEC_MERGE:
41086 mask = XEXP (x, 2);
41087 /* This is masked instruction, assume the same cost,
41088 as nonmasked variant. */
41089 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
41090 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
41091 else
41092 *total = cost->sse_op;
41093 return true;
41095 default:
41096 return false;
41100 #if TARGET_MACHO
41102 static int current_machopic_label_num;
41104 /* Given a symbol name and its associated stub, write out the
41105 definition of the stub. */
41107 void
41108 machopic_output_stub (FILE *file, const char *symb, const char *stub)
41110 unsigned int length;
41111 char *binder_name, *symbol_name, lazy_ptr_name[32];
41112 int label = ++current_machopic_label_num;
41114 /* For 64-bit we shouldn't get here. */
41115 gcc_assert (!TARGET_64BIT);
41117 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
41118 symb = targetm.strip_name_encoding (symb);
41120 length = strlen (stub);
41121 binder_name = XALLOCAVEC (char, length + 32);
41122 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
41124 length = strlen (symb);
41125 symbol_name = XALLOCAVEC (char, length + 32);
41126 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
41128 sprintf (lazy_ptr_name, "L%d$lz", label);
41130 if (MACHOPIC_ATT_STUB)
41131 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
41132 else if (MACHOPIC_PURE)
41133 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
41134 else
41135 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
41137 fprintf (file, "%s:\n", stub);
41138 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41140 if (MACHOPIC_ATT_STUB)
41142 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
41144 else if (MACHOPIC_PURE)
41146 /* PIC stub. */
41147 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41148 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
41149 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
41150 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
41151 label, lazy_ptr_name, label);
41152 fprintf (file, "\tjmp\t*%%ecx\n");
41154 else
41155 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
41157 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
41158 it needs no stub-binding-helper. */
41159 if (MACHOPIC_ATT_STUB)
41160 return;
41162 fprintf (file, "%s:\n", binder_name);
41164 if (MACHOPIC_PURE)
41166 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
41167 fprintf (file, "\tpushl\t%%ecx\n");
41169 else
41170 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
41172 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
41174 /* N.B. Keep the correspondence of these
41175 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
41176 old-pic/new-pic/non-pic stubs; altering this will break
41177 compatibility with existing dylibs. */
41178 if (MACHOPIC_PURE)
41180 /* 25-byte PIC stub using "CALL get_pc_thunk". */
41181 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
41183 else
41184 /* 16-byte -mdynamic-no-pic stub. */
41185 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
41187 fprintf (file, "%s:\n", lazy_ptr_name);
41188 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
41189 fprintf (file, ASM_LONG "%s\n", binder_name);
41191 #endif /* TARGET_MACHO */
41193 /* Order the registers for register allocator. */
41195 void
41196 x86_order_regs_for_local_alloc (void)
41198 int pos = 0;
41199 int i;
41201 /* First allocate the local general purpose registers. */
41202 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41203 if (GENERAL_REGNO_P (i) && call_used_regs[i])
41204 reg_alloc_order [pos++] = i;
41206 /* Global general purpose registers. */
41207 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41208 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
41209 reg_alloc_order [pos++] = i;
41211 /* x87 registers come first in case we are doing FP math
41212 using them. */
41213 if (!TARGET_SSE_MATH)
41214 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41215 reg_alloc_order [pos++] = i;
41217 /* SSE registers. */
41218 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
41219 reg_alloc_order [pos++] = i;
41220 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
41221 reg_alloc_order [pos++] = i;
41223 /* Extended REX SSE registers. */
41224 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
41225 reg_alloc_order [pos++] = i;
41227 /* Mask register. */
41228 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
41229 reg_alloc_order [pos++] = i;
41231 /* x87 registers. */
41232 if (TARGET_SSE_MATH)
41233 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
41234 reg_alloc_order [pos++] = i;
41236 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
41237 reg_alloc_order [pos++] = i;
41239 /* Initialize the rest of array as we do not allocate some registers
41240 at all. */
41241 while (pos < FIRST_PSEUDO_REGISTER)
41242 reg_alloc_order [pos++] = 0;
41245 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
41246 in struct attribute_spec handler. */
41247 static tree
41248 ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
41249 bool *no_add_attrs)
41251 if (TREE_CODE (*node) != FUNCTION_TYPE
41252 && TREE_CODE (*node) != METHOD_TYPE
41253 && TREE_CODE (*node) != FIELD_DECL
41254 && TREE_CODE (*node) != TYPE_DECL)
41256 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41257 name);
41258 *no_add_attrs = true;
41259 return NULL_TREE;
41261 if (TARGET_64BIT)
41263 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
41264 name);
41265 *no_add_attrs = true;
41266 return NULL_TREE;
41268 if (is_attribute_p ("callee_pop_aggregate_return", name))
41270 tree cst;
41272 cst = TREE_VALUE (args);
41273 if (TREE_CODE (cst) != INTEGER_CST)
41275 warning (OPT_Wattributes,
41276 "%qE attribute requires an integer constant argument",
41277 name);
41278 *no_add_attrs = true;
41280 else if (compare_tree_int (cst, 0) != 0
41281 && compare_tree_int (cst, 1) != 0)
41283 warning (OPT_Wattributes,
41284 "argument to %qE attribute is neither zero, nor one",
41285 name);
41286 *no_add_attrs = true;
41289 return NULL_TREE;
41292 return NULL_TREE;
41295 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
41296 struct attribute_spec.handler. */
41297 static tree
41298 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
41299 bool *no_add_attrs)
41301 if (TREE_CODE (*node) != FUNCTION_TYPE
41302 && TREE_CODE (*node) != METHOD_TYPE
41303 && TREE_CODE (*node) != FIELD_DECL
41304 && TREE_CODE (*node) != TYPE_DECL)
41306 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41307 name);
41308 *no_add_attrs = true;
41309 return NULL_TREE;
41312 /* Can combine regparm with all attributes but fastcall. */
41313 if (is_attribute_p ("ms_abi", name))
41315 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
41317 error ("ms_abi and sysv_abi attributes are not compatible");
41320 return NULL_TREE;
41322 else if (is_attribute_p ("sysv_abi", name))
41324 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
41326 error ("ms_abi and sysv_abi attributes are not compatible");
41329 return NULL_TREE;
41332 return NULL_TREE;
41335 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
41336 struct attribute_spec.handler. */
41337 static tree
41338 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
41339 bool *no_add_attrs)
41341 tree *type = NULL;
41342 if (DECL_P (*node))
41344 if (TREE_CODE (*node) == TYPE_DECL)
41345 type = &TREE_TYPE (*node);
41347 else
41348 type = node;
41350 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
41352 warning (OPT_Wattributes, "%qE attribute ignored",
41353 name);
41354 *no_add_attrs = true;
41357 else if ((is_attribute_p ("ms_struct", name)
41358 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
41359 || ((is_attribute_p ("gcc_struct", name)
41360 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
41362 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
41363 name);
41364 *no_add_attrs = true;
41367 return NULL_TREE;
41370 static tree
41371 ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
41372 bool *no_add_attrs)
41374 if (TREE_CODE (*node) != FUNCTION_DECL)
41376 warning (OPT_Wattributes, "%qE attribute only applies to functions",
41377 name);
41378 *no_add_attrs = true;
41381 if (is_attribute_p ("indirect_branch", name))
41383 tree cst = TREE_VALUE (args);
41384 if (TREE_CODE (cst) != STRING_CST)
41386 warning (OPT_Wattributes,
41387 "%qE attribute requires a string constant argument",
41388 name);
41389 *no_add_attrs = true;
41391 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41392 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41393 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41394 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41396 warning (OPT_Wattributes,
41397 "argument to %qE attribute is not "
41398 "(keep|thunk|thunk-inline|thunk-extern)", name);
41399 *no_add_attrs = true;
41403 if (is_attribute_p ("function_return", name))
41405 tree cst = TREE_VALUE (args);
41406 if (TREE_CODE (cst) != STRING_CST)
41408 warning (OPT_Wattributes,
41409 "%qE attribute requires a string constant argument",
41410 name);
41411 *no_add_attrs = true;
41413 else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
41414 && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
41415 && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
41416 && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
41418 warning (OPT_Wattributes,
41419 "argument to %qE attribute is not "
41420 "(keep|thunk|thunk-inline|thunk-extern)", name);
41421 *no_add_attrs = true;
41425 return NULL_TREE;
41428 static tree
41429 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
41430 int, bool *)
41432 return NULL_TREE;
41435 static tree
41436 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
41438 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
41439 but the function type contains args and return type data. */
41440 tree func_type = *node;
41441 tree return_type = TREE_TYPE (func_type);
41443 int nargs = 0;
41444 tree current_arg_type = TYPE_ARG_TYPES (func_type);
41445 while (current_arg_type
41446 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
41448 if (nargs == 0)
41450 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
41451 error ("interrupt service routine should have a pointer "
41452 "as the first argument");
41454 else if (nargs == 1)
41456 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
41457 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
41458 error ("interrupt service routine should have unsigned %s"
41459 "int as the second argument",
41460 TARGET_64BIT
41461 ? (TARGET_X32 ? "long long " : "long ")
41462 : "");
41464 nargs++;
41465 current_arg_type = TREE_CHAIN (current_arg_type);
41467 if (!nargs || nargs > 2)
41468 error ("interrupt service routine can only have a pointer argument "
41469 "and an optional integer argument");
41470 if (! VOID_TYPE_P (return_type))
41471 error ("interrupt service routine can't have non-void return value");
41473 return NULL_TREE;
41476 static bool
41477 ix86_ms_bitfield_layout_p (const_tree record_type)
41479 return ((TARGET_MS_BITFIELD_LAYOUT
41480 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
41481 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
41484 /* Returns an expression indicating where the this parameter is
41485 located on entry to the FUNCTION. */
41487 static rtx
41488 x86_this_parameter (tree function)
41490 tree type = TREE_TYPE (function);
41491 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
41492 int nregs;
41494 if (TARGET_64BIT)
41496 const int *parm_regs;
41498 if (ix86_function_type_abi (type) == MS_ABI)
41499 parm_regs = x86_64_ms_abi_int_parameter_registers;
41500 else
41501 parm_regs = x86_64_int_parameter_registers;
41502 return gen_rtx_REG (Pmode, parm_regs[aggr]);
41505 nregs = ix86_function_regparm (type, function);
41507 if (nregs > 0 && !stdarg_p (type))
41509 int regno;
41510 unsigned int ccvt = ix86_get_callcvt (type);
41512 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41513 regno = aggr ? DX_REG : CX_REG;
41514 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41516 regno = CX_REG;
41517 if (aggr)
41518 return gen_rtx_MEM (SImode,
41519 plus_constant (Pmode, stack_pointer_rtx, 4));
41521 else
41523 regno = AX_REG;
41524 if (aggr)
41526 regno = DX_REG;
41527 if (nregs == 1)
41528 return gen_rtx_MEM (SImode,
41529 plus_constant (Pmode,
41530 stack_pointer_rtx, 4));
41533 return gen_rtx_REG (SImode, regno);
41536 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
41537 aggr ? 8 : 4));
41540 /* Determine whether x86_output_mi_thunk can succeed. */
41542 static bool
41543 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
41544 const_tree function)
41546 /* 64-bit can handle anything. */
41547 if (TARGET_64BIT)
41548 return true;
41550 /* For 32-bit, everything's fine if we have one free register. */
41551 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
41552 return true;
41554 /* Need a free register for vcall_offset. */
41555 if (vcall_offset)
41556 return false;
41558 /* Need a free register for GOT references. */
41559 if (flag_pic && !targetm.binds_local_p (function))
41560 return false;
41562 /* Otherwise ok. */
41563 return true;
41566 /* Output the assembler code for a thunk function. THUNK_DECL is the
41567 declaration for the thunk function itself, FUNCTION is the decl for
41568 the target function. DELTA is an immediate constant offset to be
41569 added to THIS. If VCALL_OFFSET is nonzero, the word at
41570 *(*this + vcall_offset) should be added to THIS. */
41572 static void
41573 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
41574 HOST_WIDE_INT vcall_offset, tree function)
41576 rtx this_param = x86_this_parameter (function);
41577 rtx this_reg, tmp, fnaddr;
41578 unsigned int tmp_regno;
41579 rtx_insn *insn;
41581 if (TARGET_64BIT)
41582 tmp_regno = R10_REG;
41583 else
41585 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
41586 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
41587 tmp_regno = AX_REG;
41588 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
41589 tmp_regno = DX_REG;
41590 else
41591 tmp_regno = CX_REG;
41594 emit_note (NOTE_INSN_PROLOGUE_END);
41596 /* CET is enabled, insert EB instruction. */
41597 if ((flag_cf_protection & CF_BRANCH))
41598 emit_insn (gen_nop_endbr ());
41600 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
41601 pull it in now and let DELTA benefit. */
41602 if (REG_P (this_param))
41603 this_reg = this_param;
41604 else if (vcall_offset)
41606 /* Put the this parameter into %eax. */
41607 this_reg = gen_rtx_REG (Pmode, AX_REG);
41608 emit_move_insn (this_reg, this_param);
41610 else
41611 this_reg = NULL_RTX;
41613 /* Adjust the this parameter by a fixed constant. */
41614 if (delta)
41616 rtx delta_rtx = GEN_INT (delta);
41617 rtx delta_dst = this_reg ? this_reg : this_param;
41619 if (TARGET_64BIT)
41621 if (!x86_64_general_operand (delta_rtx, Pmode))
41623 tmp = gen_rtx_REG (Pmode, tmp_regno);
41624 emit_move_insn (tmp, delta_rtx);
41625 delta_rtx = tmp;
41629 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
41632 /* Adjust the this parameter by a value stored in the vtable. */
41633 if (vcall_offset)
41635 rtx vcall_addr, vcall_mem, this_mem;
41637 tmp = gen_rtx_REG (Pmode, tmp_regno);
41639 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
41640 if (Pmode != ptr_mode)
41641 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
41642 emit_move_insn (tmp, this_mem);
41644 /* Adjust the this parameter. */
41645 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
41646 if (TARGET_64BIT
41647 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
41649 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
41650 emit_move_insn (tmp2, GEN_INT (vcall_offset));
41651 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
41654 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
41655 if (Pmode != ptr_mode)
41656 emit_insn (gen_addsi_1_zext (this_reg,
41657 gen_rtx_REG (ptr_mode,
41658 REGNO (this_reg)),
41659 vcall_mem));
41660 else
41661 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
41664 /* If necessary, drop THIS back to its stack slot. */
41665 if (this_reg && this_reg != this_param)
41666 emit_move_insn (this_param, this_reg);
41668 fnaddr = XEXP (DECL_RTL (function), 0);
41669 if (TARGET_64BIT)
41671 if (!flag_pic || targetm.binds_local_p (function)
41672 || TARGET_PECOFF)
41674 else
41676 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
41677 tmp = gen_rtx_CONST (Pmode, tmp);
41678 fnaddr = gen_const_mem (Pmode, tmp);
41681 else
41683 if (!flag_pic || targetm.binds_local_p (function))
41685 #if TARGET_MACHO
41686 else if (TARGET_MACHO)
41688 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
41689 fnaddr = XEXP (fnaddr, 0);
41691 #endif /* TARGET_MACHO */
41692 else
41694 tmp = gen_rtx_REG (Pmode, CX_REG);
41695 output_set_got (tmp, NULL_RTX);
41697 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
41698 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
41699 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
41700 fnaddr = gen_const_mem (Pmode, fnaddr);
41704 /* Our sibling call patterns do not allow memories, because we have no
41705 predicate that can distinguish between frame and non-frame memory.
41706 For our purposes here, we can get away with (ab)using a jump pattern,
41707 because we're going to do no optimization. */
41708 if (MEM_P (fnaddr))
41710 if (sibcall_insn_operand (fnaddr, word_mode))
41712 fnaddr = XEXP (DECL_RTL (function), 0);
41713 tmp = gen_rtx_MEM (QImode, fnaddr);
41714 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41715 tmp = emit_call_insn (tmp);
41716 SIBLING_CALL_P (tmp) = 1;
41718 else
41719 emit_jump_insn (gen_indirect_jump (fnaddr));
41721 else
41723 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
41725 // CM_LARGE_PIC always uses pseudo PIC register which is
41726 // uninitialized. Since FUNCTION is local and calling it
41727 // doesn't go through PLT, we use scratch register %r11 as
41728 // PIC register and initialize it here.
41729 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
41730 ix86_init_large_pic_reg (tmp_regno);
41731 fnaddr = legitimize_pic_address (fnaddr,
41732 gen_rtx_REG (Pmode, tmp_regno));
41735 if (!sibcall_insn_operand (fnaddr, word_mode))
41737 tmp = gen_rtx_REG (word_mode, tmp_regno);
41738 if (GET_MODE (fnaddr) != word_mode)
41739 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
41740 emit_move_insn (tmp, fnaddr);
41741 fnaddr = tmp;
41744 tmp = gen_rtx_MEM (QImode, fnaddr);
41745 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
41746 tmp = emit_call_insn (tmp);
41747 SIBLING_CALL_P (tmp) = 1;
41749 emit_barrier ();
41751 /* Emit just enough of rest_of_compilation to get the insns emitted.
41752 Note that use_thunk calls assemble_start_function et al. */
41753 insn = get_insns ();
41754 shorten_branches (insn);
41755 final_start_function (insn, file, 1);
41756 final (insn, file, 1);
41757 final_end_function ();
41760 static void
41761 x86_file_start (void)
41763 default_file_start ();
41764 if (TARGET_16BIT)
41765 fputs ("\t.code16gcc\n", asm_out_file);
41766 #if TARGET_MACHO
41767 darwin_file_start ();
41768 #endif
41769 if (X86_FILE_START_VERSION_DIRECTIVE)
41770 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
41771 if (X86_FILE_START_FLTUSED)
41772 fputs ("\t.global\t__fltused\n", asm_out_file);
41773 if (ix86_asm_dialect == ASM_INTEL)
41774 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
41778 x86_field_alignment (tree type, int computed)
41780 machine_mode mode;
41782 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
41783 return computed;
41784 if (TARGET_IAMCU)
41785 return iamcu_alignment (type, computed);
41786 mode = TYPE_MODE (strip_array_types (type));
41787 if (mode == DFmode || mode == DCmode
41788 || GET_MODE_CLASS (mode) == MODE_INT
41789 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
41790 return MIN (32, computed);
41791 return computed;
41794 /* Print call to TARGET to FILE. */
41796 static void
41797 x86_print_call_or_nop (FILE *file, const char *target)
41799 if (flag_nop_mcount)
41800 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
41801 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
41802 else
41803 fprintf (file, "1:\tcall\t%s\n", target);
41806 /* Output assembler code to FILE to increment profiler label # LABELNO
41807 for profiling a function entry. */
41808 void
41809 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
41811 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
41812 : MCOUNT_NAME);
41813 if (TARGET_64BIT)
41815 #ifndef NO_PROFILE_COUNTERS
41816 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
41817 #endif
41819 if (!TARGET_PECOFF && flag_pic)
41820 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
41821 else
41822 x86_print_call_or_nop (file, mcount_name);
41824 else if (flag_pic)
41826 #ifndef NO_PROFILE_COUNTERS
41827 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
41828 LPREFIX, labelno);
41829 #endif
41830 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
41832 else
41834 #ifndef NO_PROFILE_COUNTERS
41835 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
41836 LPREFIX, labelno);
41837 #endif
41838 x86_print_call_or_nop (file, mcount_name);
41841 if (flag_record_mcount)
41843 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
41844 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
41845 fprintf (file, "\t.previous\n");
41849 /* We don't have exact information about the insn sizes, but we may assume
41850 quite safely that we are informed about all 1 byte insns and memory
41851 address sizes. This is enough to eliminate unnecessary padding in
41852 99% of cases. */
41855 ix86_min_insn_size (rtx_insn *insn)
41857 int l = 0, len;
41859 if (!INSN_P (insn) || !active_insn_p (insn))
41860 return 0;
41862 /* Discard alignments we've emit and jump instructions. */
41863 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
41864 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
41865 return 0;
41867 /* Important case - calls are always 5 bytes.
41868 It is common to have many calls in the row. */
41869 if (CALL_P (insn)
41870 && symbolic_reference_mentioned_p (PATTERN (insn))
41871 && !SIBLING_CALL_P (insn))
41872 return 5;
41873 len = get_attr_length (insn);
41874 if (len <= 1)
41875 return 1;
41877 /* For normal instructions we rely on get_attr_length being exact,
41878 with a few exceptions. */
41879 if (!JUMP_P (insn))
41881 enum attr_type type = get_attr_type (insn);
41883 switch (type)
41885 case TYPE_MULTI:
41886 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
41887 || asm_noperands (PATTERN (insn)) >= 0)
41888 return 0;
41889 break;
41890 case TYPE_OTHER:
41891 case TYPE_FCMP:
41892 break;
41893 default:
41894 /* Otherwise trust get_attr_length. */
41895 return len;
41898 l = get_attr_length_address (insn);
41899 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
41900 l = 4;
41902 if (l)
41903 return 1+l;
41904 else
41905 return 2;
41908 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41910 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
41911 window. */
41913 static void
41914 ix86_avoid_jump_mispredicts (void)
41916 rtx_insn *insn, *start = get_insns ();
41917 int nbytes = 0, njumps = 0;
41918 bool isjump = false;
41920 /* Look for all minimal intervals of instructions containing 4 jumps.
41921 The intervals are bounded by START and INSN. NBYTES is the total
41922 size of instructions in the interval including INSN and not including
41923 START. When the NBYTES is smaller than 16 bytes, it is possible
41924 that the end of START and INSN ends up in the same 16byte page.
41926 The smallest offset in the page INSN can start is the case where START
41927 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
41928 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
41930 Don't consider asm goto as jump, while it can contain a jump, it doesn't
41931 have to, control transfer to label(s) can be performed through other
41932 means, and also we estimate minimum length of all asm stmts as 0. */
41933 for (insn = start; insn; insn = NEXT_INSN (insn))
41935 int min_size;
41937 if (LABEL_P (insn))
41939 align_flags alignment = label_to_alignment (insn);
41940 int align = alignment.levels[0].log;
41941 int max_skip = alignment.levels[0].maxskip;
41943 if (max_skip > 15)
41944 max_skip = 15;
41945 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
41946 already in the current 16 byte page, because otherwise
41947 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
41948 bytes to reach 16 byte boundary. */
41949 if (align <= 0
41950 || (align <= 3 && max_skip != (1 << align) - 1))
41951 max_skip = 0;
41952 if (dump_file)
41953 fprintf (dump_file, "Label %i with max_skip %i\n",
41954 INSN_UID (insn), max_skip);
41955 if (max_skip)
41957 while (nbytes + max_skip >= 16)
41959 start = NEXT_INSN (start);
41960 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41961 || CALL_P (start))
41962 njumps--, isjump = true;
41963 else
41964 isjump = false;
41965 nbytes -= ix86_min_insn_size (start);
41968 continue;
41971 min_size = ix86_min_insn_size (insn);
41972 nbytes += min_size;
41973 if (dump_file)
41974 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
41975 INSN_UID (insn), min_size);
41976 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
41977 || CALL_P (insn))
41978 njumps++;
41979 else
41980 continue;
41982 while (njumps > 3)
41984 start = NEXT_INSN (start);
41985 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
41986 || CALL_P (start))
41987 njumps--, isjump = true;
41988 else
41989 isjump = false;
41990 nbytes -= ix86_min_insn_size (start);
41992 gcc_assert (njumps >= 0);
41993 if (dump_file)
41994 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
41995 INSN_UID (start), INSN_UID (insn), nbytes);
41997 if (njumps == 3 && isjump && nbytes < 16)
41999 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
42001 if (dump_file)
42002 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
42003 INSN_UID (insn), padsize);
42004 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
42008 #endif
42010 /* AMD Athlon works faster
42011 when RET is not destination of conditional jump or directly preceded
42012 by other jump instruction. We avoid the penalty by inserting NOP just
42013 before the RET instructions in such cases. */
42014 static void
42015 ix86_pad_returns (void)
42017 edge e;
42018 edge_iterator ei;
42020 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42022 basic_block bb = e->src;
42023 rtx_insn *ret = BB_END (bb);
42024 rtx_insn *prev;
42025 bool replace = false;
42027 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
42028 || optimize_bb_for_size_p (bb))
42029 continue;
42030 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
42031 if (active_insn_p (prev) || LABEL_P (prev))
42032 break;
42033 if (prev && LABEL_P (prev))
42035 edge e;
42036 edge_iterator ei;
42038 FOR_EACH_EDGE (e, ei, bb->preds)
42039 if (EDGE_FREQUENCY (e) && e->src->index >= 0
42040 && !(e->flags & EDGE_FALLTHRU))
42042 replace = true;
42043 break;
42046 if (!replace)
42048 prev = prev_active_insn (ret);
42049 if (prev
42050 && ((JUMP_P (prev) && any_condjump_p (prev))
42051 || CALL_P (prev)))
42052 replace = true;
42053 /* Empty functions get branch mispredict even when
42054 the jump destination is not visible to us. */
42055 if (!prev && !optimize_function_for_size_p (cfun))
42056 replace = true;
42058 if (replace)
42060 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
42061 delete_insn (ret);
42066 /* Count the minimum number of instructions in BB. Return 4 if the
42067 number of instructions >= 4. */
42069 static int
42070 ix86_count_insn_bb (basic_block bb)
42072 rtx_insn *insn;
42073 int insn_count = 0;
42075 /* Count number of instructions in this block. Return 4 if the number
42076 of instructions >= 4. */
42077 FOR_BB_INSNS (bb, insn)
42079 /* Only happen in exit blocks. */
42080 if (JUMP_P (insn)
42081 && ANY_RETURN_P (PATTERN (insn)))
42082 break;
42084 if (NONDEBUG_INSN_P (insn)
42085 && GET_CODE (PATTERN (insn)) != USE
42086 && GET_CODE (PATTERN (insn)) != CLOBBER)
42088 insn_count++;
42089 if (insn_count >= 4)
42090 return insn_count;
42094 return insn_count;
42098 /* Count the minimum number of instructions in code path in BB.
42099 Return 4 if the number of instructions >= 4. */
42101 static int
42102 ix86_count_insn (basic_block bb)
42104 edge e;
42105 edge_iterator ei;
42106 int min_prev_count;
42108 /* Only bother counting instructions along paths with no
42109 more than 2 basic blocks between entry and exit. Given
42110 that BB has an edge to exit, determine if a predecessor
42111 of BB has an edge from entry. If so, compute the number
42112 of instructions in the predecessor block. If there
42113 happen to be multiple such blocks, compute the minimum. */
42114 min_prev_count = 4;
42115 FOR_EACH_EDGE (e, ei, bb->preds)
42117 edge prev_e;
42118 edge_iterator prev_ei;
42120 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42122 min_prev_count = 0;
42123 break;
42125 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
42127 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
42129 int count = ix86_count_insn_bb (e->src);
42130 if (count < min_prev_count)
42131 min_prev_count = count;
42132 break;
42137 if (min_prev_count < 4)
42138 min_prev_count += ix86_count_insn_bb (bb);
42140 return min_prev_count;
42143 /* Pad short function to 4 instructions. */
42145 static void
42146 ix86_pad_short_function (void)
42148 edge e;
42149 edge_iterator ei;
42151 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42153 rtx_insn *ret = BB_END (e->src);
42154 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
42156 int insn_count = ix86_count_insn (e->src);
42158 /* Pad short function. */
42159 if (insn_count < 4)
42161 rtx_insn *insn = ret;
42163 /* Find epilogue. */
42164 while (insn
42165 && (!NOTE_P (insn)
42166 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
42167 insn = PREV_INSN (insn);
42169 if (!insn)
42170 insn = ret;
42172 /* Two NOPs count as one instruction. */
42173 insn_count = 2 * (4 - insn_count);
42174 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
42180 /* Fix up a Windows system unwinder issue. If an EH region falls through into
42181 the epilogue, the Windows system unwinder will apply epilogue logic and
42182 produce incorrect offsets. This can be avoided by adding a nop between
42183 the last insn that can throw and the first insn of the epilogue. */
42185 static void
42186 ix86_seh_fixup_eh_fallthru (void)
42188 edge e;
42189 edge_iterator ei;
42191 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
42193 rtx_insn *insn, *next;
42195 /* Find the beginning of the epilogue. */
42196 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
42197 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
42198 break;
42199 if (insn == NULL)
42200 continue;
42202 /* We only care about preceding insns that can throw. */
42203 insn = prev_active_insn (insn);
42204 if (insn == NULL || !can_throw_internal (insn))
42205 continue;
42207 /* Do not separate calls from their debug information. */
42208 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
42209 if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
42210 insn = next;
42211 else
42212 break;
42214 emit_insn_after (gen_nops (const1_rtx), insn);
42218 /* Given a register number BASE, the lowest of a group of registers, update
42219 regsets IN and OUT with the registers that should be avoided in input
42220 and output operands respectively when trying to avoid generating a modr/m
42221 byte for -mmitigate-rop. */
42223 static void
42224 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
42226 SET_HARD_REG_BIT (out, base);
42227 SET_HARD_REG_BIT (out, base + 1);
42228 SET_HARD_REG_BIT (in, base + 2);
42229 SET_HARD_REG_BIT (in, base + 3);
42232 /* Called if -mmitigate-rop is in effect. Try to rewrite instructions so
42233 that certain encodings of modr/m bytes do not occur. */
42234 static void
42235 ix86_mitigate_rop (void)
42237 HARD_REG_SET input_risky;
42238 HARD_REG_SET output_risky;
42239 HARD_REG_SET inout_risky;
42241 CLEAR_HARD_REG_SET (output_risky);
42242 CLEAR_HARD_REG_SET (input_risky);
42243 SET_HARD_REG_BIT (output_risky, AX_REG);
42244 SET_HARD_REG_BIT (output_risky, CX_REG);
42245 SET_HARD_REG_BIT (input_risky, BX_REG);
42246 SET_HARD_REG_BIT (input_risky, DX_REG);
42247 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
42248 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
42249 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
42250 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
42251 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
42252 COPY_HARD_REG_SET (inout_risky, input_risky);
42253 IOR_HARD_REG_SET (inout_risky, output_risky);
42255 df_note_add_problem ();
42256 /* Fix up what stack-regs did. */
42257 df_insn_rescan_all ();
42258 df_analyze ();
42260 regrename_init (true);
42261 regrename_analyze (NULL);
42263 auto_vec<du_head_p> cands;
42265 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
42267 if (!NONDEBUG_INSN_P (insn))
42268 continue;
42270 if (GET_CODE (PATTERN (insn)) == USE
42271 || GET_CODE (PATTERN (insn)) == CLOBBER)
42272 continue;
42274 extract_insn (insn);
42276 int opno0, opno1;
42277 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42278 recog_data.n_operands, &opno0,
42279 &opno1);
42281 if (!ix86_rop_should_change_byte_p (modrm))
42282 continue;
42284 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
42286 /* This happens when regrename has to fail a block. */
42287 if (!info->op_info)
42288 continue;
42290 if (info->op_info[opno0].n_chains != 0)
42292 gcc_assert (info->op_info[opno0].n_chains == 1);
42293 du_head_p op0c;
42294 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
42295 if (op0c->target_data_1 + op0c->target_data_2 == 0
42296 && !op0c->cannot_rename)
42297 cands.safe_push (op0c);
42299 op0c->target_data_1++;
42301 if (info->op_info[opno1].n_chains != 0)
42303 gcc_assert (info->op_info[opno1].n_chains == 1);
42304 du_head_p op1c;
42305 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
42306 if (op1c->target_data_1 + op1c->target_data_2 == 0
42307 && !op1c->cannot_rename)
42308 cands.safe_push (op1c);
42310 op1c->target_data_2++;
42314 int i;
42315 du_head_p head;
42316 FOR_EACH_VEC_ELT (cands, i, head)
42318 int old_reg, best_reg;
42319 HARD_REG_SET unavailable;
42321 CLEAR_HARD_REG_SET (unavailable);
42322 if (head->target_data_1)
42323 IOR_HARD_REG_SET (unavailable, output_risky);
42324 if (head->target_data_2)
42325 IOR_HARD_REG_SET (unavailable, input_risky);
42327 int n_uses;
42328 reg_class superclass = regrename_find_superclass (head, &n_uses,
42329 &unavailable);
42330 old_reg = head->regno;
42331 best_reg = find_rename_reg (head, superclass, &unavailable,
42332 old_reg, false);
42333 bool ok = regrename_do_replace (head, best_reg);
42334 gcc_assert (ok);
42335 if (dump_file)
42336 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
42337 reg_names[best_reg], reg_class_names[superclass]);
42341 regrename_finish ();
42343 df_analyze ();
42345 basic_block bb;
42346 regset_head live;
42348 INIT_REG_SET (&live);
42350 FOR_EACH_BB_FN (bb, cfun)
42352 rtx_insn *insn;
42354 COPY_REG_SET (&live, DF_LR_OUT (bb));
42355 df_simulate_initialize_backwards (bb, &live);
42357 FOR_BB_INSNS_REVERSE (bb, insn)
42359 if (!NONDEBUG_INSN_P (insn))
42360 continue;
42362 df_simulate_one_insn_backwards (bb, insn, &live);
42364 if (GET_CODE (PATTERN (insn)) == USE
42365 || GET_CODE (PATTERN (insn)) == CLOBBER)
42366 continue;
42368 extract_insn (insn);
42369 constrain_operands_cached (insn, reload_completed);
42370 int opno0, opno1;
42371 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
42372 recog_data.n_operands, &opno0,
42373 &opno1);
42374 if (modrm < 0
42375 || !ix86_rop_should_change_byte_p (modrm)
42376 || opno0 == opno1)
42377 continue;
42379 rtx oldreg = recog_data.operand[opno1];
42380 preprocess_constraints (insn);
42381 const operand_alternative *alt = which_op_alt ();
42383 int i;
42384 for (i = 0; i < recog_data.n_operands; i++)
42385 if (i != opno1
42386 && alt[i].earlyclobber
42387 && reg_overlap_mentioned_p (recog_data.operand[i],
42388 oldreg))
42389 break;
42391 if (i < recog_data.n_operands)
42392 continue;
42394 if (dump_file)
42395 fprintf (dump_file,
42396 "attempting to fix modrm byte in insn %d:"
42397 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
42398 reg_class_names[alt[opno1].cl]);
42400 HARD_REG_SET unavailable;
42401 REG_SET_TO_HARD_REG_SET (unavailable, &live);
42402 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
42403 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
42404 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
42405 IOR_HARD_REG_SET (unavailable, output_risky);
42406 IOR_COMPL_HARD_REG_SET (unavailable,
42407 reg_class_contents[alt[opno1].cl]);
42409 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
42410 if (!TEST_HARD_REG_BIT (unavailable, i))
42411 break;
42412 if (i == FIRST_PSEUDO_REGISTER)
42414 if (dump_file)
42415 fprintf (dump_file, ", none available\n");
42416 continue;
42418 if (dump_file)
42419 fprintf (dump_file, " -> %d\n", i);
42420 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
42421 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
42422 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
42427 /* Implement machine specific optimizations. We implement padding of returns
42428 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
42429 static void
42430 ix86_reorg (void)
42432 /* We are freeing block_for_insn in the toplev to keep compatibility
42433 with old MDEP_REORGS that are not CFG based. Recompute it now. */
42434 compute_bb_for_insn ();
42436 if (flag_mitigate_rop)
42437 ix86_mitigate_rop ();
42439 if (TARGET_SEH && current_function_has_exception_handlers ())
42440 ix86_seh_fixup_eh_fallthru ();
42442 if (optimize && optimize_function_for_speed_p (cfun))
42444 if (TARGET_PAD_SHORT_FUNCTION)
42445 ix86_pad_short_function ();
42446 else if (TARGET_PAD_RETURNS)
42447 ix86_pad_returns ();
42448 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
42449 if (TARGET_FOUR_JUMP_LIMIT)
42450 ix86_avoid_jump_mispredicts ();
42451 #endif
42455 /* Return nonzero when QImode register that must be represented via REX prefix
42456 is used. */
42457 bool
42458 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
42460 int i;
42461 extract_insn_cached (insn);
42462 for (i = 0; i < recog_data.n_operands; i++)
42463 if (GENERAL_REG_P (recog_data.operand[i])
42464 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
42465 return true;
42466 return false;
42469 /* Return true when INSN mentions register that must be encoded using REX
42470 prefix. */
42471 bool
42472 x86_extended_reg_mentioned_p (rtx insn)
42474 subrtx_iterator::array_type array;
42475 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
42477 const_rtx x = *iter;
42478 if (REG_P (x)
42479 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
42480 return true;
42482 return false;
42485 /* If profitable, negate (without causing overflow) integer constant
42486 of mode MODE at location LOC. Return true in this case. */
42487 bool
42488 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
42490 HOST_WIDE_INT val;
42492 if (!CONST_INT_P (*loc))
42493 return false;
42495 switch (mode)
42497 case E_DImode:
42498 /* DImode x86_64 constants must fit in 32 bits. */
42499 gcc_assert (x86_64_immediate_operand (*loc, mode));
42501 mode = SImode;
42502 break;
42504 case E_SImode:
42505 case E_HImode:
42506 case E_QImode:
42507 break;
42509 default:
42510 gcc_unreachable ();
42513 /* Avoid overflows. */
42514 if (mode_signbit_p (mode, *loc))
42515 return false;
42517 val = INTVAL (*loc);
42519 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
42520 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
42521 if ((val < 0 && val != -128)
42522 || val == 128)
42524 *loc = GEN_INT (-val);
42525 return true;
42528 return false;
42531 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
42532 optabs would emit if we didn't have TFmode patterns. */
42534 void
42535 x86_emit_floatuns (rtx operands[2])
42537 rtx_code_label *neglab, *donelab;
42538 rtx i0, i1, f0, in, out;
42539 machine_mode mode, inmode;
42541 inmode = GET_MODE (operands[1]);
42542 gcc_assert (inmode == SImode || inmode == DImode);
42544 out = operands[0];
42545 in = force_reg (inmode, operands[1]);
42546 mode = GET_MODE (out);
42547 neglab = gen_label_rtx ();
42548 donelab = gen_label_rtx ();
42549 f0 = gen_reg_rtx (mode);
42551 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
42553 expand_float (out, in, 0);
42555 emit_jump_insn (gen_jump (donelab));
42556 emit_barrier ();
42558 emit_label (neglab);
42560 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
42561 1, OPTAB_DIRECT);
42562 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
42563 1, OPTAB_DIRECT);
42564 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
42566 expand_float (f0, i0, 0);
42568 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
42570 emit_label (donelab);
42573 static bool canonicalize_perm (struct expand_vec_perm_d *d);
42574 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
42575 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
42576 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
42578 /* Get a vector mode of the same size as the original but with elements
42579 twice as wide. This is only guaranteed to apply to integral vectors. */
42581 static inline machine_mode
42582 get_mode_wider_vector (machine_mode o)
42584 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
42585 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
42586 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
42587 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
42588 return n;
42591 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
42592 fill target with val via vec_duplicate. */
42594 static bool
42595 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
42597 bool ok;
42598 rtx_insn *insn;
42599 rtx dup;
42601 /* First attempt to recognize VAL as-is. */
42602 dup = gen_vec_duplicate (mode, val);
42603 insn = emit_insn (gen_rtx_SET (target, dup));
42604 if (recog_memoized (insn) < 0)
42606 rtx_insn *seq;
42607 machine_mode innermode = GET_MODE_INNER (mode);
42608 rtx reg;
42610 /* If that fails, force VAL into a register. */
42612 start_sequence ();
42613 reg = force_reg (innermode, val);
42614 if (GET_MODE (reg) != innermode)
42615 reg = gen_lowpart (innermode, reg);
42616 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
42617 seq = get_insns ();
42618 end_sequence ();
42619 if (seq)
42620 emit_insn_before (seq, insn);
42622 ok = recog_memoized (insn) >= 0;
42623 gcc_assert (ok);
42625 return true;
42628 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42629 with all elements equal to VAR. Return true if successful. */
42631 static bool
42632 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
42633 rtx target, rtx val)
42635 bool ok;
42637 switch (mode)
42639 case E_V2SImode:
42640 case E_V2SFmode:
42641 if (!mmx_ok)
42642 return false;
42643 /* FALLTHRU */
42645 case E_V4DFmode:
42646 case E_V4DImode:
42647 case E_V8SFmode:
42648 case E_V8SImode:
42649 case E_V2DFmode:
42650 case E_V2DImode:
42651 case E_V4SFmode:
42652 case E_V4SImode:
42653 case E_V16SImode:
42654 case E_V8DImode:
42655 case E_V16SFmode:
42656 case E_V8DFmode:
42657 return ix86_vector_duplicate_value (mode, target, val);
42659 case E_V4HImode:
42660 if (!mmx_ok)
42661 return false;
42662 if (TARGET_SSE || TARGET_3DNOW_A)
42664 rtx x;
42666 val = gen_lowpart (SImode, val);
42667 x = gen_rtx_TRUNCATE (HImode, val);
42668 x = gen_rtx_VEC_DUPLICATE (mode, x);
42669 emit_insn (gen_rtx_SET (target, x));
42670 return true;
42672 goto widen;
42674 case E_V8QImode:
42675 if (!mmx_ok)
42676 return false;
42677 goto widen;
42679 case E_V8HImode:
42680 if (TARGET_AVX2)
42681 return ix86_vector_duplicate_value (mode, target, val);
42683 if (TARGET_SSE2)
42685 struct expand_vec_perm_d dperm;
42686 rtx tmp1, tmp2;
42688 permute:
42689 memset (&dperm, 0, sizeof (dperm));
42690 dperm.target = target;
42691 dperm.vmode = mode;
42692 dperm.nelt = GET_MODE_NUNITS (mode);
42693 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
42694 dperm.one_operand_p = true;
42696 /* Extend to SImode using a paradoxical SUBREG. */
42697 tmp1 = gen_reg_rtx (SImode);
42698 emit_move_insn (tmp1, gen_lowpart (SImode, val));
42700 /* Insert the SImode value as low element of a V4SImode vector. */
42701 tmp2 = gen_reg_rtx (V4SImode);
42702 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
42703 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
42705 ok = (expand_vec_perm_1 (&dperm)
42706 || expand_vec_perm_broadcast_1 (&dperm));
42707 gcc_assert (ok);
42708 return ok;
42710 goto widen;
42712 case E_V16QImode:
42713 if (TARGET_AVX2)
42714 return ix86_vector_duplicate_value (mode, target, val);
42716 if (TARGET_SSE2)
42717 goto permute;
42718 goto widen;
42720 widen:
42721 /* Replicate the value once into the next wider mode and recurse. */
42723 machine_mode smode, wsmode, wvmode;
42724 rtx x;
42726 smode = GET_MODE_INNER (mode);
42727 wvmode = get_mode_wider_vector (mode);
42728 wsmode = GET_MODE_INNER (wvmode);
42730 val = convert_modes (wsmode, smode, val, true);
42731 x = expand_simple_binop (wsmode, ASHIFT, val,
42732 GEN_INT (GET_MODE_BITSIZE (smode)),
42733 NULL_RTX, 1, OPTAB_LIB_WIDEN);
42734 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
42736 x = gen_reg_rtx (wvmode);
42737 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
42738 gcc_assert (ok);
42739 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
42740 return ok;
42743 case E_V16HImode:
42744 case E_V32QImode:
42745 if (TARGET_AVX2)
42746 return ix86_vector_duplicate_value (mode, target, val);
42747 else
42749 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
42750 rtx x = gen_reg_rtx (hvmode);
42752 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42753 gcc_assert (ok);
42755 x = gen_rtx_VEC_CONCAT (mode, x, x);
42756 emit_insn (gen_rtx_SET (target, x));
42758 return true;
42760 case E_V64QImode:
42761 case E_V32HImode:
42762 if (TARGET_AVX512BW)
42763 return ix86_vector_duplicate_value (mode, target, val);
42764 else
42766 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
42767 rtx x = gen_reg_rtx (hvmode);
42769 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
42770 gcc_assert (ok);
42772 x = gen_rtx_VEC_CONCAT (mode, x, x);
42773 emit_insn (gen_rtx_SET (target, x));
42775 return true;
42777 default:
42778 return false;
42782 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42783 whose ONE_VAR element is VAR, and other elements are zero. Return true
42784 if successful. */
42786 static bool
42787 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
42788 rtx target, rtx var, int one_var)
42790 machine_mode vsimode;
42791 rtx new_target;
42792 rtx x, tmp;
42793 bool use_vector_set = false;
42794 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
42796 switch (mode)
42798 case E_V2DImode:
42799 /* For SSE4.1, we normally use vector set. But if the second
42800 element is zero and inter-unit moves are OK, we use movq
42801 instead. */
42802 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
42803 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
42804 && one_var == 0));
42805 break;
42806 case E_V16QImode:
42807 case E_V4SImode:
42808 case E_V4SFmode:
42809 use_vector_set = TARGET_SSE4_1;
42810 break;
42811 case E_V8HImode:
42812 use_vector_set = TARGET_SSE2;
42813 break;
42814 case E_V4HImode:
42815 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
42816 break;
42817 case E_V32QImode:
42818 case E_V16HImode:
42819 use_vector_set = TARGET_AVX;
42820 break;
42821 case E_V8SImode:
42822 use_vector_set = TARGET_AVX;
42823 gen_vec_set_0 = gen_vec_setv8si_0;
42824 break;
42825 case E_V8SFmode:
42826 use_vector_set = TARGET_AVX;
42827 gen_vec_set_0 = gen_vec_setv8sf_0;
42828 break;
42829 case E_V4DFmode:
42830 use_vector_set = TARGET_AVX;
42831 gen_vec_set_0 = gen_vec_setv4df_0;
42832 break;
42833 case E_V4DImode:
42834 /* Use ix86_expand_vector_set in 64bit mode only. */
42835 use_vector_set = TARGET_AVX && TARGET_64BIT;
42836 gen_vec_set_0 = gen_vec_setv4di_0;
42837 break;
42838 case E_V16SImode:
42839 use_vector_set = TARGET_AVX512F && one_var == 0;
42840 gen_vec_set_0 = gen_vec_setv16si_0;
42841 break;
42842 case E_V16SFmode:
42843 use_vector_set = TARGET_AVX512F && one_var == 0;
42844 gen_vec_set_0 = gen_vec_setv16sf_0;
42845 break;
42846 case E_V8DFmode:
42847 use_vector_set = TARGET_AVX512F && one_var == 0;
42848 gen_vec_set_0 = gen_vec_setv8df_0;
42849 break;
42850 case E_V8DImode:
42851 /* Use ix86_expand_vector_set in 64bit mode only. */
42852 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
42853 gen_vec_set_0 = gen_vec_setv8di_0;
42854 break;
42855 default:
42856 break;
42859 if (use_vector_set)
42861 if (gen_vec_set_0 && one_var == 0)
42863 var = force_reg (GET_MODE_INNER (mode), var);
42864 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
42865 return true;
42867 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
42868 var = force_reg (GET_MODE_INNER (mode), var);
42869 ix86_expand_vector_set (mmx_ok, target, var, one_var);
42870 return true;
42873 switch (mode)
42875 case E_V2SFmode:
42876 case E_V2SImode:
42877 if (!mmx_ok)
42878 return false;
42879 /* FALLTHRU */
42881 case E_V2DFmode:
42882 case E_V2DImode:
42883 if (one_var != 0)
42884 return false;
42885 var = force_reg (GET_MODE_INNER (mode), var);
42886 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
42887 emit_insn (gen_rtx_SET (target, x));
42888 return true;
42890 case E_V4SFmode:
42891 case E_V4SImode:
42892 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
42893 new_target = gen_reg_rtx (mode);
42894 else
42895 new_target = target;
42896 var = force_reg (GET_MODE_INNER (mode), var);
42897 x = gen_rtx_VEC_DUPLICATE (mode, var);
42898 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
42899 emit_insn (gen_rtx_SET (new_target, x));
42900 if (one_var != 0)
42902 /* We need to shuffle the value to the correct position, so
42903 create a new pseudo to store the intermediate result. */
42905 /* With SSE2, we can use the integer shuffle insns. */
42906 if (mode != V4SFmode && TARGET_SSE2)
42908 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
42909 const1_rtx,
42910 GEN_INT (one_var == 1 ? 0 : 1),
42911 GEN_INT (one_var == 2 ? 0 : 1),
42912 GEN_INT (one_var == 3 ? 0 : 1)));
42913 if (target != new_target)
42914 emit_move_insn (target, new_target);
42915 return true;
42918 /* Otherwise convert the intermediate result to V4SFmode and
42919 use the SSE1 shuffle instructions. */
42920 if (mode != V4SFmode)
42922 tmp = gen_reg_rtx (V4SFmode);
42923 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
42925 else
42926 tmp = new_target;
42928 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
42929 const1_rtx,
42930 GEN_INT (one_var == 1 ? 0 : 1),
42931 GEN_INT (one_var == 2 ? 0+4 : 1+4),
42932 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
42934 if (mode != V4SFmode)
42935 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
42936 else if (tmp != target)
42937 emit_move_insn (target, tmp);
42939 else if (target != new_target)
42940 emit_move_insn (target, new_target);
42941 return true;
42943 case E_V8HImode:
42944 case E_V16QImode:
42945 vsimode = V4SImode;
42946 goto widen;
42947 case E_V4HImode:
42948 case E_V8QImode:
42949 if (!mmx_ok)
42950 return false;
42951 vsimode = V2SImode;
42952 goto widen;
42953 widen:
42954 if (one_var != 0)
42955 return false;
42957 /* Zero extend the variable element to SImode and recurse. */
42958 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
42960 x = gen_reg_rtx (vsimode);
42961 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
42962 var, one_var))
42963 gcc_unreachable ();
42965 emit_move_insn (target, gen_lowpart (mode, x));
42966 return true;
42968 default:
42969 return false;
42973 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
42974 consisting of the values in VALS. It is known that all elements
42975 except ONE_VAR are constants. Return true if successful. */
42977 static bool
42978 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
42979 rtx target, rtx vals, int one_var)
42981 rtx var = XVECEXP (vals, 0, one_var);
42982 machine_mode wmode;
42983 rtx const_vec, x;
42985 const_vec = copy_rtx (vals);
42986 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
42987 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
42989 switch (mode)
42991 case E_V2DFmode:
42992 case E_V2DImode:
42993 case E_V2SFmode:
42994 case E_V2SImode:
42995 /* For the two element vectors, it's just as easy to use
42996 the general case. */
42997 return false;
42999 case E_V4DImode:
43000 /* Use ix86_expand_vector_set in 64bit mode only. */
43001 if (!TARGET_64BIT)
43002 return false;
43003 /* FALLTHRU */
43004 case E_V4DFmode:
43005 case E_V8SFmode:
43006 case E_V8SImode:
43007 case E_V16HImode:
43008 case E_V32QImode:
43009 case E_V4SFmode:
43010 case E_V4SImode:
43011 case E_V8HImode:
43012 case E_V4HImode:
43013 break;
43015 case E_V16QImode:
43016 if (TARGET_SSE4_1)
43017 break;
43018 wmode = V8HImode;
43019 goto widen;
43020 case E_V8QImode:
43021 wmode = V4HImode;
43022 goto widen;
43023 widen:
43024 /* There's no way to set one QImode entry easily. Combine
43025 the variable value with its adjacent constant value, and
43026 promote to an HImode set. */
43027 x = XVECEXP (vals, 0, one_var ^ 1);
43028 if (one_var & 1)
43030 var = convert_modes (HImode, QImode, var, true);
43031 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
43032 NULL_RTX, 1, OPTAB_LIB_WIDEN);
43033 x = GEN_INT (INTVAL (x) & 0xff);
43035 else
43037 var = convert_modes (HImode, QImode, var, true);
43038 x = gen_int_mode (UINTVAL (x) << 8, HImode);
43040 if (x != const0_rtx)
43041 var = expand_simple_binop (HImode, IOR, var, x, var,
43042 1, OPTAB_LIB_WIDEN);
43044 x = gen_reg_rtx (wmode);
43045 emit_move_insn (x, gen_lowpart (wmode, const_vec));
43046 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
43048 emit_move_insn (target, gen_lowpart (mode, x));
43049 return true;
43051 default:
43052 return false;
43055 emit_move_insn (target, const_vec);
43056 ix86_expand_vector_set (mmx_ok, target, var, one_var);
43057 return true;
43060 /* A subroutine of ix86_expand_vector_init_general. Use vector
43061 concatenate to handle the most general case: all values variable,
43062 and none identical. */
43064 static void
43065 ix86_expand_vector_init_concat (machine_mode mode,
43066 rtx target, rtx *ops, int n)
43068 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
43069 rtx first[16], second[8], third[4];
43070 rtvec v;
43071 int i, j;
43073 switch (n)
43075 case 2:
43076 switch (mode)
43078 case E_V16SImode:
43079 cmode = V8SImode;
43080 break;
43081 case E_V16SFmode:
43082 cmode = V8SFmode;
43083 break;
43084 case E_V8DImode:
43085 cmode = V4DImode;
43086 break;
43087 case E_V8DFmode:
43088 cmode = V4DFmode;
43089 break;
43090 case E_V8SImode:
43091 cmode = V4SImode;
43092 break;
43093 case E_V8SFmode:
43094 cmode = V4SFmode;
43095 break;
43096 case E_V4DImode:
43097 cmode = V2DImode;
43098 break;
43099 case E_V4DFmode:
43100 cmode = V2DFmode;
43101 break;
43102 case E_V4SImode:
43103 cmode = V2SImode;
43104 break;
43105 case E_V4SFmode:
43106 cmode = V2SFmode;
43107 break;
43108 case E_V2DImode:
43109 cmode = DImode;
43110 break;
43111 case E_V2SImode:
43112 cmode = SImode;
43113 break;
43114 case E_V2DFmode:
43115 cmode = DFmode;
43116 break;
43117 case E_V2SFmode:
43118 cmode = SFmode;
43119 break;
43120 default:
43121 gcc_unreachable ();
43124 if (!register_operand (ops[1], cmode))
43125 ops[1] = force_reg (cmode, ops[1]);
43126 if (!register_operand (ops[0], cmode))
43127 ops[0] = force_reg (cmode, ops[0]);
43128 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
43129 ops[1])));
43130 break;
43132 case 4:
43133 switch (mode)
43135 case E_V4DImode:
43136 cmode = V2DImode;
43137 break;
43138 case E_V4DFmode:
43139 cmode = V2DFmode;
43140 break;
43141 case E_V4SImode:
43142 cmode = V2SImode;
43143 break;
43144 case E_V4SFmode:
43145 cmode = V2SFmode;
43146 break;
43147 default:
43148 gcc_unreachable ();
43150 goto half;
43152 case 8:
43153 switch (mode)
43155 case E_V8DImode:
43156 cmode = V2DImode;
43157 hmode = V4DImode;
43158 break;
43159 case E_V8DFmode:
43160 cmode = V2DFmode;
43161 hmode = V4DFmode;
43162 break;
43163 case E_V8SImode:
43164 cmode = V2SImode;
43165 hmode = V4SImode;
43166 break;
43167 case E_V8SFmode:
43168 cmode = V2SFmode;
43169 hmode = V4SFmode;
43170 break;
43171 default:
43172 gcc_unreachable ();
43174 goto half;
43176 case 16:
43177 switch (mode)
43179 case E_V16SImode:
43180 cmode = V2SImode;
43181 hmode = V4SImode;
43182 gmode = V8SImode;
43183 break;
43184 case E_V16SFmode:
43185 cmode = V2SFmode;
43186 hmode = V4SFmode;
43187 gmode = V8SFmode;
43188 break;
43189 default:
43190 gcc_unreachable ();
43192 goto half;
43194 half:
43195 /* FIXME: We process inputs backward to help RA. PR 36222. */
43196 i = n - 1;
43197 j = (n >> 1) - 1;
43198 for (; i > 0; i -= 2, j--)
43200 first[j] = gen_reg_rtx (cmode);
43201 v = gen_rtvec (2, ops[i - 1], ops[i]);
43202 ix86_expand_vector_init (false, first[j],
43203 gen_rtx_PARALLEL (cmode, v));
43206 n >>= 1;
43207 if (n > 4)
43209 gcc_assert (hmode != VOIDmode);
43210 gcc_assert (gmode != VOIDmode);
43211 for (i = j = 0; i < n; i += 2, j++)
43213 second[j] = gen_reg_rtx (hmode);
43214 ix86_expand_vector_init_concat (hmode, second [j],
43215 &first [i], 2);
43217 n >>= 1;
43218 for (i = j = 0; i < n; i += 2, j++)
43220 third[j] = gen_reg_rtx (gmode);
43221 ix86_expand_vector_init_concat (gmode, third[j],
43222 &second[i], 2);
43224 n >>= 1;
43225 ix86_expand_vector_init_concat (mode, target, third, n);
43227 else if (n > 2)
43229 gcc_assert (hmode != VOIDmode);
43230 for (i = j = 0; i < n; i += 2, j++)
43232 second[j] = gen_reg_rtx (hmode);
43233 ix86_expand_vector_init_concat (hmode, second [j],
43234 &first [i], 2);
43236 n >>= 1;
43237 ix86_expand_vector_init_concat (mode, target, second, n);
43239 else
43240 ix86_expand_vector_init_concat (mode, target, first, n);
43241 break;
43243 default:
43244 gcc_unreachable ();
43248 /* A subroutine of ix86_expand_vector_init_general. Use vector
43249 interleave to handle the most general case: all values variable,
43250 and none identical. */
43252 static void
43253 ix86_expand_vector_init_interleave (machine_mode mode,
43254 rtx target, rtx *ops, int n)
43256 machine_mode first_imode, second_imode, third_imode, inner_mode;
43257 int i, j;
43258 rtx op0, op1;
43259 rtx (*gen_load_even) (rtx, rtx, rtx);
43260 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
43261 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
43263 switch (mode)
43265 case E_V8HImode:
43266 gen_load_even = gen_vec_setv8hi;
43267 gen_interleave_first_low = gen_vec_interleave_lowv4si;
43268 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43269 inner_mode = HImode;
43270 first_imode = V4SImode;
43271 second_imode = V2DImode;
43272 third_imode = VOIDmode;
43273 break;
43274 case E_V16QImode:
43275 gen_load_even = gen_vec_setv16qi;
43276 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
43277 gen_interleave_second_low = gen_vec_interleave_lowv4si;
43278 inner_mode = QImode;
43279 first_imode = V8HImode;
43280 second_imode = V4SImode;
43281 third_imode = V2DImode;
43282 break;
43283 default:
43284 gcc_unreachable ();
43287 for (i = 0; i < n; i++)
43289 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
43290 op0 = gen_reg_rtx (SImode);
43291 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
43293 /* Insert the SImode value as low element of V4SImode vector. */
43294 op1 = gen_reg_rtx (V4SImode);
43295 op0 = gen_rtx_VEC_MERGE (V4SImode,
43296 gen_rtx_VEC_DUPLICATE (V4SImode,
43297 op0),
43298 CONST0_RTX (V4SImode),
43299 const1_rtx);
43300 emit_insn (gen_rtx_SET (op1, op0));
43302 /* Cast the V4SImode vector back to a vector in orignal mode. */
43303 op0 = gen_reg_rtx (mode);
43304 emit_move_insn (op0, gen_lowpart (mode, op1));
43306 /* Load even elements into the second position. */
43307 emit_insn (gen_load_even (op0,
43308 force_reg (inner_mode,
43309 ops [i + i + 1]),
43310 const1_rtx));
43312 /* Cast vector to FIRST_IMODE vector. */
43313 ops[i] = gen_reg_rtx (first_imode);
43314 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
43317 /* Interleave low FIRST_IMODE vectors. */
43318 for (i = j = 0; i < n; i += 2, j++)
43320 op0 = gen_reg_rtx (first_imode);
43321 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
43323 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
43324 ops[j] = gen_reg_rtx (second_imode);
43325 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
43328 /* Interleave low SECOND_IMODE vectors. */
43329 switch (second_imode)
43331 case E_V4SImode:
43332 for (i = j = 0; i < n / 2; i += 2, j++)
43334 op0 = gen_reg_rtx (second_imode);
43335 emit_insn (gen_interleave_second_low (op0, ops[i],
43336 ops[i + 1]));
43338 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
43339 vector. */
43340 ops[j] = gen_reg_rtx (third_imode);
43341 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
43343 second_imode = V2DImode;
43344 gen_interleave_second_low = gen_vec_interleave_lowv2di;
43345 /* FALLTHRU */
43347 case E_V2DImode:
43348 op0 = gen_reg_rtx (second_imode);
43349 emit_insn (gen_interleave_second_low (op0, ops[0],
43350 ops[1]));
43352 /* Cast the SECOND_IMODE vector back to a vector on original
43353 mode. */
43354 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
43355 break;
43357 default:
43358 gcc_unreachable ();
43362 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
43363 all values variable, and none identical. */
43365 static void
43366 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
43367 rtx target, rtx vals)
43369 rtx ops[64], op0, op1, op2, op3, op4, op5;
43370 machine_mode half_mode = VOIDmode;
43371 machine_mode quarter_mode = VOIDmode;
43372 int n, i;
43374 switch (mode)
43376 case E_V2SFmode:
43377 case E_V2SImode:
43378 if (!mmx_ok && !TARGET_SSE)
43379 break;
43380 /* FALLTHRU */
43382 case E_V16SImode:
43383 case E_V16SFmode:
43384 case E_V8DFmode:
43385 case E_V8DImode:
43386 case E_V8SFmode:
43387 case E_V8SImode:
43388 case E_V4DFmode:
43389 case E_V4DImode:
43390 case E_V4SFmode:
43391 case E_V4SImode:
43392 case E_V2DFmode:
43393 case E_V2DImode:
43394 n = GET_MODE_NUNITS (mode);
43395 for (i = 0; i < n; i++)
43396 ops[i] = XVECEXP (vals, 0, i);
43397 ix86_expand_vector_init_concat (mode, target, ops, n);
43398 return;
43400 case E_V2TImode:
43401 for (i = 0; i < 2; i++)
43402 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43403 op0 = gen_reg_rtx (V4DImode);
43404 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
43405 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43406 return;
43408 case E_V4TImode:
43409 for (i = 0; i < 4; i++)
43410 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
43411 ops[4] = gen_reg_rtx (V4DImode);
43412 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
43413 ops[5] = gen_reg_rtx (V4DImode);
43414 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
43415 op0 = gen_reg_rtx (V8DImode);
43416 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
43417 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
43418 return;
43420 case E_V32QImode:
43421 half_mode = V16QImode;
43422 goto half;
43424 case E_V16HImode:
43425 half_mode = V8HImode;
43426 goto half;
43428 half:
43429 n = GET_MODE_NUNITS (mode);
43430 for (i = 0; i < n; i++)
43431 ops[i] = XVECEXP (vals, 0, i);
43432 op0 = gen_reg_rtx (half_mode);
43433 op1 = gen_reg_rtx (half_mode);
43434 ix86_expand_vector_init_interleave (half_mode, op0, ops,
43435 n >> 2);
43436 ix86_expand_vector_init_interleave (half_mode, op1,
43437 &ops [n >> 1], n >> 2);
43438 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
43439 return;
43441 case E_V64QImode:
43442 quarter_mode = V16QImode;
43443 half_mode = V32QImode;
43444 goto quarter;
43446 case E_V32HImode:
43447 quarter_mode = V8HImode;
43448 half_mode = V16HImode;
43449 goto quarter;
43451 quarter:
43452 n = GET_MODE_NUNITS (mode);
43453 for (i = 0; i < n; i++)
43454 ops[i] = XVECEXP (vals, 0, i);
43455 op0 = gen_reg_rtx (quarter_mode);
43456 op1 = gen_reg_rtx (quarter_mode);
43457 op2 = gen_reg_rtx (quarter_mode);
43458 op3 = gen_reg_rtx (quarter_mode);
43459 op4 = gen_reg_rtx (half_mode);
43460 op5 = gen_reg_rtx (half_mode);
43461 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
43462 n >> 3);
43463 ix86_expand_vector_init_interleave (quarter_mode, op1,
43464 &ops [n >> 2], n >> 3);
43465 ix86_expand_vector_init_interleave (quarter_mode, op2,
43466 &ops [n >> 1], n >> 3);
43467 ix86_expand_vector_init_interleave (quarter_mode, op3,
43468 &ops [(n >> 1) | (n >> 2)], n >> 3);
43469 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
43470 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
43471 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
43472 return;
43474 case E_V16QImode:
43475 if (!TARGET_SSE4_1)
43476 break;
43477 /* FALLTHRU */
43479 case E_V8HImode:
43480 if (!TARGET_SSE2)
43481 break;
43483 /* Don't use ix86_expand_vector_init_interleave if we can't
43484 move from GPR to SSE register directly. */
43485 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
43486 break;
43488 n = GET_MODE_NUNITS (mode);
43489 for (i = 0; i < n; i++)
43490 ops[i] = XVECEXP (vals, 0, i);
43491 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
43492 return;
43494 case E_V4HImode:
43495 case E_V8QImode:
43496 break;
43498 default:
43499 gcc_unreachable ();
43503 int i, j, n_elts, n_words, n_elt_per_word;
43504 machine_mode inner_mode;
43505 rtx words[4], shift;
43507 inner_mode = GET_MODE_INNER (mode);
43508 n_elts = GET_MODE_NUNITS (mode);
43509 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
43510 n_elt_per_word = n_elts / n_words;
43511 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
43513 for (i = 0; i < n_words; ++i)
43515 rtx word = NULL_RTX;
43517 for (j = 0; j < n_elt_per_word; ++j)
43519 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
43520 elt = convert_modes (word_mode, inner_mode, elt, true);
43522 if (j == 0)
43523 word = elt;
43524 else
43526 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
43527 word, 1, OPTAB_LIB_WIDEN);
43528 word = expand_simple_binop (word_mode, IOR, word, elt,
43529 word, 1, OPTAB_LIB_WIDEN);
43533 words[i] = word;
43536 if (n_words == 1)
43537 emit_move_insn (target, gen_lowpart (mode, words[0]));
43538 else if (n_words == 2)
43540 rtx tmp = gen_reg_rtx (mode);
43541 emit_clobber (tmp);
43542 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
43543 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
43544 emit_move_insn (target, tmp);
43546 else if (n_words == 4)
43548 rtx tmp = gen_reg_rtx (V4SImode);
43549 gcc_assert (word_mode == SImode);
43550 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
43551 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
43552 emit_move_insn (target, gen_lowpart (mode, tmp));
43554 else
43555 gcc_unreachable ();
43559 /* Initialize vector TARGET via VALS. Suppress the use of MMX
43560 instructions unless MMX_OK is true. */
43562 void
43563 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
43565 machine_mode mode = GET_MODE (target);
43566 machine_mode inner_mode = GET_MODE_INNER (mode);
43567 int n_elts = GET_MODE_NUNITS (mode);
43568 int n_var = 0, one_var = -1;
43569 bool all_same = true, all_const_zero = true;
43570 int i;
43571 rtx x;
43573 /* Handle first initialization from vector elts. */
43574 if (n_elts != XVECLEN (vals, 0))
43576 rtx subtarget = target;
43577 x = XVECEXP (vals, 0, 0);
43578 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
43579 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
43581 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
43582 if (inner_mode == QImode || inner_mode == HImode)
43584 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
43585 mode = mode_for_vector (SImode, n_bits / 4).require ();
43586 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
43587 ops[0] = gen_lowpart (inner_mode, ops[0]);
43588 ops[1] = gen_lowpart (inner_mode, ops[1]);
43589 subtarget = gen_reg_rtx (mode);
43591 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
43592 if (subtarget != target)
43593 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
43594 return;
43596 gcc_unreachable ();
43599 for (i = 0; i < n_elts; ++i)
43601 x = XVECEXP (vals, 0, i);
43602 if (!(CONST_SCALAR_INT_P (x)
43603 || CONST_DOUBLE_P (x)
43604 || CONST_FIXED_P (x)))
43605 n_var++, one_var = i;
43606 else if (x != CONST0_RTX (inner_mode))
43607 all_const_zero = false;
43608 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
43609 all_same = false;
43612 /* Constants are best loaded from the constant pool. */
43613 if (n_var == 0)
43615 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
43616 return;
43619 /* If all values are identical, broadcast the value. */
43620 if (all_same
43621 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
43622 XVECEXP (vals, 0, 0)))
43623 return;
43625 /* Values where only one field is non-constant are best loaded from
43626 the pool and overwritten via move later. */
43627 if (n_var == 1)
43629 if (all_const_zero
43630 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
43631 XVECEXP (vals, 0, one_var),
43632 one_var))
43633 return;
43635 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
43636 return;
43639 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
43642 void
43643 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
43645 machine_mode mode = GET_MODE (target);
43646 machine_mode inner_mode = GET_MODE_INNER (mode);
43647 machine_mode half_mode;
43648 bool use_vec_merge = false;
43649 rtx tmp;
43650 static rtx (*gen_extract[6][2]) (rtx, rtx)
43652 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
43653 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
43654 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
43655 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
43656 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
43657 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
43659 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
43661 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
43662 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
43663 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
43664 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
43665 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
43666 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
43668 int i, j, n;
43669 machine_mode mmode = VOIDmode;
43670 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
43672 switch (mode)
43674 case E_V2SFmode:
43675 case E_V2SImode:
43676 if (mmx_ok)
43678 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43679 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
43680 if (elt == 0)
43681 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43682 else
43683 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43684 emit_insn (gen_rtx_SET (target, tmp));
43685 return;
43687 break;
43689 case E_V2DImode:
43690 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
43691 if (use_vec_merge)
43692 break;
43694 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
43695 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
43696 if (elt == 0)
43697 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
43698 else
43699 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
43700 emit_insn (gen_rtx_SET (target, tmp));
43701 return;
43703 case E_V2DFmode:
43705 rtx op0, op1;
43707 /* For the two element vectors, we implement a VEC_CONCAT with
43708 the extraction of the other element. */
43710 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
43711 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
43713 if (elt == 0)
43714 op0 = val, op1 = tmp;
43715 else
43716 op0 = tmp, op1 = val;
43718 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
43719 emit_insn (gen_rtx_SET (target, tmp));
43721 return;
43723 case E_V4SFmode:
43724 use_vec_merge = TARGET_SSE4_1;
43725 if (use_vec_merge)
43726 break;
43728 switch (elt)
43730 case 0:
43731 use_vec_merge = true;
43732 break;
43734 case 1:
43735 /* tmp = target = A B C D */
43736 tmp = copy_to_reg (target);
43737 /* target = A A B B */
43738 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
43739 /* target = X A B B */
43740 ix86_expand_vector_set (false, target, val, 0);
43741 /* target = A X C D */
43742 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43743 const1_rtx, const0_rtx,
43744 GEN_INT (2+4), GEN_INT (3+4)));
43745 return;
43747 case 2:
43748 /* tmp = target = A B C D */
43749 tmp = copy_to_reg (target);
43750 /* tmp = X B C D */
43751 ix86_expand_vector_set (false, tmp, val, 0);
43752 /* target = A B X D */
43753 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43754 const0_rtx, const1_rtx,
43755 GEN_INT (0+4), GEN_INT (3+4)));
43756 return;
43758 case 3:
43759 /* tmp = target = A B C D */
43760 tmp = copy_to_reg (target);
43761 /* tmp = X B C D */
43762 ix86_expand_vector_set (false, tmp, val, 0);
43763 /* target = A B X D */
43764 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
43765 const0_rtx, const1_rtx,
43766 GEN_INT (2+4), GEN_INT (0+4)));
43767 return;
43769 default:
43770 gcc_unreachable ();
43772 break;
43774 case E_V4SImode:
43775 use_vec_merge = TARGET_SSE4_1;
43776 if (use_vec_merge)
43777 break;
43779 /* Element 0 handled by vec_merge below. */
43780 if (elt == 0)
43782 use_vec_merge = true;
43783 break;
43786 if (TARGET_SSE2)
43788 /* With SSE2, use integer shuffles to swap element 0 and ELT,
43789 store into element 0, then shuffle them back. */
43791 rtx order[4];
43793 order[0] = GEN_INT (elt);
43794 order[1] = const1_rtx;
43795 order[2] = const2_rtx;
43796 order[3] = GEN_INT (3);
43797 order[elt] = const0_rtx;
43799 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43800 order[1], order[2], order[3]));
43802 ix86_expand_vector_set (false, target, val, 0);
43804 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
43805 order[1], order[2], order[3]));
43807 else
43809 /* For SSE1, we have to reuse the V4SF code. */
43810 rtx t = gen_reg_rtx (V4SFmode);
43811 emit_move_insn (t, gen_lowpart (V4SFmode, target));
43812 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
43813 emit_move_insn (target, gen_lowpart (mode, t));
43815 return;
43817 case E_V8HImode:
43818 use_vec_merge = TARGET_SSE2;
43819 break;
43820 case E_V4HImode:
43821 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
43822 break;
43824 case E_V16QImode:
43825 use_vec_merge = TARGET_SSE4_1;
43826 break;
43828 case E_V8QImode:
43829 break;
43831 case E_V32QImode:
43832 half_mode = V16QImode;
43833 j = 0;
43834 n = 16;
43835 goto half;
43837 case E_V16HImode:
43838 half_mode = V8HImode;
43839 j = 1;
43840 n = 8;
43841 goto half;
43843 case E_V8SImode:
43844 half_mode = V4SImode;
43845 j = 2;
43846 n = 4;
43847 goto half;
43849 case E_V4DImode:
43850 half_mode = V2DImode;
43851 j = 3;
43852 n = 2;
43853 goto half;
43855 case E_V8SFmode:
43856 half_mode = V4SFmode;
43857 j = 4;
43858 n = 4;
43859 goto half;
43861 case E_V4DFmode:
43862 half_mode = V2DFmode;
43863 j = 5;
43864 n = 2;
43865 goto half;
43867 half:
43868 /* Compute offset. */
43869 i = elt / n;
43870 elt %= n;
43872 gcc_assert (i <= 1);
43874 /* Extract the half. */
43875 tmp = gen_reg_rtx (half_mode);
43876 emit_insn (gen_extract[j][i] (tmp, target));
43878 /* Put val in tmp at elt. */
43879 ix86_expand_vector_set (false, tmp, val, elt);
43881 /* Put it back. */
43882 emit_insn (gen_insert[j][i] (target, target, tmp));
43883 return;
43885 case E_V8DFmode:
43886 if (TARGET_AVX512F)
43888 mmode = QImode;
43889 gen_blendm = gen_avx512f_blendmv8df;
43891 break;
43893 case E_V8DImode:
43894 if (TARGET_AVX512F)
43896 mmode = QImode;
43897 gen_blendm = gen_avx512f_blendmv8di;
43899 break;
43901 case E_V16SFmode:
43902 if (TARGET_AVX512F)
43904 mmode = HImode;
43905 gen_blendm = gen_avx512f_blendmv16sf;
43907 break;
43909 case E_V16SImode:
43910 if (TARGET_AVX512F)
43912 mmode = HImode;
43913 gen_blendm = gen_avx512f_blendmv16si;
43915 break;
43917 case E_V32HImode:
43918 if (TARGET_AVX512BW)
43920 mmode = SImode;
43921 gen_blendm = gen_avx512bw_blendmv32hi;
43923 else if (TARGET_AVX512F)
43925 half_mode = E_V8HImode;
43926 n = 8;
43927 goto quarter;
43929 break;
43931 case E_V64QImode:
43932 if (TARGET_AVX512BW)
43934 mmode = DImode;
43935 gen_blendm = gen_avx512bw_blendmv64qi;
43937 else if (TARGET_AVX512F)
43939 half_mode = E_V16QImode;
43940 n = 16;
43941 goto quarter;
43943 break;
43945 quarter:
43946 /* Compute offset. */
43947 i = elt / n;
43948 elt %= n;
43950 gcc_assert (i <= 3);
43953 /* Extract the quarter. */
43954 tmp = gen_reg_rtx (V4SImode);
43955 rtx tmp2 = gen_lowpart (V16SImode, target);
43956 rtx mask = gen_reg_rtx (QImode);
43958 emit_move_insn (mask, constm1_rtx);
43959 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
43960 tmp, mask));
43962 tmp2 = gen_reg_rtx (half_mode);
43963 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
43964 tmp = tmp2;
43966 /* Put val in tmp at elt. */
43967 ix86_expand_vector_set (false, tmp, val, elt);
43969 /* Put it back. */
43970 tmp2 = gen_reg_rtx (V16SImode);
43971 rtx tmp3 = gen_lowpart (V16SImode, target);
43972 mask = gen_reg_rtx (HImode);
43973 emit_move_insn (mask, constm1_rtx);
43974 tmp = gen_lowpart (V4SImode, tmp);
43975 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
43976 tmp3, mask));
43977 emit_move_insn (target, gen_lowpart (mode, tmp2));
43979 return;
43981 default:
43982 break;
43985 if (mmode != VOIDmode)
43987 tmp = gen_reg_rtx (mode);
43988 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
43989 /* The avx512*_blendm<mode> expanders have different operand order
43990 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
43991 elements where the mask is set and second input operand otherwise,
43992 in {sse,avx}*_*blend* the first input operand is used for elements
43993 where the mask is clear and second input operand otherwise. */
43994 emit_insn (gen_blendm (target, target, tmp,
43995 force_reg (mmode,
43996 gen_int_mode (HOST_WIDE_INT_1U << elt,
43997 mmode))));
43999 else if (use_vec_merge)
44001 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
44002 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
44003 GEN_INT (HOST_WIDE_INT_1U << elt));
44004 emit_insn (gen_rtx_SET (target, tmp));
44006 else
44008 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44010 emit_move_insn (mem, target);
44012 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
44013 emit_move_insn (tmp, val);
44015 emit_move_insn (target, mem);
44019 void
44020 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
44022 machine_mode mode = GET_MODE (vec);
44023 machine_mode inner_mode = GET_MODE_INNER (mode);
44024 bool use_vec_extr = false;
44025 rtx tmp;
44027 switch (mode)
44029 case E_V2SImode:
44030 case E_V2SFmode:
44031 if (!mmx_ok)
44032 break;
44033 /* FALLTHRU */
44035 case E_V2DFmode:
44036 case E_V2DImode:
44037 case E_V2TImode:
44038 case E_V4TImode:
44039 use_vec_extr = true;
44040 break;
44042 case E_V4SFmode:
44043 use_vec_extr = TARGET_SSE4_1;
44044 if (use_vec_extr)
44045 break;
44047 switch (elt)
44049 case 0:
44050 tmp = vec;
44051 break;
44053 case 1:
44054 case 3:
44055 tmp = gen_reg_rtx (mode);
44056 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
44057 GEN_INT (elt), GEN_INT (elt),
44058 GEN_INT (elt+4), GEN_INT (elt+4)));
44059 break;
44061 case 2:
44062 tmp = gen_reg_rtx (mode);
44063 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
44064 break;
44066 default:
44067 gcc_unreachable ();
44069 vec = tmp;
44070 use_vec_extr = true;
44071 elt = 0;
44072 break;
44074 case E_V4SImode:
44075 use_vec_extr = TARGET_SSE4_1;
44076 if (use_vec_extr)
44077 break;
44079 if (TARGET_SSE2)
44081 switch (elt)
44083 case 0:
44084 tmp = vec;
44085 break;
44087 case 1:
44088 case 3:
44089 tmp = gen_reg_rtx (mode);
44090 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
44091 GEN_INT (elt), GEN_INT (elt),
44092 GEN_INT (elt), GEN_INT (elt)));
44093 break;
44095 case 2:
44096 tmp = gen_reg_rtx (mode);
44097 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
44098 break;
44100 default:
44101 gcc_unreachable ();
44103 vec = tmp;
44104 use_vec_extr = true;
44105 elt = 0;
44107 else
44109 /* For SSE1, we have to reuse the V4SF code. */
44110 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
44111 gen_lowpart (V4SFmode, vec), elt);
44112 return;
44114 break;
44116 case E_V8HImode:
44117 use_vec_extr = TARGET_SSE2;
44118 break;
44119 case E_V4HImode:
44120 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
44121 break;
44123 case E_V16QImode:
44124 use_vec_extr = TARGET_SSE4_1;
44125 break;
44127 case E_V8SFmode:
44128 if (TARGET_AVX)
44130 tmp = gen_reg_rtx (V4SFmode);
44131 if (elt < 4)
44132 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
44133 else
44134 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
44135 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44136 return;
44138 break;
44140 case E_V4DFmode:
44141 if (TARGET_AVX)
44143 tmp = gen_reg_rtx (V2DFmode);
44144 if (elt < 2)
44145 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
44146 else
44147 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
44148 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44149 return;
44151 break;
44153 case E_V32QImode:
44154 if (TARGET_AVX)
44156 tmp = gen_reg_rtx (V16QImode);
44157 if (elt < 16)
44158 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
44159 else
44160 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
44161 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44162 return;
44164 break;
44166 case E_V16HImode:
44167 if (TARGET_AVX)
44169 tmp = gen_reg_rtx (V8HImode);
44170 if (elt < 8)
44171 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
44172 else
44173 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
44174 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44175 return;
44177 break;
44179 case E_V8SImode:
44180 if (TARGET_AVX)
44182 tmp = gen_reg_rtx (V4SImode);
44183 if (elt < 4)
44184 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
44185 else
44186 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
44187 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44188 return;
44190 break;
44192 case E_V4DImode:
44193 if (TARGET_AVX)
44195 tmp = gen_reg_rtx (V2DImode);
44196 if (elt < 2)
44197 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
44198 else
44199 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
44200 ix86_expand_vector_extract (false, target, tmp, elt & 1);
44201 return;
44203 break;
44205 case E_V32HImode:
44206 if (TARGET_AVX512BW)
44208 tmp = gen_reg_rtx (V16HImode);
44209 if (elt < 16)
44210 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
44211 else
44212 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
44213 ix86_expand_vector_extract (false, target, tmp, elt & 15);
44214 return;
44216 break;
44218 case E_V64QImode:
44219 if (TARGET_AVX512BW)
44221 tmp = gen_reg_rtx (V32QImode);
44222 if (elt < 32)
44223 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
44224 else
44225 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
44226 ix86_expand_vector_extract (false, target, tmp, elt & 31);
44227 return;
44229 break;
44231 case E_V16SFmode:
44232 tmp = gen_reg_rtx (V8SFmode);
44233 if (elt < 8)
44234 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
44235 else
44236 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
44237 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44238 return;
44240 case E_V8DFmode:
44241 tmp = gen_reg_rtx (V4DFmode);
44242 if (elt < 4)
44243 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
44244 else
44245 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
44246 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44247 return;
44249 case E_V16SImode:
44250 tmp = gen_reg_rtx (V8SImode);
44251 if (elt < 8)
44252 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
44253 else
44254 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
44255 ix86_expand_vector_extract (false, target, tmp, elt & 7);
44256 return;
44258 case E_V8DImode:
44259 tmp = gen_reg_rtx (V4DImode);
44260 if (elt < 4)
44261 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
44262 else
44263 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
44264 ix86_expand_vector_extract (false, target, tmp, elt & 3);
44265 return;
44267 case E_V8QImode:
44268 /* ??? Could extract the appropriate HImode element and shift. */
44269 default:
44270 break;
44273 if (use_vec_extr)
44275 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
44276 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
44278 /* Let the rtl optimizers know about the zero extension performed. */
44279 if (inner_mode == QImode || inner_mode == HImode)
44281 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
44282 target = gen_lowpart (SImode, target);
44285 emit_insn (gen_rtx_SET (target, tmp));
44287 else
44289 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
44291 emit_move_insn (mem, vec);
44293 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
44294 emit_move_insn (target, tmp);
44298 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
44299 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
44300 The upper bits of DEST are undefined, though they shouldn't cause
44301 exceptions (some bits from src or all zeros are ok). */
44303 static void
44304 emit_reduc_half (rtx dest, rtx src, int i)
44306 rtx tem, d = dest;
44307 switch (GET_MODE (src))
44309 case E_V4SFmode:
44310 if (i == 128)
44311 tem = gen_sse_movhlps (dest, src, src);
44312 else
44313 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
44314 GEN_INT (1 + 4), GEN_INT (1 + 4));
44315 break;
44316 case E_V2DFmode:
44317 tem = gen_vec_interleave_highv2df (dest, src, src);
44318 break;
44319 case E_V16QImode:
44320 case E_V8HImode:
44321 case E_V4SImode:
44322 case E_V2DImode:
44323 d = gen_reg_rtx (V1TImode);
44324 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
44325 GEN_INT (i / 2));
44326 break;
44327 case E_V8SFmode:
44328 if (i == 256)
44329 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
44330 else
44331 tem = gen_avx_shufps256 (dest, src, src,
44332 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
44333 break;
44334 case E_V4DFmode:
44335 if (i == 256)
44336 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
44337 else
44338 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
44339 break;
44340 case E_V32QImode:
44341 case E_V16HImode:
44342 case E_V8SImode:
44343 case E_V4DImode:
44344 if (i == 256)
44346 if (GET_MODE (dest) != V4DImode)
44347 d = gen_reg_rtx (V4DImode);
44348 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
44349 gen_lowpart (V4DImode, src),
44350 const1_rtx);
44352 else
44354 d = gen_reg_rtx (V2TImode);
44355 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
44356 GEN_INT (i / 2));
44358 break;
44359 case E_V64QImode:
44360 case E_V32HImode:
44361 case E_V16SImode:
44362 case E_V16SFmode:
44363 case E_V8DImode:
44364 case E_V8DFmode:
44365 if (i > 128)
44366 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
44367 gen_lowpart (V16SImode, src),
44368 gen_lowpart (V16SImode, src),
44369 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
44370 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
44371 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
44372 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
44373 GEN_INT (0xC), GEN_INT (0xD),
44374 GEN_INT (0xE), GEN_INT (0xF),
44375 GEN_INT (0x10), GEN_INT (0x11),
44376 GEN_INT (0x12), GEN_INT (0x13),
44377 GEN_INT (0x14), GEN_INT (0x15),
44378 GEN_INT (0x16), GEN_INT (0x17));
44379 else
44380 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
44381 gen_lowpart (V16SImode, src),
44382 GEN_INT (i == 128 ? 0x2 : 0x1),
44383 GEN_INT (0x3),
44384 GEN_INT (0x3),
44385 GEN_INT (0x3),
44386 GEN_INT (i == 128 ? 0x6 : 0x5),
44387 GEN_INT (0x7),
44388 GEN_INT (0x7),
44389 GEN_INT (0x7),
44390 GEN_INT (i == 128 ? 0xA : 0x9),
44391 GEN_INT (0xB),
44392 GEN_INT (0xB),
44393 GEN_INT (0xB),
44394 GEN_INT (i == 128 ? 0xE : 0xD),
44395 GEN_INT (0xF),
44396 GEN_INT (0xF),
44397 GEN_INT (0xF));
44398 break;
44399 default:
44400 gcc_unreachable ();
44402 emit_insn (tem);
44403 if (d != dest)
44404 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
44407 /* Expand a vector reduction. FN is the binary pattern to reduce;
44408 DEST is the destination; IN is the input vector. */
44410 void
44411 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
44413 rtx half, dst, vec = in;
44414 machine_mode mode = GET_MODE (in);
44415 int i;
44417 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
44418 if (TARGET_SSE4_1
44419 && mode == V8HImode
44420 && fn == gen_uminv8hi3)
44422 emit_insn (gen_sse4_1_phminposuw (dest, in));
44423 return;
44426 for (i = GET_MODE_BITSIZE (mode);
44427 i > GET_MODE_UNIT_BITSIZE (mode);
44428 i >>= 1)
44430 half = gen_reg_rtx (mode);
44431 emit_reduc_half (half, vec, i);
44432 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
44433 dst = dest;
44434 else
44435 dst = gen_reg_rtx (mode);
44436 emit_insn (fn (dst, half, vec));
44437 vec = dst;
44441 /* Target hook for scalar_mode_supported_p. */
44442 static bool
44443 ix86_scalar_mode_supported_p (scalar_mode mode)
44445 if (DECIMAL_FLOAT_MODE_P (mode))
44446 return default_decimal_float_supported_p ();
44447 else if (mode == TFmode)
44448 return true;
44449 else
44450 return default_scalar_mode_supported_p (mode);
44453 /* Implements target hook vector_mode_supported_p. */
44454 static bool
44455 ix86_vector_mode_supported_p (machine_mode mode)
44457 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
44458 return true;
44459 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
44460 return true;
44461 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
44462 return true;
44463 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
44464 return true;
44465 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
44466 return true;
44467 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
44468 return true;
44469 return false;
44472 /* Target hook for c_mode_for_suffix. */
44473 static machine_mode
44474 ix86_c_mode_for_suffix (char suffix)
44476 if (suffix == 'q')
44477 return TFmode;
44478 if (suffix == 'w')
44479 return XFmode;
44481 return VOIDmode;
44484 /* Worker function for TARGET_MD_ASM_ADJUST.
44486 We implement asm flag outputs, and maintain source compatibility
44487 with the old cc0-based compiler. */
44489 static rtx_insn *
44490 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
44491 vec<const char *> &constraints,
44492 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
44494 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
44495 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
44497 bool saw_asm_flag = false;
44499 start_sequence ();
44500 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
44502 const char *con = constraints[i];
44503 if (strncmp (con, "=@cc", 4) != 0)
44504 continue;
44505 con += 4;
44506 if (strchr (con, ',') != NULL)
44508 error ("alternatives not allowed in asm flag output");
44509 continue;
44512 bool invert = false;
44513 if (con[0] == 'n')
44514 invert = true, con++;
44516 machine_mode mode = CCmode;
44517 rtx_code code = UNKNOWN;
44519 switch (con[0])
44521 case 'a':
44522 if (con[1] == 0)
44523 mode = CCAmode, code = EQ;
44524 else if (con[1] == 'e' && con[2] == 0)
44525 mode = CCCmode, code = NE;
44526 break;
44527 case 'b':
44528 if (con[1] == 0)
44529 mode = CCCmode, code = EQ;
44530 else if (con[1] == 'e' && con[2] == 0)
44531 mode = CCAmode, code = NE;
44532 break;
44533 case 'c':
44534 if (con[1] == 0)
44535 mode = CCCmode, code = EQ;
44536 break;
44537 case 'e':
44538 if (con[1] == 0)
44539 mode = CCZmode, code = EQ;
44540 break;
44541 case 'g':
44542 if (con[1] == 0)
44543 mode = CCGCmode, code = GT;
44544 else if (con[1] == 'e' && con[2] == 0)
44545 mode = CCGCmode, code = GE;
44546 break;
44547 case 'l':
44548 if (con[1] == 0)
44549 mode = CCGCmode, code = LT;
44550 else if (con[1] == 'e' && con[2] == 0)
44551 mode = CCGCmode, code = LE;
44552 break;
44553 case 'o':
44554 if (con[1] == 0)
44555 mode = CCOmode, code = EQ;
44556 break;
44557 case 'p':
44558 if (con[1] == 0)
44559 mode = CCPmode, code = EQ;
44560 break;
44561 case 's':
44562 if (con[1] == 0)
44563 mode = CCSmode, code = EQ;
44564 break;
44565 case 'z':
44566 if (con[1] == 0)
44567 mode = CCZmode, code = EQ;
44568 break;
44570 if (code == UNKNOWN)
44572 error ("unknown asm flag output %qs", constraints[i]);
44573 continue;
44575 if (invert)
44576 code = reverse_condition (code);
44578 rtx dest = outputs[i];
44579 if (!saw_asm_flag)
44581 /* This is the first asm flag output. Here we put the flags
44582 register in as the real output and adjust the condition to
44583 allow it. */
44584 constraints[i] = "=Bf";
44585 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
44586 saw_asm_flag = true;
44588 else
44590 /* We don't need the flags register as output twice. */
44591 constraints[i] = "=X";
44592 outputs[i] = gen_rtx_SCRATCH (SImode);
44595 rtx x = gen_rtx_REG (mode, FLAGS_REG);
44596 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
44598 machine_mode dest_mode = GET_MODE (dest);
44599 if (!SCALAR_INT_MODE_P (dest_mode))
44601 error ("invalid type for asm flag output");
44602 continue;
44605 if (dest_mode == DImode && !TARGET_64BIT)
44606 dest_mode = SImode;
44608 if (dest_mode != QImode)
44610 rtx destqi = gen_reg_rtx (QImode);
44611 emit_insn (gen_rtx_SET (destqi, x));
44613 if (TARGET_ZERO_EXTEND_WITH_AND
44614 && optimize_function_for_speed_p (cfun))
44616 x = force_reg (dest_mode, const0_rtx);
44618 emit_insn (gen_movstrictqi
44619 (gen_lowpart (QImode, x), destqi));
44621 else
44622 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
44625 if (dest_mode != GET_MODE (dest))
44627 rtx tmp = gen_reg_rtx (SImode);
44629 emit_insn (gen_rtx_SET (tmp, x));
44630 emit_insn (gen_zero_extendsidi2 (dest, tmp));
44632 else
44633 emit_insn (gen_rtx_SET (dest, x));
44635 rtx_insn *seq = get_insns ();
44636 end_sequence ();
44638 if (saw_asm_flag)
44639 return seq;
44640 else
44642 /* If we had no asm flag outputs, clobber the flags. */
44643 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
44644 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
44645 return NULL;
44649 /* Implements target vector targetm.asm.encode_section_info. */
44651 static void ATTRIBUTE_UNUSED
44652 ix86_encode_section_info (tree decl, rtx rtl, int first)
44654 default_encode_section_info (decl, rtl, first);
44656 if (ix86_in_large_data_p (decl))
44657 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
44660 /* Worker function for REVERSE_CONDITION. */
44662 enum rtx_code
44663 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
44665 return (mode == CCFPmode
44666 ? reverse_condition_maybe_unordered (code)
44667 : reverse_condition (code));
44670 /* Output code to perform an x87 FP register move, from OPERANDS[1]
44671 to OPERANDS[0]. */
44673 const char *
44674 output_387_reg_move (rtx_insn *insn, rtx *operands)
44676 if (REG_P (operands[0]))
44678 if (REG_P (operands[1])
44679 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44681 if (REGNO (operands[0]) == FIRST_STACK_REG)
44682 return output_387_ffreep (operands, 0);
44683 return "fstp\t%y0";
44685 if (STACK_TOP_P (operands[0]))
44686 return "fld%Z1\t%y1";
44687 return "fst\t%y0";
44689 else if (MEM_P (operands[0]))
44691 gcc_assert (REG_P (operands[1]));
44692 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
44693 return "fstp%Z0\t%y0";
44694 else
44696 /* There is no non-popping store to memory for XFmode.
44697 So if we need one, follow the store with a load. */
44698 if (GET_MODE (operands[0]) == XFmode)
44699 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
44700 else
44701 return "fst%Z0\t%y0";
44704 else
44705 gcc_unreachable();
44708 /* Output code to perform a conditional jump to LABEL, if C2 flag in
44709 FP status register is set. */
44711 void
44712 ix86_emit_fp_unordered_jump (rtx label)
44714 rtx reg = gen_reg_rtx (HImode);
44715 rtx temp;
44717 emit_insn (gen_x86_fnstsw_1 (reg));
44719 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
44721 emit_insn (gen_x86_sahf_1 (reg));
44723 temp = gen_rtx_REG (CCmode, FLAGS_REG);
44724 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
44726 else
44728 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
44730 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
44731 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
44734 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
44735 gen_rtx_LABEL_REF (VOIDmode, label),
44736 pc_rtx);
44737 temp = gen_rtx_SET (pc_rtx, temp);
44739 emit_jump_insn (temp);
44740 predict_jump (REG_BR_PROB_BASE * 10 / 100);
44743 /* Output code to perform a log1p XFmode calculation. */
44745 void ix86_emit_i387_log1p (rtx op0, rtx op1)
44747 rtx_code_label *label1 = gen_label_rtx ();
44748 rtx_code_label *label2 = gen_label_rtx ();
44750 rtx tmp = gen_reg_rtx (XFmode);
44751 rtx tmp2 = gen_reg_rtx (XFmode);
44752 rtx test;
44754 emit_insn (gen_absxf2 (tmp, op1));
44755 test = gen_rtx_GE (VOIDmode, tmp,
44756 const_double_from_real_value (
44757 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
44758 XFmode));
44759 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
44761 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44762 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
44763 emit_jump (label2);
44765 emit_label (label1);
44766 emit_move_insn (tmp, CONST1_RTX (XFmode));
44767 emit_insn (gen_addxf3 (tmp, op1, tmp));
44768 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
44769 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
44771 emit_label (label2);
44774 /* Emit code for round calculation. */
44775 void ix86_emit_i387_round (rtx op0, rtx op1)
44777 machine_mode inmode = GET_MODE (op1);
44778 machine_mode outmode = GET_MODE (op0);
44779 rtx e1, e2, res, tmp, tmp1, half;
44780 rtx scratch = gen_reg_rtx (HImode);
44781 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
44782 rtx_code_label *jump_label = gen_label_rtx ();
44783 rtx insn;
44784 rtx (*gen_abs) (rtx, rtx);
44785 rtx (*gen_neg) (rtx, rtx);
44787 switch (inmode)
44789 case E_SFmode:
44790 gen_abs = gen_abssf2;
44791 break;
44792 case E_DFmode:
44793 gen_abs = gen_absdf2;
44794 break;
44795 case E_XFmode:
44796 gen_abs = gen_absxf2;
44797 break;
44798 default:
44799 gcc_unreachable ();
44802 switch (outmode)
44804 case E_SFmode:
44805 gen_neg = gen_negsf2;
44806 break;
44807 case E_DFmode:
44808 gen_neg = gen_negdf2;
44809 break;
44810 case E_XFmode:
44811 gen_neg = gen_negxf2;
44812 break;
44813 case E_HImode:
44814 gen_neg = gen_neghi2;
44815 break;
44816 case E_SImode:
44817 gen_neg = gen_negsi2;
44818 break;
44819 case E_DImode:
44820 gen_neg = gen_negdi2;
44821 break;
44822 default:
44823 gcc_unreachable ();
44826 e1 = gen_reg_rtx (inmode);
44827 e2 = gen_reg_rtx (inmode);
44828 res = gen_reg_rtx (outmode);
44830 half = const_double_from_real_value (dconsthalf, inmode);
44832 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
44834 /* scratch = fxam(op1) */
44835 emit_insn (gen_rtx_SET (scratch,
44836 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
44837 UNSPEC_FXAM)));
44838 /* e1 = fabs(op1) */
44839 emit_insn (gen_abs (e1, op1));
44841 /* e2 = e1 + 0.5 */
44842 half = force_reg (inmode, half);
44843 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
44845 /* res = floor(e2) */
44846 if (inmode != XFmode)
44848 tmp1 = gen_reg_rtx (XFmode);
44850 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
44852 else
44853 tmp1 = e2;
44855 switch (outmode)
44857 case E_SFmode:
44858 case E_DFmode:
44860 rtx tmp0 = gen_reg_rtx (XFmode);
44862 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
44864 emit_insn (gen_rtx_SET (res,
44865 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
44866 UNSPEC_TRUNC_NOOP)));
44868 break;
44869 case E_XFmode:
44870 emit_insn (gen_frndintxf2_floor (res, tmp1));
44871 break;
44872 case E_HImode:
44873 emit_insn (gen_lfloorxfhi2 (res, tmp1));
44874 break;
44875 case E_SImode:
44876 emit_insn (gen_lfloorxfsi2 (res, tmp1));
44877 break;
44878 case E_DImode:
44879 emit_insn (gen_lfloorxfdi2 (res, tmp1));
44880 break;
44881 default:
44882 gcc_unreachable ();
44885 /* flags = signbit(a) */
44886 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
44888 /* if (flags) then res = -res */
44889 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
44890 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
44891 gen_rtx_LABEL_REF (VOIDmode, jump_label),
44892 pc_rtx);
44893 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44894 predict_jump (REG_BR_PROB_BASE * 50 / 100);
44895 JUMP_LABEL (insn) = jump_label;
44897 emit_insn (gen_neg (res, res));
44899 emit_label (jump_label);
44900 LABEL_NUSES (jump_label) = 1;
44902 emit_move_insn (op0, res);
44905 /* Output code to perform a Newton-Rhapson approximation of a single precision
44906 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
44908 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
44910 rtx x0, x1, e0, e1;
44912 x0 = gen_reg_rtx (mode);
44913 e0 = gen_reg_rtx (mode);
44914 e1 = gen_reg_rtx (mode);
44915 x1 = gen_reg_rtx (mode);
44917 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
44919 b = force_reg (mode, b);
44921 /* x0 = rcp(b) estimate */
44922 if (mode == V16SFmode || mode == V8DFmode)
44924 if (TARGET_AVX512ER)
44926 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44927 UNSPEC_RCP28)));
44928 /* res = a * x0 */
44929 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
44930 return;
44932 else
44933 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44934 UNSPEC_RCP14)));
44936 else
44937 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
44938 UNSPEC_RCP)));
44940 /* e0 = x0 * b */
44941 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
44943 /* e0 = x0 * e0 */
44944 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
44946 /* e1 = x0 + x0 */
44947 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
44949 /* x1 = e1 - e0 */
44950 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
44952 /* res = a * x1 */
44953 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
44956 /* Output code to perform a Newton-Rhapson approximation of a
44957 single precision floating point [reciprocal] square root. */
44959 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
44961 rtx x0, e0, e1, e2, e3, mthree, mhalf;
44962 REAL_VALUE_TYPE r;
44963 int unspec;
44965 x0 = gen_reg_rtx (mode);
44966 e0 = gen_reg_rtx (mode);
44967 e1 = gen_reg_rtx (mode);
44968 e2 = gen_reg_rtx (mode);
44969 e3 = gen_reg_rtx (mode);
44971 if (TARGET_AVX512ER && mode == V16SFmode)
44973 if (recip)
44974 /* res = rsqrt28(a) estimate */
44975 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44976 UNSPEC_RSQRT28)));
44977 else
44979 /* x0 = rsqrt28(a) estimate */
44980 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
44981 UNSPEC_RSQRT28)));
44982 /* res = rcp28(x0) estimate */
44983 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
44984 UNSPEC_RCP28)));
44986 return;
44989 real_from_integer (&r, VOIDmode, -3, SIGNED);
44990 mthree = const_double_from_real_value (r, SFmode);
44992 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
44993 mhalf = const_double_from_real_value (r, SFmode);
44994 unspec = UNSPEC_RSQRT;
44996 if (VECTOR_MODE_P (mode))
44998 mthree = ix86_build_const_vector (mode, true, mthree);
44999 mhalf = ix86_build_const_vector (mode, true, mhalf);
45000 /* There is no 512-bit rsqrt. There is however rsqrt14. */
45001 if (GET_MODE_SIZE (mode) == 64)
45002 unspec = UNSPEC_RSQRT14;
45005 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
45006 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
45008 a = force_reg (mode, a);
45010 /* x0 = rsqrt(a) estimate */
45011 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
45012 unspec)));
45014 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
45015 if (!recip)
45017 rtx zero = force_reg (mode, CONST0_RTX(mode));
45018 rtx mask;
45020 /* Handle masked compare. */
45021 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
45023 mask = gen_reg_rtx (HImode);
45024 /* Imm value 0x4 corresponds to not-equal comparison. */
45025 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
45026 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
45028 else
45030 mask = gen_reg_rtx (mode);
45031 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
45032 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
45036 /* e0 = x0 * a */
45037 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
45038 /* e1 = e0 * x0 */
45039 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
45041 /* e2 = e1 - 3. */
45042 mthree = force_reg (mode, mthree);
45043 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
45045 mhalf = force_reg (mode, mhalf);
45046 if (recip)
45047 /* e3 = -.5 * x0 */
45048 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
45049 else
45050 /* e3 = -.5 * e0 */
45051 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
45052 /* ret = e2 * e3 */
45053 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
45056 #ifdef TARGET_SOLARIS
45057 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
45059 static void
45060 i386_solaris_elf_named_section (const char *name, unsigned int flags,
45061 tree decl)
45063 /* With Binutils 2.15, the "@unwind" marker must be specified on
45064 every occurrence of the ".eh_frame" section, not just the first
45065 one. */
45066 if (TARGET_64BIT
45067 && strcmp (name, ".eh_frame") == 0)
45069 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
45070 flags & SECTION_WRITE ? "aw" : "a");
45071 return;
45074 #ifndef USE_GAS
45075 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
45077 solaris_elf_asm_comdat_section (name, flags, decl);
45078 return;
45081 /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
45082 SPARC assembler. One cannot mix single-letter flags and #exclude, so
45083 only emit the latter here. */
45084 if (flags & SECTION_EXCLUDE)
45086 fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
45087 return;
45089 #endif
45091 default_elf_asm_named_section (name, flags, decl);
45093 #endif /* TARGET_SOLARIS */
45095 /* Return the mangling of TYPE if it is an extended fundamental type. */
45097 static const char *
45098 ix86_mangle_type (const_tree type)
45100 type = TYPE_MAIN_VARIANT (type);
45102 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
45103 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
45104 return NULL;
45106 switch (TYPE_MODE (type))
45108 case E_TFmode:
45109 /* __float128 is "g". */
45110 return "g";
45111 case E_XFmode:
45112 /* "long double" or __float80 is "e". */
45113 return "e";
45114 default:
45115 return NULL;
45119 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
45121 static tree
45122 ix86_stack_protect_guard (void)
45124 if (TARGET_SSP_TLS_GUARD)
45126 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
45127 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
45128 tree type = build_qualified_type (type_node, qual);
45129 tree t;
45131 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
45133 t = ix86_tls_stack_chk_guard_decl;
45135 if (t == NULL)
45137 rtx x;
45139 t = build_decl
45140 (UNKNOWN_LOCATION, VAR_DECL,
45141 get_identifier (ix86_stack_protector_guard_symbol_str),
45142 type);
45143 TREE_STATIC (t) = 1;
45144 TREE_PUBLIC (t) = 1;
45145 DECL_EXTERNAL (t) = 1;
45146 TREE_USED (t) = 1;
45147 TREE_THIS_VOLATILE (t) = 1;
45148 DECL_ARTIFICIAL (t) = 1;
45149 DECL_IGNORED_P (t) = 1;
45151 /* Do not share RTL as the declaration is visible outside of
45152 current function. */
45153 x = DECL_RTL (t);
45154 RTX_FLAG (x, used) = 1;
45156 ix86_tls_stack_chk_guard_decl = t;
45159 else
45161 tree asptrtype = build_pointer_type (type);
45163 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
45164 t = build2 (MEM_REF, asptrtype, t,
45165 build_int_cst (asptrtype, 0));
45168 return t;
45171 return default_stack_protect_guard ();
45174 /* For 32-bit code we can save PIC register setup by using
45175 __stack_chk_fail_local hidden function instead of calling
45176 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
45177 register, so it is better to call __stack_chk_fail directly. */
45179 static tree ATTRIBUTE_UNUSED
45180 ix86_stack_protect_fail (void)
45182 return TARGET_64BIT
45183 ? default_external_stack_protect_fail ()
45184 : default_hidden_stack_protect_fail ();
45187 /* Select a format to encode pointers in exception handling data. CODE
45188 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
45189 true if the symbol may be affected by dynamic relocations.
45191 ??? All x86 object file formats are capable of representing this.
45192 After all, the relocation needed is the same as for the call insn.
45193 Whether or not a particular assembler allows us to enter such, I
45194 guess we'll have to see. */
45196 asm_preferred_eh_data_format (int code, int global)
45198 if (flag_pic)
45200 int type = DW_EH_PE_sdata8;
45201 if (!TARGET_64BIT
45202 || ix86_cmodel == CM_SMALL_PIC
45203 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
45204 type = DW_EH_PE_sdata4;
45205 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
45207 if (ix86_cmodel == CM_SMALL
45208 || (ix86_cmodel == CM_MEDIUM && code))
45209 return DW_EH_PE_udata4;
45210 return DW_EH_PE_absptr;
45213 /* Expand copysign from SIGN to the positive value ABS_VALUE
45214 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
45215 the sign-bit. */
45216 static void
45217 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
45219 machine_mode mode = GET_MODE (sign);
45220 rtx sgn = gen_reg_rtx (mode);
45221 if (mask == NULL_RTX)
45223 machine_mode vmode;
45225 if (mode == SFmode)
45226 vmode = V4SFmode;
45227 else if (mode == DFmode)
45228 vmode = V2DFmode;
45229 else
45230 vmode = mode;
45232 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
45233 if (!VECTOR_MODE_P (mode))
45235 /* We need to generate a scalar mode mask in this case. */
45236 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45237 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45238 mask = gen_reg_rtx (mode);
45239 emit_insn (gen_rtx_SET (mask, tmp));
45242 else
45243 mask = gen_rtx_NOT (mode, mask);
45244 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
45245 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
45248 /* Expand fabs (OP0) and return a new rtx that holds the result. The
45249 mask for masking out the sign-bit is stored in *SMASK, if that is
45250 non-null. */
45251 static rtx
45252 ix86_expand_sse_fabs (rtx op0, rtx *smask)
45254 machine_mode vmode, mode = GET_MODE (op0);
45255 rtx xa, mask;
45257 xa = gen_reg_rtx (mode);
45258 if (mode == SFmode)
45259 vmode = V4SFmode;
45260 else if (mode == DFmode)
45261 vmode = V2DFmode;
45262 else
45263 vmode = mode;
45264 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
45265 if (!VECTOR_MODE_P (mode))
45267 /* We need to generate a scalar mode mask in this case. */
45268 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
45269 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
45270 mask = gen_reg_rtx (mode);
45271 emit_insn (gen_rtx_SET (mask, tmp));
45273 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
45275 if (smask)
45276 *smask = mask;
45278 return xa;
45281 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
45282 swapping the operands if SWAP_OPERANDS is true. The expanded
45283 code is a forward jump to a newly created label in case the
45284 comparison is true. The generated label rtx is returned. */
45285 static rtx_code_label *
45286 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
45287 bool swap_operands)
45289 bool unordered_compare = ix86_unordered_fp_compare (code);
45290 rtx_code_label *label;
45291 rtx tmp, reg;
45293 if (swap_operands)
45294 std::swap (op0, op1);
45296 label = gen_label_rtx ();
45297 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
45298 if (unordered_compare)
45299 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
45300 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
45301 emit_insn (gen_rtx_SET (reg, tmp));
45302 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
45303 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
45304 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
45305 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
45306 JUMP_LABEL (tmp) = label;
45308 return label;
45311 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
45312 using comparison code CODE. Operands are swapped for the comparison if
45313 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
45314 static rtx
45315 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
45316 bool swap_operands)
45318 rtx (*insn)(rtx, rtx, rtx, rtx);
45319 machine_mode mode = GET_MODE (op0);
45320 rtx mask = gen_reg_rtx (mode);
45322 if (swap_operands)
45323 std::swap (op0, op1);
45325 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
45327 emit_insn (insn (mask, op0, op1,
45328 gen_rtx_fmt_ee (code, mode, op0, op1)));
45329 return mask;
45332 /* Generate and return a rtx of mode MODE for 2**n where n is the number
45333 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
45334 static rtx
45335 ix86_gen_TWO52 (machine_mode mode)
45337 REAL_VALUE_TYPE TWO52r;
45338 rtx TWO52;
45340 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
45341 TWO52 = const_double_from_real_value (TWO52r, mode);
45342 TWO52 = force_reg (mode, TWO52);
45344 return TWO52;
45347 /* Expand SSE sequence for computing lround from OP1 storing
45348 into OP0. */
45349 void
45350 ix86_expand_lround (rtx op0, rtx op1)
45352 /* C code for the stuff we're doing below:
45353 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
45354 return (long)tmp;
45356 machine_mode mode = GET_MODE (op1);
45357 const struct real_format *fmt;
45358 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45359 rtx adj;
45361 /* load nextafter (0.5, 0.0) */
45362 fmt = REAL_MODE_FORMAT (mode);
45363 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45364 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45366 /* adj = copysign (0.5, op1) */
45367 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
45368 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
45370 /* adj = op1 + adj */
45371 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
45373 /* op0 = (imode)adj */
45374 expand_fix (op0, adj, 0);
45377 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
45378 into OPERAND0. */
45379 void
45380 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
45382 /* C code for the stuff we're doing below (for do_floor):
45383 xi = (long)op1;
45384 xi -= (double)xi > op1 ? 1 : 0;
45385 return xi;
45387 machine_mode fmode = GET_MODE (op1);
45388 machine_mode imode = GET_MODE (op0);
45389 rtx ireg, freg, tmp;
45390 rtx_code_label *label;
45392 /* reg = (long)op1 */
45393 ireg = gen_reg_rtx (imode);
45394 expand_fix (ireg, op1, 0);
45396 /* freg = (double)reg */
45397 freg = gen_reg_rtx (fmode);
45398 expand_float (freg, ireg, 0);
45400 /* ireg = (freg > op1) ? ireg - 1 : ireg */
45401 label = ix86_expand_sse_compare_and_jump (UNLE,
45402 freg, op1, !do_floor);
45403 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
45404 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
45405 emit_move_insn (ireg, tmp);
45407 emit_label (label);
45408 LABEL_NUSES (label) = 1;
45410 emit_move_insn (op0, ireg);
45413 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
45414 void
45415 ix86_expand_rint (rtx operand0, rtx operand1)
45417 /* C code for the stuff we're doing below:
45418 xa = fabs (operand1);
45419 if (!isless (xa, 2**52))
45420 return operand1;
45421 two52 = 2**52;
45422 if (flag_rounding_math)
45424 two52 = copysign (two52, operand1);
45425 xa = operand1;
45427 xa = xa + two52 - two52;
45428 return copysign (xa, operand1);
45430 machine_mode mode = GET_MODE (operand0);
45431 rtx res, xa, TWO52, two52, mask;
45432 rtx_code_label *label;
45434 res = gen_reg_rtx (mode);
45435 emit_move_insn (res, operand1);
45437 /* xa = abs (operand1) */
45438 xa = ix86_expand_sse_fabs (res, &mask);
45440 /* if (!isless (xa, TWO52)) goto label; */
45441 TWO52 = ix86_gen_TWO52 (mode);
45442 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45444 two52 = TWO52;
45445 if (flag_rounding_math)
45447 two52 = gen_reg_rtx (mode);
45448 ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
45449 xa = res;
45452 xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
45453 xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
45455 ix86_sse_copysign_to_positive (res, xa, res, mask);
45457 emit_label (label);
45458 LABEL_NUSES (label) = 1;
45460 emit_move_insn (operand0, res);
45463 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45464 into OPERAND0. */
45465 void
45466 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
45468 /* C code for the stuff we expand below.
45469 double xa = fabs (x), x2;
45470 if (!isless (xa, TWO52))
45471 return x;
45472 xa = xa + TWO52 - TWO52;
45473 x2 = copysign (xa, x);
45474 Compensate. Floor:
45475 if (x2 > x)
45476 x2 -= 1;
45477 Compensate. Ceil:
45478 if (x2 < x)
45479 x2 -= -1;
45480 return x2;
45482 machine_mode mode = GET_MODE (operand0);
45483 rtx xa, TWO52, tmp, one, res, mask;
45484 rtx_code_label *label;
45486 TWO52 = ix86_gen_TWO52 (mode);
45488 /* Temporary for holding the result, initialized to the input
45489 operand to ease control flow. */
45490 res = gen_reg_rtx (mode);
45491 emit_move_insn (res, operand1);
45493 /* xa = abs (operand1) */
45494 xa = ix86_expand_sse_fabs (res, &mask);
45496 /* if (!isless (xa, TWO52)) goto label; */
45497 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45499 /* xa = xa + TWO52 - TWO52; */
45500 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45501 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
45503 /* xa = copysign (xa, operand1) */
45504 ix86_sse_copysign_to_positive (xa, xa, res, mask);
45506 /* generate 1.0 or -1.0 */
45507 one = force_reg (mode,
45508 const_double_from_real_value (do_floor
45509 ? dconst1 : dconstm1, mode));
45511 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45512 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45513 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45514 /* We always need to subtract here to preserve signed zero. */
45515 tmp = expand_simple_binop (mode, MINUS,
45516 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45517 emit_move_insn (res, tmp);
45519 emit_label (label);
45520 LABEL_NUSES (label) = 1;
45522 emit_move_insn (operand0, res);
45525 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
45526 into OPERAND0. */
45527 void
45528 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
45530 /* C code for the stuff we expand below.
45531 double xa = fabs (x), x2;
45532 if (!isless (xa, TWO52))
45533 return x;
45534 x2 = (double)(long)x;
45535 Compensate. Floor:
45536 if (x2 > x)
45537 x2 -= 1;
45538 Compensate. Ceil:
45539 if (x2 < x)
45540 x2 += 1;
45541 if (HONOR_SIGNED_ZEROS (mode))
45542 return copysign (x2, x);
45543 return x2;
45545 machine_mode mode = GET_MODE (operand0);
45546 rtx xa, xi, TWO52, tmp, one, res, mask;
45547 rtx_code_label *label;
45549 TWO52 = ix86_gen_TWO52 (mode);
45551 /* Temporary for holding the result, initialized to the input
45552 operand to ease control flow. */
45553 res = gen_reg_rtx (mode);
45554 emit_move_insn (res, operand1);
45556 /* xa = abs (operand1) */
45557 xa = ix86_expand_sse_fabs (res, &mask);
45559 /* if (!isless (xa, TWO52)) goto label; */
45560 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45562 /* xa = (double)(long)x */
45563 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45564 expand_fix (xi, res, 0);
45565 expand_float (xa, xi, 0);
45567 /* generate 1.0 */
45568 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45570 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
45571 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
45572 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45573 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
45574 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45575 emit_move_insn (res, tmp);
45577 if (HONOR_SIGNED_ZEROS (mode))
45578 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45580 emit_label (label);
45581 LABEL_NUSES (label) = 1;
45583 emit_move_insn (operand0, res);
45586 /* Expand SSE sequence for computing round from OPERAND1 storing
45587 into OPERAND0. Sequence that works without relying on DImode truncation
45588 via cvttsd2siq that is only available on 64bit targets. */
45589 void
45590 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
45592 /* C code for the stuff we expand below.
45593 double xa = fabs (x), xa2, x2;
45594 if (!isless (xa, TWO52))
45595 return x;
45596 Using the absolute value and copying back sign makes
45597 -0.0 -> -0.0 correct.
45598 xa2 = xa + TWO52 - TWO52;
45599 Compensate.
45600 dxa = xa2 - xa;
45601 if (dxa <= -0.5)
45602 xa2 += 1;
45603 else if (dxa > 0.5)
45604 xa2 -= 1;
45605 x2 = copysign (xa2, x);
45606 return x2;
45608 machine_mode mode = GET_MODE (operand0);
45609 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
45610 rtx_code_label *label;
45612 TWO52 = ix86_gen_TWO52 (mode);
45614 /* Temporary for holding the result, initialized to the input
45615 operand to ease control flow. */
45616 res = gen_reg_rtx (mode);
45617 emit_move_insn (res, operand1);
45619 /* xa = abs (operand1) */
45620 xa = ix86_expand_sse_fabs (res, &mask);
45622 /* if (!isless (xa, TWO52)) goto label; */
45623 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45625 /* xa2 = xa + TWO52 - TWO52; */
45626 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45627 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
45629 /* dxa = xa2 - xa; */
45630 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
45632 /* generate 0.5, 1.0 and -0.5 */
45633 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
45634 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
45635 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
45636 0, OPTAB_DIRECT);
45638 /* Compensate. */
45639 tmp = gen_reg_rtx (mode);
45640 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
45641 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
45642 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45643 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45644 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
45645 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
45646 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
45647 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
45649 /* res = copysign (xa2, operand1) */
45650 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
45652 emit_label (label);
45653 LABEL_NUSES (label) = 1;
45655 emit_move_insn (operand0, res);
45658 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45659 into OPERAND0. */
45660 void
45661 ix86_expand_trunc (rtx operand0, rtx operand1)
45663 /* C code for SSE variant we expand below.
45664 double xa = fabs (x), x2;
45665 if (!isless (xa, TWO52))
45666 return x;
45667 x2 = (double)(long)x;
45668 if (HONOR_SIGNED_ZEROS (mode))
45669 return copysign (x2, x);
45670 return x2;
45672 machine_mode mode = GET_MODE (operand0);
45673 rtx xa, xi, TWO52, res, mask;
45674 rtx_code_label *label;
45676 TWO52 = ix86_gen_TWO52 (mode);
45678 /* Temporary for holding the result, initialized to the input
45679 operand to ease control flow. */
45680 res = gen_reg_rtx (mode);
45681 emit_move_insn (res, operand1);
45683 /* xa = abs (operand1) */
45684 xa = ix86_expand_sse_fabs (res, &mask);
45686 /* if (!isless (xa, TWO52)) goto label; */
45687 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45689 /* x = (double)(long)x */
45690 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45691 expand_fix (xi, res, 0);
45692 expand_float (res, xi, 0);
45694 if (HONOR_SIGNED_ZEROS (mode))
45695 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
45697 emit_label (label);
45698 LABEL_NUSES (label) = 1;
45700 emit_move_insn (operand0, res);
45703 /* Expand SSE sequence for computing trunc from OPERAND1 storing
45704 into OPERAND0. */
45705 void
45706 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
45708 machine_mode mode = GET_MODE (operand0);
45709 rtx xa, mask, TWO52, one, res, smask, tmp;
45710 rtx_code_label *label;
45712 /* C code for SSE variant we expand below.
45713 double xa = fabs (x), x2;
45714 if (!isless (xa, TWO52))
45715 return x;
45716 xa2 = xa + TWO52 - TWO52;
45717 Compensate:
45718 if (xa2 > xa)
45719 xa2 -= 1.0;
45720 x2 = copysign (xa2, x);
45721 return x2;
45724 TWO52 = ix86_gen_TWO52 (mode);
45726 /* Temporary for holding the result, initialized to the input
45727 operand to ease control flow. */
45728 res = gen_reg_rtx (mode);
45729 emit_move_insn (res, operand1);
45731 /* xa = abs (operand1) */
45732 xa = ix86_expand_sse_fabs (res, &smask);
45734 /* if (!isless (xa, TWO52)) goto label; */
45735 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45737 /* res = xa + TWO52 - TWO52; */
45738 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
45739 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
45740 emit_move_insn (res, tmp);
45742 /* generate 1.0 */
45743 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
45745 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
45746 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
45747 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
45748 tmp = expand_simple_binop (mode, MINUS,
45749 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
45750 emit_move_insn (res, tmp);
45752 /* res = copysign (res, operand1) */
45753 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
45755 emit_label (label);
45756 LABEL_NUSES (label) = 1;
45758 emit_move_insn (operand0, res);
45761 /* Expand SSE sequence for computing round from OPERAND1 storing
45762 into OPERAND0. */
45763 void
45764 ix86_expand_round (rtx operand0, rtx operand1)
45766 /* C code for the stuff we're doing below:
45767 double xa = fabs (x);
45768 if (!isless (xa, TWO52))
45769 return x;
45770 xa = (double)(long)(xa + nextafter (0.5, 0.0));
45771 return copysign (xa, x);
45773 machine_mode mode = GET_MODE (operand0);
45774 rtx res, TWO52, xa, xi, half, mask;
45775 rtx_code_label *label;
45776 const struct real_format *fmt;
45777 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45779 /* Temporary for holding the result, initialized to the input
45780 operand to ease control flow. */
45781 res = gen_reg_rtx (mode);
45782 emit_move_insn (res, operand1);
45784 TWO52 = ix86_gen_TWO52 (mode);
45785 xa = ix86_expand_sse_fabs (res, &mask);
45786 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
45788 /* load nextafter (0.5, 0.0) */
45789 fmt = REAL_MODE_FORMAT (mode);
45790 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45791 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45793 /* xa = xa + 0.5 */
45794 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
45795 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
45797 /* xa = (double)(int64_t)xa */
45798 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
45799 expand_fix (xi, xa, 0);
45800 expand_float (xa, xi, 0);
45802 /* res = copysign (xa, operand1) */
45803 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
45805 emit_label (label);
45806 LABEL_NUSES (label) = 1;
45808 emit_move_insn (operand0, res);
45811 /* Expand SSE sequence for computing round
45812 from OP1 storing into OP0 using sse4 round insn. */
45813 void
45814 ix86_expand_round_sse4 (rtx op0, rtx op1)
45816 machine_mode mode = GET_MODE (op0);
45817 rtx e1, e2, res, half;
45818 const struct real_format *fmt;
45819 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
45820 rtx (*gen_copysign) (rtx, rtx, rtx);
45821 rtx (*gen_round) (rtx, rtx, rtx);
45823 switch (mode)
45825 case E_SFmode:
45826 gen_copysign = gen_copysignsf3;
45827 gen_round = gen_sse4_1_roundsf2;
45828 break;
45829 case E_DFmode:
45830 gen_copysign = gen_copysigndf3;
45831 gen_round = gen_sse4_1_rounddf2;
45832 break;
45833 default:
45834 gcc_unreachable ();
45837 /* round (a) = trunc (a + copysign (0.5, a)) */
45839 /* load nextafter (0.5, 0.0) */
45840 fmt = REAL_MODE_FORMAT (mode);
45841 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
45842 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
45843 half = const_double_from_real_value (pred_half, mode);
45845 /* e1 = copysign (0.5, op1) */
45846 e1 = gen_reg_rtx (mode);
45847 emit_insn (gen_copysign (e1, half, op1));
45849 /* e2 = op1 + e1 */
45850 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
45852 /* res = trunc (e2) */
45853 res = gen_reg_rtx (mode);
45854 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
45856 emit_move_insn (op0, res);
45860 /* Table of valid machine attributes. */
45861 static const struct attribute_spec ix86_attribute_table[] =
45863 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
45864 affects_type_identity, handler, exclude } */
45865 /* Stdcall attribute says callee is responsible for popping arguments
45866 if they are not variable. */
45867 { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45868 NULL },
45869 /* Fastcall attribute says callee is responsible for popping arguments
45870 if they are not variable. */
45871 { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45872 NULL },
45873 /* Thiscall attribute says callee is responsible for popping arguments
45874 if they are not variable. */
45875 { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45876 NULL },
45877 /* Cdecl attribute says the callee is a normal C declaration */
45878 { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45879 NULL },
45880 /* Regparm attribute specifies how many integer arguments are to be
45881 passed in registers. */
45882 { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
45883 NULL },
45884 /* Sseregparm attribute says we are using x86_64 calling conventions
45885 for FP arguments. */
45886 { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
45887 NULL },
45888 /* The transactional memory builtins are implicitly regparm or fastcall
45889 depending on the ABI. Override the generic do-nothing attribute that
45890 these builtins were declared with. */
45891 { "*tm regparm", 0, 0, false, true, true, true,
45892 ix86_handle_tm_regparm_attribute, NULL },
45893 /* force_align_arg_pointer says this function realigns the stack at entry. */
45894 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
45895 false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
45896 NULL },
45897 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
45898 { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
45899 NULL },
45900 { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
45901 NULL },
45902 { "shared", 0, 0, true, false, false, false,
45903 ix86_handle_shared_attribute, NULL },
45904 #endif
45905 { "ms_struct", 0, 0, false, false, false, false,
45906 ix86_handle_struct_attribute, NULL },
45907 { "gcc_struct", 0, 0, false, false, false, false,
45908 ix86_handle_struct_attribute, NULL },
45909 #ifdef SUBTARGET_ATTRIBUTE_TABLE
45910 SUBTARGET_ATTRIBUTE_TABLE,
45911 #endif
45912 /* ms_abi and sysv_abi calling convention function attributes. */
45913 { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
45914 { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
45915 NULL },
45916 { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45917 { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
45918 { "ms_hook_prologue", 0, 0, true, false, false, false,
45919 ix86_handle_fndecl_attribute, NULL },
45920 { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
45921 ix86_handle_callee_pop_aggregate_return, NULL },
45922 { "interrupt", 0, 0, false, true, true, false,
45923 ix86_handle_interrupt_attribute, NULL },
45924 { "no_caller_saved_registers", 0, 0, false, true, true, false,
45925 ix86_handle_no_caller_saved_registers_attribute, NULL },
45926 { "naked", 0, 0, true, false, false, false,
45927 ix86_handle_fndecl_attribute, NULL },
45928 { "indirect_branch", 1, 1, true, false, false, false,
45929 ix86_handle_fndecl_attribute, NULL },
45930 { "function_return", 1, 1, true, false, false, false,
45931 ix86_handle_fndecl_attribute, NULL },
45932 { "indirect_return", 0, 0, false, true, true, false,
45933 NULL, NULL },
45935 /* End element. */
45936 { NULL, 0, 0, false, false, false, false, NULL, NULL }
45939 /* Implement targetm.vectorize.builtin_vectorization_cost. */
45940 static int
45941 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
45942 tree vectype, int)
45944 bool fp = false;
45945 machine_mode mode = TImode;
45946 int index;
45947 if (vectype != NULL)
45949 fp = FLOAT_TYPE_P (vectype);
45950 mode = TYPE_MODE (vectype);
45953 switch (type_of_cost)
45955 case scalar_stmt:
45956 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
45958 case scalar_load:
45959 /* load/store costs are relative to register move which is 2. Recompute
45960 it to COSTS_N_INSNS so everything have same base. */
45961 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
45962 : ix86_cost->int_load [2]) / 2;
45964 case scalar_store:
45965 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
45966 : ix86_cost->int_store [2]) / 2;
45968 case vector_stmt:
45969 return ix86_vec_cost (mode,
45970 fp ? ix86_cost->addss : ix86_cost->sse_op,
45971 true);
45973 case vector_load:
45974 index = sse_store_index (mode);
45975 /* See PR82713 - we may end up being called on non-vector type. */
45976 if (index < 0)
45977 index = 2;
45978 return ix86_vec_cost (mode,
45979 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
45980 true);
45982 case vector_store:
45983 index = sse_store_index (mode);
45984 /* See PR82713 - we may end up being called on non-vector type. */
45985 if (index < 0)
45986 index = 2;
45987 return ix86_vec_cost (mode,
45988 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
45989 true);
45991 case vec_to_scalar:
45992 case scalar_to_vec:
45993 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
45995 /* We should have separate costs for unaligned loads and gather/scatter.
45996 Do that incrementally. */
45997 case unaligned_load:
45998 index = sse_store_index (mode);
45999 /* See PR82713 - we may end up being called on non-vector type. */
46000 if (index < 0)
46001 index = 2;
46002 return ix86_vec_cost (mode,
46003 COSTS_N_INSNS
46004 (ix86_cost->sse_unaligned_load[index]) / 2,
46005 true);
46007 case unaligned_store:
46008 index = sse_store_index (mode);
46009 /* See PR82713 - we may end up being called on non-vector type. */
46010 if (index < 0)
46011 index = 2;
46012 return ix86_vec_cost (mode,
46013 COSTS_N_INSNS
46014 (ix86_cost->sse_unaligned_store[index]) / 2,
46015 true);
46017 case vector_gather_load:
46018 return ix86_vec_cost (mode,
46019 COSTS_N_INSNS
46020 (ix86_cost->gather_static
46021 + ix86_cost->gather_per_elt
46022 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46023 true);
46025 case vector_scatter_store:
46026 return ix86_vec_cost (mode,
46027 COSTS_N_INSNS
46028 (ix86_cost->scatter_static
46029 + ix86_cost->scatter_per_elt
46030 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
46031 true);
46033 case cond_branch_taken:
46034 return ix86_cost->cond_taken_branch_cost;
46036 case cond_branch_not_taken:
46037 return ix86_cost->cond_not_taken_branch_cost;
46039 case vec_perm:
46040 case vec_promote_demote:
46041 return ix86_vec_cost (mode,
46042 ix86_cost->sse_op, true);
46044 case vec_construct:
46046 /* N element inserts. */
46047 int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false);
46048 /* One vinserti128 for combining two SSE vectors for AVX256. */
46049 if (GET_MODE_BITSIZE (mode) == 256)
46050 cost += ix86_vec_cost (mode, ix86_cost->addss, true);
46051 /* One vinserti64x4 and two vinserti128 for combining SSE
46052 and AVX256 vectors to AVX512. */
46053 else if (GET_MODE_BITSIZE (mode) == 512)
46054 cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true);
46055 return cost;
46058 default:
46059 gcc_unreachable ();
46063 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
46064 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
46065 insn every time. */
46067 static GTY(()) rtx_insn *vselect_insn;
46069 /* Initialize vselect_insn. */
46071 static void
46072 init_vselect_insn (void)
46074 unsigned i;
46075 rtx x;
46077 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
46078 for (i = 0; i < MAX_VECT_LEN; ++i)
46079 XVECEXP (x, 0, i) = const0_rtx;
46080 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
46081 const0_rtx), x);
46082 x = gen_rtx_SET (const0_rtx, x);
46083 start_sequence ();
46084 vselect_insn = emit_insn (x);
46085 end_sequence ();
46088 /* Construct (set target (vec_select op0 (parallel perm))) and
46089 return true if that's a valid instruction in the active ISA. */
46091 static bool
46092 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
46093 unsigned nelt, bool testing_p)
46095 unsigned int i;
46096 rtx x, save_vconcat;
46097 int icode;
46099 if (vselect_insn == NULL_RTX)
46100 init_vselect_insn ();
46102 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
46103 PUT_NUM_ELEM (XVEC (x, 0), nelt);
46104 for (i = 0; i < nelt; ++i)
46105 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
46106 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46107 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
46108 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
46109 SET_DEST (PATTERN (vselect_insn)) = target;
46110 icode = recog_memoized (vselect_insn);
46112 if (icode >= 0 && !testing_p)
46113 emit_insn (copy_rtx (PATTERN (vselect_insn)));
46115 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
46116 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
46117 INSN_CODE (vselect_insn) = -1;
46119 return icode >= 0;
46122 /* Similar, but generate a vec_concat from op0 and op1 as well. */
46124 static bool
46125 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
46126 const unsigned char *perm, unsigned nelt,
46127 bool testing_p)
46129 machine_mode v2mode;
46130 rtx x;
46131 bool ok;
46133 if (vselect_insn == NULL_RTX)
46134 init_vselect_insn ();
46136 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
46137 return false;
46138 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
46139 PUT_MODE (x, v2mode);
46140 XEXP (x, 0) = op0;
46141 XEXP (x, 1) = op1;
46142 ok = expand_vselect (target, x, perm, nelt, testing_p);
46143 XEXP (x, 0) = const0_rtx;
46144 XEXP (x, 1) = const0_rtx;
46145 return ok;
46148 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46149 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
46151 static bool
46152 expand_vec_perm_blend (struct expand_vec_perm_d *d)
46154 machine_mode mmode, vmode = d->vmode;
46155 unsigned i, mask, nelt = d->nelt;
46156 rtx target, op0, op1, maskop, x;
46157 rtx rperm[32], vperm;
46159 if (d->one_operand_p)
46160 return false;
46161 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
46162 && (TARGET_AVX512BW
46163 || GET_MODE_UNIT_SIZE (vmode) >= 4))
46165 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
46167 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
46169 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
46171 else
46172 return false;
46174 /* This is a blend, not a permute. Elements must stay in their
46175 respective lanes. */
46176 for (i = 0; i < nelt; ++i)
46178 unsigned e = d->perm[i];
46179 if (!(e == i || e == i + nelt))
46180 return false;
46183 if (d->testing_p)
46184 return true;
46186 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
46187 decision should be extracted elsewhere, so that we only try that
46188 sequence once all budget==3 options have been tried. */
46189 target = d->target;
46190 op0 = d->op0;
46191 op1 = d->op1;
46192 mask = 0;
46194 switch (vmode)
46196 case E_V8DFmode:
46197 case E_V16SFmode:
46198 case E_V4DFmode:
46199 case E_V8SFmode:
46200 case E_V2DFmode:
46201 case E_V4SFmode:
46202 case E_V8HImode:
46203 case E_V8SImode:
46204 case E_V32HImode:
46205 case E_V64QImode:
46206 case E_V16SImode:
46207 case E_V8DImode:
46208 for (i = 0; i < nelt; ++i)
46209 mask |= (d->perm[i] >= nelt) << i;
46210 break;
46212 case E_V2DImode:
46213 for (i = 0; i < 2; ++i)
46214 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
46215 vmode = V8HImode;
46216 goto do_subreg;
46218 case E_V4SImode:
46219 for (i = 0; i < 4; ++i)
46220 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46221 vmode = V8HImode;
46222 goto do_subreg;
46224 case E_V16QImode:
46225 /* See if bytes move in pairs so we can use pblendw with
46226 an immediate argument, rather than pblendvb with a vector
46227 argument. */
46228 for (i = 0; i < 16; i += 2)
46229 if (d->perm[i] + 1 != d->perm[i + 1])
46231 use_pblendvb:
46232 for (i = 0; i < nelt; ++i)
46233 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
46235 finish_pblendvb:
46236 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
46237 vperm = force_reg (vmode, vperm);
46239 if (GET_MODE_SIZE (vmode) == 16)
46240 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
46241 else
46242 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
46243 if (target != d->target)
46244 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46245 return true;
46248 for (i = 0; i < 8; ++i)
46249 mask |= (d->perm[i * 2] >= 16) << i;
46250 vmode = V8HImode;
46251 /* FALLTHRU */
46253 do_subreg:
46254 target = gen_reg_rtx (vmode);
46255 op0 = gen_lowpart (vmode, op0);
46256 op1 = gen_lowpart (vmode, op1);
46257 break;
46259 case E_V32QImode:
46260 /* See if bytes move in pairs. If not, vpblendvb must be used. */
46261 for (i = 0; i < 32; i += 2)
46262 if (d->perm[i] + 1 != d->perm[i + 1])
46263 goto use_pblendvb;
46264 /* See if bytes move in quadruplets. If yes, vpblendd
46265 with immediate can be used. */
46266 for (i = 0; i < 32; i += 4)
46267 if (d->perm[i] + 2 != d->perm[i + 2])
46268 break;
46269 if (i < 32)
46271 /* See if bytes move the same in both lanes. If yes,
46272 vpblendw with immediate can be used. */
46273 for (i = 0; i < 16; i += 2)
46274 if (d->perm[i] + 16 != d->perm[i + 16])
46275 goto use_pblendvb;
46277 /* Use vpblendw. */
46278 for (i = 0; i < 16; ++i)
46279 mask |= (d->perm[i * 2] >= 32) << i;
46280 vmode = V16HImode;
46281 goto do_subreg;
46284 /* Use vpblendd. */
46285 for (i = 0; i < 8; ++i)
46286 mask |= (d->perm[i * 4] >= 32) << i;
46287 vmode = V8SImode;
46288 goto do_subreg;
46290 case E_V16HImode:
46291 /* See if words move in pairs. If yes, vpblendd can be used. */
46292 for (i = 0; i < 16; i += 2)
46293 if (d->perm[i] + 1 != d->perm[i + 1])
46294 break;
46295 if (i < 16)
46297 /* See if words move the same in both lanes. If not,
46298 vpblendvb must be used. */
46299 for (i = 0; i < 8; i++)
46300 if (d->perm[i] + 8 != d->perm[i + 8])
46302 /* Use vpblendvb. */
46303 for (i = 0; i < 32; ++i)
46304 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
46306 vmode = V32QImode;
46307 nelt = 32;
46308 target = gen_reg_rtx (vmode);
46309 op0 = gen_lowpart (vmode, op0);
46310 op1 = gen_lowpart (vmode, op1);
46311 goto finish_pblendvb;
46314 /* Use vpblendw. */
46315 for (i = 0; i < 16; ++i)
46316 mask |= (d->perm[i] >= 16) << i;
46317 break;
46320 /* Use vpblendd. */
46321 for (i = 0; i < 8; ++i)
46322 mask |= (d->perm[i * 2] >= 16) << i;
46323 vmode = V8SImode;
46324 goto do_subreg;
46326 case E_V4DImode:
46327 /* Use vpblendd. */
46328 for (i = 0; i < 4; ++i)
46329 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
46330 vmode = V8SImode;
46331 goto do_subreg;
46333 default:
46334 gcc_unreachable ();
46337 switch (vmode)
46339 case E_V8DFmode:
46340 case E_V8DImode:
46341 mmode = QImode;
46342 break;
46343 case E_V16SFmode:
46344 case E_V16SImode:
46345 mmode = HImode;
46346 break;
46347 case E_V32HImode:
46348 mmode = SImode;
46349 break;
46350 case E_V64QImode:
46351 mmode = DImode;
46352 break;
46353 default:
46354 mmode = VOIDmode;
46357 if (mmode != VOIDmode)
46358 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
46359 else
46360 maskop = GEN_INT (mask);
46362 /* This matches five different patterns with the different modes. */
46363 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
46364 x = gen_rtx_SET (target, x);
46365 emit_insn (x);
46366 if (target != d->target)
46367 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46369 return true;
46372 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46373 in terms of the variable form of vpermilps.
46375 Note that we will have already failed the immediate input vpermilps,
46376 which requires that the high and low part shuffle be identical; the
46377 variable form doesn't require that. */
46379 static bool
46380 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
46382 rtx rperm[8], vperm;
46383 unsigned i;
46385 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
46386 return false;
46388 /* We can only permute within the 128-bit lane. */
46389 for (i = 0; i < 8; ++i)
46391 unsigned e = d->perm[i];
46392 if (i < 4 ? e >= 4 : e < 4)
46393 return false;
46396 if (d->testing_p)
46397 return true;
46399 for (i = 0; i < 8; ++i)
46401 unsigned e = d->perm[i];
46403 /* Within each 128-bit lane, the elements of op0 are numbered
46404 from 0 and the elements of op1 are numbered from 4. */
46405 if (e >= 8 + 4)
46406 e -= 8;
46407 else if (e >= 4)
46408 e -= 4;
46410 rperm[i] = GEN_INT (e);
46413 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
46414 vperm = force_reg (V8SImode, vperm);
46415 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
46417 return true;
46420 /* Return true if permutation D can be performed as VMODE permutation
46421 instead. */
46423 static bool
46424 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
46426 unsigned int i, j, chunk;
46428 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
46429 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
46430 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
46431 return false;
46433 if (GET_MODE_NUNITS (vmode) >= d->nelt)
46434 return true;
46436 chunk = d->nelt / GET_MODE_NUNITS (vmode);
46437 for (i = 0; i < d->nelt; i += chunk)
46438 if (d->perm[i] & (chunk - 1))
46439 return false;
46440 else
46441 for (j = 1; j < chunk; ++j)
46442 if (d->perm[i] + j != d->perm[i + j])
46443 return false;
46445 return true;
46448 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46449 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
46451 static bool
46452 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
46454 unsigned i, nelt, eltsz, mask;
46455 unsigned char perm[64];
46456 machine_mode vmode = V16QImode;
46457 rtx rperm[64], vperm, target, op0, op1;
46459 nelt = d->nelt;
46461 if (!d->one_operand_p)
46463 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
46465 if (TARGET_AVX2
46466 && valid_perm_using_mode_p (V2TImode, d))
46468 if (d->testing_p)
46469 return true;
46471 /* Use vperm2i128 insn. The pattern uses
46472 V4DImode instead of V2TImode. */
46473 target = d->target;
46474 if (d->vmode != V4DImode)
46475 target = gen_reg_rtx (V4DImode);
46476 op0 = gen_lowpart (V4DImode, d->op0);
46477 op1 = gen_lowpart (V4DImode, d->op1);
46478 rperm[0]
46479 = GEN_INT ((d->perm[0] / (nelt / 2))
46480 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
46481 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
46482 if (target != d->target)
46483 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46484 return true;
46486 return false;
46489 else
46491 if (GET_MODE_SIZE (d->vmode) == 16)
46493 if (!TARGET_SSSE3)
46494 return false;
46496 else if (GET_MODE_SIZE (d->vmode) == 32)
46498 if (!TARGET_AVX2)
46499 return false;
46501 /* V4DImode should be already handled through
46502 expand_vselect by vpermq instruction. */
46503 gcc_assert (d->vmode != V4DImode);
46505 vmode = V32QImode;
46506 if (d->vmode == V8SImode
46507 || d->vmode == V16HImode
46508 || d->vmode == V32QImode)
46510 /* First see if vpermq can be used for
46511 V8SImode/V16HImode/V32QImode. */
46512 if (valid_perm_using_mode_p (V4DImode, d))
46514 for (i = 0; i < 4; i++)
46515 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
46516 if (d->testing_p)
46517 return true;
46518 target = gen_reg_rtx (V4DImode);
46519 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
46520 perm, 4, false))
46522 emit_move_insn (d->target,
46523 gen_lowpart (d->vmode, target));
46524 return true;
46526 return false;
46529 /* Next see if vpermd can be used. */
46530 if (valid_perm_using_mode_p (V8SImode, d))
46531 vmode = V8SImode;
46533 /* Or if vpermps can be used. */
46534 else if (d->vmode == V8SFmode)
46535 vmode = V8SImode;
46537 if (vmode == V32QImode)
46539 /* vpshufb only works intra lanes, it is not
46540 possible to shuffle bytes in between the lanes. */
46541 for (i = 0; i < nelt; ++i)
46542 if ((d->perm[i] ^ i) & (nelt / 2))
46543 return false;
46546 else if (GET_MODE_SIZE (d->vmode) == 64)
46548 if (!TARGET_AVX512BW)
46549 return false;
46551 /* If vpermq didn't work, vpshufb won't work either. */
46552 if (d->vmode == V8DFmode || d->vmode == V8DImode)
46553 return false;
46555 vmode = V64QImode;
46556 if (d->vmode == V16SImode
46557 || d->vmode == V32HImode
46558 || d->vmode == V64QImode)
46560 /* First see if vpermq can be used for
46561 V16SImode/V32HImode/V64QImode. */
46562 if (valid_perm_using_mode_p (V8DImode, d))
46564 for (i = 0; i < 8; i++)
46565 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
46566 if (d->testing_p)
46567 return true;
46568 target = gen_reg_rtx (V8DImode);
46569 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
46570 perm, 8, false))
46572 emit_move_insn (d->target,
46573 gen_lowpart (d->vmode, target));
46574 return true;
46576 return false;
46579 /* Next see if vpermd can be used. */
46580 if (valid_perm_using_mode_p (V16SImode, d))
46581 vmode = V16SImode;
46583 /* Or if vpermps can be used. */
46584 else if (d->vmode == V16SFmode)
46585 vmode = V16SImode;
46586 if (vmode == V64QImode)
46588 /* vpshufb only works intra lanes, it is not
46589 possible to shuffle bytes in between the lanes. */
46590 for (i = 0; i < nelt; ++i)
46591 if ((d->perm[i] ^ i) & (nelt / 4))
46592 return false;
46595 else
46596 return false;
46599 if (d->testing_p)
46600 return true;
46602 if (vmode == V8SImode)
46603 for (i = 0; i < 8; ++i)
46604 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
46605 else if (vmode == V16SImode)
46606 for (i = 0; i < 16; ++i)
46607 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
46608 else
46610 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46611 if (!d->one_operand_p)
46612 mask = 2 * nelt - 1;
46613 else if (vmode == V16QImode)
46614 mask = nelt - 1;
46615 else if (vmode == V64QImode)
46616 mask = nelt / 4 - 1;
46617 else
46618 mask = nelt / 2 - 1;
46620 for (i = 0; i < nelt; ++i)
46622 unsigned j, e = d->perm[i] & mask;
46623 for (j = 0; j < eltsz; ++j)
46624 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
46628 vperm = gen_rtx_CONST_VECTOR (vmode,
46629 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
46630 vperm = force_reg (vmode, vperm);
46632 target = d->target;
46633 if (d->vmode != vmode)
46634 target = gen_reg_rtx (vmode);
46635 op0 = gen_lowpart (vmode, d->op0);
46636 if (d->one_operand_p)
46638 if (vmode == V16QImode)
46639 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
46640 else if (vmode == V32QImode)
46641 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
46642 else if (vmode == V64QImode)
46643 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
46644 else if (vmode == V8SFmode)
46645 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
46646 else if (vmode == V8SImode)
46647 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
46648 else if (vmode == V16SFmode)
46649 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
46650 else if (vmode == V16SImode)
46651 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
46652 else
46653 gcc_unreachable ();
46655 else
46657 op1 = gen_lowpart (vmode, d->op1);
46658 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
46660 if (target != d->target)
46661 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
46663 return true;
46666 /* For V*[QHS]Imode permutations, check if the same permutation
46667 can't be performed in a 2x, 4x or 8x wider inner mode. */
46669 static bool
46670 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
46671 struct expand_vec_perm_d *nd)
46673 int i;
46674 machine_mode mode = VOIDmode;
46676 switch (d->vmode)
46678 case E_V16QImode: mode = V8HImode; break;
46679 case E_V32QImode: mode = V16HImode; break;
46680 case E_V64QImode: mode = V32HImode; break;
46681 case E_V8HImode: mode = V4SImode; break;
46682 case E_V16HImode: mode = V8SImode; break;
46683 case E_V32HImode: mode = V16SImode; break;
46684 case E_V4SImode: mode = V2DImode; break;
46685 case E_V8SImode: mode = V4DImode; break;
46686 case E_V16SImode: mode = V8DImode; break;
46687 default: return false;
46689 for (i = 0; i < d->nelt; i += 2)
46690 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
46691 return false;
46692 nd->vmode = mode;
46693 nd->nelt = d->nelt / 2;
46694 for (i = 0; i < nd->nelt; i++)
46695 nd->perm[i] = d->perm[2 * i] / 2;
46696 if (GET_MODE_INNER (mode) != DImode)
46697 canonicalize_vector_int_perm (nd, nd);
46698 if (nd != d)
46700 nd->one_operand_p = d->one_operand_p;
46701 nd->testing_p = d->testing_p;
46702 if (d->op0 == d->op1)
46703 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
46704 else
46706 nd->op0 = gen_lowpart (nd->vmode, d->op0);
46707 nd->op1 = gen_lowpart (nd->vmode, d->op1);
46709 if (d->testing_p)
46710 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
46711 else
46712 nd->target = gen_reg_rtx (nd->vmode);
46714 return true;
46717 /* Try to expand one-operand permutation with constant mask. */
46719 static bool
46720 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
46722 machine_mode mode = GET_MODE (d->op0);
46723 machine_mode maskmode = mode;
46724 rtx (*gen) (rtx, rtx, rtx) = NULL;
46725 rtx target, op0, mask;
46726 rtx vec[64];
46728 if (!rtx_equal_p (d->op0, d->op1))
46729 return false;
46731 if (!TARGET_AVX512F)
46732 return false;
46734 switch (mode)
46736 case E_V16SImode:
46737 gen = gen_avx512f_permvarv16si;
46738 break;
46739 case E_V16SFmode:
46740 gen = gen_avx512f_permvarv16sf;
46741 maskmode = V16SImode;
46742 break;
46743 case E_V8DImode:
46744 gen = gen_avx512f_permvarv8di;
46745 break;
46746 case E_V8DFmode:
46747 gen = gen_avx512f_permvarv8df;
46748 maskmode = V8DImode;
46749 break;
46750 default:
46751 return false;
46754 target = d->target;
46755 op0 = d->op0;
46756 for (int i = 0; i < d->nelt; ++i)
46757 vec[i] = GEN_INT (d->perm[i]);
46758 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
46759 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
46760 return true;
46763 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
46764 in a single instruction. */
46766 static bool
46767 expand_vec_perm_1 (struct expand_vec_perm_d *d)
46769 unsigned i, nelt = d->nelt;
46770 struct expand_vec_perm_d nd;
46772 /* Check plain VEC_SELECT first, because AVX has instructions that could
46773 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
46774 input where SEL+CONCAT may not. */
46775 if (d->one_operand_p)
46777 int mask = nelt - 1;
46778 bool identity_perm = true;
46779 bool broadcast_perm = true;
46781 for (i = 0; i < nelt; i++)
46783 nd.perm[i] = d->perm[i] & mask;
46784 if (nd.perm[i] != i)
46785 identity_perm = false;
46786 if (nd.perm[i])
46787 broadcast_perm = false;
46790 if (identity_perm)
46792 if (!d->testing_p)
46793 emit_move_insn (d->target, d->op0);
46794 return true;
46796 else if (broadcast_perm && TARGET_AVX2)
46798 /* Use vpbroadcast{b,w,d}. */
46799 rtx (*gen) (rtx, rtx) = NULL;
46800 switch (d->vmode)
46802 case E_V64QImode:
46803 if (TARGET_AVX512BW)
46804 gen = gen_avx512bw_vec_dupv64qi_1;
46805 break;
46806 case E_V32QImode:
46807 gen = gen_avx2_pbroadcastv32qi_1;
46808 break;
46809 case E_V32HImode:
46810 if (TARGET_AVX512BW)
46811 gen = gen_avx512bw_vec_dupv32hi_1;
46812 break;
46813 case E_V16HImode:
46814 gen = gen_avx2_pbroadcastv16hi_1;
46815 break;
46816 case E_V16SImode:
46817 if (TARGET_AVX512F)
46818 gen = gen_avx512f_vec_dupv16si_1;
46819 break;
46820 case E_V8SImode:
46821 gen = gen_avx2_pbroadcastv8si_1;
46822 break;
46823 case E_V16QImode:
46824 gen = gen_avx2_pbroadcastv16qi;
46825 break;
46826 case E_V8HImode:
46827 gen = gen_avx2_pbroadcastv8hi;
46828 break;
46829 case E_V16SFmode:
46830 if (TARGET_AVX512F)
46831 gen = gen_avx512f_vec_dupv16sf_1;
46832 break;
46833 case E_V8SFmode:
46834 gen = gen_avx2_vec_dupv8sf_1;
46835 break;
46836 case E_V8DFmode:
46837 if (TARGET_AVX512F)
46838 gen = gen_avx512f_vec_dupv8df_1;
46839 break;
46840 case E_V8DImode:
46841 if (TARGET_AVX512F)
46842 gen = gen_avx512f_vec_dupv8di_1;
46843 break;
46844 /* For other modes prefer other shuffles this function creates. */
46845 default: break;
46847 if (gen != NULL)
46849 if (!d->testing_p)
46850 emit_insn (gen (d->target, d->op0));
46851 return true;
46855 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
46856 return true;
46858 /* There are plenty of patterns in sse.md that are written for
46859 SEL+CONCAT and are not replicated for a single op. Perhaps
46860 that should be changed, to avoid the nastiness here. */
46862 /* Recognize interleave style patterns, which means incrementing
46863 every other permutation operand. */
46864 for (i = 0; i < nelt; i += 2)
46866 nd.perm[i] = d->perm[i] & mask;
46867 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
46869 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46870 d->testing_p))
46871 return true;
46873 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
46874 if (nelt >= 4)
46876 for (i = 0; i < nelt; i += 4)
46878 nd.perm[i + 0] = d->perm[i + 0] & mask;
46879 nd.perm[i + 1] = d->perm[i + 1] & mask;
46880 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
46881 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
46884 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
46885 d->testing_p))
46886 return true;
46890 /* Finally, try the fully general two operand permute. */
46891 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
46892 d->testing_p))
46893 return true;
46895 /* Recognize interleave style patterns with reversed operands. */
46896 if (!d->one_operand_p)
46898 for (i = 0; i < nelt; ++i)
46900 unsigned e = d->perm[i];
46901 if (e >= nelt)
46902 e -= nelt;
46903 else
46904 e += nelt;
46905 nd.perm[i] = e;
46908 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
46909 d->testing_p))
46910 return true;
46913 /* Try the SSE4.1 blend variable merge instructions. */
46914 if (expand_vec_perm_blend (d))
46915 return true;
46917 /* Try one of the AVX vpermil variable permutations. */
46918 if (expand_vec_perm_vpermil (d))
46919 return true;
46921 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
46922 vpshufb, vpermd, vpermps or vpermq variable permutation. */
46923 if (expand_vec_perm_pshufb (d))
46924 return true;
46926 /* Try the AVX2 vpalignr instruction. */
46927 if (expand_vec_perm_palignr (d, true))
46928 return true;
46930 /* Try the AVX512F vperm{s,d} instructions. */
46931 if (ix86_expand_vec_one_operand_perm_avx512 (d))
46932 return true;
46934 /* Try the AVX512F vpermt2/vpermi2 instructions. */
46935 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
46936 return true;
46938 /* See if we can get the same permutation in different vector integer
46939 mode. */
46940 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
46942 if (!d->testing_p)
46943 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
46944 return true;
46946 return false;
46949 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
46950 in terms of a pair of pshuflw + pshufhw instructions. */
46952 static bool
46953 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
46955 unsigned char perm2[MAX_VECT_LEN];
46956 unsigned i;
46957 bool ok;
46959 if (d->vmode != V8HImode || !d->one_operand_p)
46960 return false;
46962 /* The two permutations only operate in 64-bit lanes. */
46963 for (i = 0; i < 4; ++i)
46964 if (d->perm[i] >= 4)
46965 return false;
46966 for (i = 4; i < 8; ++i)
46967 if (d->perm[i] < 4)
46968 return false;
46970 if (d->testing_p)
46971 return true;
46973 /* Emit the pshuflw. */
46974 memcpy (perm2, d->perm, 4);
46975 for (i = 4; i < 8; ++i)
46976 perm2[i] = i;
46977 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
46978 gcc_assert (ok);
46980 /* Emit the pshufhw. */
46981 memcpy (perm2 + 4, d->perm + 4, 4);
46982 for (i = 0; i < 4; ++i)
46983 perm2[i] = i;
46984 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
46985 gcc_assert (ok);
46987 return true;
46990 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46991 the permutation using the SSSE3 palignr instruction. This succeeds
46992 when all of the elements in PERM fit within one vector and we merely
46993 need to shift them down so that a single vector permutation has a
46994 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
46995 the vpalignr instruction itself can perform the requested permutation. */
46997 static bool
46998 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
47000 unsigned i, nelt = d->nelt;
47001 unsigned min, max, minswap, maxswap;
47002 bool in_order, ok, swap = false;
47003 rtx shift, target;
47004 struct expand_vec_perm_d dcopy;
47006 /* Even with AVX, palignr only operates on 128-bit vectors,
47007 in AVX2 palignr operates on both 128-bit lanes. */
47008 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47009 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
47010 return false;
47012 min = 2 * nelt;
47013 max = 0;
47014 minswap = 2 * nelt;
47015 maxswap = 0;
47016 for (i = 0; i < nelt; ++i)
47018 unsigned e = d->perm[i];
47019 unsigned eswap = d->perm[i] ^ nelt;
47020 if (GET_MODE_SIZE (d->vmode) == 32)
47022 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
47023 eswap = e ^ (nelt / 2);
47025 if (e < min)
47026 min = e;
47027 if (e > max)
47028 max = e;
47029 if (eswap < minswap)
47030 minswap = eswap;
47031 if (eswap > maxswap)
47032 maxswap = eswap;
47034 if (min == 0
47035 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
47037 if (d->one_operand_p
47038 || minswap == 0
47039 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
47040 ? nelt / 2 : nelt))
47041 return false;
47042 swap = true;
47043 min = minswap;
47044 max = maxswap;
47047 /* Given that we have SSSE3, we know we'll be able to implement the
47048 single operand permutation after the palignr with pshufb for
47049 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
47050 first. */
47051 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
47052 return true;
47054 dcopy = *d;
47055 if (swap)
47057 dcopy.op0 = d->op1;
47058 dcopy.op1 = d->op0;
47059 for (i = 0; i < nelt; ++i)
47060 dcopy.perm[i] ^= nelt;
47063 in_order = true;
47064 for (i = 0; i < nelt; ++i)
47066 unsigned e = dcopy.perm[i];
47067 if (GET_MODE_SIZE (d->vmode) == 32
47068 && e >= nelt
47069 && (e & (nelt / 2 - 1)) < min)
47070 e = e - min - (nelt / 2);
47071 else
47072 e = e - min;
47073 if (e != i)
47074 in_order = false;
47075 dcopy.perm[i] = e;
47077 dcopy.one_operand_p = true;
47079 if (single_insn_only_p && !in_order)
47080 return false;
47082 /* For AVX2, test whether we can permute the result in one instruction. */
47083 if (d->testing_p)
47085 if (in_order)
47086 return true;
47087 dcopy.op1 = dcopy.op0;
47088 return expand_vec_perm_1 (&dcopy);
47091 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
47092 if (GET_MODE_SIZE (d->vmode) == 16)
47094 target = gen_reg_rtx (TImode);
47095 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
47096 gen_lowpart (TImode, dcopy.op0), shift));
47098 else
47100 target = gen_reg_rtx (V2TImode);
47101 emit_insn (gen_avx2_palignrv2ti (target,
47102 gen_lowpart (V2TImode, dcopy.op1),
47103 gen_lowpart (V2TImode, dcopy.op0),
47104 shift));
47107 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
47109 /* Test for the degenerate case where the alignment by itself
47110 produces the desired permutation. */
47111 if (in_order)
47113 emit_move_insn (d->target, dcopy.op0);
47114 return true;
47117 ok = expand_vec_perm_1 (&dcopy);
47118 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
47120 return ok;
47123 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
47124 the permutation using the SSE4_1 pblendv instruction. Potentially
47125 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
47127 static bool
47128 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
47130 unsigned i, which, nelt = d->nelt;
47131 struct expand_vec_perm_d dcopy, dcopy1;
47132 machine_mode vmode = d->vmode;
47133 bool ok;
47135 /* Use the same checks as in expand_vec_perm_blend. */
47136 if (d->one_operand_p)
47137 return false;
47138 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
47140 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
47142 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
47144 else
47145 return false;
47147 /* Figure out where permutation elements stay not in their
47148 respective lanes. */
47149 for (i = 0, which = 0; i < nelt; ++i)
47151 unsigned e = d->perm[i];
47152 if (e != i)
47153 which |= (e < nelt ? 1 : 2);
47155 /* We can pblend the part where elements stay not in their
47156 respective lanes only when these elements are all in one
47157 half of a permutation.
47158 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
47159 lanes, but both 8 and 9 >= 8
47160 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
47161 respective lanes and 8 >= 8, but 2 not. */
47162 if (which != 1 && which != 2)
47163 return false;
47164 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
47165 return true;
47167 /* First we apply one operand permutation to the part where
47168 elements stay not in their respective lanes. */
47169 dcopy = *d;
47170 if (which == 2)
47171 dcopy.op0 = dcopy.op1 = d->op1;
47172 else
47173 dcopy.op0 = dcopy.op1 = d->op0;
47174 if (!d->testing_p)
47175 dcopy.target = gen_reg_rtx (vmode);
47176 dcopy.one_operand_p = true;
47178 for (i = 0; i < nelt; ++i)
47179 dcopy.perm[i] = d->perm[i] & (nelt - 1);
47181 ok = expand_vec_perm_1 (&dcopy);
47182 if (GET_MODE_SIZE (vmode) != 16 && !ok)
47183 return false;
47184 else
47185 gcc_assert (ok);
47186 if (d->testing_p)
47187 return true;
47189 /* Next we put permuted elements into their positions. */
47190 dcopy1 = *d;
47191 if (which == 2)
47192 dcopy1.op1 = dcopy.target;
47193 else
47194 dcopy1.op0 = dcopy.target;
47196 for (i = 0; i < nelt; ++i)
47197 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
47199 ok = expand_vec_perm_blend (&dcopy1);
47200 gcc_assert (ok);
47202 return true;
47205 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
47207 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47208 a two vector permutation into a single vector permutation by using
47209 an interleave operation to merge the vectors. */
47211 static bool
47212 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
47214 struct expand_vec_perm_d dremap, dfinal;
47215 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
47216 unsigned HOST_WIDE_INT contents;
47217 unsigned char remap[2 * MAX_VECT_LEN];
47218 rtx_insn *seq;
47219 bool ok, same_halves = false;
47221 if (GET_MODE_SIZE (d->vmode) == 16)
47223 if (d->one_operand_p)
47224 return false;
47226 else if (GET_MODE_SIZE (d->vmode) == 32)
47228 if (!TARGET_AVX)
47229 return false;
47230 /* For 32-byte modes allow even d->one_operand_p.
47231 The lack of cross-lane shuffling in some instructions
47232 might prevent a single insn shuffle. */
47233 dfinal = *d;
47234 dfinal.testing_p = true;
47235 /* If expand_vec_perm_interleave3 can expand this into
47236 a 3 insn sequence, give up and let it be expanded as
47237 3 insn sequence. While that is one insn longer,
47238 it doesn't need a memory operand and in the common
47239 case that both interleave low and high permutations
47240 with the same operands are adjacent needs 4 insns
47241 for both after CSE. */
47242 if (expand_vec_perm_interleave3 (&dfinal))
47243 return false;
47245 else
47246 return false;
47248 /* Examine from whence the elements come. */
47249 contents = 0;
47250 for (i = 0; i < nelt; ++i)
47251 contents |= HOST_WIDE_INT_1U << d->perm[i];
47253 memset (remap, 0xff, sizeof (remap));
47254 dremap = *d;
47256 if (GET_MODE_SIZE (d->vmode) == 16)
47258 unsigned HOST_WIDE_INT h1, h2, h3, h4;
47260 /* Split the two input vectors into 4 halves. */
47261 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
47262 h2 = h1 << nelt2;
47263 h3 = h2 << nelt2;
47264 h4 = h3 << nelt2;
47266 /* If the elements from the low halves use interleave low, and similarly
47267 for interleave high. If the elements are from mis-matched halves, we
47268 can use shufps for V4SF/V4SI or do a DImode shuffle. */
47269 if ((contents & (h1 | h3)) == contents)
47271 /* punpckl* */
47272 for (i = 0; i < nelt2; ++i)
47274 remap[i] = i * 2;
47275 remap[i + nelt] = i * 2 + 1;
47276 dremap.perm[i * 2] = i;
47277 dremap.perm[i * 2 + 1] = i + nelt;
47279 if (!TARGET_SSE2 && d->vmode == V4SImode)
47280 dremap.vmode = V4SFmode;
47282 else if ((contents & (h2 | h4)) == contents)
47284 /* punpckh* */
47285 for (i = 0; i < nelt2; ++i)
47287 remap[i + nelt2] = i * 2;
47288 remap[i + nelt + nelt2] = i * 2 + 1;
47289 dremap.perm[i * 2] = i + nelt2;
47290 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
47292 if (!TARGET_SSE2 && d->vmode == V4SImode)
47293 dremap.vmode = V4SFmode;
47295 else if ((contents & (h1 | h4)) == contents)
47297 /* shufps */
47298 for (i = 0; i < nelt2; ++i)
47300 remap[i] = i;
47301 remap[i + nelt + nelt2] = i + nelt2;
47302 dremap.perm[i] = i;
47303 dremap.perm[i + nelt2] = i + nelt + nelt2;
47305 if (nelt != 4)
47307 /* shufpd */
47308 dremap.vmode = V2DImode;
47309 dremap.nelt = 2;
47310 dremap.perm[0] = 0;
47311 dremap.perm[1] = 3;
47314 else if ((contents & (h2 | h3)) == contents)
47316 /* shufps */
47317 for (i = 0; i < nelt2; ++i)
47319 remap[i + nelt2] = i;
47320 remap[i + nelt] = i + nelt2;
47321 dremap.perm[i] = i + nelt2;
47322 dremap.perm[i + nelt2] = i + nelt;
47324 if (nelt != 4)
47326 /* shufpd */
47327 dremap.vmode = V2DImode;
47328 dremap.nelt = 2;
47329 dremap.perm[0] = 1;
47330 dremap.perm[1] = 2;
47333 else
47334 return false;
47336 else
47338 unsigned int nelt4 = nelt / 4, nzcnt = 0;
47339 unsigned HOST_WIDE_INT q[8];
47340 unsigned int nonzero_halves[4];
47342 /* Split the two input vectors into 8 quarters. */
47343 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
47344 for (i = 1; i < 8; ++i)
47345 q[i] = q[0] << (nelt4 * i);
47346 for (i = 0; i < 4; ++i)
47347 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
47349 nonzero_halves[nzcnt] = i;
47350 ++nzcnt;
47353 if (nzcnt == 1)
47355 gcc_assert (d->one_operand_p);
47356 nonzero_halves[1] = nonzero_halves[0];
47357 same_halves = true;
47359 else if (d->one_operand_p)
47361 gcc_assert (nonzero_halves[0] == 0);
47362 gcc_assert (nonzero_halves[1] == 1);
47365 if (nzcnt <= 2)
47367 if (d->perm[0] / nelt2 == nonzero_halves[1])
47369 /* Attempt to increase the likelihood that dfinal
47370 shuffle will be intra-lane. */
47371 std::swap (nonzero_halves[0], nonzero_halves[1]);
47374 /* vperm2f128 or vperm2i128. */
47375 for (i = 0; i < nelt2; ++i)
47377 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
47378 remap[i + nonzero_halves[0] * nelt2] = i;
47379 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
47380 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
47383 if (d->vmode != V8SFmode
47384 && d->vmode != V4DFmode
47385 && d->vmode != V8SImode)
47387 dremap.vmode = V8SImode;
47388 dremap.nelt = 8;
47389 for (i = 0; i < 4; ++i)
47391 dremap.perm[i] = i + nonzero_halves[0] * 4;
47392 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
47396 else if (d->one_operand_p)
47397 return false;
47398 else if (TARGET_AVX2
47399 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
47401 /* vpunpckl* */
47402 for (i = 0; i < nelt4; ++i)
47404 remap[i] = i * 2;
47405 remap[i + nelt] = i * 2 + 1;
47406 remap[i + nelt2] = i * 2 + nelt2;
47407 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
47408 dremap.perm[i * 2] = i;
47409 dremap.perm[i * 2 + 1] = i + nelt;
47410 dremap.perm[i * 2 + nelt2] = i + nelt2;
47411 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
47414 else if (TARGET_AVX2
47415 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
47417 /* vpunpckh* */
47418 for (i = 0; i < nelt4; ++i)
47420 remap[i + nelt4] = i * 2;
47421 remap[i + nelt + nelt4] = i * 2 + 1;
47422 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
47423 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
47424 dremap.perm[i * 2] = i + nelt4;
47425 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
47426 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
47427 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
47430 else
47431 return false;
47434 /* Use the remapping array set up above to move the elements from their
47435 swizzled locations into their final destinations. */
47436 dfinal = *d;
47437 for (i = 0; i < nelt; ++i)
47439 unsigned e = remap[d->perm[i]];
47440 gcc_assert (e < nelt);
47441 /* If same_halves is true, both halves of the remapped vector are the
47442 same. Avoid cross-lane accesses if possible. */
47443 if (same_halves && i >= nelt2)
47445 gcc_assert (e < nelt2);
47446 dfinal.perm[i] = e + nelt2;
47448 else
47449 dfinal.perm[i] = e;
47451 if (!d->testing_p)
47453 dremap.target = gen_reg_rtx (dremap.vmode);
47454 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47456 dfinal.op1 = dfinal.op0;
47457 dfinal.one_operand_p = true;
47459 /* Test if the final remap can be done with a single insn. For V4SFmode or
47460 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
47461 start_sequence ();
47462 ok = expand_vec_perm_1 (&dfinal);
47463 seq = get_insns ();
47464 end_sequence ();
47466 if (!ok)
47467 return false;
47469 if (d->testing_p)
47470 return true;
47472 if (dremap.vmode != dfinal.vmode)
47474 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
47475 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
47478 ok = expand_vec_perm_1 (&dremap);
47479 gcc_assert (ok);
47481 emit_insn (seq);
47482 return true;
47485 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47486 a single vector cross-lane permutation into vpermq followed
47487 by any of the single insn permutations. */
47489 static bool
47490 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
47492 struct expand_vec_perm_d dremap, dfinal;
47493 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
47494 unsigned contents[2];
47495 bool ok;
47497 if (!(TARGET_AVX2
47498 && (d->vmode == V32QImode || d->vmode == V16HImode)
47499 && d->one_operand_p))
47500 return false;
47502 contents[0] = 0;
47503 contents[1] = 0;
47504 for (i = 0; i < nelt2; ++i)
47506 contents[0] |= 1u << (d->perm[i] / nelt4);
47507 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
47510 for (i = 0; i < 2; ++i)
47512 unsigned int cnt = 0;
47513 for (j = 0; j < 4; ++j)
47514 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
47515 return false;
47518 if (d->testing_p)
47519 return true;
47521 dremap = *d;
47522 dremap.vmode = V4DImode;
47523 dremap.nelt = 4;
47524 dremap.target = gen_reg_rtx (V4DImode);
47525 dremap.op0 = gen_lowpart (V4DImode, d->op0);
47526 dremap.op1 = dremap.op0;
47527 dremap.one_operand_p = true;
47528 for (i = 0; i < 2; ++i)
47530 unsigned int cnt = 0;
47531 for (j = 0; j < 4; ++j)
47532 if ((contents[i] & (1u << j)) != 0)
47533 dremap.perm[2 * i + cnt++] = j;
47534 for (; cnt < 2; ++cnt)
47535 dremap.perm[2 * i + cnt] = 0;
47538 dfinal = *d;
47539 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
47540 dfinal.op1 = dfinal.op0;
47541 dfinal.one_operand_p = true;
47542 for (i = 0, j = 0; i < nelt; ++i)
47544 if (i == nelt2)
47545 j = 2;
47546 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
47547 if ((d->perm[i] / nelt4) == dremap.perm[j])
47549 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
47550 dfinal.perm[i] |= nelt4;
47551 else
47552 gcc_unreachable ();
47555 ok = expand_vec_perm_1 (&dremap);
47556 gcc_assert (ok);
47558 ok = expand_vec_perm_1 (&dfinal);
47559 gcc_assert (ok);
47561 return true;
47564 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
47565 a vector permutation using two instructions, vperm2f128 resp.
47566 vperm2i128 followed by any single in-lane permutation. */
47568 static bool
47569 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
47571 struct expand_vec_perm_d dfirst, dsecond;
47572 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
47573 bool ok;
47575 if (!TARGET_AVX
47576 || GET_MODE_SIZE (d->vmode) != 32
47577 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
47578 return false;
47580 dsecond = *d;
47581 dsecond.one_operand_p = false;
47582 dsecond.testing_p = true;
47584 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
47585 immediate. For perm < 16 the second permutation uses
47586 d->op0 as first operand, for perm >= 16 it uses d->op1
47587 as first operand. The second operand is the result of
47588 vperm2[fi]128. */
47589 for (perm = 0; perm < 32; perm++)
47591 /* Ignore permutations which do not move anything cross-lane. */
47592 if (perm < 16)
47594 /* The second shuffle for e.g. V4DFmode has
47595 0123 and ABCD operands.
47596 Ignore AB23, as 23 is already in the second lane
47597 of the first operand. */
47598 if ((perm & 0xc) == (1 << 2)) continue;
47599 /* And 01CD, as 01 is in the first lane of the first
47600 operand. */
47601 if ((perm & 3) == 0) continue;
47602 /* And 4567, as then the vperm2[fi]128 doesn't change
47603 anything on the original 4567 second operand. */
47604 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
47606 else
47608 /* The second shuffle for e.g. V4DFmode has
47609 4567 and ABCD operands.
47610 Ignore AB67, as 67 is already in the second lane
47611 of the first operand. */
47612 if ((perm & 0xc) == (3 << 2)) continue;
47613 /* And 45CD, as 45 is in the first lane of the first
47614 operand. */
47615 if ((perm & 3) == 2) continue;
47616 /* And 0123, as then the vperm2[fi]128 doesn't change
47617 anything on the original 0123 first operand. */
47618 if ((perm & 0xf) == (1 << 2)) continue;
47621 for (i = 0; i < nelt; i++)
47623 j = d->perm[i] / nelt2;
47624 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
47625 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
47626 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
47627 dsecond.perm[i] = d->perm[i] & (nelt - 1);
47628 else
47629 break;
47632 if (i == nelt)
47634 start_sequence ();
47635 ok = expand_vec_perm_1 (&dsecond);
47636 end_sequence ();
47638 else
47639 ok = false;
47641 if (ok)
47643 if (d->testing_p)
47644 return true;
47646 /* Found a usable second shuffle. dfirst will be
47647 vperm2f128 on d->op0 and d->op1. */
47648 dsecond.testing_p = false;
47649 dfirst = *d;
47650 dfirst.target = gen_reg_rtx (d->vmode);
47651 for (i = 0; i < nelt; i++)
47652 dfirst.perm[i] = (i & (nelt2 - 1))
47653 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
47655 canonicalize_perm (&dfirst);
47656 ok = expand_vec_perm_1 (&dfirst);
47657 gcc_assert (ok);
47659 /* And dsecond is some single insn shuffle, taking
47660 d->op0 and result of vperm2f128 (if perm < 16) or
47661 d->op1 and result of vperm2f128 (otherwise). */
47662 if (perm >= 16)
47663 dsecond.op0 = dsecond.op1;
47664 dsecond.op1 = dfirst.target;
47666 ok = expand_vec_perm_1 (&dsecond);
47667 gcc_assert (ok);
47669 return true;
47672 /* For one operand, the only useful vperm2f128 permutation is 0x01
47673 aka lanes swap. */
47674 if (d->one_operand_p)
47675 return false;
47678 return false;
47681 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
47682 a two vector permutation using 2 intra-lane interleave insns
47683 and cross-lane shuffle for 32-byte vectors. */
47685 static bool
47686 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
47688 unsigned i, nelt;
47689 rtx (*gen) (rtx, rtx, rtx);
47691 if (d->one_operand_p)
47692 return false;
47693 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
47695 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
47697 else
47698 return false;
47700 nelt = d->nelt;
47701 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
47702 return false;
47703 for (i = 0; i < nelt; i += 2)
47704 if (d->perm[i] != d->perm[0] + i / 2
47705 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
47706 return false;
47708 if (d->testing_p)
47709 return true;
47711 switch (d->vmode)
47713 case E_V32QImode:
47714 if (d->perm[0])
47715 gen = gen_vec_interleave_highv32qi;
47716 else
47717 gen = gen_vec_interleave_lowv32qi;
47718 break;
47719 case E_V16HImode:
47720 if (d->perm[0])
47721 gen = gen_vec_interleave_highv16hi;
47722 else
47723 gen = gen_vec_interleave_lowv16hi;
47724 break;
47725 case E_V8SImode:
47726 if (d->perm[0])
47727 gen = gen_vec_interleave_highv8si;
47728 else
47729 gen = gen_vec_interleave_lowv8si;
47730 break;
47731 case E_V4DImode:
47732 if (d->perm[0])
47733 gen = gen_vec_interleave_highv4di;
47734 else
47735 gen = gen_vec_interleave_lowv4di;
47736 break;
47737 case E_V8SFmode:
47738 if (d->perm[0])
47739 gen = gen_vec_interleave_highv8sf;
47740 else
47741 gen = gen_vec_interleave_lowv8sf;
47742 break;
47743 case E_V4DFmode:
47744 if (d->perm[0])
47745 gen = gen_vec_interleave_highv4df;
47746 else
47747 gen = gen_vec_interleave_lowv4df;
47748 break;
47749 default:
47750 gcc_unreachable ();
47753 emit_insn (gen (d->target, d->op0, d->op1));
47754 return true;
47757 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
47758 a single vector permutation using a single intra-lane vector
47759 permutation, vperm2f128 swapping the lanes and vblend* insn blending
47760 the non-swapped and swapped vectors together. */
47762 static bool
47763 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
47765 struct expand_vec_perm_d dfirst, dsecond;
47766 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
47767 rtx_insn *seq;
47768 bool ok;
47769 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
47771 if (!TARGET_AVX
47772 || TARGET_AVX2
47773 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
47774 || !d->one_operand_p)
47775 return false;
47777 dfirst = *d;
47778 for (i = 0; i < nelt; i++)
47779 dfirst.perm[i] = 0xff;
47780 for (i = 0, msk = 0; i < nelt; i++)
47782 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
47783 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
47784 return false;
47785 dfirst.perm[j] = d->perm[i];
47786 if (j != i)
47787 msk |= (1 << i);
47789 for (i = 0; i < nelt; i++)
47790 if (dfirst.perm[i] == 0xff)
47791 dfirst.perm[i] = i;
47793 if (!d->testing_p)
47794 dfirst.target = gen_reg_rtx (dfirst.vmode);
47796 start_sequence ();
47797 ok = expand_vec_perm_1 (&dfirst);
47798 seq = get_insns ();
47799 end_sequence ();
47801 if (!ok)
47802 return false;
47804 if (d->testing_p)
47805 return true;
47807 emit_insn (seq);
47809 dsecond = *d;
47810 dsecond.op0 = dfirst.target;
47811 dsecond.op1 = dfirst.target;
47812 dsecond.one_operand_p = true;
47813 dsecond.target = gen_reg_rtx (dsecond.vmode);
47814 for (i = 0; i < nelt; i++)
47815 dsecond.perm[i] = i ^ nelt2;
47817 ok = expand_vec_perm_1 (&dsecond);
47818 gcc_assert (ok);
47820 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
47821 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
47822 return true;
47825 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
47826 permutation using two vperm2f128, followed by a vshufpd insn blending
47827 the two vectors together. */
47829 static bool
47830 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
47832 struct expand_vec_perm_d dfirst, dsecond, dthird;
47833 bool ok;
47835 if (!TARGET_AVX || (d->vmode != V4DFmode))
47836 return false;
47838 if (d->testing_p)
47839 return true;
47841 dfirst = *d;
47842 dsecond = *d;
47843 dthird = *d;
47845 dfirst.perm[0] = (d->perm[0] & ~1);
47846 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
47847 dfirst.perm[2] = (d->perm[2] & ~1);
47848 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
47849 dsecond.perm[0] = (d->perm[1] & ~1);
47850 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
47851 dsecond.perm[2] = (d->perm[3] & ~1);
47852 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
47853 dthird.perm[0] = (d->perm[0] % 2);
47854 dthird.perm[1] = (d->perm[1] % 2) + 4;
47855 dthird.perm[2] = (d->perm[2] % 2) + 2;
47856 dthird.perm[3] = (d->perm[3] % 2) + 6;
47858 dfirst.target = gen_reg_rtx (dfirst.vmode);
47859 dsecond.target = gen_reg_rtx (dsecond.vmode);
47860 dthird.op0 = dfirst.target;
47861 dthird.op1 = dsecond.target;
47862 dthird.one_operand_p = false;
47864 canonicalize_perm (&dfirst);
47865 canonicalize_perm (&dsecond);
47867 ok = expand_vec_perm_1 (&dfirst)
47868 && expand_vec_perm_1 (&dsecond)
47869 && expand_vec_perm_1 (&dthird);
47871 gcc_assert (ok);
47873 return true;
47876 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
47877 permutation with two pshufb insns and an ior. We should have already
47878 failed all two instruction sequences. */
47880 static bool
47881 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
47883 rtx rperm[2][16], vperm, l, h, op, m128;
47884 unsigned int i, nelt, eltsz;
47886 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
47887 return false;
47888 gcc_assert (!d->one_operand_p);
47890 if (d->testing_p)
47891 return true;
47893 nelt = d->nelt;
47894 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47896 /* Generate two permutation masks. If the required element is within
47897 the given vector it is shuffled into the proper lane. If the required
47898 element is in the other vector, force a zero into the lane by setting
47899 bit 7 in the permutation mask. */
47900 m128 = GEN_INT (-128);
47901 for (i = 0; i < nelt; ++i)
47903 unsigned j, e = d->perm[i];
47904 unsigned which = (e >= nelt);
47905 if (e >= nelt)
47906 e -= nelt;
47908 for (j = 0; j < eltsz; ++j)
47910 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
47911 rperm[1-which][i*eltsz + j] = m128;
47915 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
47916 vperm = force_reg (V16QImode, vperm);
47918 l = gen_reg_rtx (V16QImode);
47919 op = gen_lowpart (V16QImode, d->op0);
47920 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
47922 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
47923 vperm = force_reg (V16QImode, vperm);
47925 h = gen_reg_rtx (V16QImode);
47926 op = gen_lowpart (V16QImode, d->op1);
47927 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
47929 op = d->target;
47930 if (d->vmode != V16QImode)
47931 op = gen_reg_rtx (V16QImode);
47932 emit_insn (gen_iorv16qi3 (op, l, h));
47933 if (op != d->target)
47934 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47936 return true;
47939 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
47940 with two vpshufb insns, vpermq and vpor. We should have already failed
47941 all two or three instruction sequences. */
47943 static bool
47944 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
47946 rtx rperm[2][32], vperm, l, h, hp, op, m128;
47947 unsigned int i, nelt, eltsz;
47949 if (!TARGET_AVX2
47950 || !d->one_operand_p
47951 || (d->vmode != V32QImode && d->vmode != V16HImode))
47952 return false;
47954 if (d->testing_p)
47955 return true;
47957 nelt = d->nelt;
47958 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47960 /* Generate two permutation masks. If the required element is within
47961 the same lane, it is shuffled in. If the required element from the
47962 other lane, force a zero by setting bit 7 in the permutation mask.
47963 In the other mask the mask has non-negative elements if element
47964 is requested from the other lane, but also moved to the other lane,
47965 so that the result of vpshufb can have the two V2TImode halves
47966 swapped. */
47967 m128 = GEN_INT (-128);
47968 for (i = 0; i < nelt; ++i)
47970 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47971 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47973 for (j = 0; j < eltsz; ++j)
47975 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
47976 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
47980 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
47981 vperm = force_reg (V32QImode, vperm);
47983 h = gen_reg_rtx (V32QImode);
47984 op = gen_lowpart (V32QImode, d->op0);
47985 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
47987 /* Swap the 128-byte lanes of h into hp. */
47988 hp = gen_reg_rtx (V4DImode);
47989 op = gen_lowpart (V4DImode, h);
47990 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
47991 const1_rtx));
47993 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
47994 vperm = force_reg (V32QImode, vperm);
47996 l = gen_reg_rtx (V32QImode);
47997 op = gen_lowpart (V32QImode, d->op0);
47998 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48000 op = d->target;
48001 if (d->vmode != V32QImode)
48002 op = gen_reg_rtx (V32QImode);
48003 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
48004 if (op != d->target)
48005 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48007 return true;
48010 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48011 and extract-odd permutations of two V32QImode and V16QImode operand
48012 with two vpshufb insns, vpor and vpermq. We should have already
48013 failed all two or three instruction sequences. */
48015 static bool
48016 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
48018 rtx rperm[2][32], vperm, l, h, ior, op, m128;
48019 unsigned int i, nelt, eltsz;
48021 if (!TARGET_AVX2
48022 || d->one_operand_p
48023 || (d->vmode != V32QImode && d->vmode != V16HImode))
48024 return false;
48026 for (i = 0; i < d->nelt; ++i)
48027 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
48028 return false;
48030 if (d->testing_p)
48031 return true;
48033 nelt = d->nelt;
48034 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48036 /* Generate two permutation masks. In the first permutation mask
48037 the first quarter will contain indexes for the first half
48038 of the op0, the second quarter will contain bit 7 set, third quarter
48039 will contain indexes for the second half of the op0 and the
48040 last quarter bit 7 set. In the second permutation mask
48041 the first quarter will contain bit 7 set, the second quarter
48042 indexes for the first half of the op1, the third quarter bit 7 set
48043 and last quarter indexes for the second half of the op1.
48044 I.e. the first mask e.g. for V32QImode extract even will be:
48045 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
48046 (all values masked with 0xf except for -128) and second mask
48047 for extract even will be
48048 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
48049 m128 = GEN_INT (-128);
48050 for (i = 0; i < nelt; ++i)
48052 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48053 unsigned which = d->perm[i] >= nelt;
48054 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
48056 for (j = 0; j < eltsz; ++j)
48058 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
48059 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
48063 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
48064 vperm = force_reg (V32QImode, vperm);
48066 l = gen_reg_rtx (V32QImode);
48067 op = gen_lowpart (V32QImode, d->op0);
48068 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
48070 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
48071 vperm = force_reg (V32QImode, vperm);
48073 h = gen_reg_rtx (V32QImode);
48074 op = gen_lowpart (V32QImode, d->op1);
48075 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
48077 ior = gen_reg_rtx (V32QImode);
48078 emit_insn (gen_iorv32qi3 (ior, l, h));
48080 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
48081 op = gen_reg_rtx (V4DImode);
48082 ior = gen_lowpart (V4DImode, ior);
48083 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
48084 const1_rtx, GEN_INT (3)));
48085 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48087 return true;
48090 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48091 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
48092 with two "and" and "pack" or two "shift" and "pack" insns. We should
48093 have already failed all two instruction sequences. */
48095 static bool
48096 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
48098 rtx op, dop0, dop1, t;
48099 unsigned i, odd, c, s, nelt = d->nelt;
48100 bool end_perm = false;
48101 machine_mode half_mode;
48102 rtx (*gen_and) (rtx, rtx, rtx);
48103 rtx (*gen_pack) (rtx, rtx, rtx);
48104 rtx (*gen_shift) (rtx, rtx, rtx);
48106 if (d->one_operand_p)
48107 return false;
48109 switch (d->vmode)
48111 case E_V8HImode:
48112 /* Required for "pack". */
48113 if (!TARGET_SSE4_1)
48114 return false;
48115 c = 0xffff;
48116 s = 16;
48117 half_mode = V4SImode;
48118 gen_and = gen_andv4si3;
48119 gen_pack = gen_sse4_1_packusdw;
48120 gen_shift = gen_lshrv4si3;
48121 break;
48122 case E_V16QImode:
48123 /* No check as all instructions are SSE2. */
48124 c = 0xff;
48125 s = 8;
48126 half_mode = V8HImode;
48127 gen_and = gen_andv8hi3;
48128 gen_pack = gen_sse2_packuswb;
48129 gen_shift = gen_lshrv8hi3;
48130 break;
48131 case E_V16HImode:
48132 if (!TARGET_AVX2)
48133 return false;
48134 c = 0xffff;
48135 s = 16;
48136 half_mode = V8SImode;
48137 gen_and = gen_andv8si3;
48138 gen_pack = gen_avx2_packusdw;
48139 gen_shift = gen_lshrv8si3;
48140 end_perm = true;
48141 break;
48142 case E_V32QImode:
48143 if (!TARGET_AVX2)
48144 return false;
48145 c = 0xff;
48146 s = 8;
48147 half_mode = V16HImode;
48148 gen_and = gen_andv16hi3;
48149 gen_pack = gen_avx2_packuswb;
48150 gen_shift = gen_lshrv16hi3;
48151 end_perm = true;
48152 break;
48153 default:
48154 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
48155 general shuffles. */
48156 return false;
48159 /* Check that permutation is even or odd. */
48160 odd = d->perm[0];
48161 if (odd > 1)
48162 return false;
48164 for (i = 1; i < nelt; ++i)
48165 if (d->perm[i] != 2 * i + odd)
48166 return false;
48168 if (d->testing_p)
48169 return true;
48171 dop0 = gen_reg_rtx (half_mode);
48172 dop1 = gen_reg_rtx (half_mode);
48173 if (odd == 0)
48175 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
48176 t = force_reg (half_mode, t);
48177 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
48178 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
48180 else
48182 emit_insn (gen_shift (dop0,
48183 gen_lowpart (half_mode, d->op0),
48184 GEN_INT (s)));
48185 emit_insn (gen_shift (dop1,
48186 gen_lowpart (half_mode, d->op1),
48187 GEN_INT (s)));
48189 /* In AVX2 for 256 bit case we need to permute pack result. */
48190 if (TARGET_AVX2 && end_perm)
48192 op = gen_reg_rtx (d->vmode);
48193 t = gen_reg_rtx (V4DImode);
48194 emit_insn (gen_pack (op, dop0, dop1));
48195 emit_insn (gen_avx2_permv4di_1 (t,
48196 gen_lowpart (V4DImode, op),
48197 const0_rtx,
48198 const2_rtx,
48199 const1_rtx,
48200 GEN_INT (3)));
48201 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
48203 else
48204 emit_insn (gen_pack (d->target, dop0, dop1));
48206 return true;
48209 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
48210 and extract-odd permutations of two V64QI operands
48211 with two "shifts", two "truncs" and one "concat" insns for "odd"
48212 and two "truncs" and one concat insn for "even."
48213 Have already failed all two instruction sequences. */
48215 static bool
48216 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
48218 rtx t1, t2, t3, t4;
48219 unsigned i, odd, nelt = d->nelt;
48221 if (!TARGET_AVX512BW
48222 || d->one_operand_p
48223 || d->vmode != V64QImode)
48224 return false;
48226 /* Check that permutation is even or odd. */
48227 odd = d->perm[0];
48228 if (odd > 1)
48229 return false;
48231 for (i = 1; i < nelt; ++i)
48232 if (d->perm[i] != 2 * i + odd)
48233 return false;
48235 if (d->testing_p)
48236 return true;
48239 if (odd)
48241 t1 = gen_reg_rtx (V32HImode);
48242 t2 = gen_reg_rtx (V32HImode);
48243 emit_insn (gen_lshrv32hi3 (t1,
48244 gen_lowpart (V32HImode, d->op0),
48245 GEN_INT (8)));
48246 emit_insn (gen_lshrv32hi3 (t2,
48247 gen_lowpart (V32HImode, d->op1),
48248 GEN_INT (8)));
48250 else
48252 t1 = gen_lowpart (V32HImode, d->op0);
48253 t2 = gen_lowpart (V32HImode, d->op1);
48256 t3 = gen_reg_rtx (V32QImode);
48257 t4 = gen_reg_rtx (V32QImode);
48258 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
48259 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
48260 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
48262 return true;
48265 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
48266 and extract-odd permutations. */
48268 static bool
48269 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
48271 rtx t1, t2, t3, t4, t5;
48273 switch (d->vmode)
48275 case E_V4DFmode:
48276 if (d->testing_p)
48277 break;
48278 t1 = gen_reg_rtx (V4DFmode);
48279 t2 = gen_reg_rtx (V4DFmode);
48281 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48282 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
48283 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
48285 /* Now an unpck[lh]pd will produce the result required. */
48286 if (odd)
48287 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
48288 else
48289 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
48290 emit_insn (t3);
48291 break;
48293 case E_V8SFmode:
48295 int mask = odd ? 0xdd : 0x88;
48297 if (d->testing_p)
48298 break;
48299 t1 = gen_reg_rtx (V8SFmode);
48300 t2 = gen_reg_rtx (V8SFmode);
48301 t3 = gen_reg_rtx (V8SFmode);
48303 /* Shuffle within the 128-bit lanes to produce:
48304 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
48305 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
48306 GEN_INT (mask)));
48308 /* Shuffle the lanes around to produce:
48309 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
48310 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
48311 GEN_INT (0x3)));
48313 /* Shuffle within the 128-bit lanes to produce:
48314 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
48315 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
48317 /* Shuffle within the 128-bit lanes to produce:
48318 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
48319 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
48321 /* Shuffle the lanes around to produce:
48322 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
48323 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
48324 GEN_INT (0x20)));
48326 break;
48328 case E_V2DFmode:
48329 case E_V4SFmode:
48330 case E_V2DImode:
48331 case E_V4SImode:
48332 /* These are always directly implementable by expand_vec_perm_1. */
48333 gcc_unreachable ();
48335 case E_V8HImode:
48336 if (TARGET_SSE4_1)
48337 return expand_vec_perm_even_odd_pack (d);
48338 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
48339 return expand_vec_perm_pshufb2 (d);
48340 else
48342 if (d->testing_p)
48343 break;
48344 /* We need 2*log2(N)-1 operations to achieve odd/even
48345 with interleave. */
48346 t1 = gen_reg_rtx (V8HImode);
48347 t2 = gen_reg_rtx (V8HImode);
48348 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
48349 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
48350 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
48351 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
48352 if (odd)
48353 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
48354 else
48355 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
48356 emit_insn (t3);
48358 break;
48360 case E_V16QImode:
48361 return expand_vec_perm_even_odd_pack (d);
48363 case E_V16HImode:
48364 case E_V32QImode:
48365 return expand_vec_perm_even_odd_pack (d);
48367 case E_V64QImode:
48368 return expand_vec_perm_even_odd_trunc (d);
48370 case E_V4DImode:
48371 if (!TARGET_AVX2)
48373 struct expand_vec_perm_d d_copy = *d;
48374 d_copy.vmode = V4DFmode;
48375 if (d->testing_p)
48376 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
48377 else
48378 d_copy.target = gen_reg_rtx (V4DFmode);
48379 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
48380 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
48381 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48383 if (!d->testing_p)
48384 emit_move_insn (d->target,
48385 gen_lowpart (V4DImode, d_copy.target));
48386 return true;
48388 return false;
48391 if (d->testing_p)
48392 break;
48394 t1 = gen_reg_rtx (V4DImode);
48395 t2 = gen_reg_rtx (V4DImode);
48397 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
48398 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
48399 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
48401 /* Now an vpunpck[lh]qdq will produce the result required. */
48402 if (odd)
48403 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
48404 else
48405 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
48406 emit_insn (t3);
48407 break;
48409 case E_V8SImode:
48410 if (!TARGET_AVX2)
48412 struct expand_vec_perm_d d_copy = *d;
48413 d_copy.vmode = V8SFmode;
48414 if (d->testing_p)
48415 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
48416 else
48417 d_copy.target = gen_reg_rtx (V8SFmode);
48418 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
48419 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
48420 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
48422 if (!d->testing_p)
48423 emit_move_insn (d->target,
48424 gen_lowpart (V8SImode, d_copy.target));
48425 return true;
48427 return false;
48430 if (d->testing_p)
48431 break;
48433 t1 = gen_reg_rtx (V8SImode);
48434 t2 = gen_reg_rtx (V8SImode);
48435 t3 = gen_reg_rtx (V4DImode);
48436 t4 = gen_reg_rtx (V4DImode);
48437 t5 = gen_reg_rtx (V4DImode);
48439 /* Shuffle the lanes around into
48440 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
48441 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
48442 gen_lowpart (V4DImode, d->op1),
48443 GEN_INT (0x20)));
48444 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
48445 gen_lowpart (V4DImode, d->op1),
48446 GEN_INT (0x31)));
48448 /* Swap the 2nd and 3rd position in each lane into
48449 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
48450 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
48451 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48452 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
48453 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
48455 /* Now an vpunpck[lh]qdq will produce
48456 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
48457 if (odd)
48458 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
48459 gen_lowpart (V4DImode, t2));
48460 else
48461 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
48462 gen_lowpart (V4DImode, t2));
48463 emit_insn (t3);
48464 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
48465 break;
48467 default:
48468 gcc_unreachable ();
48471 return true;
48474 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48475 extract-even and extract-odd permutations. */
48477 static bool
48478 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
48480 unsigned i, odd, nelt = d->nelt;
48482 odd = d->perm[0];
48483 if (odd != 0 && odd != 1)
48484 return false;
48486 for (i = 1; i < nelt; ++i)
48487 if (d->perm[i] != 2 * i + odd)
48488 return false;
48490 return expand_vec_perm_even_odd_1 (d, odd);
48493 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
48494 permutations. We assume that expand_vec_perm_1 has already failed. */
48496 static bool
48497 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
48499 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
48500 machine_mode vmode = d->vmode;
48501 unsigned char perm2[4];
48502 rtx op0 = d->op0, dest;
48503 bool ok;
48505 switch (vmode)
48507 case E_V4DFmode:
48508 case E_V8SFmode:
48509 /* These are special-cased in sse.md so that we can optionally
48510 use the vbroadcast instruction. They expand to two insns
48511 if the input happens to be in a register. */
48512 gcc_unreachable ();
48514 case E_V2DFmode:
48515 case E_V2DImode:
48516 case E_V4SFmode:
48517 case E_V4SImode:
48518 /* These are always implementable using standard shuffle patterns. */
48519 gcc_unreachable ();
48521 case E_V8HImode:
48522 case E_V16QImode:
48523 /* These can be implemented via interleave. We save one insn by
48524 stopping once we have promoted to V4SImode and then use pshufd. */
48525 if (d->testing_p)
48526 return true;
48529 rtx dest;
48530 rtx (*gen) (rtx, rtx, rtx)
48531 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
48532 : gen_vec_interleave_lowv8hi;
48534 if (elt >= nelt2)
48536 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
48537 : gen_vec_interleave_highv8hi;
48538 elt -= nelt2;
48540 nelt2 /= 2;
48542 dest = gen_reg_rtx (vmode);
48543 emit_insn (gen (dest, op0, op0));
48544 vmode = get_mode_wider_vector (vmode);
48545 op0 = gen_lowpart (vmode, dest);
48547 while (vmode != V4SImode);
48549 memset (perm2, elt, 4);
48550 dest = gen_reg_rtx (V4SImode);
48551 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
48552 gcc_assert (ok);
48553 if (!d->testing_p)
48554 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
48555 return true;
48557 case E_V64QImode:
48558 case E_V32QImode:
48559 case E_V16HImode:
48560 case E_V8SImode:
48561 case E_V4DImode:
48562 /* For AVX2 broadcasts of the first element vpbroadcast* or
48563 vpermq should be used by expand_vec_perm_1. */
48564 gcc_assert (!TARGET_AVX2 || d->perm[0]);
48565 return false;
48567 default:
48568 gcc_unreachable ();
48572 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
48573 broadcast permutations. */
48575 static bool
48576 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
48578 unsigned i, elt, nelt = d->nelt;
48580 if (!d->one_operand_p)
48581 return false;
48583 elt = d->perm[0];
48584 for (i = 1; i < nelt; ++i)
48585 if (d->perm[i] != elt)
48586 return false;
48588 return expand_vec_perm_broadcast_1 (d);
48591 /* Implement arbitrary permutations of two V64QImode operands
48592 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
48593 static bool
48594 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
48596 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
48597 return false;
48599 if (d->testing_p)
48600 return true;
48602 struct expand_vec_perm_d ds[2];
48603 rtx rperm[128], vperm, target0, target1;
48604 unsigned int i, nelt;
48605 machine_mode vmode;
48607 nelt = d->nelt;
48608 vmode = V64QImode;
48610 for (i = 0; i < 2; i++)
48612 ds[i] = *d;
48613 ds[i].vmode = V32HImode;
48614 ds[i].nelt = 32;
48615 ds[i].target = gen_reg_rtx (V32HImode);
48616 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
48617 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
48620 /* Prepare permutations such that the first one takes care of
48621 putting the even bytes into the right positions or one higher
48622 positions (ds[0]) and the second one takes care of
48623 putting the odd bytes into the right positions or one below
48624 (ds[1]). */
48626 for (i = 0; i < nelt; i++)
48628 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
48629 if (i & 1)
48631 rperm[i] = constm1_rtx;
48632 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48634 else
48636 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
48637 rperm[i + 64] = constm1_rtx;
48641 bool ok = expand_vec_perm_1 (&ds[0]);
48642 gcc_assert (ok);
48643 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
48645 ok = expand_vec_perm_1 (&ds[1]);
48646 gcc_assert (ok);
48647 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
48649 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
48650 vperm = force_reg (vmode, vperm);
48651 target0 = gen_reg_rtx (V64QImode);
48652 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
48654 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
48655 vperm = force_reg (vmode, vperm);
48656 target1 = gen_reg_rtx (V64QImode);
48657 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
48659 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
48660 return true;
48663 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
48664 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
48665 all the shorter instruction sequences. */
48667 static bool
48668 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
48670 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
48671 unsigned int i, nelt, eltsz;
48672 bool used[4];
48674 if (!TARGET_AVX2
48675 || d->one_operand_p
48676 || (d->vmode != V32QImode && d->vmode != V16HImode))
48677 return false;
48679 if (d->testing_p)
48680 return true;
48682 nelt = d->nelt;
48683 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
48685 /* Generate 4 permutation masks. If the required element is within
48686 the same lane, it is shuffled in. If the required element from the
48687 other lane, force a zero by setting bit 7 in the permutation mask.
48688 In the other mask the mask has non-negative elements if element
48689 is requested from the other lane, but also moved to the other lane,
48690 so that the result of vpshufb can have the two V2TImode halves
48691 swapped. */
48692 m128 = GEN_INT (-128);
48693 for (i = 0; i < 32; ++i)
48695 rperm[0][i] = m128;
48696 rperm[1][i] = m128;
48697 rperm[2][i] = m128;
48698 rperm[3][i] = m128;
48700 used[0] = false;
48701 used[1] = false;
48702 used[2] = false;
48703 used[3] = false;
48704 for (i = 0; i < nelt; ++i)
48706 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
48707 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
48708 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
48710 for (j = 0; j < eltsz; ++j)
48711 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
48712 used[which] = true;
48715 for (i = 0; i < 2; ++i)
48717 if (!used[2 * i + 1])
48719 h[i] = NULL_RTX;
48720 continue;
48722 vperm = gen_rtx_CONST_VECTOR (V32QImode,
48723 gen_rtvec_v (32, rperm[2 * i + 1]));
48724 vperm = force_reg (V32QImode, vperm);
48725 h[i] = gen_reg_rtx (V32QImode);
48726 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48727 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
48730 /* Swap the 128-byte lanes of h[X]. */
48731 for (i = 0; i < 2; ++i)
48733 if (h[i] == NULL_RTX)
48734 continue;
48735 op = gen_reg_rtx (V4DImode);
48736 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
48737 const2_rtx, GEN_INT (3), const0_rtx,
48738 const1_rtx));
48739 h[i] = gen_lowpart (V32QImode, op);
48742 for (i = 0; i < 2; ++i)
48744 if (!used[2 * i])
48746 l[i] = NULL_RTX;
48747 continue;
48749 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
48750 vperm = force_reg (V32QImode, vperm);
48751 l[i] = gen_reg_rtx (V32QImode);
48752 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
48753 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
48756 for (i = 0; i < 2; ++i)
48758 if (h[i] && l[i])
48760 op = gen_reg_rtx (V32QImode);
48761 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
48762 l[i] = op;
48764 else if (h[i])
48765 l[i] = h[i];
48768 gcc_assert (l[0] && l[1]);
48769 op = d->target;
48770 if (d->vmode != V32QImode)
48771 op = gen_reg_rtx (V32QImode);
48772 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
48773 if (op != d->target)
48774 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
48775 return true;
48778 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
48779 taken care of, perform the expansion in D and return true on success. */
48781 static bool
48782 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
48784 /* Try a single instruction expansion. */
48785 if (expand_vec_perm_1 (d))
48786 return true;
48788 /* Try sequences of two instructions. */
48790 if (expand_vec_perm_pshuflw_pshufhw (d))
48791 return true;
48793 if (expand_vec_perm_palignr (d, false))
48794 return true;
48796 if (expand_vec_perm_interleave2 (d))
48797 return true;
48799 if (expand_vec_perm_broadcast (d))
48800 return true;
48802 if (expand_vec_perm_vpermq_perm_1 (d))
48803 return true;
48805 if (expand_vec_perm_vperm2f128 (d))
48806 return true;
48808 if (expand_vec_perm_pblendv (d))
48809 return true;
48811 /* Try sequences of three instructions. */
48813 if (expand_vec_perm_even_odd_pack (d))
48814 return true;
48816 if (expand_vec_perm_2vperm2f128_vshuf (d))
48817 return true;
48819 if (expand_vec_perm_pshufb2 (d))
48820 return true;
48822 if (expand_vec_perm_interleave3 (d))
48823 return true;
48825 if (expand_vec_perm_vperm2f128_vblend (d))
48826 return true;
48828 /* Try sequences of four instructions. */
48830 if (expand_vec_perm_even_odd_trunc (d))
48831 return true;
48832 if (expand_vec_perm_vpshufb2_vpermq (d))
48833 return true;
48835 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
48836 return true;
48838 if (expand_vec_perm_vpermt2_vpshub2 (d))
48839 return true;
48841 /* ??? Look for narrow permutations whose element orderings would
48842 allow the promotion to a wider mode. */
48844 /* ??? Look for sequences of interleave or a wider permute that place
48845 the data into the correct lanes for a half-vector shuffle like
48846 pshuf[lh]w or vpermilps. */
48848 /* ??? Look for sequences of interleave that produce the desired results.
48849 The combinatorics of punpck[lh] get pretty ugly... */
48851 if (expand_vec_perm_even_odd (d))
48852 return true;
48854 /* Even longer sequences. */
48855 if (expand_vec_perm_vpshufb4_vpermq2 (d))
48856 return true;
48858 /* See if we can get the same permutation in different vector integer
48859 mode. */
48860 struct expand_vec_perm_d nd;
48861 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
48863 if (!d->testing_p)
48864 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
48865 return true;
48868 return false;
48871 /* If a permutation only uses one operand, make it clear. Returns true
48872 if the permutation references both operands. */
48874 static bool
48875 canonicalize_perm (struct expand_vec_perm_d *d)
48877 int i, which, nelt = d->nelt;
48879 for (i = which = 0; i < nelt; ++i)
48880 which |= (d->perm[i] < nelt ? 1 : 2);
48882 d->one_operand_p = true;
48883 switch (which)
48885 default:
48886 gcc_unreachable();
48888 case 3:
48889 if (!rtx_equal_p (d->op0, d->op1))
48891 d->one_operand_p = false;
48892 break;
48894 /* The elements of PERM do not suggest that only the first operand
48895 is used, but both operands are identical. Allow easier matching
48896 of the permutation by folding the permutation into the single
48897 input vector. */
48898 /* FALLTHRU */
48900 case 2:
48901 for (i = 0; i < nelt; ++i)
48902 d->perm[i] &= nelt - 1;
48903 d->op0 = d->op1;
48904 break;
48906 case 1:
48907 d->op1 = d->op0;
48908 break;
48911 return (which == 3);
48914 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
48916 static bool
48917 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
48918 rtx op1, const vec_perm_indices &sel)
48920 struct expand_vec_perm_d d;
48921 unsigned char perm[MAX_VECT_LEN];
48922 unsigned int i, nelt, which;
48923 bool two_args;
48925 d.target = target;
48926 d.op0 = op0;
48927 d.op1 = op1;
48929 d.vmode = vmode;
48930 gcc_assert (VECTOR_MODE_P (d.vmode));
48931 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
48932 d.testing_p = !target;
48934 gcc_assert (sel.length () == nelt);
48935 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
48937 /* Given sufficient ISA support we can just return true here
48938 for selected vector modes. */
48939 switch (d.vmode)
48941 case E_V16SFmode:
48942 case E_V16SImode:
48943 case E_V8DImode:
48944 case E_V8DFmode:
48945 if (!TARGET_AVX512F)
48946 return false;
48947 /* All implementable with a single vperm[it]2 insn. */
48948 if (d.testing_p)
48949 return true;
48950 break;
48951 case E_V32HImode:
48952 if (!TARGET_AVX512BW)
48953 return false;
48954 if (d.testing_p)
48955 /* All implementable with a single vperm[it]2 insn. */
48956 return true;
48957 break;
48958 case E_V64QImode:
48959 if (!TARGET_AVX512BW)
48960 return false;
48961 if (d.testing_p)
48962 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
48963 return true;
48964 break;
48965 case E_V8SImode:
48966 case E_V8SFmode:
48967 case E_V4DFmode:
48968 case E_V4DImode:
48969 if (!TARGET_AVX)
48970 return false;
48971 if (d.testing_p && TARGET_AVX512VL)
48972 /* All implementable with a single vperm[it]2 insn. */
48973 return true;
48974 break;
48975 case E_V16HImode:
48976 if (!TARGET_SSE2)
48977 return false;
48978 if (d.testing_p && TARGET_AVX2)
48979 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48980 return true;
48981 break;
48982 case E_V32QImode:
48983 if (!TARGET_SSE2)
48984 return false;
48985 if (d.testing_p && TARGET_AVX2)
48986 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
48987 return true;
48988 break;
48989 case E_V8HImode:
48990 case E_V16QImode:
48991 if (!TARGET_SSE2)
48992 return false;
48993 /* Fall through. */
48994 case E_V4SImode:
48995 case E_V4SFmode:
48996 if (!TARGET_SSE)
48997 return false;
48998 /* All implementable with a single vpperm insn. */
48999 if (d.testing_p && TARGET_XOP)
49000 return true;
49001 /* All implementable with 2 pshufb + 1 ior. */
49002 if (d.testing_p && TARGET_SSSE3)
49003 return true;
49004 break;
49005 case E_V2DImode:
49006 case E_V2DFmode:
49007 if (!TARGET_SSE)
49008 return false;
49009 /* All implementable with shufpd or unpck[lh]pd. */
49010 if (d.testing_p)
49011 return true;
49012 break;
49013 default:
49014 return false;
49017 for (i = which = 0; i < nelt; ++i)
49019 unsigned char e = sel[i];
49020 gcc_assert (e < 2 * nelt);
49021 d.perm[i] = e;
49022 perm[i] = e;
49023 which |= (e < nelt ? 1 : 2);
49026 if (d.testing_p)
49028 /* For all elements from second vector, fold the elements to first. */
49029 if (which == 2)
49030 for (i = 0; i < nelt; ++i)
49031 d.perm[i] -= nelt;
49033 /* Check whether the mask can be applied to the vector type. */
49034 d.one_operand_p = (which != 3);
49036 /* Implementable with shufps or pshufd. */
49037 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
49038 return true;
49040 /* Otherwise we have to go through the motions and see if we can
49041 figure out how to generate the requested permutation. */
49042 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
49043 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
49044 if (!d.one_operand_p)
49045 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
49047 start_sequence ();
49048 bool ret = ix86_expand_vec_perm_const_1 (&d);
49049 end_sequence ();
49051 return ret;
49054 two_args = canonicalize_perm (&d);
49056 if (ix86_expand_vec_perm_const_1 (&d))
49057 return true;
49059 /* If the selector says both arguments are needed, but the operands are the
49060 same, the above tried to expand with one_operand_p and flattened selector.
49061 If that didn't work, retry without one_operand_p; we succeeded with that
49062 during testing. */
49063 if (two_args && d.one_operand_p)
49065 d.one_operand_p = false;
49066 memcpy (d.perm, perm, sizeof (perm));
49067 return ix86_expand_vec_perm_const_1 (&d);
49070 return false;
49073 void
49074 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
49076 struct expand_vec_perm_d d;
49077 unsigned i, nelt;
49079 d.target = targ;
49080 d.op0 = op0;
49081 d.op1 = op1;
49082 d.vmode = GET_MODE (targ);
49083 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49084 d.one_operand_p = false;
49085 d.testing_p = false;
49087 for (i = 0; i < nelt; ++i)
49088 d.perm[i] = i * 2 + odd;
49090 /* We'll either be able to implement the permutation directly... */
49091 if (expand_vec_perm_1 (&d))
49092 return;
49094 /* ... or we use the special-case patterns. */
49095 expand_vec_perm_even_odd_1 (&d, odd);
49098 static void
49099 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
49101 struct expand_vec_perm_d d;
49102 unsigned i, nelt, base;
49103 bool ok;
49105 d.target = targ;
49106 d.op0 = op0;
49107 d.op1 = op1;
49108 d.vmode = GET_MODE (targ);
49109 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
49110 d.one_operand_p = false;
49111 d.testing_p = false;
49113 base = high_p ? nelt / 2 : 0;
49114 for (i = 0; i < nelt / 2; ++i)
49116 d.perm[i * 2] = i + base;
49117 d.perm[i * 2 + 1] = i + base + nelt;
49120 /* Note that for AVX this isn't one instruction. */
49121 ok = ix86_expand_vec_perm_const_1 (&d);
49122 gcc_assert (ok);
49126 /* Expand a vector operation CODE for a V*QImode in terms of the
49127 same operation on V*HImode. */
49129 void
49130 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
49132 machine_mode qimode = GET_MODE (dest);
49133 machine_mode himode;
49134 rtx (*gen_il) (rtx, rtx, rtx);
49135 rtx (*gen_ih) (rtx, rtx, rtx);
49136 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
49137 struct expand_vec_perm_d d;
49138 bool ok, full_interleave;
49139 bool uns_p = false;
49140 int i;
49142 switch (qimode)
49144 case E_V16QImode:
49145 himode = V8HImode;
49146 gen_il = gen_vec_interleave_lowv16qi;
49147 gen_ih = gen_vec_interleave_highv16qi;
49148 break;
49149 case E_V32QImode:
49150 himode = V16HImode;
49151 gen_il = gen_avx2_interleave_lowv32qi;
49152 gen_ih = gen_avx2_interleave_highv32qi;
49153 break;
49154 case E_V64QImode:
49155 himode = V32HImode;
49156 gen_il = gen_avx512bw_interleave_lowv64qi;
49157 gen_ih = gen_avx512bw_interleave_highv64qi;
49158 break;
49159 default:
49160 gcc_unreachable ();
49163 op2_l = op2_h = op2;
49164 switch (code)
49166 case MULT:
49167 /* Unpack data such that we've got a source byte in each low byte of
49168 each word. We don't care what goes into the high byte of each word.
49169 Rather than trying to get zero in there, most convenient is to let
49170 it be a copy of the low byte. */
49171 op2_l = gen_reg_rtx (qimode);
49172 op2_h = gen_reg_rtx (qimode);
49173 emit_insn (gen_il (op2_l, op2, op2));
49174 emit_insn (gen_ih (op2_h, op2, op2));
49176 op1_l = gen_reg_rtx (qimode);
49177 op1_h = gen_reg_rtx (qimode);
49178 emit_insn (gen_il (op1_l, op1, op1));
49179 emit_insn (gen_ih (op1_h, op1, op1));
49180 full_interleave = qimode == V16QImode;
49181 break;
49183 case ASHIFT:
49184 case LSHIFTRT:
49185 uns_p = true;
49186 /* FALLTHRU */
49187 case ASHIFTRT:
49188 op1_l = gen_reg_rtx (himode);
49189 op1_h = gen_reg_rtx (himode);
49190 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
49191 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
49192 full_interleave = true;
49193 break;
49194 default:
49195 gcc_unreachable ();
49198 /* Perform the operation. */
49199 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
49200 1, OPTAB_DIRECT);
49201 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
49202 1, OPTAB_DIRECT);
49203 gcc_assert (res_l && res_h);
49205 /* Merge the data back into the right place. */
49206 d.target = dest;
49207 d.op0 = gen_lowpart (qimode, res_l);
49208 d.op1 = gen_lowpart (qimode, res_h);
49209 d.vmode = qimode;
49210 d.nelt = GET_MODE_NUNITS (qimode);
49211 d.one_operand_p = false;
49212 d.testing_p = false;
49214 if (full_interleave)
49216 /* For SSE2, we used an full interleave, so the desired
49217 results are in the even elements. */
49218 for (i = 0; i < d.nelt; ++i)
49219 d.perm[i] = i * 2;
49221 else
49223 /* For AVX, the interleave used above was not cross-lane. So the
49224 extraction is evens but with the second and third quarter swapped.
49225 Happily, that is even one insn shorter than even extraction.
49226 For AVX512BW we have 4 lanes. We extract evens from within a lane,
49227 always first from the first and then from the second source operand,
49228 the index bits above the low 4 bits remains the same.
49229 Thus, for d.nelt == 32 we want permutation
49230 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
49231 and for d.nelt == 64 we want permutation
49232 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
49233 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
49234 for (i = 0; i < d.nelt; ++i)
49235 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
49238 ok = ix86_expand_vec_perm_const_1 (&d);
49239 gcc_assert (ok);
49241 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49242 gen_rtx_fmt_ee (code, qimode, op1, op2));
49245 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
49246 if op is CONST_VECTOR with all odd elements equal to their
49247 preceding element. */
49249 static bool
49250 const_vector_equal_evenodd_p (rtx op)
49252 machine_mode mode = GET_MODE (op);
49253 int i, nunits = GET_MODE_NUNITS (mode);
49254 if (GET_CODE (op) != CONST_VECTOR
49255 || nunits != CONST_VECTOR_NUNITS (op))
49256 return false;
49257 for (i = 0; i < nunits; i += 2)
49258 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
49259 return false;
49260 return true;
49263 void
49264 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
49265 bool uns_p, bool odd_p)
49267 machine_mode mode = GET_MODE (op1);
49268 machine_mode wmode = GET_MODE (dest);
49269 rtx x;
49270 rtx orig_op1 = op1, orig_op2 = op2;
49272 if (!nonimmediate_operand (op1, mode))
49273 op1 = force_reg (mode, op1);
49274 if (!nonimmediate_operand (op2, mode))
49275 op2 = force_reg (mode, op2);
49277 /* We only play even/odd games with vectors of SImode. */
49278 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
49280 /* If we're looking for the odd results, shift those members down to
49281 the even slots. For some cpus this is faster than a PSHUFD. */
49282 if (odd_p)
49284 /* For XOP use vpmacsdqh, but only for smult, as it is only
49285 signed. */
49286 if (TARGET_XOP && mode == V4SImode && !uns_p)
49288 x = force_reg (wmode, CONST0_RTX (wmode));
49289 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
49290 return;
49293 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
49294 if (!const_vector_equal_evenodd_p (orig_op1))
49295 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
49296 x, NULL, 1, OPTAB_DIRECT);
49297 if (!const_vector_equal_evenodd_p (orig_op2))
49298 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
49299 x, NULL, 1, OPTAB_DIRECT);
49300 op1 = gen_lowpart (mode, op1);
49301 op2 = gen_lowpart (mode, op2);
49304 if (mode == V16SImode)
49306 if (uns_p)
49307 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
49308 else
49309 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
49311 else if (mode == V8SImode)
49313 if (uns_p)
49314 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
49315 else
49316 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
49318 else if (uns_p)
49319 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
49320 else if (TARGET_SSE4_1)
49321 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
49322 else
49324 rtx s1, s2, t0, t1, t2;
49326 /* The easiest way to implement this without PMULDQ is to go through
49327 the motions as if we are performing a full 64-bit multiply. With
49328 the exception that we need to do less shuffling of the elements. */
49330 /* Compute the sign-extension, aka highparts, of the two operands. */
49331 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49332 op1, pc_rtx, pc_rtx);
49333 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
49334 op2, pc_rtx, pc_rtx);
49336 /* Multiply LO(A) * HI(B), and vice-versa. */
49337 t1 = gen_reg_rtx (wmode);
49338 t2 = gen_reg_rtx (wmode);
49339 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
49340 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
49342 /* Multiply LO(A) * LO(B). */
49343 t0 = gen_reg_rtx (wmode);
49344 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
49346 /* Combine and shift the highparts into place. */
49347 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
49348 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
49349 1, OPTAB_DIRECT);
49351 /* Combine high and low parts. */
49352 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
49353 return;
49355 emit_insn (x);
49358 void
49359 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
49360 bool uns_p, bool high_p)
49362 machine_mode wmode = GET_MODE (dest);
49363 machine_mode mode = GET_MODE (op1);
49364 rtx t1, t2, t3, t4, mask;
49366 switch (mode)
49368 case E_V4SImode:
49369 t1 = gen_reg_rtx (mode);
49370 t2 = gen_reg_rtx (mode);
49371 if (TARGET_XOP && !uns_p)
49373 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
49374 shuffle the elements once so that all elements are in the right
49375 place for immediate use: { A C B D }. */
49376 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
49377 const1_rtx, GEN_INT (3)));
49378 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
49379 const1_rtx, GEN_INT (3)));
49381 else
49383 /* Put the elements into place for the multiply. */
49384 ix86_expand_vec_interleave (t1, op1, op1, high_p);
49385 ix86_expand_vec_interleave (t2, op2, op2, high_p);
49386 high_p = false;
49388 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
49389 break;
49391 case E_V8SImode:
49392 /* Shuffle the elements between the lanes. After this we
49393 have { A B E F | C D G H } for each operand. */
49394 t1 = gen_reg_rtx (V4DImode);
49395 t2 = gen_reg_rtx (V4DImode);
49396 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
49397 const0_rtx, const2_rtx,
49398 const1_rtx, GEN_INT (3)));
49399 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
49400 const0_rtx, const2_rtx,
49401 const1_rtx, GEN_INT (3)));
49403 /* Shuffle the elements within the lanes. After this we
49404 have { A A B B | C C D D } or { E E F F | G G H H }. */
49405 t3 = gen_reg_rtx (V8SImode);
49406 t4 = gen_reg_rtx (V8SImode);
49407 mask = GEN_INT (high_p
49408 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
49409 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
49410 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
49411 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
49413 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
49414 break;
49416 case E_V8HImode:
49417 case E_V16HImode:
49418 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
49419 uns_p, OPTAB_DIRECT);
49420 t2 = expand_binop (mode,
49421 uns_p ? umul_highpart_optab : smul_highpart_optab,
49422 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
49423 gcc_assert (t1 && t2);
49425 t3 = gen_reg_rtx (mode);
49426 ix86_expand_vec_interleave (t3, t1, t2, high_p);
49427 emit_move_insn (dest, gen_lowpart (wmode, t3));
49428 break;
49430 case E_V16QImode:
49431 case E_V32QImode:
49432 case E_V32HImode:
49433 case E_V16SImode:
49434 case E_V64QImode:
49435 t1 = gen_reg_rtx (wmode);
49436 t2 = gen_reg_rtx (wmode);
49437 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
49438 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
49440 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
49441 break;
49443 default:
49444 gcc_unreachable ();
49448 void
49449 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
49451 rtx res_1, res_2, res_3, res_4;
49453 res_1 = gen_reg_rtx (V4SImode);
49454 res_2 = gen_reg_rtx (V4SImode);
49455 res_3 = gen_reg_rtx (V2DImode);
49456 res_4 = gen_reg_rtx (V2DImode);
49457 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
49458 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
49460 /* Move the results in element 2 down to element 1; we don't care
49461 what goes in elements 2 and 3. Then we can merge the parts
49462 back together with an interleave.
49464 Note that two other sequences were tried:
49465 (1) Use interleaves at the start instead of psrldq, which allows
49466 us to use a single shufps to merge things back at the end.
49467 (2) Use shufps here to combine the two vectors, then pshufd to
49468 put the elements in the correct order.
49469 In both cases the cost of the reformatting stall was too high
49470 and the overall sequence slower. */
49472 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
49473 const0_rtx, const2_rtx,
49474 const0_rtx, const0_rtx));
49475 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
49476 const0_rtx, const2_rtx,
49477 const0_rtx, const0_rtx));
49478 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
49480 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
49483 void
49484 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
49486 machine_mode mode = GET_MODE (op0);
49487 rtx t1, t2, t3, t4, t5, t6;
49489 if (TARGET_AVX512DQ && mode == V8DImode)
49490 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
49491 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
49492 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
49493 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
49494 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
49495 else if (TARGET_XOP && mode == V2DImode)
49497 /* op1: A,B,C,D, op2: E,F,G,H */
49498 op1 = gen_lowpart (V4SImode, op1);
49499 op2 = gen_lowpart (V4SImode, op2);
49501 t1 = gen_reg_rtx (V4SImode);
49502 t2 = gen_reg_rtx (V4SImode);
49503 t3 = gen_reg_rtx (V2DImode);
49504 t4 = gen_reg_rtx (V2DImode);
49506 /* t1: B,A,D,C */
49507 emit_insn (gen_sse2_pshufd_1 (t1, op1,
49508 GEN_INT (1),
49509 GEN_INT (0),
49510 GEN_INT (3),
49511 GEN_INT (2)));
49513 /* t2: (B*E),(A*F),(D*G),(C*H) */
49514 emit_insn (gen_mulv4si3 (t2, t1, op2));
49516 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
49517 emit_insn (gen_xop_phadddq (t3, t2));
49519 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
49520 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
49522 /* Multiply lower parts and add all */
49523 t5 = gen_reg_rtx (V2DImode);
49524 emit_insn (gen_vec_widen_umult_even_v4si (t5,
49525 gen_lowpart (V4SImode, op1),
49526 gen_lowpart (V4SImode, op2)));
49527 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
49530 else
49532 machine_mode nmode;
49533 rtx (*umul) (rtx, rtx, rtx);
49535 if (mode == V2DImode)
49537 umul = gen_vec_widen_umult_even_v4si;
49538 nmode = V4SImode;
49540 else if (mode == V4DImode)
49542 umul = gen_vec_widen_umult_even_v8si;
49543 nmode = V8SImode;
49545 else if (mode == V8DImode)
49547 umul = gen_vec_widen_umult_even_v16si;
49548 nmode = V16SImode;
49550 else
49551 gcc_unreachable ();
49554 /* Multiply low parts. */
49555 t1 = gen_reg_rtx (mode);
49556 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
49558 /* Shift input vectors right 32 bits so we can multiply high parts. */
49559 t6 = GEN_INT (32);
49560 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
49561 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
49563 /* Multiply high parts by low parts. */
49564 t4 = gen_reg_rtx (mode);
49565 t5 = gen_reg_rtx (mode);
49566 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
49567 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
49569 /* Combine and shift the highparts back. */
49570 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
49571 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
49573 /* Combine high and low parts. */
49574 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
49577 set_unique_reg_note (get_last_insn (), REG_EQUAL,
49578 gen_rtx_MULT (mode, op1, op2));
49581 /* Return 1 if control tansfer instruction INSN
49582 should be encoded with notrack prefix. */
49584 static bool
49585 ix86_notrack_prefixed_insn_p (rtx insn)
49587 if (!insn || !((flag_cf_protection & CF_BRANCH)))
49588 return false;
49590 if (CALL_P (insn))
49592 rtx call = get_call_rtx_from (insn);
49593 gcc_assert (call != NULL_RTX);
49594 rtx addr = XEXP (call, 0);
49596 /* Do not emit 'notrack' if it's not an indirect call. */
49597 if (MEM_P (addr)
49598 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49599 return false;
49600 else
49601 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
49604 if (JUMP_P (insn) && !flag_cet_switch)
49606 rtx target = JUMP_LABEL (insn);
49607 if (target == NULL_RTX || ANY_RETURN_P (target))
49608 return false;
49610 /* Check the jump is a switch table. */
49611 rtx_insn *label = as_a<rtx_insn *> (target);
49612 rtx_insn *table = next_insn (label);
49613 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
49614 return false;
49615 else
49616 return true;
49618 return false;
49621 /* Calculate integer abs() using only SSE2 instructions. */
49623 void
49624 ix86_expand_sse2_abs (rtx target, rtx input)
49626 machine_mode mode = GET_MODE (target);
49627 rtx tmp0, tmp1, x;
49629 switch (mode)
49631 case E_V2DImode:
49632 case E_V4DImode:
49633 /* For 64-bit signed integer X, with SSE4.2 use
49634 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
49635 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
49636 32 and use logical instead of arithmetic right shift (which is
49637 unimplemented) and subtract. */
49638 if (TARGET_SSE4_2)
49640 tmp0 = gen_reg_rtx (mode);
49641 tmp1 = gen_reg_rtx (mode);
49642 emit_move_insn (tmp1, CONST0_RTX (mode));
49643 if (mode == E_V2DImode)
49644 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
49645 else
49646 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
49648 else
49650 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
49651 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
49652 - 1), NULL, 0, OPTAB_DIRECT);
49653 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
49656 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49657 NULL, 0, OPTAB_DIRECT);
49658 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49659 target, 0, OPTAB_DIRECT);
49660 break;
49662 case E_V4SImode:
49663 /* For 32-bit signed integer X, the best way to calculate the absolute
49664 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
49665 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
49666 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
49667 NULL, 0, OPTAB_DIRECT);
49668 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
49669 NULL, 0, OPTAB_DIRECT);
49670 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
49671 target, 0, OPTAB_DIRECT);
49672 break;
49674 case E_V8HImode:
49675 /* For 16-bit signed integer X, the best way to calculate the absolute
49676 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
49677 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49679 x = expand_simple_binop (mode, SMAX, tmp0, input,
49680 target, 0, OPTAB_DIRECT);
49681 break;
49683 case E_V16QImode:
49684 /* For 8-bit signed integer X, the best way to calculate the absolute
49685 value of X is min ((unsigned char) X, (unsigned char) (-X)),
49686 as SSE2 provides the PMINUB insn. */
49687 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
49689 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
49690 target, 0, OPTAB_DIRECT);
49691 break;
49693 default:
49694 gcc_unreachable ();
49697 if (x != target)
49698 emit_move_insn (target, x);
49701 /* Expand an extract from a vector register through pextr insn.
49702 Return true if successful. */
49704 bool
49705 ix86_expand_pextr (rtx *operands)
49707 rtx dst = operands[0];
49708 rtx src = operands[1];
49710 unsigned int size = INTVAL (operands[2]);
49711 unsigned int pos = INTVAL (operands[3]);
49713 if (SUBREG_P (dst))
49715 /* Reject non-lowpart subregs. */
49716 if (SUBREG_BYTE (dst) > 0)
49717 return false;
49718 dst = SUBREG_REG (dst);
49721 if (SUBREG_P (src))
49723 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
49724 src = SUBREG_REG (src);
49727 switch (GET_MODE (src))
49729 case E_V16QImode:
49730 case E_V8HImode:
49731 case E_V4SImode:
49732 case E_V2DImode:
49733 case E_V1TImode:
49734 case E_TImode:
49736 machine_mode srcmode, dstmode;
49737 rtx d, pat;
49739 if (!int_mode_for_size (size, 0).exists (&dstmode))
49740 return false;
49742 switch (dstmode)
49744 case E_QImode:
49745 if (!TARGET_SSE4_1)
49746 return false;
49747 srcmode = V16QImode;
49748 break;
49750 case E_HImode:
49751 if (!TARGET_SSE2)
49752 return false;
49753 srcmode = V8HImode;
49754 break;
49756 case E_SImode:
49757 if (!TARGET_SSE4_1)
49758 return false;
49759 srcmode = V4SImode;
49760 break;
49762 case E_DImode:
49763 gcc_assert (TARGET_64BIT);
49764 if (!TARGET_SSE4_1)
49765 return false;
49766 srcmode = V2DImode;
49767 break;
49769 default:
49770 return false;
49773 /* Reject extractions from misaligned positions. */
49774 if (pos & (size-1))
49775 return false;
49777 if (GET_MODE (dst) == dstmode)
49778 d = dst;
49779 else
49780 d = gen_reg_rtx (dstmode);
49782 /* Construct insn pattern. */
49783 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
49784 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
49786 /* Let the rtl optimizers know about the zero extension performed. */
49787 if (dstmode == QImode || dstmode == HImode)
49789 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
49790 d = gen_lowpart (SImode, d);
49793 emit_insn (gen_rtx_SET (d, pat));
49795 if (d != dst)
49796 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49797 return true;
49800 default:
49801 return false;
49805 /* Expand an insert into a vector register through pinsr insn.
49806 Return true if successful. */
49808 bool
49809 ix86_expand_pinsr (rtx *operands)
49811 rtx dst = operands[0];
49812 rtx src = operands[3];
49814 unsigned int size = INTVAL (operands[1]);
49815 unsigned int pos = INTVAL (operands[2]);
49817 if (SUBREG_P (dst))
49819 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
49820 dst = SUBREG_REG (dst);
49823 switch (GET_MODE (dst))
49825 case E_V16QImode:
49826 case E_V8HImode:
49827 case E_V4SImode:
49828 case E_V2DImode:
49829 case E_V1TImode:
49830 case E_TImode:
49832 machine_mode srcmode, dstmode;
49833 rtx (*pinsr)(rtx, rtx, rtx, rtx);
49834 rtx d;
49836 if (!int_mode_for_size (size, 0).exists (&srcmode))
49837 return false;
49839 switch (srcmode)
49841 case E_QImode:
49842 if (!TARGET_SSE4_1)
49843 return false;
49844 dstmode = V16QImode;
49845 pinsr = gen_sse4_1_pinsrb;
49846 break;
49848 case E_HImode:
49849 if (!TARGET_SSE2)
49850 return false;
49851 dstmode = V8HImode;
49852 pinsr = gen_sse2_pinsrw;
49853 break;
49855 case E_SImode:
49856 if (!TARGET_SSE4_1)
49857 return false;
49858 dstmode = V4SImode;
49859 pinsr = gen_sse4_1_pinsrd;
49860 break;
49862 case E_DImode:
49863 gcc_assert (TARGET_64BIT);
49864 if (!TARGET_SSE4_1)
49865 return false;
49866 dstmode = V2DImode;
49867 pinsr = gen_sse4_1_pinsrq;
49868 break;
49870 default:
49871 return false;
49874 /* Reject insertions to misaligned positions. */
49875 if (pos & (size-1))
49876 return false;
49878 if (SUBREG_P (src))
49880 unsigned int srcpos = SUBREG_BYTE (src);
49882 if (srcpos > 0)
49884 rtx extr_ops[4];
49886 extr_ops[0] = gen_reg_rtx (srcmode);
49887 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
49888 extr_ops[2] = GEN_INT (size);
49889 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
49891 if (!ix86_expand_pextr (extr_ops))
49892 return false;
49894 src = extr_ops[0];
49896 else
49897 src = gen_lowpart (srcmode, SUBREG_REG (src));
49900 if (GET_MODE (dst) == dstmode)
49901 d = dst;
49902 else
49903 d = gen_reg_rtx (dstmode);
49905 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
49906 gen_lowpart (srcmode, src),
49907 GEN_INT (1 << (pos / size))));
49908 if (d != dst)
49909 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
49910 return true;
49913 default:
49914 return false;
49918 /* This function returns the calling abi specific va_list type node.
49919 It returns the FNDECL specific va_list type. */
49921 static tree
49922 ix86_fn_abi_va_list (tree fndecl)
49924 if (!TARGET_64BIT)
49925 return va_list_type_node;
49926 gcc_assert (fndecl != NULL_TREE);
49928 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
49929 return ms_va_list_type_node;
49930 else
49931 return sysv_va_list_type_node;
49934 /* Returns the canonical va_list type specified by TYPE. If there
49935 is no valid TYPE provided, it return NULL_TREE. */
49937 static tree
49938 ix86_canonical_va_list_type (tree type)
49940 if (TARGET_64BIT)
49942 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
49943 return ms_va_list_type_node;
49945 if ((TREE_CODE (type) == ARRAY_TYPE
49946 && integer_zerop (array_type_nelts (type)))
49947 || POINTER_TYPE_P (type))
49949 tree elem_type = TREE_TYPE (type);
49950 if (TREE_CODE (elem_type) == RECORD_TYPE
49951 && lookup_attribute ("sysv_abi va_list",
49952 TYPE_ATTRIBUTES (elem_type)))
49953 return sysv_va_list_type_node;
49956 return NULL_TREE;
49959 return std_canonical_va_list_type (type);
49962 /* Iterate through the target-specific builtin types for va_list.
49963 IDX denotes the iterator, *PTREE is set to the result type of
49964 the va_list builtin, and *PNAME to its internal type.
49965 Returns zero if there is no element for this index, otherwise
49966 IDX should be increased upon the next call.
49967 Note, do not iterate a base builtin's name like __builtin_va_list.
49968 Used from c_common_nodes_and_builtins. */
49970 static int
49971 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
49973 if (TARGET_64BIT)
49975 switch (idx)
49977 default:
49978 break;
49980 case 0:
49981 *ptree = ms_va_list_type_node;
49982 *pname = "__builtin_ms_va_list";
49983 return 1;
49985 case 1:
49986 *ptree = sysv_va_list_type_node;
49987 *pname = "__builtin_sysv_va_list";
49988 return 1;
49992 return 0;
49995 #undef TARGET_SCHED_DISPATCH
49996 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
49997 #undef TARGET_SCHED_DISPATCH_DO
49998 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
49999 #undef TARGET_SCHED_REASSOCIATION_WIDTH
50000 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
50001 #undef TARGET_SCHED_REORDER
50002 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
50003 #undef TARGET_SCHED_ADJUST_PRIORITY
50004 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
50005 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
50006 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
50007 ix86_dependencies_evaluation_hook
50010 /* Implementation of reassociation_width target hook used by
50011 reassoc phase to identify parallelism level in reassociated
50012 tree. Statements tree_code is passed in OPC. Arguments type
50013 is passed in MODE. */
50015 static int
50016 ix86_reassociation_width (unsigned int op, machine_mode mode)
50018 int width = 1;
50019 /* Vector part. */
50020 if (VECTOR_MODE_P (mode))
50022 int div = 1;
50023 if (INTEGRAL_MODE_P (mode))
50024 width = ix86_cost->reassoc_vec_int;
50025 else if (FLOAT_MODE_P (mode))
50026 width = ix86_cost->reassoc_vec_fp;
50028 if (width == 1)
50029 return 1;
50031 /* Integer vector instructions execute in FP unit
50032 and can execute 3 additions and one multiplication per cycle. */
50033 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
50034 && op != PLUS && op != MINUS)
50035 return 1;
50037 /* Account for targets that splits wide vectors into multiple parts. */
50038 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
50039 div = GET_MODE_BITSIZE (mode) / 128;
50040 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
50041 div = GET_MODE_BITSIZE (mode) / 64;
50042 width = (width + div - 1) / div;
50044 /* Scalar part. */
50045 else if (INTEGRAL_MODE_P (mode))
50046 width = ix86_cost->reassoc_int;
50047 else if (FLOAT_MODE_P (mode))
50048 width = ix86_cost->reassoc_fp;
50050 /* Avoid using too many registers in 32bit mode. */
50051 if (!TARGET_64BIT && width > 2)
50052 width = 2;
50053 return width;
50056 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
50057 place emms and femms instructions. */
50059 static machine_mode
50060 ix86_preferred_simd_mode (scalar_mode mode)
50062 if (!TARGET_SSE)
50063 return word_mode;
50065 switch (mode)
50067 case E_QImode:
50068 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50069 return V64QImode;
50070 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50071 return V32QImode;
50072 else
50073 return V16QImode;
50075 case E_HImode:
50076 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
50077 return V32HImode;
50078 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50079 return V16HImode;
50080 else
50081 return V8HImode;
50083 case E_SImode:
50084 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50085 return V16SImode;
50086 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50087 return V8SImode;
50088 else
50089 return V4SImode;
50091 case E_DImode:
50092 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50093 return V8DImode;
50094 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50095 return V4DImode;
50096 else
50097 return V2DImode;
50099 case E_SFmode:
50100 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50101 return V16SFmode;
50102 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50103 return V8SFmode;
50104 else
50105 return V4SFmode;
50107 case E_DFmode:
50108 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50109 return V8DFmode;
50110 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50111 return V4DFmode;
50112 else if (TARGET_SSE2)
50113 return V2DFmode;
50114 /* FALLTHRU */
50116 default:
50117 return word_mode;
50121 /* All CPUs prefer to avoid cross-lane operations so perform reductions
50122 upper against lower halves up to SSE reg size. */
50124 static machine_mode
50125 ix86_split_reduction (machine_mode mode)
50127 /* Reduce lowpart against highpart until we reach SSE reg width to
50128 avoid cross-lane operations. */
50129 switch (mode)
50131 case E_V8DImode:
50132 case E_V4DImode:
50133 return V2DImode;
50134 case E_V16SImode:
50135 case E_V8SImode:
50136 return V4SImode;
50137 case E_V32HImode:
50138 case E_V16HImode:
50139 return V8HImode;
50140 case E_V64QImode:
50141 case E_V32QImode:
50142 return V16QImode;
50143 case E_V16SFmode:
50144 case E_V8SFmode:
50145 return V4SFmode;
50146 case E_V8DFmode:
50147 case E_V4DFmode:
50148 return V2DFmode;
50149 default:
50150 return mode;
50154 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
50155 vectors. If AVX512F is enabled then try vectorizing with 512bit,
50156 256bit and 128bit vectors. */
50158 static void
50159 ix86_autovectorize_vector_sizes (vector_sizes *sizes)
50161 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
50163 sizes->safe_push (64);
50164 sizes->safe_push (32);
50165 sizes->safe_push (16);
50167 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
50169 sizes->safe_push (32);
50170 sizes->safe_push (16);
50174 /* Implemenation of targetm.vectorize.get_mask_mode. */
50176 static opt_machine_mode
50177 ix86_get_mask_mode (poly_uint64 nunits, poly_uint64 vector_size)
50179 unsigned elem_size = vector_size / nunits;
50181 /* Scalar mask case. */
50182 if ((TARGET_AVX512F && vector_size == 64)
50183 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
50185 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
50186 return smallest_int_mode_for_size (nunits);
50189 scalar_int_mode elem_mode
50190 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
50192 gcc_assert (elem_size * nunits == vector_size);
50194 return mode_for_vector (elem_mode, nunits);
50199 /* Return class of registers which could be used for pseudo of MODE
50200 and of class RCLASS for spilling instead of memory. Return NO_REGS
50201 if it is not possible or non-profitable. */
50203 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50205 static reg_class_t
50206 ix86_spill_class (reg_class_t rclass, machine_mode mode)
50208 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
50209 && TARGET_SSE2
50210 && TARGET_INTER_UNIT_MOVES_TO_VEC
50211 && TARGET_INTER_UNIT_MOVES_FROM_VEC
50212 && (mode == SImode || (TARGET_64BIT && mode == DImode))
50213 && INTEGER_CLASS_P (rclass))
50214 return ALL_SSE_REGS;
50215 return NO_REGS;
50218 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
50219 but returns a lower bound. */
50221 static unsigned int
50222 ix86_max_noce_ifcvt_seq_cost (edge e)
50224 bool predictable_p = predictable_edge_p (e);
50226 enum compiler_param param
50227 = (predictable_p
50228 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
50229 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
50231 /* If we have a parameter set, use that, otherwise take a guess using
50232 BRANCH_COST. */
50233 if (global_options_set.x_param_values[param])
50234 return PARAM_VALUE (param);
50235 else
50236 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
50239 /* Return true if SEQ is a good candidate as a replacement for the
50240 if-convertible sequence described in IF_INFO. */
50242 static bool
50243 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
50245 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
50247 int cmov_cnt = 0;
50248 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
50249 Maybe we should allow even more conditional moves as long as they
50250 are used far enough not to stall the CPU, or also consider
50251 IF_INFO->TEST_BB succ edge probabilities. */
50252 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
50254 rtx set = single_set (insn);
50255 if (!set)
50256 continue;
50257 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
50258 continue;
50259 rtx src = SET_SRC (set);
50260 machine_mode mode = GET_MODE (src);
50261 if (GET_MODE_CLASS (mode) != MODE_INT
50262 && GET_MODE_CLASS (mode) != MODE_FLOAT)
50263 continue;
50264 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
50265 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
50266 continue;
50267 /* insn is CMOV or FCMOV. */
50268 if (++cmov_cnt > 1)
50269 return false;
50272 return default_noce_conversion_profitable_p (seq, if_info);
50275 /* Implement targetm.vectorize.init_cost. */
50277 static void *
50278 ix86_init_cost (struct loop *)
50280 unsigned *cost = XNEWVEC (unsigned, 3);
50281 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
50282 return cost;
50285 /* Implement targetm.vectorize.add_stmt_cost. */
50287 static unsigned
50288 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
50289 struct _stmt_vec_info *stmt_info, int misalign,
50290 enum vect_cost_model_location where)
50292 unsigned *cost = (unsigned *) data;
50293 unsigned retval = 0;
50295 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
50296 int stmt_cost = - 1;
50298 bool fp = false;
50299 machine_mode mode = TImode;
50301 if (vectype != NULL)
50303 fp = FLOAT_TYPE_P (vectype);
50304 mode = TYPE_MODE (vectype);
50307 if ((kind == vector_stmt || kind == scalar_stmt)
50308 && stmt_info
50309 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
50311 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
50312 /*machine_mode inner_mode = mode;
50313 if (VECTOR_MODE_P (mode))
50314 inner_mode = GET_MODE_INNER (mode);*/
50316 switch (subcode)
50318 case PLUS_EXPR:
50319 case POINTER_PLUS_EXPR:
50320 case MINUS_EXPR:
50321 if (kind == scalar_stmt)
50323 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50324 stmt_cost = ix86_cost->addss;
50325 else if (X87_FLOAT_MODE_P (mode))
50326 stmt_cost = ix86_cost->fadd;
50327 else
50328 stmt_cost = ix86_cost->add;
50330 else
50331 stmt_cost = ix86_vec_cost (mode,
50332 fp ? ix86_cost->addss
50333 : ix86_cost->sse_op,
50334 true);
50335 break;
50337 case MULT_EXPR:
50338 case WIDEN_MULT_EXPR:
50339 case MULT_HIGHPART_EXPR:
50340 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
50341 break;
50342 case NEGATE_EXPR:
50343 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50344 stmt_cost = ix86_cost->sse_op;
50345 else if (X87_FLOAT_MODE_P (mode))
50346 stmt_cost = ix86_cost->fchs;
50347 else if (VECTOR_MODE_P (mode))
50348 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50349 else
50350 stmt_cost = ix86_cost->add;
50351 break;
50352 case TRUNC_DIV_EXPR:
50353 case CEIL_DIV_EXPR:
50354 case FLOOR_DIV_EXPR:
50355 case ROUND_DIV_EXPR:
50356 case TRUNC_MOD_EXPR:
50357 case CEIL_MOD_EXPR:
50358 case FLOOR_MOD_EXPR:
50359 case RDIV_EXPR:
50360 case ROUND_MOD_EXPR:
50361 case EXACT_DIV_EXPR:
50362 stmt_cost = ix86_division_cost (ix86_cost, mode);
50363 break;
50365 case RSHIFT_EXPR:
50366 case LSHIFT_EXPR:
50367 case LROTATE_EXPR:
50368 case RROTATE_EXPR:
50370 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
50371 stmt_cost = ix86_shift_rotate_cost
50372 (ix86_cost, mode,
50373 TREE_CODE (op2) == INTEGER_CST,
50374 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
50375 true, false, false, NULL, NULL);
50377 break;
50378 case NOP_EXPR:
50379 /* Only sign-conversions are free. */
50380 if (tree_nop_conversion_p
50381 (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
50382 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
50383 stmt_cost = 0;
50384 break;
50386 case BIT_IOR_EXPR:
50387 case ABS_EXPR:
50388 case ABSU_EXPR:
50389 case MIN_EXPR:
50390 case MAX_EXPR:
50391 case BIT_XOR_EXPR:
50392 case BIT_AND_EXPR:
50393 case BIT_NOT_EXPR:
50394 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
50395 stmt_cost = ix86_cost->sse_op;
50396 else if (VECTOR_MODE_P (mode))
50397 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
50398 else
50399 stmt_cost = ix86_cost->add;
50400 break;
50401 default:
50402 break;
50406 combined_fn cfn;
50407 if ((kind == vector_stmt || kind == scalar_stmt)
50408 && stmt_info
50409 && stmt_info->stmt
50410 && (cfn = gimple_call_combined_fn (stmt_info->stmt)) != CFN_LAST)
50411 switch (cfn)
50413 case CFN_FMA:
50414 stmt_cost = ix86_vec_cost (mode,
50415 mode == SFmode ? ix86_cost->fmass
50416 : ix86_cost->fmasd,
50417 true);
50418 break;
50419 default:
50420 break;
50423 /* If we do elementwise loads into a vector then we are bound by
50424 latency and execution resources for the many scalar loads
50425 (AGU and load ports). Try to account for this by scaling the
50426 construction cost by the number of elements involved. */
50427 if (kind == vec_construct
50428 && stmt_info
50429 && STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
50430 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
50431 && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
50433 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50434 stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
50436 if (stmt_cost == -1)
50437 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
50439 /* Penalize DFmode vector operations for Bonnell. */
50440 if (TARGET_BONNELL && kind == vector_stmt
50441 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
50442 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
50444 /* Statements in an inner loop relative to the loop being
50445 vectorized are weighted more heavily. The value here is
50446 arbitrary and could potentially be improved with analysis. */
50447 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
50448 count *= 50; /* FIXME. */
50450 retval = (unsigned) (count * stmt_cost);
50452 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
50453 for Silvermont as it has out of order integer pipeline and can execute
50454 2 scalar instruction per tick, but has in order SIMD pipeline. */
50455 if ((TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
50456 || TARGET_TREMONT || TARGET_INTEL) && stmt_info && stmt_info->stmt)
50458 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
50459 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
50460 retval = (retval * 17) / 10;
50463 cost[where] += retval;
50465 return retval;
50468 /* Implement targetm.vectorize.finish_cost. */
50470 static void
50471 ix86_finish_cost (void *data, unsigned *prologue_cost,
50472 unsigned *body_cost, unsigned *epilogue_cost)
50474 unsigned *cost = (unsigned *) data;
50475 *prologue_cost = cost[vect_prologue];
50476 *body_cost = cost[vect_body];
50477 *epilogue_cost = cost[vect_epilogue];
50480 /* Implement targetm.vectorize.destroy_cost_data. */
50482 static void
50483 ix86_destroy_cost_data (void *data)
50485 free (data);
50488 /* Validate target specific memory model bits in VAL. */
50490 static unsigned HOST_WIDE_INT
50491 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
50493 enum memmodel model = memmodel_from_int (val);
50494 bool strong;
50496 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
50497 |MEMMODEL_MASK)
50498 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
50500 warning (OPT_Winvalid_memory_model,
50501 "unknown architecture specific memory model");
50502 return MEMMODEL_SEQ_CST;
50504 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
50505 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
50507 warning (OPT_Winvalid_memory_model,
50508 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
50509 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
50511 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
50513 warning (OPT_Winvalid_memory_model,
50514 "HLE_RELEASE not used with RELEASE or stronger memory model");
50515 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
50517 return val;
50520 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
50521 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
50522 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
50523 or number of vecsize_mangle variants that should be emitted. */
50525 static int
50526 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
50527 struct cgraph_simd_clone *clonei,
50528 tree base_type, int num)
50530 int ret = 1;
50532 if (clonei->simdlen
50533 && (clonei->simdlen < 2
50534 || clonei->simdlen > 1024
50535 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
50537 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50538 "unsupported simdlen %d", clonei->simdlen);
50539 return 0;
50542 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
50543 if (TREE_CODE (ret_type) != VOID_TYPE)
50544 switch (TYPE_MODE (ret_type))
50546 case E_QImode:
50547 case E_HImode:
50548 case E_SImode:
50549 case E_DImode:
50550 case E_SFmode:
50551 case E_DFmode:
50552 /* case E_SCmode: */
50553 /* case E_DCmode: */
50554 break;
50555 default:
50556 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50557 "unsupported return type %qT for simd", ret_type);
50558 return 0;
50561 tree t;
50562 int i;
50564 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
50565 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
50566 switch (TYPE_MODE (TREE_TYPE (t)))
50568 case E_QImode:
50569 case E_HImode:
50570 case E_SImode:
50571 case E_DImode:
50572 case E_SFmode:
50573 case E_DFmode:
50574 /* case E_SCmode: */
50575 /* case E_DCmode: */
50576 break;
50577 default:
50578 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50579 "unsupported argument type %qT for simd", TREE_TYPE (t));
50580 return 0;
50583 if (!TREE_PUBLIC (node->decl))
50585 /* If the function isn't exported, we can pick up just one ISA
50586 for the clones. */
50587 if (TARGET_AVX512F)
50588 clonei->vecsize_mangle = 'e';
50589 else if (TARGET_AVX2)
50590 clonei->vecsize_mangle = 'd';
50591 else if (TARGET_AVX)
50592 clonei->vecsize_mangle = 'c';
50593 else
50594 clonei->vecsize_mangle = 'b';
50595 ret = 1;
50597 else
50599 clonei->vecsize_mangle = "bcde"[num];
50600 ret = 4;
50602 clonei->mask_mode = VOIDmode;
50603 switch (clonei->vecsize_mangle)
50605 case 'b':
50606 clonei->vecsize_int = 128;
50607 clonei->vecsize_float = 128;
50608 break;
50609 case 'c':
50610 clonei->vecsize_int = 128;
50611 clonei->vecsize_float = 256;
50612 break;
50613 case 'd':
50614 clonei->vecsize_int = 256;
50615 clonei->vecsize_float = 256;
50616 break;
50617 case 'e':
50618 clonei->vecsize_int = 512;
50619 clonei->vecsize_float = 512;
50620 if (TYPE_MODE (base_type) == QImode)
50621 clonei->mask_mode = DImode;
50622 else
50623 clonei->mask_mode = SImode;
50624 break;
50626 if (clonei->simdlen == 0)
50628 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
50629 clonei->simdlen = clonei->vecsize_int;
50630 else
50631 clonei->simdlen = clonei->vecsize_float;
50632 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
50634 else if (clonei->simdlen > 16)
50636 /* For compatibility with ICC, use the same upper bounds
50637 for simdlen. In particular, for CTYPE below, use the return type,
50638 unless the function returns void, in that case use the characteristic
50639 type. If it is possible for given SIMDLEN to pass CTYPE value
50640 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
50641 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
50642 emit corresponding clone. */
50643 tree ctype = ret_type;
50644 if (TREE_CODE (ret_type) == VOID_TYPE)
50645 ctype = base_type;
50646 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
50647 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
50648 cnt /= clonei->vecsize_int;
50649 else
50650 cnt /= clonei->vecsize_float;
50651 if (cnt > (TARGET_64BIT ? 16 : 8))
50653 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
50654 "unsupported simdlen %d", clonei->simdlen);
50655 return 0;
50658 return ret;
50661 /* Add target attribute to SIMD clone NODE if needed. */
50663 static void
50664 ix86_simd_clone_adjust (struct cgraph_node *node)
50666 const char *str = NULL;
50667 gcc_assert (node->decl == cfun->decl);
50668 switch (node->simdclone->vecsize_mangle)
50670 case 'b':
50671 if (!TARGET_SSE2)
50672 str = "sse2";
50673 break;
50674 case 'c':
50675 if (!TARGET_AVX)
50676 str = "avx";
50677 break;
50678 case 'd':
50679 if (!TARGET_AVX2)
50680 str = "avx2";
50681 break;
50682 case 'e':
50683 if (!TARGET_AVX512F)
50684 str = "avx512f";
50685 break;
50686 default:
50687 gcc_unreachable ();
50689 if (str == NULL)
50690 return;
50691 push_cfun (NULL);
50692 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
50693 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
50694 gcc_assert (ok);
50695 pop_cfun ();
50696 ix86_reset_previous_fndecl ();
50697 ix86_set_current_function (node->decl);
50700 /* If SIMD clone NODE can't be used in a vectorized loop
50701 in current function, return -1, otherwise return a badness of using it
50702 (0 if it is most desirable from vecsize_mangle point of view, 1
50703 slightly less desirable, etc.). */
50705 static int
50706 ix86_simd_clone_usable (struct cgraph_node *node)
50708 switch (node->simdclone->vecsize_mangle)
50710 case 'b':
50711 if (!TARGET_SSE2)
50712 return -1;
50713 if (!TARGET_AVX)
50714 return 0;
50715 return TARGET_AVX2 ? 2 : 1;
50716 case 'c':
50717 if (!TARGET_AVX)
50718 return -1;
50719 return TARGET_AVX2 ? 1 : 0;
50720 case 'd':
50721 if (!TARGET_AVX2)
50722 return -1;
50723 return 0;
50724 case 'e':
50725 if (!TARGET_AVX512F)
50726 return -1;
50727 return 0;
50728 default:
50729 gcc_unreachable ();
50733 /* This function adjusts the unroll factor based on
50734 the hardware capabilities. For ex, bdver3 has
50735 a loop buffer which makes unrolling of smaller
50736 loops less important. This function decides the
50737 unroll factor using number of memory references
50738 (value 32 is used) as a heuristic. */
50740 static unsigned
50741 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
50743 basic_block *bbs;
50744 rtx_insn *insn;
50745 unsigned i;
50746 unsigned mem_count = 0;
50748 if (!TARGET_ADJUST_UNROLL)
50749 return nunroll;
50751 /* Count the number of memory references within the loop body.
50752 This value determines the unrolling factor for bdver3 and bdver4
50753 architectures. */
50754 subrtx_iterator::array_type array;
50755 bbs = get_loop_body (loop);
50756 for (i = 0; i < loop->num_nodes; i++)
50757 FOR_BB_INSNS (bbs[i], insn)
50758 if (NONDEBUG_INSN_P (insn))
50759 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
50760 if (const_rtx x = *iter)
50761 if (MEM_P (x))
50763 machine_mode mode = GET_MODE (x);
50764 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
50765 if (n_words > 4)
50766 mem_count += 2;
50767 else
50768 mem_count += 1;
50770 free (bbs);
50772 if (mem_count && mem_count <=32)
50773 return MIN (nunroll, 32 / mem_count);
50775 return nunroll;
50779 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
50781 static bool
50782 ix86_float_exceptions_rounding_supported_p (void)
50784 /* For x87 floating point with standard excess precision handling,
50785 there is no adddf3 pattern (since x87 floating point only has
50786 XFmode operations) so the default hook implementation gets this
50787 wrong. */
50788 return TARGET_80387 || TARGET_SSE_MATH;
50791 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
50793 static void
50794 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
50796 if (!TARGET_80387 && !TARGET_SSE_MATH)
50797 return;
50798 tree exceptions_var = create_tmp_var_raw (integer_type_node);
50799 if (TARGET_80387)
50801 tree fenv_index_type = build_index_type (size_int (6));
50802 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
50803 tree fenv_var = create_tmp_var_raw (fenv_type);
50804 TREE_ADDRESSABLE (fenv_var) = 1;
50805 tree fenv_ptr = build_pointer_type (fenv_type);
50806 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
50807 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
50808 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
50809 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
50810 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
50811 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
50812 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
50813 tree hold_fnclex = build_call_expr (fnclex, 0);
50814 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
50815 NULL_TREE, NULL_TREE);
50816 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
50817 hold_fnclex);
50818 *clear = build_call_expr (fnclex, 0);
50819 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
50820 tree fnstsw_call = build_call_expr (fnstsw, 0);
50821 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
50822 sw_var, fnstsw_call);
50823 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
50824 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
50825 exceptions_var, exceptions_x87);
50826 *update = build2 (COMPOUND_EXPR, integer_type_node,
50827 sw_mod, update_mod);
50828 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
50829 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
50831 if (TARGET_SSE_MATH)
50833 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
50834 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
50835 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
50836 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
50837 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
50838 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
50839 mxcsr_orig_var, stmxcsr_hold_call);
50840 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
50841 mxcsr_orig_var,
50842 build_int_cst (unsigned_type_node, 0x1f80));
50843 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
50844 build_int_cst (unsigned_type_node, 0xffffffc0));
50845 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
50846 mxcsr_mod_var, hold_mod_val);
50847 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50848 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
50849 hold_assign_orig, hold_assign_mod);
50850 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
50851 ldmxcsr_hold_call);
50852 if (*hold)
50853 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
50854 else
50855 *hold = hold_all;
50856 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
50857 if (*clear)
50858 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
50859 ldmxcsr_clear_call);
50860 else
50861 *clear = ldmxcsr_clear_call;
50862 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
50863 tree exceptions_sse = fold_convert (integer_type_node,
50864 stxmcsr_update_call);
50865 if (*update)
50867 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
50868 exceptions_var, exceptions_sse);
50869 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
50870 exceptions_var, exceptions_mod);
50871 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
50872 exceptions_assign);
50874 else
50875 *update = build2 (MODIFY_EXPR, integer_type_node,
50876 exceptions_var, exceptions_sse);
50877 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
50878 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50879 ldmxcsr_update_call);
50881 tree atomic_feraiseexcept
50882 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
50883 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
50884 1, exceptions_var);
50885 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
50886 atomic_feraiseexcept_call);
50889 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
50890 /* For i386, common symbol is local only for non-PIE binaries. For
50891 x86-64, common symbol is local only for non-PIE binaries or linker
50892 supports copy reloc in PIE binaries. */
50894 static bool
50895 ix86_binds_local_p (const_tree exp)
50897 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
50898 (!flag_pic
50899 || (TARGET_64BIT
50900 && HAVE_LD_PIE_COPYRELOC != 0)));
50902 #endif
50904 /* If MEM is in the form of [base+offset], extract the two parts
50905 of address and set to BASE and OFFSET, otherwise return false. */
50907 static bool
50908 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
50910 rtx addr;
50912 gcc_assert (MEM_P (mem));
50914 addr = XEXP (mem, 0);
50916 if (GET_CODE (addr) == CONST)
50917 addr = XEXP (addr, 0);
50919 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
50921 *base = addr;
50922 *offset = const0_rtx;
50923 return true;
50926 if (GET_CODE (addr) == PLUS
50927 && (REG_P (XEXP (addr, 0))
50928 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
50929 && CONST_INT_P (XEXP (addr, 1)))
50931 *base = XEXP (addr, 0);
50932 *offset = XEXP (addr, 1);
50933 return true;
50936 return false;
50939 /* Given OPERANDS of consecutive load/store, check if we can merge
50940 them into move multiple. LOAD is true if they are load instructions.
50941 MODE is the mode of memory operands. */
50943 bool
50944 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
50945 machine_mode mode)
50947 HOST_WIDE_INT offval_1, offval_2, msize;
50948 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
50950 if (load)
50952 mem_1 = operands[1];
50953 mem_2 = operands[3];
50954 reg_1 = operands[0];
50955 reg_2 = operands[2];
50957 else
50959 mem_1 = operands[0];
50960 mem_2 = operands[2];
50961 reg_1 = operands[1];
50962 reg_2 = operands[3];
50965 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
50967 if (REGNO (reg_1) != REGNO (reg_2))
50968 return false;
50970 /* Check if the addresses are in the form of [base+offset]. */
50971 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
50972 return false;
50973 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
50974 return false;
50976 /* Check if the bases are the same. */
50977 if (!rtx_equal_p (base_1, base_2))
50978 return false;
50980 offval_1 = INTVAL (offset_1);
50981 offval_2 = INTVAL (offset_2);
50982 msize = GET_MODE_SIZE (mode);
50983 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
50984 if (offval_1 + msize != offval_2)
50985 return false;
50987 return true;
50990 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
50992 static bool
50993 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
50994 optimization_type opt_type)
50996 switch (op)
50998 case asin_optab:
50999 case acos_optab:
51000 case log1p_optab:
51001 case exp_optab:
51002 case exp10_optab:
51003 case exp2_optab:
51004 case expm1_optab:
51005 case ldexp_optab:
51006 case scalb_optab:
51007 case round_optab:
51008 return opt_type == OPTIMIZE_FOR_SPEED;
51010 case rint_optab:
51011 if (SSE_FLOAT_MODE_P (mode1)
51012 && TARGET_SSE_MATH
51013 && !flag_trapping_math
51014 && !TARGET_SSE4_1)
51015 return opt_type == OPTIMIZE_FOR_SPEED;
51016 return true;
51018 case floor_optab:
51019 case ceil_optab:
51020 case btrunc_optab:
51021 if (SSE_FLOAT_MODE_P (mode1)
51022 && TARGET_SSE_MATH
51023 && !flag_trapping_math
51024 && TARGET_SSE4_1)
51025 return true;
51026 return opt_type == OPTIMIZE_FOR_SPEED;
51028 case rsqrt_optab:
51029 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
51031 default:
51032 return true;
51036 /* Address space support.
51038 This is not "far pointers" in the 16-bit sense, but an easy way
51039 to use %fs and %gs segment prefixes. Therefore:
51041 (a) All address spaces have the same modes,
51042 (b) All address spaces have the same addresss forms,
51043 (c) While %fs and %gs are technically subsets of the generic
51044 address space, they are probably not subsets of each other.
51045 (d) Since we have no access to the segment base register values
51046 without resorting to a system call, we cannot convert a
51047 non-default address space to a default address space.
51048 Therefore we do not claim %fs or %gs are subsets of generic.
51050 Therefore we can (mostly) use the default hooks. */
51052 /* All use of segmentation is assumed to make address 0 valid. */
51054 static bool
51055 ix86_addr_space_zero_address_valid (addr_space_t as)
51057 return as != ADDR_SPACE_GENERIC;
51060 static void
51061 ix86_init_libfuncs (void)
51063 if (TARGET_64BIT)
51065 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
51066 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
51068 else
51070 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
51071 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
51074 #if TARGET_MACHO
51075 darwin_rename_builtins ();
51076 #endif
51079 /* Generate call to __divmoddi4. */
51081 static void
51082 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
51083 rtx op0, rtx op1,
51084 rtx *quot_p, rtx *rem_p)
51086 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
51088 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
51089 mode,
51090 op0, GET_MODE (op0),
51091 op1, GET_MODE (op1),
51092 XEXP (rem, 0), Pmode);
51093 *quot_p = quot;
51094 *rem_p = rem;
51097 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
51098 FPU, assume that the fpcw is set to extended precision; when using
51099 only SSE, rounding is correct; when using both SSE and the FPU,
51100 the rounding precision is indeterminate, since either may be chosen
51101 apparently at random. */
51103 static enum flt_eval_method
51104 ix86_excess_precision (enum excess_precision_type type)
51106 switch (type)
51108 case EXCESS_PRECISION_TYPE_FAST:
51109 /* The fastest type to promote to will always be the native type,
51110 whether that occurs with implicit excess precision or
51111 otherwise. */
51112 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51113 case EXCESS_PRECISION_TYPE_STANDARD:
51114 case EXCESS_PRECISION_TYPE_IMPLICIT:
51115 /* Otherwise, the excess precision we want when we are
51116 in a standards compliant mode, and the implicit precision we
51117 provide would be identical were it not for the unpredictable
51118 cases. */
51119 if (!TARGET_80387)
51120 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51121 else if (!TARGET_MIX_SSE_I387)
51123 if (!TARGET_SSE_MATH)
51124 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
51125 else if (TARGET_SSE2)
51126 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
51129 /* If we are in standards compliant mode, but we know we will
51130 calculate in unpredictable precision, return
51131 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
51132 excess precision if the target can't guarantee it will honor
51133 it. */
51134 return (type == EXCESS_PRECISION_TYPE_STANDARD
51135 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
51136 : FLT_EVAL_METHOD_UNPREDICTABLE);
51137 default:
51138 gcc_unreachable ();
51141 return FLT_EVAL_METHOD_UNPREDICTABLE;
51144 /* Implement PUSH_ROUNDING. On 386, we have pushw instruction that
51145 decrements by exactly 2 no matter what the position was, there is no pushb.
51147 But as CIE data alignment factor on this arch is -4 for 32bit targets
51148 and -8 for 64bit targets, we need to make sure all stack pointer adjustments
51149 are in multiple of 4 for 32bit targets and 8 for 64bit targets. */
51151 poly_int64
51152 ix86_push_rounding (poly_int64 bytes)
51154 return ROUND_UP (bytes, UNITS_PER_WORD);
51157 /* Target-specific selftests. */
51159 #if CHECKING_P
51161 namespace selftest {
51163 /* Verify that hard regs are dumped as expected (in compact mode). */
51165 static void
51166 ix86_test_dumping_hard_regs ()
51168 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
51169 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
51172 /* Test dumping an insn with repeated references to the same SCRATCH,
51173 to verify the rtx_reuse code. */
51175 static void
51176 ix86_test_dumping_memory_blockage ()
51178 set_new_first_and_last_insn (NULL, NULL);
51180 rtx pat = gen_memory_blockage ();
51181 rtx_reuse_manager r;
51182 r.preprocess (pat);
51184 /* Verify that the repeated references to the SCRATCH show use
51185 reuse IDS. The first should be prefixed with a reuse ID,
51186 and the second should be dumped as a "reuse_rtx" of that ID.
51187 The expected string assumes Pmode == DImode. */
51188 if (Pmode == DImode)
51189 ASSERT_RTL_DUMP_EQ_WITH_REUSE
51190 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
51191 " (unspec:BLK [\n"
51192 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
51193 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
51196 /* Verify loading an RTL dump; specifically a dump of copying
51197 a param on x86_64 from a hard reg into the frame.
51198 This test is target-specific since the dump contains target-specific
51199 hard reg names. */
51201 static void
51202 ix86_test_loading_dump_fragment_1 ()
51204 rtl_dump_test t (SELFTEST_LOCATION,
51205 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
51207 rtx_insn *insn = get_insn_by_uid (1);
51209 /* The block structure and indentation here is purely for
51210 readability; it mirrors the structure of the rtx. */
51211 tree mem_expr;
51213 rtx pat = PATTERN (insn);
51214 ASSERT_EQ (SET, GET_CODE (pat));
51216 rtx dest = SET_DEST (pat);
51217 ASSERT_EQ (MEM, GET_CODE (dest));
51218 /* Verify the "/c" was parsed. */
51219 ASSERT_TRUE (RTX_FLAG (dest, call));
51220 ASSERT_EQ (SImode, GET_MODE (dest));
51222 rtx addr = XEXP (dest, 0);
51223 ASSERT_EQ (PLUS, GET_CODE (addr));
51224 ASSERT_EQ (DImode, GET_MODE (addr));
51226 rtx lhs = XEXP (addr, 0);
51227 /* Verify that the "frame" REG was consolidated. */
51228 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
51231 rtx rhs = XEXP (addr, 1);
51232 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
51233 ASSERT_EQ (-4, INTVAL (rhs));
51236 /* Verify the "[1 i+0 S4 A32]" was parsed. */
51237 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
51238 /* "i" should have been handled by synthesizing a global int
51239 variable named "i". */
51240 mem_expr = MEM_EXPR (dest);
51241 ASSERT_NE (mem_expr, NULL);
51242 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
51243 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
51244 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
51245 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
51246 /* "+0". */
51247 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
51248 ASSERT_EQ (0, MEM_OFFSET (dest));
51249 /* "S4". */
51250 ASSERT_EQ (4, MEM_SIZE (dest));
51251 /* "A32. */
51252 ASSERT_EQ (32, MEM_ALIGN (dest));
51255 rtx src = SET_SRC (pat);
51256 ASSERT_EQ (REG, GET_CODE (src));
51257 ASSERT_EQ (SImode, GET_MODE (src));
51258 ASSERT_EQ (5, REGNO (src));
51259 tree reg_expr = REG_EXPR (src);
51260 /* "i" here should point to the same var as for the MEM_EXPR. */
51261 ASSERT_EQ (reg_expr, mem_expr);
51266 /* Verify that the RTL loader copes with a call_insn dump.
51267 This test is target-specific since the dump contains a target-specific
51268 hard reg name. */
51270 static void
51271 ix86_test_loading_call_insn ()
51273 /* The test dump includes register "xmm0", where requires TARGET_SSE
51274 to exist. */
51275 if (!TARGET_SSE)
51276 return;
51278 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
51280 rtx_insn *insn = get_insns ();
51281 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
51283 /* "/j". */
51284 ASSERT_TRUE (RTX_FLAG (insn, jump));
51286 rtx pat = PATTERN (insn);
51287 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
51289 /* Verify REG_NOTES. */
51291 /* "(expr_list:REG_CALL_DECL". */
51292 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
51293 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
51294 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
51296 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
51297 rtx_expr_list *note1 = note0->next ();
51298 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
51300 ASSERT_EQ (NULL, note1->next ());
51303 /* Verify CALL_INSN_FUNCTION_USAGE. */
51305 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
51306 rtx_expr_list *usage
51307 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
51308 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
51309 ASSERT_EQ (DFmode, GET_MODE (usage));
51310 ASSERT_EQ (USE, GET_CODE (usage->element ()));
51311 ASSERT_EQ (NULL, usage->next ());
51315 /* Verify that the RTL loader copes a dump from print_rtx_function.
51316 This test is target-specific since the dump contains target-specific
51317 hard reg names. */
51319 static void
51320 ix86_test_loading_full_dump ()
51322 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
51324 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51326 rtx_insn *insn_1 = get_insn_by_uid (1);
51327 ASSERT_EQ (NOTE, GET_CODE (insn_1));
51329 rtx_insn *insn_7 = get_insn_by_uid (7);
51330 ASSERT_EQ (INSN, GET_CODE (insn_7));
51331 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
51333 rtx_insn *insn_15 = get_insn_by_uid (15);
51334 ASSERT_EQ (INSN, GET_CODE (insn_15));
51335 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
51337 /* Verify crtl->return_rtx. */
51338 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
51339 ASSERT_EQ (0, REGNO (crtl->return_rtx));
51340 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
51343 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
51344 In particular, verify that it correctly loads the 2nd operand.
51345 This test is target-specific since these are machine-specific
51346 operands (and enums). */
51348 static void
51349 ix86_test_loading_unspec ()
51351 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
51353 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
51355 ASSERT_TRUE (cfun);
51357 /* Test of an UNSPEC. */
51358 rtx_insn *insn = get_insns ();
51359 ASSERT_EQ (INSN, GET_CODE (insn));
51360 rtx set = single_set (insn);
51361 ASSERT_NE (NULL, set);
51362 rtx dst = SET_DEST (set);
51363 ASSERT_EQ (MEM, GET_CODE (dst));
51364 rtx src = SET_SRC (set);
51365 ASSERT_EQ (UNSPEC, GET_CODE (src));
51366 ASSERT_EQ (BLKmode, GET_MODE (src));
51367 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
51369 rtx v0 = XVECEXP (src, 0, 0);
51371 /* Verify that the two uses of the first SCRATCH have pointer
51372 equality. */
51373 rtx scratch_a = XEXP (dst, 0);
51374 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
51376 rtx scratch_b = XEXP (v0, 0);
51377 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
51379 ASSERT_EQ (scratch_a, scratch_b);
51381 /* Verify that the two mems are thus treated as equal. */
51382 ASSERT_TRUE (rtx_equal_p (dst, v0));
51384 /* Verify the the insn is recognized. */
51385 ASSERT_NE(-1, recog_memoized (insn));
51387 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
51388 insn = NEXT_INSN (insn);
51389 ASSERT_EQ (INSN, GET_CODE (insn));
51391 set = single_set (insn);
51392 ASSERT_NE (NULL, set);
51394 src = SET_SRC (set);
51395 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
51396 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
51399 /* Run all target-specific selftests. */
51401 static void
51402 ix86_run_selftests (void)
51404 ix86_test_dumping_hard_regs ();
51405 ix86_test_dumping_memory_blockage ();
51407 /* Various tests of loading RTL dumps, here because they contain
51408 ix86-isms (e.g. names of hard regs). */
51409 ix86_test_loading_dump_fragment_1 ();
51410 ix86_test_loading_call_insn ();
51411 ix86_test_loading_full_dump ();
51412 ix86_test_loading_unspec ();
51415 } // namespace selftest
51417 #endif /* CHECKING_P */
51419 /* Initialize the GCC target structure. */
51420 #undef TARGET_RETURN_IN_MEMORY
51421 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
51423 #undef TARGET_LEGITIMIZE_ADDRESS
51424 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
51426 #undef TARGET_ATTRIBUTE_TABLE
51427 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
51428 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
51429 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
51430 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51431 # undef TARGET_MERGE_DECL_ATTRIBUTES
51432 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
51433 #endif
51435 #undef TARGET_COMP_TYPE_ATTRIBUTES
51436 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
51438 #undef TARGET_INIT_BUILTINS
51439 #define TARGET_INIT_BUILTINS ix86_init_builtins
51440 #undef TARGET_BUILTIN_DECL
51441 #define TARGET_BUILTIN_DECL ix86_builtin_decl
51442 #undef TARGET_EXPAND_BUILTIN
51443 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
51445 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
51446 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
51447 ix86_builtin_vectorized_function
51449 #undef TARGET_VECTORIZE_BUILTIN_GATHER
51450 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
51452 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
51453 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
51455 #undef TARGET_BUILTIN_RECIPROCAL
51456 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
51458 #undef TARGET_ASM_FUNCTION_EPILOGUE
51459 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
51461 #undef TARGET_ENCODE_SECTION_INFO
51462 #ifndef SUBTARGET_ENCODE_SECTION_INFO
51463 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
51464 #else
51465 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
51466 #endif
51468 #undef TARGET_ASM_OPEN_PAREN
51469 #define TARGET_ASM_OPEN_PAREN ""
51470 #undef TARGET_ASM_CLOSE_PAREN
51471 #define TARGET_ASM_CLOSE_PAREN ""
51473 #undef TARGET_ASM_BYTE_OP
51474 #define TARGET_ASM_BYTE_OP ASM_BYTE
51476 #undef TARGET_ASM_ALIGNED_HI_OP
51477 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
51478 #undef TARGET_ASM_ALIGNED_SI_OP
51479 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
51480 #ifdef ASM_QUAD
51481 #undef TARGET_ASM_ALIGNED_DI_OP
51482 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
51483 #endif
51485 #undef TARGET_PROFILE_BEFORE_PROLOGUE
51486 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
51488 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
51489 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
51491 #undef TARGET_ASM_UNALIGNED_HI_OP
51492 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
51493 #undef TARGET_ASM_UNALIGNED_SI_OP
51494 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
51495 #undef TARGET_ASM_UNALIGNED_DI_OP
51496 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
51498 #undef TARGET_PRINT_OPERAND
51499 #define TARGET_PRINT_OPERAND ix86_print_operand
51500 #undef TARGET_PRINT_OPERAND_ADDRESS
51501 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
51502 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
51503 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
51504 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
51505 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
51507 #undef TARGET_SCHED_INIT_GLOBAL
51508 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
51509 #undef TARGET_SCHED_ADJUST_COST
51510 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
51511 #undef TARGET_SCHED_ISSUE_RATE
51512 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
51513 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
51514 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
51515 ia32_multipass_dfa_lookahead
51516 #undef TARGET_SCHED_MACRO_FUSION_P
51517 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
51518 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
51519 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
51521 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
51522 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
51524 #undef TARGET_MEMMODEL_CHECK
51525 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
51527 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
51528 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
51530 #ifdef HAVE_AS_TLS
51531 #undef TARGET_HAVE_TLS
51532 #define TARGET_HAVE_TLS true
51533 #endif
51534 #undef TARGET_CANNOT_FORCE_CONST_MEM
51535 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
51536 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
51537 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
51539 #undef TARGET_DELEGITIMIZE_ADDRESS
51540 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
51542 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
51543 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
51545 #undef TARGET_MS_BITFIELD_LAYOUT_P
51546 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
51548 #if TARGET_MACHO
51549 #undef TARGET_BINDS_LOCAL_P
51550 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
51551 #else
51552 #undef TARGET_BINDS_LOCAL_P
51553 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
51554 #endif
51555 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
51556 #undef TARGET_BINDS_LOCAL_P
51557 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
51558 #endif
51560 #undef TARGET_ASM_OUTPUT_MI_THUNK
51561 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
51562 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
51563 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
51565 #undef TARGET_ASM_FILE_START
51566 #define TARGET_ASM_FILE_START x86_file_start
51568 #undef TARGET_OPTION_OVERRIDE
51569 #define TARGET_OPTION_OVERRIDE ix86_option_override
51571 #undef TARGET_REGISTER_MOVE_COST
51572 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
51573 #undef TARGET_MEMORY_MOVE_COST
51574 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
51575 #undef TARGET_RTX_COSTS
51576 #define TARGET_RTX_COSTS ix86_rtx_costs
51577 #undef TARGET_ADDRESS_COST
51578 #define TARGET_ADDRESS_COST ix86_address_cost
51580 #undef TARGET_FLAGS_REGNUM
51581 #define TARGET_FLAGS_REGNUM FLAGS_REG
51582 #undef TARGET_FIXED_CONDITION_CODE_REGS
51583 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
51584 #undef TARGET_CC_MODES_COMPATIBLE
51585 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
51587 #undef TARGET_MACHINE_DEPENDENT_REORG
51588 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
51590 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
51591 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
51593 #undef TARGET_BUILD_BUILTIN_VA_LIST
51594 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
51596 #undef TARGET_FOLD_BUILTIN
51597 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
51599 #undef TARGET_GIMPLE_FOLD_BUILTIN
51600 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
51602 #undef TARGET_COMPARE_VERSION_PRIORITY
51603 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
51605 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
51606 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
51607 ix86_generate_version_dispatcher_body
51609 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
51610 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
51611 ix86_get_function_versions_dispatcher
51613 #undef TARGET_ENUM_VA_LIST_P
51614 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
51616 #undef TARGET_FN_ABI_VA_LIST
51617 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
51619 #undef TARGET_CANONICAL_VA_LIST_TYPE
51620 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
51622 #undef TARGET_EXPAND_BUILTIN_VA_START
51623 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
51625 #undef TARGET_MD_ASM_ADJUST
51626 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
51628 #undef TARGET_C_EXCESS_PRECISION
51629 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
51630 #undef TARGET_PROMOTE_PROTOTYPES
51631 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
51632 #undef TARGET_SETUP_INCOMING_VARARGS
51633 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
51634 #undef TARGET_MUST_PASS_IN_STACK
51635 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
51636 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
51637 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
51638 #undef TARGET_FUNCTION_ARG_ADVANCE
51639 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
51640 #undef TARGET_FUNCTION_ARG
51641 #define TARGET_FUNCTION_ARG ix86_function_arg
51642 #undef TARGET_INIT_PIC_REG
51643 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
51644 #undef TARGET_USE_PSEUDO_PIC_REG
51645 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
51646 #undef TARGET_FUNCTION_ARG_BOUNDARY
51647 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
51648 #undef TARGET_PASS_BY_REFERENCE
51649 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
51650 #undef TARGET_INTERNAL_ARG_POINTER
51651 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
51652 #undef TARGET_UPDATE_STACK_BOUNDARY
51653 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
51654 #undef TARGET_GET_DRAP_RTX
51655 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
51656 #undef TARGET_STRICT_ARGUMENT_NAMING
51657 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
51658 #undef TARGET_STATIC_CHAIN
51659 #define TARGET_STATIC_CHAIN ix86_static_chain
51660 #undef TARGET_TRAMPOLINE_INIT
51661 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
51662 #undef TARGET_RETURN_POPS_ARGS
51663 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
51665 #undef TARGET_WARN_FUNC_RETURN
51666 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
51668 #undef TARGET_LEGITIMATE_COMBINED_INSN
51669 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
51671 #undef TARGET_ASAN_SHADOW_OFFSET
51672 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
51674 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
51675 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
51677 #undef TARGET_SCALAR_MODE_SUPPORTED_P
51678 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
51680 #undef TARGET_VECTOR_MODE_SUPPORTED_P
51681 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
51683 #undef TARGET_C_MODE_FOR_SUFFIX
51684 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
51686 #ifdef HAVE_AS_TLS
51687 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
51688 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
51689 #endif
51691 #ifdef SUBTARGET_INSERT_ATTRIBUTES
51692 #undef TARGET_INSERT_ATTRIBUTES
51693 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
51694 #endif
51696 #undef TARGET_MANGLE_TYPE
51697 #define TARGET_MANGLE_TYPE ix86_mangle_type
51699 #undef TARGET_STACK_PROTECT_GUARD
51700 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
51702 #if !TARGET_MACHO
51703 #undef TARGET_STACK_PROTECT_FAIL
51704 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
51705 #endif
51707 #undef TARGET_FUNCTION_VALUE
51708 #define TARGET_FUNCTION_VALUE ix86_function_value
51710 #undef TARGET_FUNCTION_VALUE_REGNO_P
51711 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
51713 #undef TARGET_PROMOTE_FUNCTION_MODE
51714 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
51716 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
51717 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
51719 #undef TARGET_MEMBER_TYPE_FORCES_BLK
51720 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
51722 #undef TARGET_INSTANTIATE_DECLS
51723 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
51725 #undef TARGET_SECONDARY_RELOAD
51726 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
51727 #undef TARGET_SECONDARY_MEMORY_NEEDED
51728 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
51729 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
51730 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
51732 #undef TARGET_CLASS_MAX_NREGS
51733 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
51735 #undef TARGET_PREFERRED_RELOAD_CLASS
51736 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
51737 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
51738 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
51739 #undef TARGET_CLASS_LIKELY_SPILLED_P
51740 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
51742 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
51743 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
51744 ix86_builtin_vectorization_cost
51745 #undef TARGET_VECTORIZE_VEC_PERM_CONST
51746 #define TARGET_VECTORIZE_VEC_PERM_CONST ix86_vectorize_vec_perm_const
51747 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
51748 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
51749 ix86_preferred_simd_mode
51750 #undef TARGET_VECTORIZE_SPLIT_REDUCTION
51751 #define TARGET_VECTORIZE_SPLIT_REDUCTION \
51752 ix86_split_reduction
51753 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
51754 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
51755 ix86_autovectorize_vector_sizes
51756 #undef TARGET_VECTORIZE_GET_MASK_MODE
51757 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
51758 #undef TARGET_VECTORIZE_INIT_COST
51759 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
51760 #undef TARGET_VECTORIZE_ADD_STMT_COST
51761 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
51762 #undef TARGET_VECTORIZE_FINISH_COST
51763 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
51764 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
51765 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
51767 #undef TARGET_SET_CURRENT_FUNCTION
51768 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
51770 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
51771 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
51773 #undef TARGET_OPTION_SAVE
51774 #define TARGET_OPTION_SAVE ix86_function_specific_save
51776 #undef TARGET_OPTION_RESTORE
51777 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
51779 #undef TARGET_OPTION_POST_STREAM_IN
51780 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
51782 #undef TARGET_OPTION_PRINT
51783 #define TARGET_OPTION_PRINT ix86_function_specific_print
51785 #undef TARGET_OPTION_FUNCTION_VERSIONS
51786 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
51788 #undef TARGET_CAN_INLINE_P
51789 #define TARGET_CAN_INLINE_P ix86_can_inline_p
51791 #undef TARGET_LEGITIMATE_ADDRESS_P
51792 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
51794 #undef TARGET_REGISTER_PRIORITY
51795 #define TARGET_REGISTER_PRIORITY ix86_register_priority
51797 #undef TARGET_REGISTER_USAGE_LEVELING_P
51798 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
51800 #undef TARGET_LEGITIMATE_CONSTANT_P
51801 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
51803 #undef TARGET_COMPUTE_FRAME_LAYOUT
51804 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
51806 #undef TARGET_FRAME_POINTER_REQUIRED
51807 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
51809 #undef TARGET_CAN_ELIMINATE
51810 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
51812 #undef TARGET_EXTRA_LIVE_ON_ENTRY
51813 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
51815 #undef TARGET_ASM_CODE_END
51816 #define TARGET_ASM_CODE_END ix86_code_end
51818 #undef TARGET_CONDITIONAL_REGISTER_USAGE
51819 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
51821 #undef TARGET_CANONICALIZE_COMPARISON
51822 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
51824 #undef TARGET_LOOP_UNROLL_ADJUST
51825 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
51827 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
51828 #undef TARGET_SPILL_CLASS
51829 #define TARGET_SPILL_CLASS ix86_spill_class
51831 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
51832 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
51833 ix86_simd_clone_compute_vecsize_and_simdlen
51835 #undef TARGET_SIMD_CLONE_ADJUST
51836 #define TARGET_SIMD_CLONE_ADJUST \
51837 ix86_simd_clone_adjust
51839 #undef TARGET_SIMD_CLONE_USABLE
51840 #define TARGET_SIMD_CLONE_USABLE \
51841 ix86_simd_clone_usable
51843 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
51844 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
51845 ix86_float_exceptions_rounding_supported_p
51847 #undef TARGET_MODE_EMIT
51848 #define TARGET_MODE_EMIT ix86_emit_mode_set
51850 #undef TARGET_MODE_NEEDED
51851 #define TARGET_MODE_NEEDED ix86_mode_needed
51853 #undef TARGET_MODE_AFTER
51854 #define TARGET_MODE_AFTER ix86_mode_after
51856 #undef TARGET_MODE_ENTRY
51857 #define TARGET_MODE_ENTRY ix86_mode_entry
51859 #undef TARGET_MODE_EXIT
51860 #define TARGET_MODE_EXIT ix86_mode_exit
51862 #undef TARGET_MODE_PRIORITY
51863 #define TARGET_MODE_PRIORITY ix86_mode_priority
51865 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
51866 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
51868 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
51869 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
51871 #undef TARGET_OFFLOAD_OPTIONS
51872 #define TARGET_OFFLOAD_OPTIONS \
51873 ix86_offload_options
51875 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
51876 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
51878 #undef TARGET_OPTAB_SUPPORTED_P
51879 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
51881 #undef TARGET_HARD_REGNO_SCRATCH_OK
51882 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
51884 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
51885 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
51887 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
51888 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
51890 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
51891 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
51893 #undef TARGET_INIT_LIBFUNCS
51894 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
51896 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
51897 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
51899 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
51900 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
51902 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
51903 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
51905 #undef TARGET_HARD_REGNO_NREGS
51906 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
51907 #undef TARGET_HARD_REGNO_MODE_OK
51908 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
51910 #undef TARGET_MODES_TIEABLE_P
51911 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
51913 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
51914 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
51915 ix86_hard_regno_call_part_clobbered
51917 #undef TARGET_CAN_CHANGE_MODE_CLASS
51918 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
51920 #undef TARGET_STATIC_RTX_ALIGNMENT
51921 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
51922 #undef TARGET_CONSTANT_ALIGNMENT
51923 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
51925 #undef TARGET_EMPTY_RECORD_P
51926 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
51928 #undef TARGET_WARN_PARAMETER_PASSING_ABI
51929 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
51931 #if CHECKING_P
51932 #undef TARGET_RUN_TARGET_SELFTESTS
51933 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
51934 #endif /* #if CHECKING_P */
51936 struct gcc_target targetm = TARGET_INITIALIZER;
51938 #include "gt-i386.h"